1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
use super::ArrowResult;
use arrow::datatypes::PhysicalType;
use arrow::error::ArrowError;
use arrow::io::parquet::write::{self, FileWriter, *};
use arrow::io::parquet::write::{array_to_pages, DynIter, DynStreamingIterator, Encoding};
use polars_core::prelude::*;
use rayon::prelude::*;
use std::collections::VecDeque;
use std::io::Write;
struct Bla {
columns: VecDeque<CompressedPage>,
current: Option<CompressedPage>,
}
impl Bla {
pub fn new(columns: VecDeque<CompressedPage>) -> Self {
Self {
columns,
current: None,
}
}
}
impl FallibleStreamingIterator for Bla {
type Item = CompressedPage;
type Error = ArrowError;
fn advance(&mut self) -> ArrowResult<()> {
self.current = self.columns.pop_front();
Ok(())
}
fn get(&self) -> Option<&Self::Item> {
self.current.as_ref()
}
}
#[must_use]
pub struct ParquetWriter<W> {
writer: W,
compression: write::Compression,
statistics: bool,
}
pub use write::Compression as ParquetCompression;
impl<W> ParquetWriter<W>
where
W: Write,
{
pub fn new(writer: W) -> Self
where
W: Write,
{
ParquetWriter {
writer,
compression: write::Compression::Snappy,
statistics: false,
}
}
pub fn with_compression(mut self, compression: write::Compression) -> Self {
self.compression = compression;
self
}
pub fn with_statistics(mut self, statistics: bool) -> Self {
self.statistics = statistics;
self
}
pub fn finish(mut self, df: &mut DataFrame) -> Result<()> {
df.rechunk();
let fields = df.schema().to_arrow().fields;
let rb_iter = df.iter_chunks();
let options = write::WriteOptions {
write_statistics: self.statistics,
compression: self.compression,
version: write::Version::V2,
};
let schema = ArrowSchema::from(fields);
let parquet_schema = write::to_parquet_schema(&schema)?;
let encodings = schema
.fields
.iter()
.map(|field| match field.data_type().to_physical_type() {
PhysicalType::Dictionary(_) => Encoding::RleDictionary,
_ => Encoding::Plain,
})
.collect::<Vec<_>>();
let row_group_iter = rb_iter.map(|batch| {
let columns = batch
.columns()
.par_iter()
.zip(parquet_schema.columns().par_iter())
.zip(encodings.par_iter())
.map(|((array, descriptor), encoding)| {
let encoded_pages =
array_to_pages(array.as_ref(), descriptor.clone(), options, *encoding)?;
encoded_pages
.map(|page| {
compress(page?, vec![], options.compression).map_err(|x| x.into())
})
.collect::<ArrowResult<VecDeque<_>>>()
})
.collect::<ArrowResult<Vec<VecDeque<CompressedPage>>>>()?;
let row_group = DynIter::new(
columns
.into_iter()
.map(|column| Ok(DynStreamingIterator::new(Bla::new(column)))),
);
ArrowResult::Ok((row_group, batch.columns()[0].len()))
});
let mut writer = FileWriter::try_new(&mut self.writer, schema, options)?;
writer.start()?;
for group in row_group_iter {
let (group, len) = group?;
writer.write(group, len)?;
}
let _ = writer.end(None)?;
Ok(())
}
}