1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
use std::io::SeekFrom;
use async_stream::try_stream;
use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt, Stream};
use parquet_format_async_temp::thrift::protocol::TCompactInputStreamProtocol;
use crate::compression::Compression;
use crate::error::Result;
use crate::metadata::{ColumnChunkMetaData, ColumnDescriptor};
use crate::page::{CompressedDataPage, ParquetPageHeader};
use super::page_iterator::{finish_page, get_page_header, FinishedPage};
use super::PageFilter;
pub async fn get_page_stream<'a, RR: AsyncRead + Unpin + Send + AsyncSeek>(
column_metadata: &'a ColumnChunkMetaData,
reader: &'a mut RR,
buffer: Vec<u8>,
pages_filter: PageFilter,
) -> Result<impl Stream<Item = Result<CompressedDataPage>> + 'a> {
let (col_start, _) = column_metadata.byte_range();
reader.seek(SeekFrom::Start(col_start)).await?;
Ok(_get_page_stream(
reader,
column_metadata.num_values(),
column_metadata.compression(),
column_metadata.descriptor(),
buffer,
pages_filter,
))
}
fn _get_page_stream<'a, R: AsyncRead + AsyncSeek + Unpin + Send>(
reader: &'a mut R,
total_num_values: i64,
compression: Compression,
descriptor: &'a ColumnDescriptor,
mut buffer: Vec<u8>,
pages_filter: PageFilter,
) -> impl Stream<Item = Result<CompressedDataPage>> + 'a {
let mut seen_values = 0i64;
let mut current_dictionary = None;
try_stream! {
while seen_values < total_num_values {
let page_header = read_page_header(reader).await?;
let data_header = get_page_header(&page_header);
seen_values += data_header.as_ref().map(|x| x.num_values() as i64).unwrap_or_default();
let read_size = page_header.compressed_page_size as i64;
if let Some(data_header) = data_header {
if !pages_filter(descriptor, &data_header) {
reader.seek(SeekFrom::Current(read_size)).await?;
continue
}
}
let read_size = read_size as usize;
if read_size > 0 {
buffer.resize(read_size, 0);
reader.read_exact(&mut buffer).await?;
}
let result = finish_page(
page_header,
&mut buffer,
compression,
¤t_dictionary,
descriptor,
)?;
match result {
FinishedPage::Data(page) => {
yield page
}
FinishedPage::Dict(dict) => {
current_dictionary = Some(dict);
continue
}
_ => continue,
}
}
}
}
async fn read_page_header<R: AsyncRead + Unpin + Send>(
reader: &mut R,
) -> Result<ParquetPageHeader> {
let mut prot = TCompactInputStreamProtocol::new(reader);
let page_header = ParquetPageHeader::stream_from_in_protocol(&mut prot).await?;
Ok(page_header)
}