1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
use std::io::SeekFrom;
use futures::{io::Cursor, AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt};
use parquet_format_async_temp::thrift::protocol::TCompactInputStreamProtocol;
use parquet_format_async_temp::FileMetaData as TFileMetaData;
use super::super::{metadata::*, DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, PARQUET_MAGIC};
use super::metadata::{metadata_len, parse_column_orders};
use crate::error::{ParquetError, Result};
async fn stream_len(
seek: &mut (impl AsyncSeek + std::marker::Unpin),
) -> std::result::Result<u64, std::io::Error> {
let old_pos = seek.seek(SeekFrom::Current(0)).await?;
let len = seek.seek(SeekFrom::End(0)).await?;
if old_pos != len {
seek.seek(SeekFrom::Start(old_pos)).await?;
}
Ok(len)
}
pub async fn read_metadata<R: AsyncRead + AsyncSeek + Send + std::marker::Unpin>(
reader: &mut R,
) -> Result<FileMetaData> {
let file_size = stream_len(reader).await?;
if file_size < FOOTER_SIZE {
return Err(general_err!(
"Invalid Parquet file. Size is smaller than footer"
));
}
let default_end_len = std::cmp::min(DEFAULT_FOOTER_READ_SIZE, file_size) as usize;
reader
.seek(SeekFrom::End(-(default_end_len as i64)))
.await?;
let mut default_len_end_buf = vec![0; default_end_len];
reader.read_exact(&mut default_len_end_buf).await?;
if default_len_end_buf[default_end_len - 4..] != PARQUET_MAGIC {
return Err(general_err!("Invalid Parquet file. Corrupt footer"));
}
let metadata_len = metadata_len(&default_len_end_buf, default_end_len);
if metadata_len < 0 {
return Err(general_err!(
"Invalid Parquet file. Metadata length is less than zero ({})",
metadata_len
));
}
let footer_metadata_len = FOOTER_SIZE + metadata_len as u64;
let t_file_metadata = if footer_metadata_len > file_size {
return Err(general_err!(
"Invalid Parquet file. Metadata start is less than zero ({})",
file_size as i64 - footer_metadata_len as i64
));
} else if footer_metadata_len < DEFAULT_FOOTER_READ_SIZE {
let mut reader = Cursor::new(default_len_end_buf);
reader
.seek(SeekFrom::End(-(footer_metadata_len as i64)))
.await?;
let mut prot = TCompactInputStreamProtocol::new(reader);
TFileMetaData::stream_from_in_protocol(&mut prot).await?
} else {
reader
.seek(SeekFrom::End(-(footer_metadata_len as i64)))
.await?;
let mut prot = TCompactInputStreamProtocol::new(reader);
TFileMetaData::stream_from_in_protocol(&mut prot).await?
};
let schema = t_file_metadata.schema.iter().collect::<Vec<_>>();
let schema_descr = SchemaDescriptor::try_from_thrift(&schema)?;
let row_groups = t_file_metadata
.row_groups
.into_iter()
.map(|rg| RowGroupMetaData::try_from_thrift(&schema_descr, rg))
.collect::<Result<Vec<_>>>()?;
let column_orders = t_file_metadata
.column_orders
.map(|orders| parse_column_orders(&orders, &schema_descr));
Ok(FileMetaData::new(
t_file_metadata.version,
t_file_metadata.num_rows,
t_file_metadata.created_by,
row_groups,
t_file_metadata.key_value_metadata,
schema_descr,
column_orders,
))
}