1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
use std::convert::TryInto;
use std::sync::Arc;
use parquet_format_async_temp::{ColumnChunk, ColumnMetaData, Encoding};
use super::column_descriptor::ColumnDescriptor;
use crate::error::Result;
use crate::schema::types::{ParquetType, PhysicalType};
use crate::statistics::{deserialize_statistics, Statistics};
use crate::{compression::Compression, schema::types::Type};
#[derive(Debug, Clone)]
pub struct ColumnChunkMetaData {
column_chunk: ColumnChunk,
column_descr: ColumnDescriptor,
}
impl ColumnChunkMetaData {
pub fn new(column_chunk: ColumnChunk, column_descr: ColumnDescriptor) -> Self {
Self {
column_chunk,
column_descr,
}
}
pub fn file_path(&self) -> &Option<String> {
&self.column_chunk.file_path
}
pub fn file_offset(&self) -> i64 {
self.column_chunk.file_offset
}
fn column_metadata(&self) -> &ColumnMetaData {
self.column_chunk.meta_data.as_ref().unwrap()
}
pub fn type_(&self) -> &Type {
&self.column_metadata().type_
}
pub fn descriptor(&self) -> &ColumnDescriptor {
&self.column_descr
}
pub fn physical_type(&self) -> PhysicalType {
match self.descriptor().type_() {
ParquetType::PrimitiveType { physical_type, .. } => *physical_type,
_ => unreachable!(),
}
}
pub fn statistics(&self) -> Option<Result<Arc<dyn Statistics>>> {
self.column_metadata()
.statistics
.as_ref()
.map(|x| deserialize_statistics(x, self.descriptor().clone()))
}
pub fn num_values(&self) -> i64 {
self.column_metadata().num_values
}
pub fn compression(&self) -> Compression {
self.column_metadata().codec.try_into().unwrap()
}
pub fn compressed_size(&self) -> i64 {
self.column_metadata().total_compressed_size
}
pub fn uncompressed_size(&self) -> i64 {
self.column_metadata().total_uncompressed_size
}
pub fn data_page_offset(&self) -> i64 {
self.column_metadata().data_page_offset
}
pub fn has_index_page(&self) -> bool {
self.column_metadata().index_page_offset.is_some()
}
pub fn index_page_offset(&self) -> Option<i64> {
self.column_metadata().index_page_offset
}
pub fn dictionary_page_offset(&self) -> Option<i64> {
self.column_metadata().dictionary_page_offset
}
pub fn column_encoding(&self) -> &Vec<Encoding> {
&self.column_metadata().encodings
}
pub fn byte_range(&self) -> (u64, u64) {
let col_start = if let Some(dict_page_offset) = self.dictionary_page_offset() {
dict_page_offset
} else {
self.data_page_offset()
};
let col_len = self.compressed_size();
assert!(
col_start >= 0 && col_len >= 0,
"column start and length should not be negative"
);
(col_start as u64, col_len as u64)
}
pub fn try_from_thrift(
column_descr: ColumnDescriptor,
column_chunk: ColumnChunk,
) -> Result<Self> {
Ok(Self {
column_chunk,
column_descr,
})
}
pub fn into_thrift(self) -> ColumnChunk {
self.column_chunk
}
}