1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#[cfg(any(feature = "ipc", feature = "parquet", feature = "avro"))]
use crate::ArrowSchema;
use dirs::home_dir;
use polars_core::frame::DataFrame;
#[cfg(any(feature = "ipc", feature = "avro", feature = "parquet"))]
use polars_core::prelude::*;
use std::path::{Path, PathBuf};
pub fn resolve_homedir(path: &Path) -> PathBuf {
if path.starts_with("~") {
if let Some(homedir) = home_dir() {
return homedir.join(path.strip_prefix("~").unwrap());
}
}
path.into()
}
#[cfg(any(feature = "ipc", feature = "parquet", feature = "avro"))]
pub(crate) fn apply_projection(schema: &ArrowSchema, projection: &[usize]) -> ArrowSchema {
let fields = &schema.fields;
let fields = projection
.iter()
.map(|idx| fields[*idx].clone())
.collect::<Vec<_>>();
ArrowSchema::from(fields)
}
#[cfg(any(feature = "ipc", feature = "avro", feature = "parquet"))]
pub(crate) fn columns_to_projection(
columns: Vec<String>,
schema: &ArrowSchema,
) -> Result<Vec<usize>> {
use ahash::AHashMap;
let err = |column: &str| {
let valid_fields: Vec<String> = schema.fields.iter().map(|f| f.name.clone()).collect();
PolarsError::NotFound(format!(
"Unable to get field named \"{}\". Valid fields: {:?}",
column, valid_fields
))
};
let mut prj = Vec::with_capacity(columns.len());
if columns.len() > 100 {
let mut column_names = AHashMap::with_capacity(schema.fields.len());
schema.fields.iter().enumerate().for_each(|(i, c)| {
column_names.insert(c.name.as_str(), i);
});
for column in columns.iter() {
if let Some(&i) = column_names.get(column.as_str()) {
prj.push(i)
} else {
return Err(err(column));
}
}
} else {
for column in columns.iter() {
let i = schema.try_index_of(column)?;
prj.push(i);
}
}
Ok(prj)
}
pub(crate) fn update_row_counts(dfs: &mut [(DataFrame, u32)]) {
if !dfs.is_empty() {
let mut previous = dfs[0].1;
for (df, n_read) in &mut dfs[1..] {
if let Some(s) = df.get_columns_mut().get_mut(0) {
*s = &*s + previous;
}
previous = *n_read;
}
}
}
#[cfg(test)]
mod tests {
use super::resolve_homedir;
use std::path::PathBuf;
#[cfg(not(target_os = "windows"))]
#[test]
fn test_resolve_homedir() {
let paths: Vec<PathBuf> = vec![
"~/dir1/dir2/test.csv".into(),
"/abs/path/test.csv".into(),
"rel/path/test.csv".into(),
"/".into(),
"~".into(),
];
let resolved: Vec<PathBuf> = paths.iter().map(|x| resolve_homedir(x)).collect();
assert_eq!(resolved[0].file_name(), paths[0].file_name());
assert!(resolved[0].is_absolute());
assert_eq!(resolved[1], paths[1]);
assert_eq!(resolved[2], paths[2]);
assert_eq!(resolved[3], paths[3]);
assert!(resolved[4].is_absolute());
}
#[cfg(target_os = "windows")]
#[test]
fn test_resolve_homedir_windows() {
let paths: Vec<PathBuf> = vec![
r#"c:\Users\user1\test.csv"#.into(),
r#"~\user1\test.csv"#.into(),
"~".into(),
];
let resolved: Vec<PathBuf> = paths.iter().map(|x| resolve_homedir(x)).collect();
assert_eq!(resolved[0], paths[0]);
assert_eq!(resolved[1].file_name(), paths[1].file_name());
assert!(resolved[1].is_absolute());
assert!(resolved[2].is_absolute());
}
}