1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#[cfg(any(feature = "ipc", feature = "parquet", feature = "avro"))]
use crate::ArrowSchema;
use dirs::home_dir;
use polars_core::frame::DataFrame;
#[cfg(any(feature = "ipc", feature = "avro", feature = "parquet"))]
use polars_core::prelude::*;
use std::path::{Path, PathBuf};

// used by python polars
pub fn resolve_homedir(path: &Path) -> PathBuf {
    // replace "~" with home directory
    if path.starts_with("~") {
        if let Some(homedir) = home_dir() {
            return homedir.join(path.strip_prefix("~").unwrap());
        }
    }

    path.into()
}

#[cfg(any(feature = "ipc", feature = "parquet", feature = "avro"))]
pub(crate) fn apply_projection(schema: &ArrowSchema, projection: &[usize]) -> ArrowSchema {
    let fields = &schema.fields;
    let fields = projection
        .iter()
        .map(|idx| fields[*idx].clone())
        .collect::<Vec<_>>();
    ArrowSchema::from(fields)
}

#[cfg(any(feature = "ipc", feature = "avro", feature = "parquet"))]
pub(crate) fn columns_to_projection(
    columns: Vec<String>,
    schema: &ArrowSchema,
) -> Result<Vec<usize>> {
    use ahash::AHashMap;

    let err = |column: &str| {
        let valid_fields: Vec<String> = schema.fields.iter().map(|f| f.name.clone()).collect();
        PolarsError::NotFound(format!(
            "Unable to get field named \"{}\". Valid fields: {:?}",
            column, valid_fields
        ))
    };

    let mut prj = Vec::with_capacity(columns.len());
    if columns.len() > 100 {
        let mut column_names = AHashMap::with_capacity(schema.fields.len());
        schema.fields.iter().enumerate().for_each(|(i, c)| {
            column_names.insert(c.name.as_str(), i);
        });

        for column in columns.iter() {
            if let Some(&i) = column_names.get(column.as_str()) {
                prj.push(i)
            } else {
                return Err(err(column));
            }
        }
    } else {
        for column in columns.iter() {
            let i = schema.try_index_of(column)?;
            prj.push(i);
        }
    }

    Ok(prj)
}

/// Because of threading every row starts from `0` or from `offset`.
/// We must correct that so that they are monotonically increasing.
pub(crate) fn update_row_counts(dfs: &mut [(DataFrame, u32)]) {
    if !dfs.is_empty() {
        let mut previous = dfs[0].1;
        for (df, n_read) in &mut dfs[1..] {
            if let Some(s) = df.get_columns_mut().get_mut(0) {
                *s = &*s + previous;
            }
            previous = *n_read;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::resolve_homedir;
    use std::path::PathBuf;

    #[cfg(not(target_os = "windows"))]
    #[test]
    fn test_resolve_homedir() {
        let paths: Vec<PathBuf> = vec![
            "~/dir1/dir2/test.csv".into(),
            "/abs/path/test.csv".into(),
            "rel/path/test.csv".into(),
            "/".into(),
            "~".into(),
        ];

        let resolved: Vec<PathBuf> = paths.iter().map(|x| resolve_homedir(x)).collect();

        assert_eq!(resolved[0].file_name(), paths[0].file_name());
        assert!(resolved[0].is_absolute());
        assert_eq!(resolved[1], paths[1]);
        assert_eq!(resolved[2], paths[2]);
        assert_eq!(resolved[3], paths[3]);
        assert!(resolved[4].is_absolute());
    }

    #[cfg(target_os = "windows")]
    #[test]
    fn test_resolve_homedir_windows() {
        let paths: Vec<PathBuf> = vec![
            r#"c:\Users\user1\test.csv"#.into(),
            r#"~\user1\test.csv"#.into(),
            "~".into(),
        ];

        let resolved: Vec<PathBuf> = paths.iter().map(|x| resolve_homedir(x)).collect();

        assert_eq!(resolved[0], paths[0]);
        assert_eq!(resolved[1].file_name(), paths[1].file_name());
        assert!(resolved[1].is_absolute());
        assert!(resolved[2].is_absolute());
    }
}