tiger_lib/parse/
csv.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
use std::fs::read;
use std::iter::Peekable;
use std::path::Path;
use std::str::Chars;

use anyhow::{bail, Result};
use encoding_rs::WINDOWS_1252;

use crate::fileset::FileEntry;
use crate::report::ErrorLoc;
use crate::token::{Loc, Token};

#[derive(Clone, Debug)]
struct CsvParser<'a> {
    loc: Loc,
    offset: usize,
    content: &'a str,
    header_lines: usize,
    chars: Peekable<Chars<'a>>,
}

impl<'a> CsvParser<'a> {
    fn new(mut loc: Loc, header_lines: usize, content: &'a str) -> Self {
        loc.line = 1;
        loc.column = 1;
        let chars = content.chars().peekable();
        Self { loc, offset: 0, content, header_lines, chars }
    }

    fn next_char(&mut self) {
        // self.loc is always the loc of the peekable char
        if let Some(c) = self.chars.next() {
            self.offset += c.len_utf8();
            if c == '\n' {
                self.loc.line += 1;
                self.loc.column = 1;
            } else {
                self.loc.column += 1;
            }
        }
    }

    fn skip_whitespace(&mut self) {
        while let Some(c) = self.chars.peek() {
            if c.is_ascii_whitespace() {
                self.next_char();
            } else {
                break;
            }
        }
    }

    fn skip_line(&mut self) {
        while let Some(&c) = self.chars.peek() {
            if c == '\n' {
                break;
            }
            self.next_char();
        }
        self.next_char(); // Eat the newline
    }

    /// Return the next CSV line, or None if at end of file
    fn parse_csv(&mut self) -> Option<Vec<Token>> {
        // Loop until we have a non-comment line.
        loop {
            self.skip_whitespace();
            if self.chars.peek() == Some(&'#') {
                self.skip_line();
            } else if self.header_lines > 0 {
                self.skip_line();
                self.header_lines -= 1;
            } else {
                break;
            }
        }
        self.chars.peek()?;

        let mut vec = Vec::new();
        let mut loc = self.loc;
        let mut start_offset = self.offset;

        while let Some(c) = self.chars.peek() {
            match c {
                '#' | '\n' | ';' => {
                    let s = &self.content[start_offset..self.offset];
                    vec.push(Token::new(s, loc));
                    if c == &';' {
                        self.next_char();
                        loc = self.loc;
                        start_offset = self.offset;
                    } else {
                        break;
                    }
                }
                _ => self.next_char(),
            }
        }

        self.skip_line();
        Some(vec)
    }
}

pub struct CsvReader<'a> {
    parser: CsvParser<'a>,
}

impl Iterator for CsvReader<'_> {
    type Item = Vec<Token>;

    fn next(&mut self) -> Option<Self::Item> {
        self.parser.parse_csv()
    }
}

pub fn read_csv(fullpath: &Path) -> Result<String> {
    let bytes = read(fullpath)?;
    let (content, errors) = WINDOWS_1252.decode_without_bom_handling(&bytes);
    if errors {
        bail!("invalid characters");
    }
    Ok(content.into_owned())
}

#[allow(clippy::module_name_repetitions)]
pub fn parse_csv<'a>(entry: &FileEntry, header_lines: usize, content: &'a str) -> CsvReader<'a> {
    let parser = CsvParser::new(entry.into_loc(), header_lines, content);
    CsvReader { parser }
}