tiger_lib/parse/
csv.rs

1use std::fs::read;
2use std::iter::Peekable;
3use std::path::Path;
4use std::str::Chars;
5
6use anyhow::{Result, bail};
7use encoding_rs::WINDOWS_1252;
8
9use crate::fileset::FileEntry;
10use crate::report::ErrorLoc;
11use crate::token::{Loc, Token};
12
13#[derive(Clone, Debug)]
14struct CsvParser<'a> {
15    loc: Loc,
16    offset: usize,
17    content: &'a str,
18    header_lines: usize,
19    chars: Peekable<Chars<'a>>,
20}
21
22impl<'a> CsvParser<'a> {
23    fn new(mut loc: Loc, header_lines: usize, content: &'a str) -> Self {
24        loc.line = 1;
25        loc.column = 1;
26        let chars = content.chars().peekable();
27        Self { loc, offset: 0, content, header_lines, chars }
28    }
29
30    fn next_char(&mut self) {
31        // self.loc is always the loc of the peekable char
32        if let Some(c) = self.chars.next() {
33            self.offset += c.len_utf8();
34            if c == '\n' {
35                self.loc.line += 1;
36                self.loc.column = 1;
37            } else {
38                self.loc.column += 1;
39            }
40        }
41    }
42
43    fn skip_whitespace(&mut self) {
44        while let Some(c) = self.chars.peek() {
45            if c.is_ascii_whitespace() {
46                self.next_char();
47            } else {
48                break;
49            }
50        }
51    }
52
53    fn skip_line(&mut self) {
54        while let Some(&c) = self.chars.peek() {
55            if c == '\n' {
56                break;
57            }
58            self.next_char();
59        }
60        self.next_char(); // Eat the newline
61    }
62
63    /// Return the next CSV line, or None if at end of file
64    fn parse_csv(&mut self) -> Option<Vec<Token>> {
65        // Loop until we have a non-comment line.
66        loop {
67            self.skip_whitespace();
68            if self.chars.peek() == Some(&'#') {
69                self.skip_line();
70            } else if self.header_lines > 0 {
71                self.skip_line();
72                self.header_lines -= 1;
73            } else {
74                break;
75            }
76        }
77        self.chars.peek()?;
78
79        let mut vec = Vec::new();
80        let mut loc = self.loc;
81        let mut start_offset = self.offset;
82
83        while let Some(c) = self.chars.peek() {
84            match c {
85                '#' | '\r' | '\n' | ';' => {
86                    let s = &self.content[start_offset..self.offset];
87                    vec.push(Token::new(s, loc));
88                    if c == &';' {
89                        self.next_char();
90                        loc = self.loc;
91                        start_offset = self.offset;
92                    } else {
93                        break;
94                    }
95                }
96                _ => self.next_char(),
97            }
98        }
99
100        self.skip_line();
101        Some(vec)
102    }
103}
104
105pub struct CsvReader<'a> {
106    parser: CsvParser<'a>,
107}
108
109impl Iterator for CsvReader<'_> {
110    type Item = Vec<Token>;
111
112    fn next(&mut self) -> Option<Self::Item> {
113        self.parser.parse_csv()
114    }
115}
116
117pub fn read_csv(fullpath: &Path) -> Result<String> {
118    let bytes = read(fullpath)?;
119    let (content, errors) = WINDOWS_1252.decode_without_bom_handling(&bytes);
120    if errors {
121        bail!("invalid characters");
122    }
123    Ok(content.into_owned())
124}
125
126#[allow(clippy::module_name_repetitions)]
127pub fn parse_csv<'a>(entry: &FileEntry, header_lines: usize, content: &'a str) -> CsvReader<'a> {
128    let parser = CsvParser::new(entry.into_loc(), header_lines, content);
129    CsvReader { parser }
130}