darkfi/zkas/
lexer.rs

1/* This file is part of DarkFi (https://dark.fi)
2 *
3 * Copyright (C) 2020-2025 Dyne.org foundation
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Affero General Public License as
7 * published by the Free Software Foundation, either version 3 of the
8 * License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU Affero General Public License for more details.
14 *
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
17 */
18
19use std::{io::Result, str::Chars};
20
21use super::error::ErrorEmitter;
22
23const SPECIAL_CHARS: [char; 9] = ['{', '}', '(', ')', '[', ']', ',', ';', '='];
24
25fn is_letter(ch: char) -> bool {
26    ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch == '_'
27}
28
29fn is_digit(ch: char) -> bool {
30    ch.is_ascii_digit()
31}
32
33#[derive(Copy, Clone, PartialEq, Eq, Debug)]
34pub enum TokenType {
35    Symbol,
36    String,
37    Number,
38    LeftBrace,
39    RightBrace,
40    LeftParen,
41    RightParen,
42    LeftBracket,
43    RightBracket,
44    Comma,
45    Semicolon,
46    Assign,
47}
48
49#[derive(Clone, Debug)]
50pub struct Token {
51    pub token: String,
52    pub token_type: TokenType,
53    pub line: usize,
54    pub column: usize,
55}
56
57impl Token {
58    fn new(token: &str, token_type: TokenType, line: usize, column: usize) -> Self {
59        Self { token: token.to_string(), token_type, line, column }
60    }
61}
62
63pub struct Lexer<'a> {
64    source: Chars<'a>,
65    error: ErrorEmitter,
66}
67
68impl<'a> Lexer<'a> {
69    pub fn new(filename: &str, source: Chars<'a>) -> Self {
70        // For nice error reporting, we'll load everything into a string
71        // vector so we have references to lines.
72        let lines: Vec<String> = source.as_str().lines().map(|x| x.to_string()).collect();
73        let error = ErrorEmitter::new("Lexer", filename, lines);
74
75        Self { source, error }
76    }
77
78    pub fn lex(&self) -> Result<Vec<Token>> {
79        let mut tokens = vec![];
80        let mut lineno = 1;
81        let mut column = 0;
82
83        // We use this as a buffer to store a single token, which is then
84        // reset after a token is pushed to the returning vec.
85        let mut buf = String::new();
86
87        // We use these to keep state when iterating.
88        let mut in_comment = false;
89        let mut in_string = false;
90        let mut in_number = false;
91        let mut in_symbol = false;
92
93        macro_rules! new_symbol {
94            () => {
95                tokens.push(Token::new(&buf, TokenType::Symbol, lineno, column - buf.len()));
96                in_symbol = false;
97                buf = String::new();
98            };
99        }
100        macro_rules! new_string {
101            () => {
102                tokens.push(Token::new(&buf, TokenType::String, lineno, column - buf.len()));
103                in_string = false;
104                buf = String::new();
105            };
106        }
107        macro_rules! new_number {
108            () => {
109                tokens.push(Token::new(&buf, TokenType::Number, lineno, column - buf.len()));
110                in_number = false;
111                buf = String::new();
112            };
113        }
114
115        #[allow(clippy::explicit_counter_loop)]
116        for c in self.source.clone() {
117            column += 1;
118
119            if c == '\n' {
120                if in_symbol {
121                    new_symbol!();
122                }
123
124                if in_string {
125                    return Err(self.error.abort("Strings can't contain newlines", lineno, column))
126                }
127
128                if in_number {
129                    return Err(self.error.abort("Numbers can't contain newlines", lineno, column))
130                }
131
132                in_comment = false;
133                lineno += 1;
134                column = 0;
135                continue
136            }
137
138            if c == '#' || in_comment {
139                if in_symbol {
140                    new_symbol!();
141                }
142
143                if in_number {
144                    new_number!();
145                }
146
147                if in_string {
148                    buf.push(c);
149                    continue
150                }
151
152                in_comment = true;
153                continue
154            }
155
156            if c.is_whitespace() {
157                if in_symbol {
158                    new_symbol!();
159                }
160
161                if in_number {
162                    new_number!();
163                }
164
165                if in_string {
166                    // For now we forbid whitespace in strings.
167                    return Err(self.error.abort(
168                        "Strings/Namespaces can't contain whitespace",
169                        lineno,
170                        column,
171                    ))
172                }
173
174                continue
175            }
176
177            // Main cases, in_comment is already checked above.
178            if !in_number && !in_symbol && !in_string && is_digit(c) {
179                in_number = true;
180                buf.push(c);
181                continue
182            }
183
184            if in_number && !is_digit(c) {
185                new_number!();
186            }
187
188            if in_number && is_digit(c) {
189                buf.push(c);
190                continue
191            }
192
193            if !in_number && !in_symbol && !in_string && is_letter(c) {
194                in_symbol = true;
195                buf.push(c);
196                continue
197            }
198
199            if !in_number && !in_symbol && !in_string && c == '"' {
200                // " I need to fix my Rust vis lexer
201                in_string = true;
202                continue
203            }
204
205            if (in_symbol || in_string) && (is_letter(c) || is_digit(c)) {
206                buf.push(c);
207                continue
208            }
209
210            if in_string && c == '"' {
211                // " I need to fix my vis lexer
212                if buf.is_empty() {
213                    return Err(self.error.abort("String cannot be empty", lineno, column))
214                }
215                new_string!();
216                continue
217            }
218
219            if SPECIAL_CHARS.contains(&c) {
220                if in_symbol {
221                    new_symbol!();
222                }
223
224                if in_number {
225                    new_number!();
226                }
227
228                if in_string {
229                    // TODO: Perhaps forbid these chars inside strings.
230                }
231
232                match c {
233                    '{' => {
234                        tokens.push(Token::new("{", TokenType::LeftBrace, lineno, column));
235                        continue
236                    }
237                    '}' => {
238                        tokens.push(Token::new("}", TokenType::RightBrace, lineno, column));
239                        continue
240                    }
241                    '(' => {
242                        tokens.push(Token::new("(", TokenType::LeftParen, lineno, column));
243                        continue
244                    }
245                    ')' => {
246                        tokens.push(Token::new(")", TokenType::RightParen, lineno, column));
247                        continue
248                    }
249                    '[' => {
250                        tokens.push(Token::new("[", TokenType::LeftBracket, lineno, column));
251                        continue
252                    }
253                    ']' => {
254                        tokens.push(Token::new("]", TokenType::RightBracket, lineno, column));
255                        continue
256                    }
257                    ',' => {
258                        tokens.push(Token::new(",", TokenType::Comma, lineno, column));
259                        continue
260                    }
261                    ';' => {
262                        tokens.push(Token::new(";", TokenType::Semicolon, lineno, column));
263                        continue
264                    }
265                    '=' => {
266                        tokens.push(Token::new("=", TokenType::Assign, lineno, column));
267                        continue
268                    }
269                    _ => {
270                        return Err(self.error.abort(
271                            &format!("Invalid token `{}`", c),
272                            lineno,
273                            column - 1,
274                        ))
275                    }
276                }
277            }
278
279            return Err(self.error.abort(&format!("Invalid token `{}`", c), lineno, column - 1))
280        }
281
282        Ok(tokens)
283    }
284}