leo_parser/tokenizer/
lexer.rs

1// Copyright (C) 2019-2025 Provable Inc.
2// This file is part of the Leo library.
3
4// The Leo library is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// The Leo library is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13
14// You should have received a copy of the GNU General Public License
15// along with the Leo library. If not, see <https://www.gnu.org/licenses/>.
16
17use crate::tokenizer::Token;
18use leo_errors::{ParserError, Result};
19use leo_span::{Span, Symbol};
20
21use serde::{Deserialize, Serialize};
22use std::{
23    fmt,
24    iter::{Peekable, from_fn},
25};
26
27/// Eat an identifier, that is, a string matching '[a-zA-Z][a-zA-Z\d_]*', if any.
28fn eat_identifier(input: &mut Peekable<impl Iterator<Item = char>>) -> Option<String> {
29    input.peek().filter(|c| c.is_ascii_alphabetic())?;
30    Some(from_fn(|| input.next_if(|c| c.is_ascii_alphanumeric() || c == &'_')).collect())
31}
32
33/// Checks if a char is a Unicode Bidirectional Override code point
34fn is_bidi_override(c: char) -> bool {
35    let i = c as u32;
36    (0x202A..=0x202E).contains(&i) || (0x2066..=0x2069).contains(&i)
37}
38
39/// Ensure that `string` contains no Unicode Bidirectional Override code points.
40fn ensure_no_bidi_override(string: &str) -> Result<()> {
41    if string.chars().any(is_bidi_override) {
42        return Err(ParserError::lexer_bidi_override().into());
43    }
44    Ok(())
45}
46
47impl Token {
48    // todo: remove this unused code or reference https://github.com/Geal/nom/blob/main/examples/string.rs
49    // // Eats the parts of the unicode character after \u.
50    // fn eat_unicode_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
51    //     let mut unicode = String::new();
52    //     // Account for the chars '\' and 'u'.
53    //     let mut len = 2;
54    //
55    //     if input.next_if_eq(&'{').is_some() {
56    //         len += 1;
57    //     } else if let Some(c) = input.next() {
58    //         return Err(ParserError::lexer_unopened_escaped_unicode_char(c).into());
59    //     } else {
60    //         return Err(ParserError::lexer_empty_input_tendril().into());
61    //     }
62    //
63    //     while let Some(c) = input.next_if(|c| c != &'}') {
64    //         len += 1;
65    //         unicode.push(c);
66    //     }
67    //
68    //     if input.next_if_eq(&'}').is_some() {
69    //         len += 1;
70    //     } else {
71    //         return Err(ParserError::lexer_unclosed_escaped_unicode_char(unicode).into());
72    //     }
73    //
74    //     // Max of 6 digits.
75    //     // Minimum of 1 digit.
76    //     if unicode.len() > 6 || unicode.is_empty() {
77    //         return Err(ParserError::lexer_invalid_escaped_unicode_length(unicode).into());
78    //     }
79    //
80    //     if let Ok(hex) = u32::from_str_radix(&unicode, 16) {
81    //         if let Some(character) = std::char::from_u32(hex) {
82    //             Ok((len, Char::Scalar(character)))
83    //         } else if hex <= 0x10FFFF {
84    //             Ok((len, Char::NonScalar(hex)))
85    //         } else {
86    //             Err(ParserError::lexer_invalid_character_exceeded_max_value(unicode).into())
87    //         }
88    //     } else {
89    //         Err(ParserError::lexer_expected_valid_hex_char(unicode).into())
90    //     }
91    // }
92
93    // // Eats the parts of the hex character after \x.
94    // fn eat_hex_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
95    //     let mut hex = String::new();
96    //     // Account for the chars '\' and 'x'.
97    //     let mut len = 2;
98    //
99    //     // First hex character.
100    //     if let Some(c) = input.next_if(|c| c != &'\'') {
101    //         len += 1;
102    //         hex.push(c);
103    //     } else if let Some(c) = input.next() {
104    //         return Err(ParserError::lexer_expected_valid_hex_char(c).into());
105    //     } else {
106    //         return Err(ParserError::lexer_empty_input_tendril().into());
107    //     }
108    //
109    //     // Second hex character.
110    //     if let Some(c) = input.next_if(|c| c != &'\'') {
111    //         len += 1;
112    //         hex.push(c);
113    //     } else if let Some(c) = input.next() {
114    //         return Err(ParserError::lexer_expected_valid_hex_char(c).into());
115    //     } else {
116    //         return Err(ParserError::lexer_empty_input_tendril().into());
117    //     }
118    //
119    //     if let Ok(ascii_number) = u8::from_str_radix(&hex, 16) {
120    //         // According to RFC, we allow only values less than 128.
121    //         if ascii_number > 127 {
122    //             return Err(ParserError::lexer_expected_valid_hex_char(hex).into());
123    //         }
124    //
125    //         Ok((len, Char::Scalar(ascii_number as char)))
126    //     } else {
127    //         Err(ParserError::lexer_expected_valid_hex_char(hex).into())
128    //     }
129    // }
130
131    // fn eat_escaped_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
132    //     match input.next() {
133    //         None => Err(ParserError::lexer_empty_input_tendril().into()),
134    //         // Length of 2 to account the '\'.
135    //         Some('0') => Ok((2, Char::Scalar(0 as char))),
136    //         Some('t') => Ok((2, Char::Scalar(9 as char))),
137    //         Some('n') => Ok((2, Char::Scalar(10 as char))),
138    //         Some('r') => Ok((2, Char::Scalar(13 as char))),
139    //         Some('\"') => Ok((2, Char::Scalar(34 as char))),
140    //         Some('\'') => Ok((2, Char::Scalar(39 as char))),
141    //         Some('\\') => Ok((2, Char::Scalar(92 as char))),
142    //         Some('u') => Self::eat_unicode_char(input),
143    //         Some('x') => Self::eat_hex_char(input),
144    //         Some(c) => Err(ParserError::lexer_expected_valid_escaped_char(c).into()),
145    //     }
146    // }
147
148    // /// Returns a `char` if a character can be eaten, otherwise returns [`None`].
149    // fn eat_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
150    //     match input.next() {
151    //         None => Err(ParserError::lexer_empty_input_tendril().into()),
152    //         Some('\\') => Self::eat_escaped_char(input),
153    //         Some(c) => Ok((c.len_utf8(), Char::Scalar(c))),
154    //     }
155    // }
156
157    /// Returns a tuple: [(integer length, integer token)] if an integer can be eaten.
158    /// An integer can be eaten if its characters are at the front of the given `input` string.
159    /// If there is no input, this function returns an error.
160    /// If there is input but no integer, this function returns the tuple consisting of
161    /// length 0 and a dummy integer token that contains an empty string.
162    /// However, this function is always called when the next character is a digit.
163    /// This function eats a sequence representing a decimal numeral, a hex
164    /// numeral (beginning with `0x`), an octal numeral (beginning with '0o'),
165    /// or a binary numeral (beginning with `0b`), optionally including underscores,
166    /// which corresponds to a numeral in the ABNF grammar.
167    fn eat_integer(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Token)> {
168        if input.peek().is_none() {
169            return Err(ParserError::lexer_empty_input().into());
170        }
171
172        if !input.peek().unwrap().is_ascii_digit() {
173            return Ok((0, Token::Integer("".into())));
174        }
175
176        let mut int = String::new();
177
178        let first = input.next().unwrap();
179        int.push(first);
180        if first == '0' && (input.peek() == Some(&'x') || input.peek() == Some(&'o') || input.peek() == Some(&'b')) {
181            int.push(input.next().unwrap());
182        }
183
184        // Allow only uppercase hex digits, so as not to interfere with parsing the `field` suffix.
185        int.extend(input.take_while(|&c| c.is_ascii_digit() || c == '_' || c.is_ascii_uppercase()));
186
187        let (s, radix) = if let Some(s) = int.strip_prefix("0x") {
188            (s, 16)
189        } else if let Some(s) = int.strip_prefix("0o") {
190            (s, 8)
191        } else if let Some(s) = int.strip_prefix("0b") {
192            (s, 2)
193        } else {
194            (int.as_str(), 10)
195        };
196
197        if let Some(c) = s.chars().find(|&c| c != '_' && !c.is_digit(radix)) {
198            return Err(ParserError::wrong_digit_for_radix(c, radix, int).into());
199        }
200
201        Ok((int.len(), Token::Integer(int)))
202    }
203
204    /// Returns a tuple: [(token length, token)] if the next token can be eaten, otherwise returns an error.
205    /// The next token can be eaten if the characters at the front of the given `input` string can be scanned into a token.
206    pub(crate) fn eat(input: &str) -> Result<(usize, Token)> {
207        if input.is_empty() {
208            return Err(ParserError::lexer_empty_input().into());
209        }
210
211        let input_str = input;
212        let mut input = input.chars().peekable();
213
214        // Returns one token matching one character.
215        let match_one = |input: &mut Peekable<_>, token| {
216            input.next();
217            Ok((1, token))
218        };
219
220        // Returns one token matching one or two characters.
221        // If the `second` character matches, return the `second_token` that represents two characters.
222        // Otherwise, return the `first_token` that matches the one character.
223        let match_two = |input: &mut Peekable<_>, first_token, second_char, second_token| {
224            input.next();
225            Ok(if input.next_if_eq(&second_char).is_some() { (2, second_token) } else { (1, first_token) })
226        };
227
228        // Returns one token matching one or two characters.
229        // If the `second_char` character matches, return the `second_token` that represents two characters.
230        // If the `third_char` character matches, return the `third_token` that represents two characters.
231        // Otherwise, return the `first_token` that matches the one character.
232        let match_three = |input: &mut Peekable<_>, first_token, second_char, second_token, third_char, third_token| {
233            input.next();
234            Ok(if input.next_if_eq(&second_char).is_some() {
235                (2, second_token)
236            } else if input.next_if_eq(&third_char).is_some() {
237                (2, third_token)
238            } else {
239                (1, first_token)
240            })
241        };
242
243        // Returns one token matching one, two, or three characters.
244        // The `fourth_token` expects both the `third_char` and `fourth_char` to be present.
245        // See the example with the different combinations for Mul, MulAssign, Pow, PowAssign below.
246        let match_four = |
247            input: &mut Peekable<_>,
248            first_token, // e.e. Mul '*'
249            second_char, // e.g. '='
250            second_token, // e.g. MulAssign '*='
251            third_char, // e.g. '*'
252            third_token, // e.g. Pow '**'
253            fourth_char, // e.g. '='
254            fourth_token // e.g. PowAssign '**='
255        | {
256            input.next();
257            Ok(if input.next_if_eq(&second_char).is_some() {
258                // '*='
259                (2, second_token)
260            } else if input.next_if_eq(&third_char).is_some() {
261                if input.next_if_eq(&fourth_char).is_some() {
262                    // '**='
263                    return Ok((3, fourth_token))
264                }
265                // '**'
266                (2, third_token)
267            } else {
268                // '*'
269                (1, first_token)
270            })
271        };
272
273        match *input.peek().ok_or_else(ParserError::lexer_empty_input)? {
274            x if x.is_ascii_whitespace() => return match_one(&mut input, Token::WhiteSpace),
275            '"' => {
276                // Find end string quotation mark.
277                // Instead of checking each `char` and pushing, we can avoid reallocations.
278                // This works because the code 34 of double quote cannot appear as a byte
279                // in the middle of a multi-byte UTF-8 encoding of a character,
280                // because those bytes all have the high bit set to 1;
281                // in UTF-8, the byte 34 can only appear as the single-byte encoding of double quote.
282                let rest = &input_str[1..];
283                let string = match rest.as_bytes().iter().position(|c| *c == b'"') {
284                    None => return Err(ParserError::lexer_string_not_closed(rest).into()),
285                    Some(idx) => rest[..idx].to_owned(),
286                };
287
288                ensure_no_bidi_override(&string)?;
289
290                // + 2 to account for parsing quotation marks.
291                return Ok((string.len() + 2, Token::StaticString(string)));
292            }
293
294            x if x.is_ascii_digit() => return Self::eat_integer(&mut input),
295            '!' => return match_two(&mut input, Token::Not, '=', Token::NotEq),
296            '?' => return match_one(&mut input, Token::Question),
297            '&' => {
298                return match_four(
299                    &mut input,
300                    Token::BitAnd,
301                    '=',
302                    Token::BitAndAssign,
303                    '&',
304                    Token::And,
305                    '=',
306                    Token::AndAssign,
307                );
308            }
309            '(' => return match_one(&mut input, Token::LeftParen),
310            ')' => return match_one(&mut input, Token::RightParen),
311            '_' => return match_one(&mut input, Token::Underscore),
312            '*' => {
313                return match_four(
314                    &mut input,
315                    Token::Mul,
316                    '=',
317                    Token::MulAssign,
318                    '*',
319                    Token::Pow,
320                    '=',
321                    Token::PowAssign,
322                );
323            }
324            '+' => return match_two(&mut input, Token::Add, '=', Token::AddAssign),
325            ',' => return match_one(&mut input, Token::Comma),
326            '-' => return match_three(&mut input, Token::Sub, '=', Token::SubAssign, '>', Token::Arrow),
327            '.' => return match_two(&mut input, Token::Dot, '.', Token::DotDot),
328            '/' => {
329                input.next();
330                if input.next_if_eq(&'/').is_some() {
331                    // Find the end of the comment line.
332                    // This works because the code 10 of line feed cannot appear as a byte
333                    // in the middle of a multi-byte UTF-8 encoding of a character,
334                    // because those bytes all have the high bit set to 1;
335                    // in UTF-8, the byte 10 can only appear as the single-byte encoding of line feed.
336                    let comment = match input_str.as_bytes().iter().position(|c| *c == b'\n') {
337                        None => input_str,
338                        Some(idx) => &input_str[..idx + 1],
339                    };
340
341                    ensure_no_bidi_override(comment)?;
342                    return Ok((comment.len(), Token::CommentLine(comment.to_owned())));
343                } else if input.next_if_eq(&'*').is_some() {
344                    let mut comment = String::from("/*");
345
346                    if input.peek().is_none() {
347                        return Err(ParserError::lexer_empty_block_comment().into());
348                    }
349
350                    let mut ended = false;
351                    while let Some(c) = input.next() {
352                        comment.push(c);
353                        if c == '*' && input.next_if_eq(&'/').is_some() {
354                            comment.push('/');
355                            ended = true;
356                            break;
357                        }
358                    }
359
360                    ensure_no_bidi_override(&comment)?;
361
362                    if !ended {
363                        return Err(ParserError::lexer_block_comment_does_not_close_before_eof(comment).into());
364                    }
365                    return Ok((comment.len(), Token::CommentBlock(comment)));
366                } else if input.next_if_eq(&'=').is_some() {
367                    // '/='
368                    return Ok((2, Token::DivAssign));
369                }
370                // '/'
371                return Ok((1, Token::Div));
372            }
373            '%' => return match_two(&mut input, Token::Rem, '=', Token::RemAssign),
374            ':' => return match_two(&mut input, Token::Colon, ':', Token::DoubleColon),
375            ';' => return match_one(&mut input, Token::Semicolon),
376            '<' => return match_four(&mut input, Token::Lt, '=', Token::LtEq, '<', Token::Shl, '=', Token::ShlAssign),
377            '>' => return match_four(&mut input, Token::Gt, '=', Token::GtEq, '>', Token::Shr, '=', Token::ShrAssign),
378            '=' => return match_three(&mut input, Token::Assign, '=', Token::Eq, '>', Token::BigArrow),
379            '[' => return match_one(&mut input, Token::LeftSquare),
380            ']' => return match_one(&mut input, Token::RightSquare),
381            '{' => return match_one(&mut input, Token::LeftCurly),
382            '}' => return match_one(&mut input, Token::RightCurly),
383            '|' => {
384                return match_four(
385                    &mut input,
386                    Token::BitOr,
387                    '=',
388                    Token::BitOrAssign,
389                    '|',
390                    Token::Or,
391                    '=',
392                    Token::OrAssign,
393                );
394            }
395            '^' => return match_two(&mut input, Token::BitXor, '=', Token::BitXorAssign),
396            '@' => return Ok((1, Token::At)),
397            _ => (),
398        }
399        if let Some(identifier) = eat_identifier(&mut input) {
400            return Ok((
401                identifier.len(),
402                // todo: match on symbols instead of hard-coded &str's
403                match &*identifier {
404                    x if x.starts_with("aleo1") => Token::AddressLit(identifier),
405                    "address" => Token::Address,
406                    "aleo" => Token::Aleo,
407                    "as" => Token::As,
408                    "assert" => Token::Assert,
409                    "assert_eq" => Token::AssertEq,
410                    "assert_neq" => Token::AssertNeq,
411                    "async" => Token::Async,
412                    "block" => Token::Block,
413                    "bool" => Token::Bool,
414                    "const" => Token::Const,
415                    "constant" => Token::Constant,
416                    "constructor" => Token::Constructor,
417                    "else" => Token::Else,
418                    "false" => Token::False,
419                    "field" => Token::Field,
420                    "Fn" => Token::Fn,
421                    "for" => Token::For,
422                    "function" => Token::Function,
423                    "Future" => Token::Future,
424                    "group" => Token::Group,
425                    "i8" => Token::I8,
426                    "i16" => Token::I16,
427                    "i32" => Token::I32,
428                    "i64" => Token::I64,
429                    "i128" => Token::I128,
430                    "if" => Token::If,
431                    "import" => Token::Import,
432                    "in" => Token::In,
433                    "inline" => Token::Inline,
434                    "let" => Token::Let,
435                    "leo" => Token::Leo,
436                    "mapping" => Token::Mapping,
437                    "private" => Token::Private,
438                    "program" => Token::Program,
439                    "public" => Token::Public,
440                    "record" => Token::Record,
441                    "return" => Token::Return,
442                    "scalar" => Token::Scalar,
443                    "script" => Token::Script,
444                    "self" => Token::SelfLower,
445                    "signature" => Token::Signature,
446                    "string" => Token::String,
447                    "struct" => Token::Struct,
448                    "transition" => Token::Transition,
449                    "true" => Token::True,
450                    "u8" => Token::U8,
451                    "u16" => Token::U16,
452                    "u32" => Token::U32,
453                    "u64" => Token::U64,
454                    "u128" => Token::U128,
455                    _ => Token::Identifier(Symbol::intern(&identifier)),
456                },
457            ));
458        }
459
460        Err(ParserError::could_not_lex(input.take_while(|c| *c != ';' && !c.is_whitespace()).collect::<String>())
461            .into())
462    }
463}
464
465#[derive(Clone, Serialize, Deserialize)]
466pub struct SpannedToken {
467    pub token: Token,
468    pub span: Span,
469}
470
471impl SpannedToken {
472    /// Returns a dummy token at a dummy span.
473    pub const fn dummy() -> Self {
474        Self { token: Token::Question, span: Span::dummy() }
475    }
476}
477
478impl fmt::Display for SpannedToken {
479    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
480        write!(f, "'{}' @ ", self.token.to_string().trim())?;
481        self.span.fmt(f)
482    }
483}
484
485impl fmt::Debug for SpannedToken {
486    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
487        <SpannedToken as fmt::Display>::fmt(self, f)
488    }
489}