leo_parser_lossless/
tokens.rs

1// Copyright (C) 2019-2025 Provable Inc.
2// This file is part of the Leo library.
3
4// The Leo library is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// The Leo library is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13
14// You should have received a copy of the GNU General Public License
15// along with the Leo library. If not, see <https://www.gnu.org/licenses/>.
16
17use leo_errors::{Handler, ParserError};
18use logos::Logos;
19use std::sync::LazyLock;
20
21#[derive(Clone, Copy, Debug, PartialEq, Eq)]
22pub enum IdVariants {
23    Identifier,
24    Intrinsic,
25    Path,
26    ProgramId,
27    Locator,
28}
29
30fn id_variant(lex: &mut logos::Lexer<Token>) -> IdVariants {
31    // Use LazyLock to not recompile these regexes every time.
32    static REGEX_LOCATOR: LazyLock<regex::Regex> =
33        LazyLock::new(|| regex::Regex::new(r"^\.aleo/[a-zA-Z][a-zA-Z0-9_]*").unwrap());
34    static REGEX_PROGRAM_ID: LazyLock<regex::Regex> = LazyLock::new(|| regex::Regex::new(r"^\.aleo\b").unwrap());
35    static REGEX_PATH: LazyLock<regex::Regex> =
36        LazyLock::new(|| regex::Regex::new(r"^(?:::[a-zA-Z][a-zA-Z0-9_]*)+").unwrap());
37
38    if let Some(found) = REGEX_LOCATOR.find(lex.remainder()) {
39        lex.bump(found.len());
40        IdVariants::Locator
41    } else if let Some(found) = REGEX_PROGRAM_ID.find(lex.remainder()) {
42        lex.bump(found.len());
43        IdVariants::ProgramId
44    } else if let Some(found) = REGEX_PATH.find(lex.remainder()) {
45        lex.bump(found.len());
46        IdVariants::Path
47    } else if lex.remainder().starts_with("_") {
48        IdVariants::Intrinsic
49    } else {
50        IdVariants::Identifier
51    }
52}
53
54fn comment_block(lex: &mut logos::Lexer<Token>) -> bool {
55    let mut last_asterisk = false;
56    for (index, c) in lex.remainder().char_indices() {
57        if c == '*' {
58            last_asterisk = true;
59        } else if c == '/' && last_asterisk {
60            lex.bump(index + 1);
61            return true;
62        } else if matches!(c,
63            '\u{202A}'..='\u{202E}' |
64            '\u{2066}'..='\u{2069}'
65        ) {
66            // It's a bidi character - end the comment token
67            // so we can report that error.
68            lex.bump(index);
69            return true;
70        } else {
71            last_asterisk = false;
72        }
73    }
74    false
75}
76
77#[derive(Clone, Copy, Debug, PartialEq, Eq, Logos)]
78pub enum Token {
79    #[regex(r"[ \t\f]+")]
80    Whitespace,
81
82    #[regex(r"\r?\n")]
83    Linebreak,
84
85    // Comments don't include line breaks or bidi characters.
86    #[regex(r"//[^\r\n\u{202A}-\u{202E}\u{2066}-\u{2069}]*")]
87    CommentLine,
88
89    // Can't match block comments in a regex without lazy quantifiers,
90    // so use a callback.
91    #[token(r"/*", comment_block)]
92    CommentBlock,
93
94    // We want to lex these four categories as separate token types:
95    // 1. identifiers like `abc`
96    // 2. paths like `abc::def::ghi`
97    // 3. program ids like `abc.aleo`
98    // 4. locators like `abc.aleo/def`
99    // We can't do this directly with logos regexes due to the lack of backtracking.
100    // So we do it with this callback.
101    //
102    // As an alternative design, we could simply treat the individual components of these as separate tokens,
103    // so that `abc.aleo/def` would be tokenized as `[abc, ., aleo, /, def]`. This is challenging to handle
104    // with an LR(1) parser - we potentially get shift-reduce conflicts and other ambiguities between
105    // member accesses, program ids, tuple accesses, etc. We could make it work but let's just cut to the
106    // chase here.
107
108    // Catch identifiers starting with underscore
109    #[regex(r"_[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Intrinsic)]
110    #[regex(r"[a-zA-Z][a-zA-Z0-9_]*", id_variant)]
111    // We need to special case `group::abc` and `signature::abc` as otherwise these are keywords.
112    #[regex(r"group::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)]
113    #[regex(r"signature::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)]
114    #[regex(r"Future::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)]
115    IdVariants(IdVariants),
116
117    // Address literals should have exactly 58 characters, but we lex other lengths
118    // and flag an error later.
119    #[regex(r"aleo1[a-z0-9]*")]
120    AddressLiteral,
121
122    // As with the previous parser, avoid lowercase letters to avoid ambiguity with the `field` postfix.
123    // Allow invalid digits for each radix so we can report an error about them later.
124    #[regex(r"0x[0-9A-Z_]+")]
125    #[regex(r"0o[0-9A-Z_]+")]
126    #[regex(r"0b[0-9A-Z_]+")]
127    #[regex(r"[0-9][0-9A-Z_]*")]
128    Integer,
129
130    #[regex(r#""[^"]*""#)]
131    StaticString,
132
133    // Symbols
134    #[token("=")]
135    Assign,
136    #[token("!")]
137    Not,
138    #[token("&&")]
139    And,
140    #[token("&&=")]
141    AndAssign,
142    #[token("||")]
143    Or,
144    #[token("||=")]
145    OrAssign,
146    #[token("&")]
147    BitAnd,
148    #[token("&=")]
149    BitAndAssign,
150    #[token("|")]
151    BitOr,
152    #[token("|=")]
153    BitOrAssign,
154    #[token("^")]
155    BitXor,
156    #[token("^=")]
157    BitXorAssign,
158    #[token("==")]
159    Eq,
160    #[token("!=")]
161    NotEq,
162    #[token("<")]
163    Lt,
164    #[token("<=")]
165    LtEq,
166    #[token(">")]
167    Gt,
168    #[token(">=")]
169    GtEq,
170    #[token("+")]
171    Add,
172    #[token("+=")]
173    AddAssign,
174    #[token("-")]
175    Sub,
176    #[token("-=")]
177    SubAssign,
178    #[token("*")]
179    Mul,
180    #[token("*=")]
181    MulAssign,
182    #[token("/")]
183    Div,
184    #[token("/=")]
185    DivAssign,
186    #[token("**")]
187    Pow,
188    #[token("**=")]
189    PowAssign,
190    #[token("%")]
191    Rem,
192    #[token("%=")]
193    RemAssign,
194    #[token("<<")]
195    Shl,
196    #[token("<<=")]
197    ShlAssign,
198    #[token(">>")]
199    Shr,
200    #[token(">>=")]
201    ShrAssign,
202    #[token("(")]
203    LeftParen,
204    #[token(")")]
205    RightParen,
206    #[token("[")]
207    LeftSquare,
208    #[token("]")]
209    RightSquare,
210    #[token("{")]
211    LeftCurly,
212    #[token("}")]
213    RightCurly,
214    #[token(",")]
215    Comma,
216    #[token(".")]
217    Dot,
218    #[token("..")]
219    DotDot,
220    #[token(";")]
221    Semicolon,
222    #[token(":")]
223    Colon,
224    #[token("::")]
225    DoubleColon,
226    #[token("?")]
227    Question,
228    #[token("->")]
229    Arrow,
230    #[token("=>")]
231    BigArrow,
232    #[token("_")]
233    Underscore,
234    #[token("@")]
235    At,
236
237    // Keywords
238    #[token("true")]
239    True,
240    #[token("false")]
241    False,
242    #[token("none")]
243    None,
244    #[token("address")]
245    Address,
246    #[token("bool")]
247    Bool,
248    #[token("field")]
249    Field,
250    #[token("group")]
251    Group,
252    #[token("i8")]
253    I8,
254    #[token("i16")]
255    I16,
256    #[token("i32")]
257    I32,
258    #[token("i64")]
259    I64,
260    #[token("i128")]
261    I128,
262    #[token("record")]
263    Record,
264    #[token("scalar")]
265    Scalar,
266    #[token("signature")]
267    Signature,
268    #[token("string")]
269    String,
270    #[token("struct")]
271    Struct,
272    #[token("u8")]
273    U8,
274    #[token("u16")]
275    U16,
276    #[token("u32")]
277    U32,
278    #[token("u64")]
279    U64,
280    #[token("u128")]
281    U128,
282
283    #[token("aleo")]
284    Aleo,
285    #[token("as")]
286    As,
287    #[token("assert")]
288    Assert,
289    #[token("assert_eq")]
290    AssertEq,
291    #[token("assert_neq")]
292    AssertNeq,
293    #[token("async")]
294    Async,
295    #[token("block")]
296    Block,
297    #[token("const")]
298    Const,
299    #[token("constant")]
300    Constant,
301    #[token("constructor")]
302    Constructor,
303    #[token("else")]
304    Else,
305    #[token("Fn")]
306    Fn,
307    #[token("for")]
308    For,
309    #[token("function")]
310    Function,
311    #[token("Future")]
312    Future,
313    #[token("if")]
314    If,
315    #[token("import")]
316    Import,
317    #[token("in")]
318    In,
319    #[token("inline")]
320    Inline,
321    #[token("let")]
322    Let,
323    #[token("mapping")]
324    Mapping,
325    #[token("storage")]
326    Storage,
327    #[token("network")]
328    Network,
329    #[token("private")]
330    Private,
331    #[token("program")]
332    Program,
333    #[token("public")]
334    Public,
335    #[token("return")]
336    Return,
337    #[token("script")]
338    Script,
339    #[token("self")]
340    SelfLower,
341    #[token("transition")]
342    Transition,
343
344    // Unicode bidirectional control characters are a potential risk in
345    // source. We detect them so we can report them as an error.
346    #[regex(r"[\u{202A}-\u{202E}\u{2066}-\u{2069}]")]
347    Bidi,
348
349    // This token is never produced; we use it in grammar.lalrpop
350    // to ensure a given production doesn't happen.
351    Never,
352}
353
354impl Token {
355    /// A `str` describing the token suitable for use in error messages.
356    ///
357    /// * `token_s` - The str as reported by logos.
358    pub fn str_user(token_s: &str) -> Option<&'static str> {
359        let v = match token_s {
360            // These variants we don't want to report to the user.
361            // Whitespace,
362            // Linebreak,
363            // CommentLine,
364            // CommentBlock,
365            "Identifier" => "an identifier",
366            "AddressLiteral" => "an address literal",
367            "ProgramId" => "a program id",
368
369            "Integer" => "an integer literal",
370
371            "StaticString" => "a static string",
372
373            // Symbols
374            "Assign" => "'='",
375            "Not" => "'!'",
376            "And" => "'&&'",
377            "AndAssign" => "'&&='",
378            "Or" => "'||'",
379            "OrAssign" => "'||='",
380            "BitAnd" => "'&'",
381            "BitAndAssign" => "'&='",
382            "BitOr" => "'|'",
383            "BitOrAssign" => "'|='",
384            "BitXor" => "'^'",
385            "BitXorAssign" => "'&='",
386            "Eq" => "'=='",
387            "NotEq" => "'!='",
388            "Lt" => "'<'",
389            "LtEq" => "'<='",
390            "Gt" => "'>'",
391            "GtEq" => "'>='",
392            "Add" => "'+'",
393            "AddAssign" => "'+='",
394            "Sub" => "'-'",
395            "SubAssign" => "'-='",
396            "Mul" => "'*'",
397            "MulAssign" => "'*='",
398            "Div" => "'/'",
399            "DivAssign" => "'/='",
400            "Pow" => "'**'",
401            "PowAssign" => "'**='",
402            "Rem" => "'%'",
403            "RemAssign" => "'%='",
404            "Shl" => "'<<'",
405            "ShlAssign" => "'<<='",
406            "Shr" => "'>>'",
407            "ShrAssign" => "'>>='",
408            "LeftParen" => "'('",
409            "RightParen" => "')'",
410            "LeftSquare" => "'['",
411            "RightSquare" => "']'",
412            "LeftCurly" => "'{'",
413            "RightCurly" => "'}'",
414            "Comma" => "','",
415            "Dot" => "'.'",
416            "DotDot" => "'..'",
417            "Semicolon" => "';'",
418            "Colon" => "':'",
419            "DoubleColon" => "'::'",
420            "Question" => "'?'",
421            "Arrow" => "'->'",
422            "BigArrow" => "'=>'",
423            "Underscore" => "'_'",
424            "At" => "'@'",
425
426            // Keywords
427            "True" => "'true'",
428            "False" => "'false'",
429            "Address" => "'address",
430            "Bool" => "'bool'",
431            "Field" => "'field'",
432            "Group" => "'group'",
433            "I8" => "'i8'",
434            "I16" => "'i16'",
435            "I32" => "'i32'",
436            "I64" => "'i64'",
437            "I128" => "'i128'",
438            "Record" => "'record'",
439            "Scalar" => "'scalar'",
440            "Signature" => "'signature'",
441            "String" => "a string",
442            "Struct" => "'struct'",
443            "U8" => "'u8'",
444            "U16" => "'u16'",
445            "U32" => "'u32'",
446            "U64" => "'u64'",
447            "U128" => "'u128'",
448
449            "Aleo" => "'aleo'",
450            "As" => "'as'",
451            "Assert" => "'assert'",
452            "AssertEq" => "'assert_eq'",
453            "AssertNeq" => "'assert_neq'",
454            "Async" => "'async'",
455            "Block" => "'block'",
456            "Const" => "'const'",
457            "Constant" => "'constant'",
458            "Constructor" => "'constructor'",
459            "Else" => "'else'",
460            "Fn" => "'Fn'",
461            "For" => "'for'",
462            "Function" => "'function'",
463            "Future" => "'future'",
464            "If" => "'if'",
465            "Import" => "'import'",
466            "In" => "'in'",
467            "Inline" => "'inline'",
468            "Let" => "'let'",
469            "Mapping" => "'mapping'",
470            "Storage" => "'storage'",
471            "Network" => "'network'",
472            "Private" => "'private'",
473            "Program" => "'program'",
474            "Public" => "'public'",
475            "Return" => "'return'",
476            "Script" => "'script'",
477            "SelfLower" => "'self'",
478            "Transition" => "'transition'",
479
480            "Never" => return None,
481
482            _ => return None,
483        };
484        Some(v)
485    }
486}
487
488/// The token type we present to LALRPOP.
489#[derive(Clone, Debug, PartialEq, Eq)]
490pub struct LalrToken<'a> {
491    pub token: Token,
492    pub text: &'a str,
493    pub span: leo_span::Span,
494}
495
496/// The lexer we present to LALRPOP.
497pub struct Lexer<'a> {
498    logos_lexer: logos::Lexer<'a, Token>,
499    start_pos: u32,
500    handler: Handler,
501}
502
503impl<'a> Lexer<'a> {
504    pub fn new(text: &'a str, start_pos: u32, handler: Handler) -> Self {
505        Self { logos_lexer: Token::lexer(text), start_pos, handler }
506    }
507}
508
509impl<'a> Iterator for Lexer<'a> {
510    type Item = (usize, LalrToken<'a>, usize);
511
512    fn next(&mut self) -> Option<Self::Item> {
513        let next = self.logos_lexer.next()?;
514        let logos_span = self.logos_lexer.span();
515        let span =
516            leo_span::Span { lo: self.start_pos + logos_span.start as u32, hi: self.start_pos + logos_span.end as u32 };
517
518        let text = self.logos_lexer.slice();
519
520        let Ok(token) = next else {
521            self.handler.emit_err(ParserError::could_not_lex_span(text.trim(), span));
522            return None;
523        };
524
525        if matches!(token, Token::Bidi) {
526            self.handler.emit_err(ParserError::lexer_bidi_override_span(span));
527            return None;
528        } else if matches!(token, Token::Integer) {
529            let (s, radix) = if let Some(s) = text.strip_prefix("0x") {
530                (s, 16)
531            } else if let Some(s) = text.strip_prefix("0o") {
532                (s, 8)
533            } else if let Some(s) = text.strip_prefix("0b") {
534                (s, 2)
535            } else {
536                (text, 10)
537            };
538
539            if let Some(c) = s.chars().find(|&c| c != '_' && !c.is_digit(radix)) {
540                self.handler.emit_err(ParserError::wrong_digit_for_radix_span(c, radix, text, span));
541            }
542        }
543
544        let lalr_token = LalrToken { token, text, span };
545
546        Some((span.lo as usize, lalr_token, span.hi as usize))
547    }
548}