leo_parser/tokenizer/lexer.rs
1// Copyright (C) 2019-2025 Provable Inc.
2// This file is part of the Leo library.
3
4// The Leo library is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8
9// The Leo library is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13
14// You should have received a copy of the GNU General Public License
15// along with the Leo library. If not, see <https://www.gnu.org/licenses/>.
16
17use crate::tokenizer::Token;
18use leo_errors::{ParserError, Result};
19use leo_span::{Span, Symbol};
20
21use serde::{Deserialize, Serialize};
22use std::{
23 fmt,
24 iter::{Peekable, from_fn},
25};
26
27/// Eat an identifier, that is, a string matching '[a-zA-Z][a-zA-Z\d_]*', if any.
28fn eat_identifier(input: &mut Peekable<impl Iterator<Item = char>>) -> Option<String> {
29 input.peek().filter(|c| c.is_ascii_alphabetic())?;
30 Some(from_fn(|| input.next_if(|c| c.is_ascii_alphanumeric() || c == &'_')).collect())
31}
32
33/// Checks if a char is a Unicode Bidirectional Override code point
34fn is_bidi_override(c: char) -> bool {
35 let i = c as u32;
36 (0x202A..=0x202E).contains(&i) || (0x2066..=0x2069).contains(&i)
37}
38
39/// Ensure that `string` contains no Unicode Bidirectional Override code points.
40fn ensure_no_bidi_override(string: &str) -> Result<()> {
41 if string.chars().any(is_bidi_override) {
42 return Err(ParserError::lexer_bidi_override().into());
43 }
44 Ok(())
45}
46
47impl Token {
48 // todo: remove this unused code or reference https://github.com/Geal/nom/blob/main/examples/string.rs
49 // // Eats the parts of the unicode character after \u.
50 // fn eat_unicode_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
51 // let mut unicode = String::new();
52 // // Account for the chars '\' and 'u'.
53 // let mut len = 2;
54 //
55 // if input.next_if_eq(&'{').is_some() {
56 // len += 1;
57 // } else if let Some(c) = input.next() {
58 // return Err(ParserError::lexer_unopened_escaped_unicode_char(c).into());
59 // } else {
60 // return Err(ParserError::lexer_empty_input_tendril().into());
61 // }
62 //
63 // while let Some(c) = input.next_if(|c| c != &'}') {
64 // len += 1;
65 // unicode.push(c);
66 // }
67 //
68 // if input.next_if_eq(&'}').is_some() {
69 // len += 1;
70 // } else {
71 // return Err(ParserError::lexer_unclosed_escaped_unicode_char(unicode).into());
72 // }
73 //
74 // // Max of 6 digits.
75 // // Minimum of 1 digit.
76 // if unicode.len() > 6 || unicode.is_empty() {
77 // return Err(ParserError::lexer_invalid_escaped_unicode_length(unicode).into());
78 // }
79 //
80 // if let Ok(hex) = u32::from_str_radix(&unicode, 16) {
81 // if let Some(character) = std::char::from_u32(hex) {
82 // Ok((len, Char::Scalar(character)))
83 // } else if hex <= 0x10FFFF {
84 // Ok((len, Char::NonScalar(hex)))
85 // } else {
86 // Err(ParserError::lexer_invalid_character_exceeded_max_value(unicode).into())
87 // }
88 // } else {
89 // Err(ParserError::lexer_expected_valid_hex_char(unicode).into())
90 // }
91 // }
92
93 // // Eats the parts of the hex character after \x.
94 // fn eat_hex_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
95 // let mut hex = String::new();
96 // // Account for the chars '\' and 'x'.
97 // let mut len = 2;
98 //
99 // // First hex character.
100 // if let Some(c) = input.next_if(|c| c != &'\'') {
101 // len += 1;
102 // hex.push(c);
103 // } else if let Some(c) = input.next() {
104 // return Err(ParserError::lexer_expected_valid_hex_char(c).into());
105 // } else {
106 // return Err(ParserError::lexer_empty_input_tendril().into());
107 // }
108 //
109 // // Second hex character.
110 // if let Some(c) = input.next_if(|c| c != &'\'') {
111 // len += 1;
112 // hex.push(c);
113 // } else if let Some(c) = input.next() {
114 // return Err(ParserError::lexer_expected_valid_hex_char(c).into());
115 // } else {
116 // return Err(ParserError::lexer_empty_input_tendril().into());
117 // }
118 //
119 // if let Ok(ascii_number) = u8::from_str_radix(&hex, 16) {
120 // // According to RFC, we allow only values less than 128.
121 // if ascii_number > 127 {
122 // return Err(ParserError::lexer_expected_valid_hex_char(hex).into());
123 // }
124 //
125 // Ok((len, Char::Scalar(ascii_number as char)))
126 // } else {
127 // Err(ParserError::lexer_expected_valid_hex_char(hex).into())
128 // }
129 // }
130
131 // fn eat_escaped_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
132 // match input.next() {
133 // None => Err(ParserError::lexer_empty_input_tendril().into()),
134 // // Length of 2 to account the '\'.
135 // Some('0') => Ok((2, Char::Scalar(0 as char))),
136 // Some('t') => Ok((2, Char::Scalar(9 as char))),
137 // Some('n') => Ok((2, Char::Scalar(10 as char))),
138 // Some('r') => Ok((2, Char::Scalar(13 as char))),
139 // Some('\"') => Ok((2, Char::Scalar(34 as char))),
140 // Some('\'') => Ok((2, Char::Scalar(39 as char))),
141 // Some('\\') => Ok((2, Char::Scalar(92 as char))),
142 // Some('u') => Self::eat_unicode_char(input),
143 // Some('x') => Self::eat_hex_char(input),
144 // Some(c) => Err(ParserError::lexer_expected_valid_escaped_char(c).into()),
145 // }
146 // }
147
148 // /// Returns a `char` if a character can be eaten, otherwise returns [`None`].
149 // fn eat_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
150 // match input.next() {
151 // None => Err(ParserError::lexer_empty_input_tendril().into()),
152 // Some('\\') => Self::eat_escaped_char(input),
153 // Some(c) => Ok((c.len_utf8(), Char::Scalar(c))),
154 // }
155 // }
156
157 /// Returns a tuple: [(integer length, integer token)] if an integer can be eaten.
158 /// An integer can be eaten if its characters are at the front of the given `input` string.
159 /// If there is no input, this function returns an error.
160 /// If there is input but no integer, this function returns the tuple consisting of
161 /// length 0 and a dummy integer token that contains an empty string.
162 /// However, this function is always called when the next character is a digit.
163 /// This function eats a sequence representing a decimal numeral, a hex
164 /// numeral (beginning with `0x`), an octal numeral (beginning with '0o'),
165 /// or a binary numeral (beginning with `0b`), optionally including underscores,
166 /// which corresponds to a numeral in the ABNF grammar.
167 fn eat_integer(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Token)> {
168 if input.peek().is_none() {
169 return Err(ParserError::lexer_empty_input().into());
170 }
171
172 if !input.peek().unwrap().is_ascii_digit() {
173 return Ok((0, Token::Integer("".into())));
174 }
175
176 let mut int = String::new();
177
178 let first = input.next().unwrap();
179 int.push(first);
180 if first == '0' && (input.peek() == Some(&'x') || input.peek() == Some(&'o') || input.peek() == Some(&'b')) {
181 int.push(input.next().unwrap());
182 }
183
184 // Allow only uppercase hex digits, so as not to interfere with parsing the `field` suffix.
185 int.extend(input.take_while(|&c| c.is_ascii_digit() || c == '_' || c.is_ascii_uppercase()));
186
187 let (s, radix) = if let Some(s) = int.strip_prefix("0x") {
188 (s, 16)
189 } else if let Some(s) = int.strip_prefix("0o") {
190 (s, 8)
191 } else if let Some(s) = int.strip_prefix("0b") {
192 (s, 2)
193 } else {
194 (int.as_str(), 10)
195 };
196
197 if let Some(c) = s.chars().find(|&c| c != '_' && !c.is_digit(radix)) {
198 return Err(ParserError::wrong_digit_for_radix(c, radix, int).into());
199 }
200
201 Ok((int.len(), Token::Integer(int)))
202 }
203
204 /// Returns a tuple: [(token length, token)] if the next token can be eaten, otherwise returns an error.
205 /// The next token can be eaten if the characters at the front of the given `input` string can be scanned into a token.
206 pub(crate) fn eat(input: &str) -> Result<(usize, Token)> {
207 if input.is_empty() {
208 return Err(ParserError::lexer_empty_input().into());
209 }
210
211 let input_str = input;
212 let mut input = input.chars().peekable();
213
214 // Returns one token matching one character.
215 let match_one = |input: &mut Peekable<_>, token| {
216 input.next();
217 Ok((1, token))
218 };
219
220 // Returns one token matching one or two characters.
221 // If the `second` character matches, return the `second_token` that represents two characters.
222 // Otherwise, return the `first_token` that matches the one character.
223 let match_two = |input: &mut Peekable<_>, first_token, second_char, second_token| {
224 input.next();
225 Ok(if input.next_if_eq(&second_char).is_some() { (2, second_token) } else { (1, first_token) })
226 };
227
228 // Returns one token matching one or two characters.
229 // If the `second_char` character matches, return the `second_token` that represents two characters.
230 // If the `third_char` character matches, return the `third_token` that represents two characters.
231 // Otherwise, return the `first_token` that matches the one character.
232 let match_three = |input: &mut Peekable<_>, first_token, second_char, second_token, third_char, third_token| {
233 input.next();
234 Ok(if input.next_if_eq(&second_char).is_some() {
235 (2, second_token)
236 } else if input.next_if_eq(&third_char).is_some() {
237 (2, third_token)
238 } else {
239 (1, first_token)
240 })
241 };
242
243 // Returns one token matching one, two, or three characters.
244 // The `fourth_token` expects both the `third_char` and `fourth_char` to be present.
245 // See the example with the different combinations for Mul, MulAssign, Pow, PowAssign below.
246 let match_four = |
247 input: &mut Peekable<_>,
248 first_token, // e.e. Mul '*'
249 second_char, // e.g. '='
250 second_token, // e.g. MulAssign '*='
251 third_char, // e.g. '*'
252 third_token, // e.g. Pow '**'
253 fourth_char, // e.g. '='
254 fourth_token // e.g. PowAssign '**='
255 | {
256 input.next();
257 Ok(if input.next_if_eq(&second_char).is_some() {
258 // '*='
259 (2, second_token)
260 } else if input.next_if_eq(&third_char).is_some() {
261 if input.next_if_eq(&fourth_char).is_some() {
262 // '**='
263 return Ok((3, fourth_token))
264 }
265 // '**'
266 (2, third_token)
267 } else {
268 // '*'
269 (1, first_token)
270 })
271 };
272
273 match *input.peek().ok_or_else(ParserError::lexer_empty_input)? {
274 x if x.is_ascii_whitespace() => return match_one(&mut input, Token::WhiteSpace),
275 '"' => {
276 // Find end string quotation mark.
277 // Instead of checking each `char` and pushing, we can avoid reallocations.
278 // This works because the code 34 of double quote cannot appear as a byte
279 // in the middle of a multi-byte UTF-8 encoding of a character,
280 // because those bytes all have the high bit set to 1;
281 // in UTF-8, the byte 34 can only appear as the single-byte encoding of double quote.
282 let rest = &input_str[1..];
283 let string = match rest.as_bytes().iter().position(|c| *c == b'"') {
284 None => return Err(ParserError::lexer_string_not_closed(rest).into()),
285 Some(idx) => rest[..idx].to_owned(),
286 };
287
288 ensure_no_bidi_override(&string)?;
289
290 // + 2 to account for parsing quotation marks.
291 return Ok((string.len() + 2, Token::StaticString(string)));
292 }
293
294 x if x.is_ascii_digit() => return Self::eat_integer(&mut input),
295 '!' => return match_two(&mut input, Token::Not, '=', Token::NotEq),
296 '?' => return match_one(&mut input, Token::Question),
297 '&' => {
298 return match_four(
299 &mut input,
300 Token::BitAnd,
301 '=',
302 Token::BitAndAssign,
303 '&',
304 Token::And,
305 '=',
306 Token::AndAssign,
307 );
308 }
309 '(' => return match_one(&mut input, Token::LeftParen),
310 ')' => return match_one(&mut input, Token::RightParen),
311 '_' => return match_one(&mut input, Token::Underscore),
312 '*' => {
313 return match_four(
314 &mut input,
315 Token::Mul,
316 '=',
317 Token::MulAssign,
318 '*',
319 Token::Pow,
320 '=',
321 Token::PowAssign,
322 );
323 }
324 '+' => return match_two(&mut input, Token::Add, '=', Token::AddAssign),
325 ',' => return match_one(&mut input, Token::Comma),
326 '-' => return match_three(&mut input, Token::Sub, '=', Token::SubAssign, '>', Token::Arrow),
327 '.' => return match_two(&mut input, Token::Dot, '.', Token::DotDot),
328 '/' => {
329 input.next();
330 if input.next_if_eq(&'/').is_some() {
331 // Find the end of the comment line.
332 // This works because the code 10 of line feed cannot appear as a byte
333 // in the middle of a multi-byte UTF-8 encoding of a character,
334 // because those bytes all have the high bit set to 1;
335 // in UTF-8, the byte 10 can only appear as the single-byte encoding of line feed.
336 let comment = match input_str.as_bytes().iter().position(|c| *c == b'\n') {
337 None => input_str,
338 Some(idx) => &input_str[..idx + 1],
339 };
340
341 ensure_no_bidi_override(comment)?;
342 return Ok((comment.len(), Token::CommentLine(comment.to_owned())));
343 } else if input.next_if_eq(&'*').is_some() {
344 let mut comment = String::from("/*");
345
346 if input.peek().is_none() {
347 return Err(ParserError::lexer_empty_block_comment().into());
348 }
349
350 let mut ended = false;
351 while let Some(c) = input.next() {
352 comment.push(c);
353 if c == '*' && input.next_if_eq(&'/').is_some() {
354 comment.push('/');
355 ended = true;
356 break;
357 }
358 }
359
360 ensure_no_bidi_override(&comment)?;
361
362 if !ended {
363 return Err(ParserError::lexer_block_comment_does_not_close_before_eof(comment).into());
364 }
365 return Ok((comment.len(), Token::CommentBlock(comment)));
366 } else if input.next_if_eq(&'=').is_some() {
367 // '/='
368 return Ok((2, Token::DivAssign));
369 }
370 // '/'
371 return Ok((1, Token::Div));
372 }
373 '%' => return match_two(&mut input, Token::Rem, '=', Token::RemAssign),
374 ':' => return match_two(&mut input, Token::Colon, ':', Token::DoubleColon),
375 ';' => return match_one(&mut input, Token::Semicolon),
376 '<' => return match_four(&mut input, Token::Lt, '=', Token::LtEq, '<', Token::Shl, '=', Token::ShlAssign),
377 '>' => return match_four(&mut input, Token::Gt, '=', Token::GtEq, '>', Token::Shr, '=', Token::ShrAssign),
378 '=' => return match_three(&mut input, Token::Assign, '=', Token::Eq, '>', Token::BigArrow),
379 '[' => return match_one(&mut input, Token::LeftSquare),
380 ']' => return match_one(&mut input, Token::RightSquare),
381 '{' => return match_one(&mut input, Token::LeftCurly),
382 '}' => return match_one(&mut input, Token::RightCurly),
383 '|' => {
384 return match_four(
385 &mut input,
386 Token::BitOr,
387 '=',
388 Token::BitOrAssign,
389 '|',
390 Token::Or,
391 '=',
392 Token::OrAssign,
393 );
394 }
395 '^' => return match_two(&mut input, Token::BitXor, '=', Token::BitXorAssign),
396 '@' => return Ok((1, Token::At)),
397 _ => (),
398 }
399 if let Some(identifier) = eat_identifier(&mut input) {
400 return Ok((
401 identifier.len(),
402 // todo: match on symbols instead of hard-coded &str's
403 match &*identifier {
404 x if x.starts_with("aleo1") => Token::AddressLit(identifier),
405 "address" => Token::Address,
406 "aleo" => Token::Aleo,
407 "as" => Token::As,
408 "assert" => Token::Assert,
409 "assert_eq" => Token::AssertEq,
410 "assert_neq" => Token::AssertNeq,
411 "async" => Token::Async,
412 "block" => Token::Block,
413 "bool" => Token::Bool,
414 "const" => Token::Const,
415 "constant" => Token::Constant,
416 "constructor" => Token::Constructor,
417 "else" => Token::Else,
418 "false" => Token::False,
419 "field" => Token::Field,
420 "Fn" => Token::Fn,
421 "for" => Token::For,
422 "function" => Token::Function,
423 "Future" => Token::Future,
424 "group" => Token::Group,
425 "i8" => Token::I8,
426 "i16" => Token::I16,
427 "i32" => Token::I32,
428 "i64" => Token::I64,
429 "i128" => Token::I128,
430 "if" => Token::If,
431 "import" => Token::Import,
432 "in" => Token::In,
433 "inline" => Token::Inline,
434 "let" => Token::Let,
435 "leo" => Token::Leo,
436 "mapping" => Token::Mapping,
437 "private" => Token::Private,
438 "program" => Token::Program,
439 "public" => Token::Public,
440 "record" => Token::Record,
441 "return" => Token::Return,
442 "scalar" => Token::Scalar,
443 "script" => Token::Script,
444 "self" => Token::SelfLower,
445 "signature" => Token::Signature,
446 "string" => Token::String,
447 "struct" => Token::Struct,
448 "transition" => Token::Transition,
449 "true" => Token::True,
450 "u8" => Token::U8,
451 "u16" => Token::U16,
452 "u32" => Token::U32,
453 "u64" => Token::U64,
454 "u128" => Token::U128,
455 _ => Token::Identifier(Symbol::intern(&identifier)),
456 },
457 ));
458 }
459
460 Err(ParserError::could_not_lex(input.take_while(|c| *c != ';' && !c.is_whitespace()).collect::<String>())
461 .into())
462 }
463}
464
465#[derive(Clone, Serialize, Deserialize)]
466pub struct SpannedToken {
467 pub token: Token,
468 pub span: Span,
469}
470
471impl SpannedToken {
472 /// Returns a dummy token at a dummy span.
473 pub const fn dummy() -> Self {
474 Self { token: Token::Question, span: Span::dummy() }
475 }
476}
477
478impl fmt::Display for SpannedToken {
479 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
480 write!(f, "'{}' @ ", self.token.to_string().trim())?;
481 self.span.fmt(f)
482 }
483}
484
485impl fmt::Debug for SpannedToken {
486 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
487 <SpannedToken as fmt::Display>::fmt(self, f)
488 }
489}