Skip to main content

time/format_description/parse/
lexer_ast.rs

1//! Lexer for parsing format descriptions.
2
3use alloc::borrow::ToOwned as _;
4use alloc::string::String;
5use alloc::vec::Vec;
6
7use super::format_item::{Item, ident_eq};
8use super::{
9    Error, Location, Span, Spanned, SpannedValue, WithLocation, WithLocationValue as _, unused,
10};
11use crate::error::InvalidFormatDescription;
12use crate::hint;
13use crate::internal_macros::{const_try_opt, try_likely_ok};
14
15#[must_use]
16enum Context {
17    Component,
18    Literal,
19}
20
21impl Context {
22    #[inline]
23    const fn is_component(&self) -> bool {
24        matches!(self, Self::Component)
25    }
26
27    #[inline]
28    const fn is_literal(&self) -> bool {
29        matches!(self, Self::Literal)
30    }
31}
32
33enum NextModifier<'a> {
34    Modifier(Modifier<'a>),
35    TrailingWhitespace(Spanned<&'a str>),
36    None,
37}
38
39/// An iterator over the lexed tokens.
40pub(super) struct Lexer<'input, const VERSION: u8> {
41    input: &'input [u8],
42    depth: u8,
43    byte_pos: u32,
44}
45
46impl<'input, const VERSION: u8> Lexer<'input, VERSION> {
47    /// Parse the string into a series of [`Token`]s.
48    ///
49    /// `VERSION` controls the version of the format description that is being parsed.
50    ///
51    /// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`. For the
52    ///   start of a nested format description, a single `[` is used and is _never_ part of the
53    ///   escape sequence. For example, `[optional [[day]]]` will lex successfully, ultimately
54    ///   resulting in a component named `optional` with the nested component `day`.
55    /// - When `VERSION` is 2 or 3, all escape sequences begin with `\`. The only characters that
56    ///   may currently follow are `\`, `[`, and `]`, all of which result in the literal character.
57    ///   All other characters result in a lex error.
58    #[inline]
59    pub(super) const fn new(input: &'input str) -> Self {
60        Self {
61            input: input.as_bytes(),
62            depth: 0,
63            byte_pos: 0,
64        }
65    }
66
67    /// Advance the input by the given number of bytes.
68    #[inline]
69    fn advance(&mut self, bytes: u32) {
70        self.input = &self.input[bytes as usize..];
71        self.byte_pos += bytes;
72    }
73
74    /// Whether the lexer is currently parsing a component or a literal.
75    #[inline]
76    const fn context(&self) -> Context {
77        if self.depth.is_multiple_of(2) {
78            Context::Literal
79        } else {
80            Context::Component
81        }
82    }
83
84    /// Consume the next token if it is a component item that is whitespace.
85    #[inline]
86    fn consume_whitespace(&mut self) -> Option<Spanned<&'input str>> {
87        debug_assert!(self.context().is_component());
88
89        let bytes = self
90            .input
91            .iter()
92            .take_while(|byte| byte.is_ascii_whitespace())
93            .count() as u32;
94
95        if bytes == 0 {
96            return None;
97        }
98
99        let start_loc = Location {
100            byte: self.byte_pos,
101        };
102        let end_loc = Location {
103            byte: self.byte_pos + bytes,
104        };
105
106        // Safety: Runtime format descriptions always originate with a string passed as a parameter
107        // and we have only consumed full codepoints, ensuring that a valid string remains.
108        let value = unsafe { str::from_utf8_unchecked(&self.input[..bytes as usize]) };
109        self.advance(bytes);
110
111        Some(value.spanned(start_loc.to(end_loc)))
112    }
113
114    /// Consume the next token if it is a component item that is not whitespace.
115    #[inline]
116    fn consume_component_part(&mut self) -> Option<Spanned<&'input str>> {
117        debug_assert!(self.context().is_component());
118
119        let bytes = self
120            .input
121            .iter()
122            .take_while(|byte| !byte.is_ascii_whitespace() && !matches!(byte, b'\\' | b'[' | b']'))
123            .count() as u32;
124
125        if bytes == 0 {
126            hint::cold_path();
127            return None;
128        }
129
130        let start_loc = Location {
131            byte: self.byte_pos,
132        };
133        let end_loc = Location {
134            byte: self.byte_pos + bytes,
135        };
136
137        // Safety: Runtime format descriptions always originate with a string passed as a parameter
138        // and we have only consumed full codepoints, ensuring that a valid string remains.
139        let value = unsafe { str::from_utf8_unchecked(&self.input[..bytes as usize]) };
140        self.advance(bytes);
141
142        Some(value.spanned(start_loc.to(end_loc)))
143    }
144
145    /// Consume the next token if it is a closing bracket.
146    #[inline]
147    fn consume_closing_bracket(&mut self) -> Option<Location> {
148        if self.input.first() != Some(&b']') {
149            hint::cold_path();
150            return None;
151        }
152
153        self.depth -= 1;
154
155        let location = Location {
156            byte: self.byte_pos,
157        };
158        self.advance(1);
159        Some(location)
160    }
161
162    /// Consume the next token if it is a component name. The caller is expected to be inside a
163    /// component header.
164    #[inline]
165    fn consume_component_name(
166        &mut self,
167        opening_bracket: Location,
168    ) -> Result<Spanned<&'input str>, Error> {
169        let leading_whitespace = self.consume_whitespace().is_some();
170
171        let Some(name) = self.consume_component_part() else {
172            hint::cold_path();
173            let location = if leading_whitespace {
174                opening_bracket.offset(1)
175            } else {
176                opening_bracket
177            };
178            return Err(Error {
179                _inner: unused(location.error("expected component name")),
180                public: InvalidFormatDescription::MissingComponentName {
181                    index: location.byte as usize,
182                },
183            });
184        };
185
186        Ok(name)
187    }
188
189    #[inline]
190    fn consume_modifier(&mut self) -> Result<NextModifier<'input>, Error> {
191        let Some(whitespace) = self.consume_whitespace() else {
192            hint::cold_path();
193            return Ok(NextModifier::None);
194        };
195
196        let Some(token) = self.consume_component_part() else {
197            hint::cold_path();
198            return Ok(NextModifier::TrailingWhitespace(whitespace));
199        };
200
201        let modifier = try_likely_ok!(self.modifier_from_token(token));
202        Ok(NextModifier::Modifier(modifier))
203    }
204
205    /// Parse a component.
206    fn consume_component(
207        &mut self,
208        opening_bracket: Location,
209    ) -> Result<Item<'input, VERSION>, Error> {
210        match self.depth.checked_add(1) {
211            Some(depth) => self.depth = depth,
212            None => {
213                hint::cold_path();
214                return Err(Error {
215                    _inner: unused(opening_bracket.error("too much nesting")),
216                    public: InvalidFormatDescription::NotSupported {
217                        what: "highly-nested format description",
218                        context: "",
219                        index: opening_bracket.byte as usize,
220                    },
221                });
222            }
223        };
224        // consume the opening bracket, which was checked prior to calling this method
225        self.advance(1);
226
227        let name = try_likely_ok!(self.consume_component_name(opening_bracket));
228        let modifiers = try_likely_ok!(Modifiers::parse(self));
229
230        let mut nested_format_descriptions = Vec::new();
231        while self.is_nested_description_start()
232            && let Ok(description) = self.consume_nested(modifiers.end_location())
233        {
234            nested_format_descriptions.push(description);
235        }
236
237        if modifiers.trailing_whitespace.is_some()
238            && let Some(first_nested) = nested_format_descriptions.first_mut()
239        {
240            first_nested.leading_whitespace = modifiers.trailing_whitespace;
241        }
242
243        if modifiers.trailing_whitespace.is_none() || !nested_format_descriptions.is_empty() {
244            self.consume_whitespace();
245        }
246
247        let Some(closing_bracket) = self.consume_closing_bracket() else {
248            hint::cold_path();
249            return Err(Error {
250                _inner: unused(opening_bracket.error("unclosed bracket")),
251                public: InvalidFormatDescription::UnclosedOpeningBracket {
252                    index: opening_bracket.byte as usize,
253                },
254            });
255        };
256
257        if let Some(first_nested_fd) = nested_format_descriptions.first()
258            && first_nested_fd.leading_whitespace.is_none()
259        {
260            hint::cold_path();
261            return Err(Error {
262                _inner: unused(
263                    opening_bracket
264                        .to(closing_bracket)
265                        .error("missing leading whitespace before nested format description"),
266                ),
267                public: InvalidFormatDescription::Expected {
268                    what: "whitespace before nested format description",
269                    index: first_nested_fd.opening_bracket.byte as usize,
270                },
271            });
272        }
273
274        if ident_eq::<VERSION>(*name, "optional") {
275            hint::cold_path();
276            return Item::optional_from_parts(
277                opening_bracket,
278                &modifiers.modifiers,
279                nested_format_descriptions,
280                closing_bracket,
281            );
282        }
283
284        if ident_eq::<VERSION>(*name, "first") {
285            hint::cold_path();
286            if !modifiers.modifiers.is_empty() {
287                hint::cold_path();
288                let modifier = &modifiers.modifiers[0];
289                return Err(Error {
290                    _inner: unused(modifier.key_span().error("invalid modifier key")),
291                    public: InvalidFormatDescription::InvalidModifier {
292                        value: (*modifier.key).to_owned(),
293                        index: modifier.key.location.byte as usize,
294                    },
295                });
296            }
297
298            if version!(3..) && nested_format_descriptions.is_empty() {
299                hint::cold_path();
300                return Err(Error {
301                    _inner: unused(opening_bracket.to(closing_bracket).error(
302                        "the `first` component requires at least one nested format description",
303                    )),
304                    public: InvalidFormatDescription::Expected {
305                        what: "at least one nested format description",
306                        index: closing_bracket.byte as usize,
307                    },
308                });
309            }
310
311            let items = nested_format_descriptions
312                .into_iter()
313                .map(|nested_format_description| nested_format_description.items)
314                .collect();
315
316            return Ok(Item::First {
317                value: items,
318                span: opening_bracket.to(closing_bracket),
319            });
320        }
321
322        if !nested_format_descriptions.is_empty() {
323            hint::cold_path();
324            return Err(Error {
325                _inner: unused(
326                    opening_bracket
327                        .to(closing_bracket)
328                        .error("this component does not support nested format descriptions"),
329                ),
330                public: InvalidFormatDescription::NotSupported {
331                    what: "nested format descriptions",
332                    context: "on this component",
333                    index: opening_bracket.byte as usize,
334                },
335            });
336        }
337
338        let component = try_likely_ok!(super::format_item::component_from_ast::<VERSION>(
339            &name,
340            &modifiers.modifiers
341        ));
342
343        Ok(Item::Component(component))
344    }
345
346    /// Parse a nested format description. The location provided is the most recent one consumed.
347    #[inline]
348    fn consume_nested(
349        &mut self,
350        last_location: Location,
351    ) -> Result<NestedFormatDescription<'input, VERSION>, Error> {
352        let leading_whitespace = self.consume_whitespace();
353
354        let opening_bracket = {
355            match self.depth.checked_add(1) {
356                Some(depth) => self.depth = depth,
357                None => {
358                    hint::cold_path();
359                    return Err(Error {
360                        _inner: unused(last_location.error("too much nesting")),
361                        public: InvalidFormatDescription::NotSupported {
362                            what: "highly-nested format description",
363                            context: "",
364                            index: last_location.byte as usize,
365                        },
366                    });
367                }
368            }
369            let location = Location {
370                byte: self.byte_pos,
371            };
372            self.advance(1);
373            location
374        };
375
376        let mut items = Vec::new();
377        loop {
378            // If we're in a literal context and the next byte is a closing bracket, stop so that we
379            // can consume it.
380            if self.context().is_literal() && self.input.first() == Some(&b']') {
381                break;
382            }
383
384            let Some(token) = self.next() else {
385                break;
386            };
387            items.push(try_likely_ok!(token));
388        }
389
390        let Some(closing_bracket) = self.consume_closing_bracket() else {
391            hint::cold_path();
392            return Err(Error {
393                _inner: unused(opening_bracket.error("unclosed bracket")),
394                public: InvalidFormatDescription::UnclosedOpeningBracket {
395                    index: opening_bracket.byte as usize,
396                },
397            });
398        };
399
400        Ok(NestedFormatDescription {
401            leading_whitespace,
402            opening_bracket,
403            items,
404            closing_bracket,
405        })
406    }
407
408    fn modifier_from_token(&self, token: Spanned<&'input str>) -> Result<Modifier<'input>, Error> {
409        let Some(colon_index) = token.bytes().position(|b| b == b':') else {
410            hint::cold_path();
411            return Err(Error {
412                _inner: unused(token.span.error("modifier must be of the form `key:value`")),
413                public: InvalidFormatDescription::InvalidModifier {
414                    value: (*token).to_owned(),
415                    index: token.span.start.byte as usize,
416                },
417            });
418        };
419        let key = &token[..colon_index];
420        let value = &token[colon_index + 1..];
421
422        if key.is_empty() {
423            hint::cold_path();
424            return Err(Error {
425                _inner: unused(token.span.shrink_to_start().error("expected modifier key")),
426                public: InvalidFormatDescription::InvalidModifier {
427                    value: String::new(),
428                    index: token.span.start.byte as usize,
429                },
430            });
431        }
432        if value.is_empty() {
433            hint::cold_path();
434            return Err(Error {
435                _inner: unused(token.span.shrink_to_end().error("expected modifier value")),
436                public: InvalidFormatDescription::InvalidModifier {
437                    value: String::new(),
438                    index: token.span.start.byte as usize + colon_index,
439                },
440            });
441        }
442
443        Ok(Modifier {
444            key: key.with_location(token.span.start),
445            value,
446        })
447    }
448
449    /// Check whether the next tokens start a nested format description. Does not consume any
450    /// input.
451    ///
452    /// Note that this call is strictly an optimization, as checking the error path on
453    /// `parse_nested` is sufficient for knowing if a nested format description is present. This
454    /// method avoids the overhead of constructing an error only to throw it away.
455    #[inline]
456    fn is_nested_description_start(&self) -> bool {
457        debug_assert!(self.context().is_component());
458
459        let Some(index) = self
460            .input
461            .iter()
462            .position(|&byte| !byte.is_ascii_whitespace())
463        else {
464            return false;
465        };
466
467        self.input[index] == b'['
468            && (version!(2..)
469                || self.context().is_component()
470                || self.input.get(index + 1) != Some(&b'['))
471    }
472
473    #[inline]
474    fn consume_literal(&mut self) -> &'input str {
475        let bytes = self
476            .input
477            .iter()
478            .take_while(|&&byte| byte != b'[' && byte != b']' && (version!(1) || byte != b'\\'))
479            .count() as u32;
480
481        // Safety: A string was passed to this function, and only UTF-8 has been consumed,
482        // leaving behind a string known to begin at a character boundary.
483        let value = unsafe { str::from_utf8_unchecked(&self.input[..bytes as usize]) };
484        self.advance(bytes);
485
486        value
487    }
488
489    #[inline]
490    fn consume_backslash_escape_sequence(
491        &mut self,
492        location: Location,
493    ) -> Result<&'input str, Error> {
494        let backslash_loc = location;
495
496        Ok(match self.input.get(1) {
497            Some(b'\\' | b'[' | b']') => {
498                // The escaped character is emitted as-is.
499                // Safety: We know that this is either a left bracket, right bracket, or
500                // backslash.
501                let char = unsafe { str::from_utf8_unchecked(&self.input[1..2]) };
502                self.advance(2);
503                char
504            }
505            Some(_) => {
506                hint::cold_path();
507                let loc = Location {
508                    byte: self.byte_pos + 1,
509                };
510                return Err(Error {
511                    _inner: unused(loc.error("invalid escape sequence")),
512                    public: InvalidFormatDescription::Expected {
513                        what: "valid escape sequence",
514                        index: loc.byte as usize,
515                    },
516                });
517            }
518            None => {
519                hint::cold_path();
520                return Err(Error {
521                    _inner: unused(backslash_loc.error("unexpected end of input")),
522                    public: InvalidFormatDescription::Expected {
523                        what: "valid escape sequence",
524                        index: backslash_loc.byte as usize,
525                    },
526                });
527            }
528        })
529    }
530}
531
532impl<'input, const VERSION: u8> Iterator for Lexer<'input, VERSION> {
533    type Item = Result<Item<'input, VERSION>, Error>;
534
535    #[inline]
536    fn next(&mut self) -> Option<Self::Item> {
537        let byte = *const_try_opt!(self.input.first());
538
539        let location = Location {
540            byte: self.byte_pos,
541        };
542
543        match byte {
544            b'[' if version!(1) && self.input.get(1) == Some(&b'[') => {
545                self.advance(2);
546                Some(Ok(Item::Literal("[")))
547            }
548            b'[' => Some(self.consume_component(location)),
549            b']' if version!(3..) => {
550                hint::cold_path();
551                Some(Err(Error {
552                    _inner: unused(location.error("right brackets must be escaped")),
553                    public: InvalidFormatDescription::Expected {
554                        what: "right bracket to be escaped",
555                        index: location.byte as usize,
556                    },
557                }))
558            }
559            b']' => {
560                self.advance(1);
561                Some(Ok(Item::Literal("]")))
562            }
563            b'\\' if version!(2..) => Some(
564                self.consume_backslash_escape_sequence(location)
565                    .map(Item::Literal),
566            ),
567            _ => Some(Ok(Item::Literal(self.consume_literal()))),
568        }
569    }
570}
571
572/// A format description that is nested within another format description.
573pub(super) struct NestedFormatDescription<'a, const VERSION: u8> {
574    /// Whitespace between the end of the previous item and the opening bracket.
575    pub(super) leading_whitespace: Option<Spanned<&'a str>>,
576    /// Where the opening bracket was in the format string.
577    pub(super) opening_bracket: Location,
578    /// The items within the nested format description.
579    pub(super) items: Vec<Item<'a, VERSION>>,
580    /// Where the closing bracket was in the format string.
581    pub(super) closing_bracket: Location,
582}
583
584/// A modifier for a component.
585pub(super) struct Modifier<'a> {
586    /// The key of the modifier.
587    pub(super) key: WithLocation<&'a str>,
588    /// The value of the modifier.
589    pub(super) value: &'a str,
590}
591
592impl Modifier<'_> {
593    #[inline]
594    pub(super) fn key_value_span(&self) -> Span {
595        self.key
596            .location
597            .with_length(self.key.len() + self.value.len() + 1)
598    }
599
600    #[inline]
601    pub(super) fn key_span(&self) -> Span {
602        self.key.location.with_length(self.key.len())
603    }
604
605    #[inline]
606    pub(super) fn value_span(&self) -> Span {
607        self.key
608            .location
609            .offset(self.key.len() as u32 + 1)
610            .with_length(self.value.len())
611    }
612}
613
614pub(super) struct Modifiers<'a> {
615    pub(super) modifiers: Vec<Modifier<'a>>,
616    pub(super) trailing_whitespace: Option<Spanned<&'a str>>,
617}
618
619impl<'a> Modifiers<'a> {
620    /// Parse modifiers until there are none left. Returns the modifiers along with any trailing
621    /// whitespace after the last modifier.
622    #[inline]
623    pub(super) fn parse<const VERSION: u8>(tokens: &mut Lexer<'a, VERSION>) -> Result<Self, Error> {
624        let mut modifiers = Vec::new();
625        loop {
626            match try_likely_ok!(tokens.consume_modifier()) {
627                NextModifier::Modifier(modifier) => modifiers.push(modifier),
628                NextModifier::TrailingWhitespace(whitespace) => {
629                    return Ok(Self {
630                        modifiers,
631                        trailing_whitespace: Some(whitespace),
632                    });
633                }
634                NextModifier::None => {
635                    return Ok(Self {
636                        modifiers,
637                        trailing_whitespace: None,
638                    });
639                }
640            }
641        }
642    }
643
644    #[inline]
645    pub(super) fn end_location(&self) -> Location {
646        match &*self.modifiers {
647            [] => Location::DUMMY,
648            [.., modifier] => modifier.value_span().end,
649        }
650    }
651}