Skip to main content

time/format_description/parse/
lexer_ast.rs

1//! Lexer for parsing format descriptions.
2
3use alloc::borrow::ToOwned as _;
4use alloc::string::String;
5use alloc::vec::Vec;
6
7use super::format_item::Item;
8use super::{Error, Location, Span, Spanned, SpannedValue, Unused, unused};
9use crate::error::InvalidFormatDescription;
10use crate::internal_macros::{const_try_opt, try_likely_ok};
11
12#[must_use]
13enum Context {
14    Component,
15    Literal,
16}
17
18impl Context {
19    #[inline]
20    const fn is_component(&self) -> bool {
21        matches!(self, Self::Component)
22    }
23
24    #[inline]
25    const fn is_literal(&self) -> bool {
26        matches!(self, Self::Literal)
27    }
28}
29
30enum NextModifier<'a> {
31    Modifier(Modifier<'a>),
32    TrailingWhitespace(Spanned<&'a str>),
33    None,
34}
35
36/// An iterator over the lexed tokens.
37pub(super) struct Lexer<'input, const VERSION: u8> {
38    input: &'input [u8],
39    depth: u8,
40    byte_pos: u32,
41}
42
43impl<'input, const VERSION: u8> Lexer<'input, VERSION> {
44    /// Parse the string into a series of [`Token`]s.
45    ///
46    /// `VERSION` controls the version of the format description that is being parsed.
47    ///
48    /// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`. For the
49    ///   start of a nested format description, a single `[` is used and is _never_ part of the
50    ///   escape sequence. For example, `[optional [[day]]]` will lex successfully, ultimately
51    ///   resulting in a component named `optional` with the nested component `day`.
52    /// - When `VERSION` is 2 or 3, all escape sequences begin with `\`. The only characters that
53    ///   may currently follow are `\`, `[`, and `]`, all of which result in the literal character.
54    ///   All other characters result in a lex error.
55    #[inline]
56    pub(super) const fn new(input: &'input str) -> Self {
57        Self {
58            input: input.as_bytes(),
59            depth: 0,
60            byte_pos: 0,
61        }
62    }
63
64    /// Advance the input by the given number of bytes.
65    #[inline]
66    fn advance(&mut self, bytes: u32) {
67        self.input = &self.input[bytes as usize..];
68        self.byte_pos += bytes;
69    }
70
71    /// Whether the lexer is currently parsing a component or a literal.
72    #[inline]
73    const fn context(&self) -> Context {
74        if self.depth.is_multiple_of(2) {
75            Context::Literal
76        } else {
77            Context::Component
78        }
79    }
80
81    /// Consume the next token if it is a component item that is whitespace.
82    #[inline]
83    fn consume_whitespace(&mut self) -> Option<Spanned<&'input str>> {
84        debug_assert!(self.context().is_component());
85
86        let bytes = self
87            .input
88            .iter()
89            .take_while(|byte| byte.is_ascii_whitespace())
90            .count() as u32;
91
92        if bytes == 0 {
93            return None;
94        }
95
96        let start_loc = Location {
97            byte: self.byte_pos,
98        };
99        let end_loc = Location {
100            byte: self.byte_pos + bytes,
101        };
102
103        // Safety: Runtime format descriptions always originate with a string passed as a parameter
104        // and we have only consumed full codepoints, ensuring that a valid string remains.
105        let value = unsafe { str::from_utf8_unchecked(&self.input[..bytes as usize]) };
106        self.advance(bytes);
107
108        Some(value.spanned(start_loc.to(end_loc)))
109    }
110
111    /// Consume the next token if it is a component item that is not whitespace.
112    #[inline]
113    fn consume_component_part(&mut self) -> Option<Spanned<&'input str>> {
114        debug_assert!(self.context().is_component());
115
116        let bytes = self
117            .input
118            .iter()
119            .take_while(|byte| !byte.is_ascii_whitespace() && !matches!(byte, b'\\' | b'[' | b']'))
120            .count() as u32;
121
122        if bytes == 0 {
123            return None;
124        }
125
126        let start_loc = Location {
127            byte: self.byte_pos,
128        };
129        let end_loc = Location {
130            byte: self.byte_pos + bytes,
131        };
132
133        // Safety: Runtime format descriptions always originate with a string passed as a parameter
134        // and we have only consumed full codepoints, ensuring that a valid string remains.
135        let value = unsafe { str::from_utf8_unchecked(&self.input[..bytes as usize]) };
136        self.advance(bytes);
137
138        Some(value.spanned(start_loc.to(end_loc)))
139    }
140
141    /// Consume the next token if it is a closing bracket.
142    #[inline]
143    fn consume_closing_bracket(&mut self) -> Option<Location> {
144        if self.input.first() != Some(&b']') {
145            return None;
146        }
147
148        self.depth -= 1;
149
150        let location = Location {
151            byte: self.byte_pos,
152        };
153        self.advance(1);
154        Some(location)
155    }
156
157    /// Consume the next token if it is a component name. The caller is expected to be inside a
158    /// component header.
159    #[inline]
160    fn consume_component_name(
161        &mut self,
162        opening_bracket: Location,
163    ) -> Result<(Option<Spanned<&'input str>>, Spanned<&'input str>), Error> {
164        let leading_whitespace = self.consume_whitespace();
165
166        let Some(name) = self.consume_component_part() else {
167            let span = match leading_whitespace {
168                Some(Spanned { value: _, span }) => span,
169                None => opening_bracket.to_self(),
170            };
171            return Err(Error {
172                _inner: unused(span.error("expected component name")),
173                public: InvalidFormatDescription::MissingComponentName {
174                    index: span.start.byte as usize,
175                },
176            });
177        };
178
179        Ok((leading_whitespace, name))
180    }
181
182    #[inline]
183    fn consume_modifier(&mut self) -> Result<NextModifier<'input>, Error> {
184        let Some(whitespace) = self.consume_whitespace() else {
185            return Ok(NextModifier::None);
186        };
187
188        let Some(token) = self.consume_component_part() else {
189            return Ok(NextModifier::TrailingWhitespace(whitespace));
190        };
191
192        let modifier =
193            try_likely_ok!(self.modifier_from_leading_whitespace_and_token(whitespace, token));
194        Ok(NextModifier::Modifier(modifier))
195    }
196
197    /// Parse a component.
198    fn consume_component(
199        &mut self,
200        opening_bracket: Location,
201    ) -> Result<Item<'input, VERSION>, Error> {
202        match self.depth.checked_add(1) {
203            Some(depth) => self.depth = depth,
204            None => {
205                return Err(Error {
206                    _inner: unused(opening_bracket.error("too much nesting")),
207                    public: InvalidFormatDescription::NotSupported {
208                        what: "highly-nested format description",
209                        context: "",
210                        index: opening_bracket.byte as usize,
211                    },
212                });
213            }
214        };
215        // consume the opening bracket, which was checked prior to calling this method
216        self.advance(1);
217
218        let (_leading_whitespace, name) =
219            try_likely_ok!(self.consume_component_name(opening_bracket));
220        let modifiers = try_likely_ok!(Modifiers::parse(self));
221
222        let mut nested_format_descriptions = Vec::new();
223        while self.is_nested_description_start() {
224            if let Ok(description) = self.consume_nested(modifiers.span().end) {
225                nested_format_descriptions.push(description);
226            } else {
227                break;
228            }
229        }
230
231        if modifiers.trailing_whitespace.is_some()
232            && let Some(first_nested) = nested_format_descriptions.first_mut()
233        {
234            first_nested.leading_whitespace = modifiers.trailing_whitespace;
235        }
236
237        let _nested_fds_trailing_whitespace =
238            if modifiers.trailing_whitespace.is_some() && nested_format_descriptions.is_empty() {
239                modifiers.trailing_whitespace
240            } else {
241                self.consume_whitespace()
242            };
243
244        let Some(closing_bracket) = self.consume_closing_bracket() else {
245            return Err(Error {
246                _inner: unused(opening_bracket.error("unclosed bracket")),
247                public: InvalidFormatDescription::UnclosedOpeningBracket {
248                    index: opening_bracket.byte as usize,
249                },
250            });
251        };
252
253        if let Some(first_nested_fd) = nested_format_descriptions.first()
254            && first_nested_fd.leading_whitespace.is_none()
255        {
256            return Err(Error {
257                _inner: unused(
258                    opening_bracket
259                        .to(closing_bracket)
260                        .error("missing leading whitespace before nested format description"),
261                ),
262                public: InvalidFormatDescription::Expected {
263                    what: "whitespace before nested format description",
264                    index: first_nested_fd.opening_bracket.byte as usize,
265                },
266            });
267        }
268
269        if super::format_item::ident_eq::<VERSION>(*name, "optional") {
270            return Item::optional_from_parts(
271                opening_bracket,
272                &modifiers.modifiers,
273                nested_format_descriptions,
274                closing_bracket,
275            );
276        }
277
278        if super::format_item::ident_eq::<VERSION>(*name, "first") {
279            if !modifiers.modifiers.is_empty() {
280                let modifier = &modifiers.modifiers[0];
281                return Err(Error {
282                    _inner: unused(modifier.key.span.error("invalid modifier key")),
283                    public: InvalidFormatDescription::InvalidModifier {
284                        value: (**modifier.key).to_owned(),
285                        index: modifier.key.span.start.byte as usize,
286                    },
287                });
288            }
289
290            if version!(3..) && nested_format_descriptions.is_empty() {
291                return Err(Error {
292                    _inner: unused(opening_bracket.to(closing_bracket).error(
293                        "the `first` component requires at least one nested format description",
294                    )),
295                    public: InvalidFormatDescription::Expected {
296                        what: "at least one nested format description",
297                        index: closing_bracket.byte as usize,
298                    },
299                });
300            }
301
302            let items = nested_format_descriptions
303                .into_iter()
304                .map(|nested_format_description| nested_format_description.items)
305                .collect();
306
307            return Ok(Item::First {
308                value: items,
309                span: opening_bracket.to(closing_bracket),
310            });
311        }
312
313        if !nested_format_descriptions.is_empty() {
314            return Err(Error {
315                _inner: unused(
316                    opening_bracket
317                        .to(closing_bracket)
318                        .error("this component does not support nested format descriptions"),
319                ),
320                public: InvalidFormatDescription::NotSupported {
321                    what: "nested format descriptions",
322                    context: "on this component",
323                    index: opening_bracket.byte as usize,
324                },
325            });
326        }
327
328        let component = try_likely_ok!(super::format_item::component_from_ast::<VERSION>(
329            &name,
330            &modifiers.modifiers
331        ));
332
333        Ok(Item::Component(component))
334    }
335
336    /// Parse a nested format description. The location provided is the most recent one consumed.
337    #[inline]
338    fn consume_nested(
339        &mut self,
340        last_location: Location,
341    ) -> Result<NestedFormatDescription<'input, VERSION>, Error> {
342        let leading_whitespace = self.consume_whitespace();
343
344        let opening_bracket = {
345            match self.depth.checked_add(1) {
346                Some(depth) => self.depth = depth,
347                None => {
348                    return Err(Error {
349                        _inner: unused(last_location.error("too much nesting")),
350                        public: InvalidFormatDescription::NotSupported {
351                            what: "highly-nested format description",
352                            context: "",
353                            index: last_location.byte as usize,
354                        },
355                    });
356                }
357            }
358            let location = Location {
359                byte: self.byte_pos,
360            };
361            self.advance(1);
362            location
363        };
364
365        let mut items = Vec::new();
366        loop {
367            // If we're in a literal context and the next byte is a closing bracket, stop so that we
368            // can consume it.
369            if self.context().is_literal() && self.input.first() == Some(&b']') {
370                break;
371            }
372
373            let Some(token) = self.next() else {
374                break;
375            };
376            items.push(try_likely_ok!(token));
377        }
378
379        let Some(closing_bracket) = self.consume_closing_bracket() else {
380            return Err(Error {
381                _inner: unused(opening_bracket.error("unclosed bracket")),
382                public: InvalidFormatDescription::UnclosedOpeningBracket {
383                    index: opening_bracket.byte as usize,
384                },
385            });
386        };
387
388        Ok(NestedFormatDescription {
389            leading_whitespace,
390            opening_bracket,
391            items,
392            closing_bracket,
393        })
394    }
395
396    fn modifier_from_leading_whitespace_and_token(
397        &self,
398        leading_whitespace: Spanned<&'input str>,
399        token: Spanned<&'input str>,
400    ) -> Result<Modifier<'input>, Error> {
401        let Some(colon_index) = token.bytes().position(|b| b == b':') else {
402            return Err(Error {
403                _inner: unused(token.span.error("modifier must be of the form `key:value`")),
404                public: InvalidFormatDescription::InvalidModifier {
405                    value: (*token).to_owned(),
406                    index: token.span.start.byte as usize,
407                },
408            });
409        };
410        let key = &token[..colon_index];
411        let value = &token[colon_index + 1..];
412
413        if key.is_empty() {
414            return Err(Error {
415                _inner: unused(token.span.shrink_to_start().error("expected modifier key")),
416                public: InvalidFormatDescription::InvalidModifier {
417                    value: String::new(),
418                    index: token.span.start.byte as usize,
419                },
420            });
421        }
422        if value.is_empty() {
423            return Err(Error {
424                _inner: unused(token.span.shrink_to_end().error("expected modifier value")),
425                public: InvalidFormatDescription::InvalidModifier {
426                    value: String::new(),
427                    index: token.span.start.byte as usize + colon_index,
428                },
429            });
430        }
431
432        Ok(Modifier {
433            _leading_whitespace: unused(leading_whitespace),
434            key: key.spanned(
435                token
436                    .span
437                    .start
438                    .to(token.span.start.offset(colon_index as u32)),
439            ),
440            _colon: unused(token.span.start.offset(colon_index as u32)),
441            value: value.spanned(
442                token
443                    .span
444                    .start
445                    .offset(colon_index as u32 + 1)
446                    .to(token.span.end),
447            ),
448        })
449    }
450
451    /// Check whether the next tokens start a nested format description. Does not consume any
452    /// input.
453    ///
454    /// Note that this call is strictly an optimization, as checking the error path on
455    /// `parse_nested` is sufficient for knowing if a nested format description is present. This
456    /// method avoids the overhead of constructing an error only to throw it away.
457    #[inline]
458    fn is_nested_description_start(&self) -> bool {
459        debug_assert!(self.context().is_component());
460
461        let Some(index) = self
462            .input
463            .iter()
464            .position(|&byte| !byte.is_ascii_whitespace())
465        else {
466            return false;
467        };
468
469        self.input[index] == b'['
470            && (version!(2..)
471                || self.context().is_component()
472                || self.input.get(index + 1) != Some(&b'['))
473    }
474
475    #[inline]
476    fn consume_literal(&mut self) -> &'input str {
477        let bytes = self
478            .input
479            .iter()
480            .take_while(|&&byte| byte != b'[' && byte != b']' && (version!(1) || byte != b'\\'))
481            .count() as u32;
482
483        // Safety: A string was passed to this function, and only UTF-8 has been consumed,
484        // leaving behind a string known to begin at a character boundary.
485        let value = unsafe { str::from_utf8_unchecked(&self.input[..bytes as usize]) };
486        self.advance(bytes);
487
488        value
489    }
490
491    #[inline]
492    fn consume_backslash_escape_sequence(
493        &mut self,
494        location: Location,
495    ) -> Result<&'input str, Error> {
496        let backslash_loc = location;
497
498        Ok(match self.input.get(1) {
499            Some(b'\\' | b'[' | b']') => {
500                // The escaped character is emitted as-is.
501                // Safety: We know that this is either a left bracket, right bracket, or
502                // backslash.
503                let char = unsafe { str::from_utf8_unchecked(&self.input[1..2]) };
504                self.advance(2);
505                if self.context().is_literal() {
506                    char
507                } else {
508                    // TODO find a way to handle this
509                    return Err(Error {
510                        _inner: unused(
511                            backslash_loc.error("escape sequences are not allowed in components"),
512                        ),
513                        public: InvalidFormatDescription::NotSupported {
514                            what: "escape sequence",
515                            context: "components",
516                            index: backslash_loc.byte as usize,
517                        },
518                    });
519                }
520            }
521            Some(_) => {
522                let loc = Location {
523                    byte: self.byte_pos + 1,
524                };
525                return Err(Error {
526                    _inner: unused(loc.error("invalid escape sequence")),
527                    public: InvalidFormatDescription::Expected {
528                        what: "valid escape sequence",
529                        index: loc.byte as usize,
530                    },
531                });
532            }
533            None => {
534                return Err(Error {
535                    _inner: unused(backslash_loc.error("unexpected end of input")),
536                    public: InvalidFormatDescription::Expected {
537                        what: "valid escape sequence",
538                        index: backslash_loc.byte as usize,
539                    },
540                });
541            }
542        })
543    }
544}
545
546impl<'input, const VERSION: u8> Iterator for Lexer<'input, VERSION> {
547    type Item = Result<Item<'input, VERSION>, Error>;
548
549    #[inline]
550    fn next(&mut self) -> Option<Self::Item> {
551        let byte = *const_try_opt!(self.input.first());
552
553        let location = Location {
554            byte: self.byte_pos,
555        };
556
557        match byte {
558            b'[' if version!(1) && self.input.get(1) == Some(&b'[') => {
559                self.advance(2);
560                Some(Ok(Item::Literal("[")))
561            }
562            b'[' => Some(self.consume_component(location)),
563            b']' if version!(3..) => Some(Err(Error {
564                _inner: unused(location.error("right brackets must be escaped")),
565                public: InvalidFormatDescription::Expected {
566                    what: "right bracket to be escaped",
567                    index: location.byte as usize,
568                },
569            })),
570            b']' => {
571                self.advance(1);
572                Some(Ok(Item::Literal("]")))
573            }
574            b'\\' if version!(2..) => Some(
575                self.consume_backslash_escape_sequence(location)
576                    .map(Item::Literal),
577            ),
578            _ => Some(Ok(Item::Literal(self.consume_literal()))),
579        }
580    }
581}
582
583/// A format description that is nested within another format description.
584pub(super) struct NestedFormatDescription<'a, const VERSION: u8> {
585    /// Whitespace between the end of the previous item and the opening bracket.
586    pub(super) leading_whitespace: Option<Spanned<&'a str>>,
587    /// Where the opening bracket was in the format string.
588    pub(super) opening_bracket: Location,
589    /// The items within the nested format description.
590    pub(super) items: Vec<Item<'a, VERSION>>,
591    /// Where the closing bracket was in the format string.
592    pub(super) closing_bracket: Location,
593}
594
595/// A modifier for a component.
596pub(super) struct Modifier<'a> {
597    /// Whitespace preceding the modifier.
598    pub(super) _leading_whitespace: Unused<Spanned<&'a str>>,
599    /// The key of the modifier.
600    pub(super) key: Spanned<&'a str>,
601    /// Where the colon of the modifier was in the format string.
602    pub(super) _colon: Unused<Location>,
603    /// The value of the modifier.
604    pub(super) value: Spanned<&'a str>,
605}
606
607impl Modifier<'_> {
608    #[inline]
609    pub(super) const fn key_value_span(&self) -> Span {
610        self.key.span.start.to(self.value.span.end)
611    }
612}
613
614pub(super) struct Modifiers<'a> {
615    pub(super) modifiers: Vec<Modifier<'a>>,
616    pub(super) trailing_whitespace: Option<Spanned<&'a str>>,
617}
618
619impl<'a> Modifiers<'a> {
620    /// Parse modifiers until there are none left. Returns the modifiers along with any trailing
621    /// whitespace after the last modifier.
622    #[inline]
623    pub(super) fn parse<const VERSION: u8>(tokens: &mut Lexer<'a, VERSION>) -> Result<Self, Error> {
624        let mut modifiers = Vec::new();
625        loop {
626            match try_likely_ok!(tokens.consume_modifier()) {
627                NextModifier::Modifier(modifier) => modifiers.push(modifier),
628                NextModifier::TrailingWhitespace(whitespace) => {
629                    return Ok(Self {
630                        modifiers,
631                        trailing_whitespace: Some(whitespace),
632                    });
633                }
634                NextModifier::None => {
635                    return Ok(Self {
636                        modifiers,
637                        trailing_whitespace: None,
638                    });
639                }
640            }
641        }
642    }
643
644    #[inline]
645    pub(super) fn span(&self) -> Span {
646        match &*self.modifiers {
647            [] => Span::DUMMY,
648            [modifier] => modifier.key.span.start.to(modifier.value.span.end),
649            [first, .., last] => first.key.span.start.to(last.value.span.end),
650        }
651    }
652}