time/format_description/parse/
lexer.rs

1//! Lexer for parsing format descriptions.
2
3use core::iter;
4
5use super::{attach_location, unused, Error, Location, Spanned, SpannedValue};
6
7/// An iterator over the lexed tokens.
8pub(super) struct Lexed<I: Iterator> {
9    /// The internal iterator.
10    iter: iter::Peekable<I>,
11}
12
13impl<I: Iterator> Iterator for Lexed<I> {
14    type Item = I::Item;
15
16    fn next(&mut self) -> Option<Self::Item> {
17        self.iter.next()
18    }
19}
20
21impl<'iter, 'token: 'iter, I: Iterator<Item = Result<Token<'token>, Error>> + 'iter> Lexed<I> {
22    /// Peek at the next item in the iterator.
23    #[inline]
24    pub(super) fn peek(&mut self) -> Option<&I::Item> {
25        self.iter.peek()
26    }
27
28    /// Consume the next token if it is whitespace.
29    #[inline]
30    pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
31        if let Some(&Ok(Token::ComponentPart {
32            kind: ComponentKind::Whitespace,
33            value,
34        })) = self.peek()
35        {
36            self.next(); // consume
37            Some(value)
38        } else {
39            None
40        }
41    }
42
43    /// Consume the next token if it is a component item that is not whitespace.
44    #[inline]
45    pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
46        if let Some(&Ok(Token::ComponentPart {
47            kind: ComponentKind::NotWhitespace,
48            value,
49        })) = self.peek()
50        {
51            self.next(); // consume
52            Some(value)
53        } else {
54            None
55        }
56    }
57
58    /// Consume the next token if it is an opening bracket.
59    #[inline]
60    pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> {
61        if let Some(&Ok(Token::Bracket {
62            kind: BracketKind::Opening,
63            location,
64        })) = self.peek()
65        {
66            self.next(); // consume
67            Some(location)
68        } else {
69            None
70        }
71    }
72
73    /// Peek at the next token if it is a closing bracket.
74    #[inline]
75    pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> {
76        if let Some(Ok(Token::Bracket {
77            kind: BracketKind::Closing,
78            location,
79        })) = self.peek()
80        {
81            Some(location)
82        } else {
83            None
84        }
85    }
86
87    /// Consume the next token if it is a closing bracket.
88    #[inline]
89    pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> {
90        if let Some(&Ok(Token::Bracket {
91            kind: BracketKind::Closing,
92            location,
93        })) = self.peek()
94        {
95            self.next(); // consume
96            Some(location)
97        } else {
98            None
99        }
100    }
101}
102
103/// A token emitted by the lexer. There is no semantic meaning at this stage.
104pub(super) enum Token<'a> {
105    /// A literal string, formatted and parsed as-is.
106    Literal(Spanned<&'a [u8]>),
107    /// An opening or closing bracket. May or may not be the start or end of a component.
108    Bracket {
109        /// Whether the bracket is opening or closing.
110        kind: BracketKind,
111        /// Where the bracket was in the format string.
112        location: Location,
113    },
114    /// One part of a component. This could be its name, a modifier, or whitespace.
115    ComponentPart {
116        /// Whether the part is whitespace or not.
117        kind: ComponentKind,
118        /// The part itself.
119        value: Spanned<&'a [u8]>,
120    },
121}
122
123/// What type of bracket is present.
124pub(super) enum BracketKind {
125    /// An opening bracket: `[`
126    Opening,
127    /// A closing bracket: `]`
128    Closing,
129}
130
131/// Indicates whether the component is whitespace or not.
132pub(super) enum ComponentKind {
133    Whitespace,
134    NotWhitespace,
135}
136
137/// Parse the string into a series of [`Token`]s.
138///
139/// `VERSION` controls the version of the format description that is being parsed. Currently, this
140/// must be 1 or 2.
141///
142/// - When `VERSION` is 1, `[[` is the only escape sequence, resulting in a literal `[`.
143/// - When `VERSION` is 2, all escape sequences begin with `\`. The only characters that may
144///   currently follow are `\`, `[`, and `]`, all of which result in the literal character. All
145///   other characters result in a lex error.
146#[inline]
147pub(super) fn lex<const VERSION: usize>(
148    mut input: &[u8],
149) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> {
150    validate_version!(VERSION);
151
152    let mut depth: u8 = 0;
153    let mut iter = attach_location(input.iter()).peekable();
154    let mut second_bracket_location = None;
155
156    let iter = iter::from_fn(move || {
157        // The flag is only set when version is zero.
158        if version!(..=1) {
159            // There is a flag set to emit the second half of an escaped bracket pair.
160            if let Some(location) = second_bracket_location.take() {
161                return Some(Ok(Token::Bracket {
162                    kind: BracketKind::Opening,
163                    location,
164                }));
165            }
166        }
167
168        Some(Ok(match iter.next()? {
169            // possible escape sequence
170            (b'\\', backslash_loc) if version!(2..) => {
171                match iter.next() {
172                    Some((b'\\' | b'[' | b']', char_loc)) => {
173                        // The escaped character is emitted as-is.
174                        let char = &input[1..2];
175                        input = &input[2..];
176                        if depth == 0 {
177                            Token::Literal(char.spanned(backslash_loc.to(char_loc)))
178                        } else {
179                            Token::ComponentPart {
180                                kind: ComponentKind::NotWhitespace,
181                                value: char.spanned(backslash_loc.to(char_loc)),
182                            }
183                        }
184                    }
185                    Some((_, loc)) => {
186                        return Some(Err(Error {
187                            _inner: unused(loc.error("invalid escape sequence")),
188                            public: crate::error::InvalidFormatDescription::Expected {
189                                what: "valid escape sequence",
190                                index: loc.byte as usize,
191                            },
192                        }));
193                    }
194                    None => {
195                        return Some(Err(Error {
196                            _inner: unused(backslash_loc.error("unexpected end of input")),
197                            public: crate::error::InvalidFormatDescription::Expected {
198                                what: "valid escape sequence",
199                                index: backslash_loc.byte as usize,
200                            },
201                        }));
202                    }
203                }
204            }
205            // potentially escaped opening bracket
206            (b'[', location) if version!(..=1) => {
207                if let Some((_, second_location)) = iter.next_if(|&(&byte, _)| byte == b'[') {
208                    // Escaped bracket. Store the location of the second so we can emit it later.
209                    second_bracket_location = Some(second_location);
210                    input = &input[2..];
211                } else {
212                    // opening bracket
213                    depth += 1;
214                    input = &input[1..];
215                }
216
217                Token::Bracket {
218                    kind: BracketKind::Opening,
219                    location,
220                }
221            }
222            // opening bracket
223            (b'[', location) => {
224                depth += 1;
225                input = &input[1..];
226
227                Token::Bracket {
228                    kind: BracketKind::Opening,
229                    location,
230                }
231            }
232            // closing bracket
233            (b']', location) if depth > 0 => {
234                depth -= 1;
235                input = &input[1..];
236
237                Token::Bracket {
238                    kind: BracketKind::Closing,
239                    location,
240                }
241            }
242            // literal
243            (_, start_location) if depth == 0 => {
244                let mut bytes = 1;
245                let mut end_location = start_location;
246
247                while let Some((_, location)) =
248                    iter.next_if(|&(&byte, _)| !((version!(2..) && byte == b'\\') || byte == b'['))
249                {
250                    end_location = location;
251                    bytes += 1;
252                }
253
254                let value = &input[..bytes];
255                input = &input[bytes..];
256
257                Token::Literal(value.spanned(start_location.to(end_location)))
258            }
259            // component part
260            (byte, start_location) => {
261                let mut bytes = 1;
262                let mut end_location = start_location;
263                let is_whitespace = byte.is_ascii_whitespace();
264
265                while let Some((_, location)) = iter.next_if(|&(byte, _)| {
266                    !matches!(byte, b'\\' | b'[' | b']')
267                        && is_whitespace == byte.is_ascii_whitespace()
268                }) {
269                    end_location = location;
270                    bytes += 1;
271                }
272
273                let value = &input[..bytes];
274                input = &input[bytes..];
275
276                Token::ComponentPart {
277                    kind: if is_whitespace {
278                        ComponentKind::Whitespace
279                    } else {
280                        ComponentKind::NotWhitespace
281                    },
282                    value: value.spanned(start_location.to(end_location)),
283                }
284            }
285        }))
286    });
287
288    Lexed {
289        iter: iter.peekable(),
290    }
291}