Skip to main content

time/format_description/parse/
lexer.rs

1//! Lexer for parsing format descriptions.
2
3use core::iter;
4
5use super::{Error, Location, Spanned, SpannedValue, attach_location, unused};
6use crate::format_description::FormatDescriptionVersion;
7
8/// An iterator over the lexed tokens.
9pub(super) struct Lexed<I>
10where
11    I: Iterator,
12{
13    /// The internal iterator.
14    iter: iter::Peekable<I>,
15}
16
17impl<I> Iterator for Lexed<I>
18where
19    I: Iterator,
20{
21    type Item = I::Item;
22
23    fn next(&mut self) -> Option<Self::Item> {
24        self.iter.next()
25    }
26}
27
28impl<'iter, 'token, I> Lexed<I>
29where
30    'token: 'iter,
31    I: Iterator<Item = Result<Token<'token>, Error>> + 'iter,
32{
33    /// Peek at the next item in the iterator.
34    #[inline]
35    pub(super) fn peek(&mut self) -> Option<&I::Item> {
36        self.iter.peek()
37    }
38
39    /// Consume the next token if it is whitespace.
40    #[inline]
41    pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
42        if let Some(&Ok(Token::ComponentPart {
43            kind: ComponentKind::Whitespace,
44            value,
45        })) = self.peek()
46        {
47            self.next(); // consume
48            Some(value)
49        } else {
50            None
51        }
52    }
53
54    /// Consume the next token if it is a component item that is not whitespace.
55    #[inline]
56    pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
57        if let Some(&Ok(Token::ComponentPart {
58            kind: ComponentKind::NotWhitespace,
59            value,
60        })) = self.peek()
61        {
62            self.next(); // consume
63            Some(value)
64        } else {
65            None
66        }
67    }
68
69    /// Consume the next token if it is an opening bracket.
70    #[inline]
71    pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> {
72        if let Some(&Ok(Token::Bracket {
73            kind: BracketKind::Opening,
74            location,
75        })) = self.peek()
76        {
77            self.next(); // consume
78            Some(location)
79        } else {
80            None
81        }
82    }
83
84    /// Peek at the next token if it is a closing bracket.
85    #[inline]
86    pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> {
87        if let Some(Ok(Token::Bracket {
88            kind: BracketKind::Closing,
89            location,
90        })) = self.peek()
91        {
92            Some(location)
93        } else {
94            None
95        }
96    }
97
98    /// Consume the next token if it is a closing bracket.
99    #[inline]
100    pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> {
101        if let Some(&Ok(Token::Bracket {
102            kind: BracketKind::Closing,
103            location,
104        })) = self.peek()
105        {
106            self.next(); // consume
107            Some(location)
108        } else {
109            None
110        }
111    }
112}
113
114/// A token emitted by the lexer. There is no semantic meaning at this stage.
115pub(super) enum Token<'a> {
116    /// A literal string, formatted and parsed as-is.
117    Literal(Spanned<&'a [u8]>),
118    /// An opening or closing bracket. May or may not be the start or end of a component.
119    Bracket {
120        /// Whether the bracket is opening or closing.
121        kind: BracketKind,
122        /// Where the bracket was in the format string.
123        location: Location,
124    },
125    /// One part of a component. This could be its name, a modifier, or whitespace.
126    ComponentPart {
127        /// Whether the part is whitespace or not.
128        kind: ComponentKind,
129        /// The part itself.
130        value: Spanned<&'a [u8]>,
131    },
132}
133
134/// What type of bracket is present.
135pub(super) enum BracketKind {
136    /// An opening bracket: `[`
137    Opening,
138    /// A closing bracket: `]`
139    Closing,
140}
141
142/// Indicates whether the component is whitespace or not.
143pub(super) enum ComponentKind {
144    Whitespace,
145    NotWhitespace,
146}
147
148/// Parse the string into a series of [`Token`]s.
149///
150/// `version` controls the version of the format description that is being parsed.
151///
152/// - When `version` is 1, `[[` is the only escape sequence, resulting in a literal `[`. For the
153///   start of a nested format description, a single `[` is used and is _never_ part of the escape
154///   sequence. For example, `[optional [[day]]]` will lex successfully, ultimately resulting in a
155///   component named `optional` with the nested component `day`.
156/// - When `version` is 2 or 3, all escape sequences begin with `\`. The only characters that may
157///   currently follow are `\`, `[`, and `]`, all of which result in the literal character. All
158///   other characters result in a lex error.
159#[inline]
160pub(super) fn lex(
161    version: FormatDescriptionVersion,
162    mut input: &[u8],
163) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> {
164    let mut depth: u32 = 0;
165    // Whether, within a nested format description, we have seen the component name. This is used to
166    // distinguish between `[[` as an escaped literal and `[[` as the start of a nested format
167    // description (and the start of a component). This is only relevant for v1 format descriptions.
168    let mut nested_component_name_seen = false;
169    let mut iter = attach_location(input.iter()).peekable();
170    let mut second_bracket_location = None;
171
172    let iter = iter::from_fn(move || {
173        // The flag is only set when version is zero.
174        if version.is_v1() {
175            // There is a flag set to emit the second half of an escaped bracket pair.
176            if let Some(location) = second_bracket_location.take() {
177                return Some(Ok(Token::Bracket {
178                    kind: BracketKind::Opening,
179                    location,
180                }));
181            }
182        }
183
184        Some(Ok(match iter.next()? {
185            // possible escape sequence
186            (b'\\', backslash_loc) if version.is_at_least_v2() => {
187                match iter.next() {
188                    Some((b'\\' | b'[' | b']', char_loc)) => {
189                        // The escaped character is emitted as-is.
190                        let char = &input[1..2];
191                        input = &input[2..];
192                        if depth == 0 {
193                            Token::Literal(char.spanned(backslash_loc.to(char_loc)))
194                        } else {
195                            Token::ComponentPart {
196                                kind: ComponentKind::NotWhitespace,
197                                value: char.spanned(backslash_loc.to(char_loc)),
198                            }
199                        }
200                    }
201                    Some((_, loc)) => {
202                        return Some(Err(Error {
203                            _inner: unused(loc.error("invalid escape sequence")),
204                            public: crate::error::InvalidFormatDescription::Expected {
205                                what: "valid escape sequence",
206                                index: loc.byte as usize,
207                            },
208                        }));
209                    }
210                    None => {
211                        return Some(Err(Error {
212                            _inner: unused(backslash_loc.error("unexpected end of input")),
213                            public: crate::error::InvalidFormatDescription::Expected {
214                                what: "valid escape sequence",
215                                index: backslash_loc.byte as usize,
216                            },
217                        }));
218                    }
219                }
220            }
221            // potentially escaped opening bracket
222            // If we have seen a nested component name, then we know for sure that this is not
223            // an escaped bracket. If we have not, then we check for the escape sequence.
224            (b'[', location) if version.is_v1() && !nested_component_name_seen => {
225                if let Some((_, second_location)) = iter.next_if(|&(&byte, _)| byte == b'[') {
226                    // Escaped bracket. Store the location of the second so we can emit it later.
227                    second_bracket_location = Some(second_location);
228                    input = &input[2..];
229                } else {
230                    // opening bracket
231                    depth += 1;
232                    input = &input[1..];
233                }
234
235                Token::Bracket {
236                    kind: BracketKind::Opening,
237                    location,
238                }
239            }
240            // opening bracket
241            (b'[', location) => {
242                depth += 1;
243                input = &input[1..];
244
245                Token::Bracket {
246                    kind: BracketKind::Opening,
247                    location,
248                }
249            }
250            // closing bracket
251            (b']', location) if depth > 0 => {
252                depth -= 1;
253                if version.is_v1() {
254                    // If the depth is zero, then we are no longer in a nested component. As such we
255                    // have not seen the component name. If the depth is not zero, then we have just
256                    // completed a nested format description or nested component. In either case,
257                    // the nested component name comes before this, so we have seen it.
258                    nested_component_name_seen = depth != 0;
259                }
260                input = &input[1..];
261
262                Token::Bracket {
263                    kind: BracketKind::Closing,
264                    location,
265                }
266            }
267            // literal
268            (_, start_location) if depth == 0 => {
269                let mut bytes = 1;
270                let mut end_location = start_location;
271
272                while let Some((_, location)) = iter.next_if(|&(&byte, _)| {
273                    !((version.is_at_least_v2() && byte == b'\\') || byte == b'[')
274                }) {
275                    end_location = location;
276                    bytes += 1;
277                }
278
279                let value = &input[..bytes];
280                input = &input[bytes..];
281
282                Token::Literal(value.spanned(start_location.to(end_location)))
283            }
284            // component part
285            (byte, start_location) => {
286                let mut bytes = 1;
287                let mut end_location = start_location;
288                let is_whitespace = byte.is_ascii_whitespace();
289
290                while let Some((_, location)) = iter.next_if(|&(byte, _)| {
291                    !matches!(byte, b'\\' | b'[' | b']')
292                        && is_whitespace == byte.is_ascii_whitespace()
293                }) {
294                    end_location = location;
295                    bytes += 1;
296                }
297
298                let value = &input[..bytes];
299                input = &input[bytes..];
300
301                // If what we just consumed is not whitespace, then it is either the component name
302                // or a modifier (which comes after the component name). In either situation, we
303                // have seen the component name, so we set the flag. This is only relevant for v1
304                // format descriptions.
305                if version.is_v1() && !is_whitespace {
306                    nested_component_name_seen = true;
307                }
308
309                Token::ComponentPart {
310                    kind: if is_whitespace {
311                        ComponentKind::Whitespace
312                    } else {
313                        ComponentKind::NotWhitespace
314                    },
315                    value: value.spanned(start_location.to(end_location)),
316                }
317            }
318        }))
319    });
320
321    Lexed {
322        iter: iter.peekable(),
323    }
324}