time/format_description/parse/lexer.rs
1//! Lexer for parsing format descriptions.
2
3use core::iter;
4
5use super::{Error, Location, Spanned, SpannedValue, attach_location, unused};
6use crate::format_description::FormatDescriptionVersion;
7
8/// An iterator over the lexed tokens.
9pub(super) struct Lexed<I>
10where
11 I: Iterator,
12{
13 /// The internal iterator.
14 iter: iter::Peekable<I>,
15}
16
17impl<I> Iterator for Lexed<I>
18where
19 I: Iterator,
20{
21 type Item = I::Item;
22
23 fn next(&mut self) -> Option<Self::Item> {
24 self.iter.next()
25 }
26}
27
28impl<'iter, 'token, I> Lexed<I>
29where
30 'token: 'iter,
31 I: Iterator<Item = Result<Token<'token>, Error>> + 'iter,
32{
33 /// Peek at the next item in the iterator.
34 #[inline]
35 pub(super) fn peek(&mut self) -> Option<&I::Item> {
36 self.iter.peek()
37 }
38
39 /// Consume the next token if it is whitespace.
40 #[inline]
41 pub(super) fn next_if_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
42 if let Some(&Ok(Token::ComponentPart {
43 kind: ComponentKind::Whitespace,
44 value,
45 })) = self.peek()
46 {
47 self.next(); // consume
48 Some(value)
49 } else {
50 None
51 }
52 }
53
54 /// Consume the next token if it is a component item that is not whitespace.
55 #[inline]
56 pub(super) fn next_if_not_whitespace(&mut self) -> Option<Spanned<&'token [u8]>> {
57 if let Some(&Ok(Token::ComponentPart {
58 kind: ComponentKind::NotWhitespace,
59 value,
60 })) = self.peek()
61 {
62 self.next(); // consume
63 Some(value)
64 } else {
65 None
66 }
67 }
68
69 /// Consume the next token if it is an opening bracket.
70 #[inline]
71 pub(super) fn next_if_opening_bracket(&mut self) -> Option<Location> {
72 if let Some(&Ok(Token::Bracket {
73 kind: BracketKind::Opening,
74 location,
75 })) = self.peek()
76 {
77 self.next(); // consume
78 Some(location)
79 } else {
80 None
81 }
82 }
83
84 /// Peek at the next token if it is a closing bracket.
85 #[inline]
86 pub(super) fn peek_closing_bracket(&'iter mut self) -> Option<&'iter Location> {
87 if let Some(Ok(Token::Bracket {
88 kind: BracketKind::Closing,
89 location,
90 })) = self.peek()
91 {
92 Some(location)
93 } else {
94 None
95 }
96 }
97
98 /// Consume the next token if it is a closing bracket.
99 #[inline]
100 pub(super) fn next_if_closing_bracket(&mut self) -> Option<Location> {
101 if let Some(&Ok(Token::Bracket {
102 kind: BracketKind::Closing,
103 location,
104 })) = self.peek()
105 {
106 self.next(); // consume
107 Some(location)
108 } else {
109 None
110 }
111 }
112}
113
114/// A token emitted by the lexer. There is no semantic meaning at this stage.
115pub(super) enum Token<'a> {
116 /// A literal string, formatted and parsed as-is.
117 Literal(Spanned<&'a [u8]>),
118 /// An opening or closing bracket. May or may not be the start or end of a component.
119 Bracket {
120 /// Whether the bracket is opening or closing.
121 kind: BracketKind,
122 /// Where the bracket was in the format string.
123 location: Location,
124 },
125 /// One part of a component. This could be its name, a modifier, or whitespace.
126 ComponentPart {
127 /// Whether the part is whitespace or not.
128 kind: ComponentKind,
129 /// The part itself.
130 value: Spanned<&'a [u8]>,
131 },
132}
133
134/// What type of bracket is present.
135pub(super) enum BracketKind {
136 /// An opening bracket: `[`
137 Opening,
138 /// A closing bracket: `]`
139 Closing,
140}
141
142/// Indicates whether the component is whitespace or not.
143pub(super) enum ComponentKind {
144 Whitespace,
145 NotWhitespace,
146}
147
148/// Parse the string into a series of [`Token`]s.
149///
150/// `version` controls the version of the format description that is being parsed.
151///
152/// - When `version` is 1, `[[` is the only escape sequence, resulting in a literal `[`. For the
153/// start of a nested format description, a single `[` is used and is _never_ part of the escape
154/// sequence. For example, `[optional [[day]]]` will lex successfully, ultimately resulting in a
155/// component named `optional` with the nested component `day`.
156/// - When `version` is 2 or 3, all escape sequences begin with `\`. The only characters that may
157/// currently follow are `\`, `[`, and `]`, all of which result in the literal character. All
158/// other characters result in a lex error.
159#[inline]
160pub(super) fn lex(
161 version: FormatDescriptionVersion,
162 mut input: &[u8],
163) -> Lexed<impl Iterator<Item = Result<Token<'_>, Error>>> {
164 let mut depth: u32 = 0;
165 // Whether, within a nested format description, we have seen the component name. This is used to
166 // distinguish between `[[` as an escaped literal and `[[` as the start of a nested format
167 // description (and the start of a component). This is only relevant for v1 format descriptions.
168 let mut nested_component_name_seen = false;
169 let mut iter = attach_location(input.iter()).peekable();
170 let mut second_bracket_location = None;
171
172 let iter = iter::from_fn(move || {
173 // The flag is only set when version is zero.
174 if version.is_v1() {
175 // There is a flag set to emit the second half of an escaped bracket pair.
176 if let Some(location) = second_bracket_location.take() {
177 return Some(Ok(Token::Bracket {
178 kind: BracketKind::Opening,
179 location,
180 }));
181 }
182 }
183
184 Some(Ok(match iter.next()? {
185 // possible escape sequence
186 (b'\\', backslash_loc) if version.is_at_least_v2() => {
187 match iter.next() {
188 Some((b'\\' | b'[' | b']', char_loc)) => {
189 // The escaped character is emitted as-is.
190 let char = &input[1..2];
191 input = &input[2..];
192 if depth == 0 {
193 Token::Literal(char.spanned(backslash_loc.to(char_loc)))
194 } else {
195 Token::ComponentPart {
196 kind: ComponentKind::NotWhitespace,
197 value: char.spanned(backslash_loc.to(char_loc)),
198 }
199 }
200 }
201 Some((_, loc)) => {
202 return Some(Err(Error {
203 _inner: unused(loc.error("invalid escape sequence")),
204 public: crate::error::InvalidFormatDescription::Expected {
205 what: "valid escape sequence",
206 index: loc.byte as usize,
207 },
208 }));
209 }
210 None => {
211 return Some(Err(Error {
212 _inner: unused(backslash_loc.error("unexpected end of input")),
213 public: crate::error::InvalidFormatDescription::Expected {
214 what: "valid escape sequence",
215 index: backslash_loc.byte as usize,
216 },
217 }));
218 }
219 }
220 }
221 // potentially escaped opening bracket
222 // If we have seen a nested component name, then we know for sure that this is not
223 // an escaped bracket. If we have not, then we check for the escape sequence.
224 (b'[', location) if version.is_v1() && !nested_component_name_seen => {
225 if let Some((_, second_location)) = iter.next_if(|&(&byte, _)| byte == b'[') {
226 // Escaped bracket. Store the location of the second so we can emit it later.
227 second_bracket_location = Some(second_location);
228 input = &input[2..];
229 } else {
230 // opening bracket
231 depth += 1;
232 input = &input[1..];
233 }
234
235 Token::Bracket {
236 kind: BracketKind::Opening,
237 location,
238 }
239 }
240 // opening bracket
241 (b'[', location) => {
242 depth += 1;
243 input = &input[1..];
244
245 Token::Bracket {
246 kind: BracketKind::Opening,
247 location,
248 }
249 }
250 // closing bracket
251 (b']', location) if depth > 0 => {
252 depth -= 1;
253 if version.is_v1() {
254 // If the depth is zero, then we are no longer in a nested component. As such we
255 // have not seen the component name. If the depth is not zero, then we have just
256 // completed a nested format description or nested component. In either case,
257 // the nested component name comes before this, so we have seen it.
258 nested_component_name_seen = depth != 0;
259 }
260 input = &input[1..];
261
262 Token::Bracket {
263 kind: BracketKind::Closing,
264 location,
265 }
266 }
267 // literal
268 (_, start_location) if depth == 0 => {
269 let mut bytes = 1;
270 let mut end_location = start_location;
271
272 while let Some((_, location)) = iter.next_if(|&(&byte, _)| {
273 !((version.is_at_least_v2() && byte == b'\\') || byte == b'[')
274 }) {
275 end_location = location;
276 bytes += 1;
277 }
278
279 let value = &input[..bytes];
280 input = &input[bytes..];
281
282 Token::Literal(value.spanned(start_location.to(end_location)))
283 }
284 // component part
285 (byte, start_location) => {
286 let mut bytes = 1;
287 let mut end_location = start_location;
288 let is_whitespace = byte.is_ascii_whitespace();
289
290 while let Some((_, location)) = iter.next_if(|&(byte, _)| {
291 !matches!(byte, b'\\' | b'[' | b']')
292 && is_whitespace == byte.is_ascii_whitespace()
293 }) {
294 end_location = location;
295 bytes += 1;
296 }
297
298 let value = &input[..bytes];
299 input = &input[bytes..];
300
301 // If what we just consumed is not whitespace, then it is either the component name
302 // or a modifier (which comes after the component name). In either situation, we
303 // have seen the component name, so we set the flag. This is only relevant for v1
304 // format descriptions.
305 if version.is_v1() && !is_whitespace {
306 nested_component_name_seen = true;
307 }
308
309 Token::ComponentPart {
310 kind: if is_whitespace {
311 ComponentKind::Whitespace
312 } else {
313 ComponentKind::NotWhitespace
314 },
315 value: value.spanned(start_location.to(end_location)),
316 }
317 }
318 }))
319 });
320
321 Lexed {
322 iter: iter.peekable(),
323 }
324}