Inja 3.3.0
A Template Engine for Modern C++
Loading...
Searching...
No Matches
lexer.hpp
1#ifndef INCLUDE_INJA_LEXER_HPP_
2#define INCLUDE_INJA_LEXER_HPP_
3
4#include <cctype>
5#include <locale>
6
7#include "config.hpp"
8#include "token.hpp"
9#include "utils.hpp"
10
11namespace inja {
12
16class Lexer {
17 enum class State {
18 Text,
19 ExpressionStart,
20 ExpressionStartForceLstrip,
21 ExpressionBody,
22 LineStart,
23 LineBody,
24 StatementStart,
25 StatementStartNoLstrip,
26 StatementStartForceLstrip,
27 StatementBody,
28 CommentStart,
29 CommentStartForceLstrip,
30 CommentBody,
31 };
32
33 enum class MinusState {
34 Operator,
35 Number,
36 };
37
38 const LexerConfig &config;
39
40 State state;
41 MinusState minus_state;
42 nonstd::string_view m_in;
43 size_t tok_start;
44 size_t pos;
45
46
47 Token scan_body(nonstd::string_view close, Token::Kind closeKind, nonstd::string_view close_trim = nonstd::string_view(), bool trim = false) {
48 again:
49 // skip whitespace (except for \n as it might be a close)
50 if (tok_start >= m_in.size()) {
51 return make_token(Token::Kind::Eof);
52 }
53 const char ch = m_in[tok_start];
54 if (ch == ' ' || ch == '\t' || ch == '\r') {
55 tok_start += 1;
56 goto again;
57 }
58
59 // check for close
60 if (!close_trim.empty() && inja::string_view::starts_with(m_in.substr(tok_start), close_trim)) {
61 state = State::Text;
62 pos = tok_start + close_trim.size();
63 const Token tok = make_token(closeKind);
64 skip_whitespaces_and_newlines();
65 return tok;
66 }
67
68 if (inja::string_view::starts_with(m_in.substr(tok_start), close)) {
69 state = State::Text;
70 pos = tok_start + close.size();
71 const Token tok = make_token(closeKind);
72 if (trim) {
73 skip_whitespaces_and_first_newline();
74 }
75 return tok;
76 }
77
78 // skip \n
79 if (ch == '\n') {
80 tok_start += 1;
81 goto again;
82 }
83
84 pos = tok_start + 1;
85 if (std::isalpha(ch)) {
86 minus_state = MinusState::Operator;
87 return scan_id();
88 }
89
90 const MinusState current_minus_state = minus_state;
91 if (minus_state == MinusState::Operator) {
92 minus_state = MinusState::Number;
93 }
94
95 switch (ch) {
96 case '+':
97 return make_token(Token::Kind::Plus);
98 case '-':
99 if (current_minus_state == MinusState::Operator) {
100 return make_token(Token::Kind::Minus);
101 }
102 return scan_number();
103 case '*':
104 return make_token(Token::Kind::Times);
105 case '/':
106 return make_token(Token::Kind::Slash);
107 case '^':
108 return make_token(Token::Kind::Power);
109 case '%':
110 return make_token(Token::Kind::Percent);
111 case '.':
112 return make_token(Token::Kind::Dot);
113 case ',':
114 return make_token(Token::Kind::Comma);
115 case ':':
116 return make_token(Token::Kind::Colon);
117 case '(':
118 return make_token(Token::Kind::LeftParen);
119 case ')':
120 minus_state = MinusState::Operator;
121 return make_token(Token::Kind::RightParen);
122 case '[':
123 return make_token(Token::Kind::LeftBracket);
124 case ']':
125 minus_state = MinusState::Operator;
126 return make_token(Token::Kind::RightBracket);
127 case '{':
128 return make_token(Token::Kind::LeftBrace);
129 case '}':
130 minus_state = MinusState::Operator;
131 return make_token(Token::Kind::RightBrace);
132 case '>':
133 if (pos < m_in.size() && m_in[pos] == '=') {
134 pos += 1;
135 return make_token(Token::Kind::GreaterEqual);
136 }
137 return make_token(Token::Kind::GreaterThan);
138 case '<':
139 if (pos < m_in.size() && m_in[pos] == '=') {
140 pos += 1;
141 return make_token(Token::Kind::LessEqual);
142 }
143 return make_token(Token::Kind::LessThan);
144 case '=':
145 if (pos < m_in.size() && m_in[pos] == '=') {
146 pos += 1;
147 return make_token(Token::Kind::Equal);
148 }
149 return make_token(Token::Kind::Unknown);
150 case '!':
151 if (pos < m_in.size() && m_in[pos] == '=') {
152 pos += 1;
153 return make_token(Token::Kind::NotEqual);
154 }
155 return make_token(Token::Kind::Unknown);
156 case '\"':
157 return scan_string();
158 case '0':
159 case '1':
160 case '2':
161 case '3':
162 case '4':
163 case '5':
164 case '6':
165 case '7':
166 case '8':
167 case '9':
168 minus_state = MinusState::Operator;
169 return scan_number();
170 case '_':
171 case '@':
172 case '$':
173 minus_state = MinusState::Operator;
174 return scan_id();
175 default:
176 return make_token(Token::Kind::Unknown);
177 }
178 }
179
180 Token scan_id() {
181 for (;;) {
182 if (pos >= m_in.size()) {
183 break;
184 }
185 const char ch = m_in[pos];
186 if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') {
187 break;
188 }
189 pos += 1;
190 }
191 return make_token(Token::Kind::Id);
192 }
193
194 Token scan_number() {
195 for (;;) {
196 if (pos >= m_in.size()) {
197 break;
198 }
199 const char ch = m_in[pos];
200 // be very permissive in lexer (we'll catch errors when conversion happens)
201 if (!std::isdigit(ch) && ch != '.' && ch != 'e' && ch != 'E' && ch != '+' && ch != '-') {
202 break;
203 }
204 pos += 1;
205 }
206 return make_token(Token::Kind::Number);
207 }
208
209 Token scan_string() {
210 bool escape {false};
211 for (;;) {
212 if (pos >= m_in.size()) {
213 break;
214 }
215 const char ch = m_in[pos++];
216 if (ch == '\\') {
217 escape = true;
218 } else if (!escape && ch == m_in[tok_start]) {
219 break;
220 } else {
221 escape = false;
222 }
223 }
224 return make_token(Token::Kind::String);
225 }
226
227 Token make_token(Token::Kind kind) const { return Token(kind, string_view::slice(m_in, tok_start, pos)); }
228
229 void skip_whitespaces_and_newlines() {
230 if (pos < m_in.size()) {
231 while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t' || m_in[pos] == '\n' || m_in[pos] == '\r')) {
232 pos += 1;
233 }
234 }
235 }
236
237 void skip_whitespaces_and_first_newline() {
238 if (pos < m_in.size()) {
239 while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t')) {
240 pos += 1;
241 }
242 }
243
244 if (pos < m_in.size()) {
245 const char ch = m_in[pos];
246 if (ch == '\n') {
247 pos += 1;
248 } else if (ch == '\r') {
249 pos += 1;
250 if (pos < m_in.size() && m_in[pos] == '\n') {
251 pos += 1;
252 }
253 }
254 }
255 }
256
257 static nonstd::string_view clear_final_line_if_whitespace(nonstd::string_view text) {
258 nonstd::string_view result = text;
259 while (!result.empty()) {
260 const char ch = result.back();
261 if (ch == ' ' || ch == '\t') {
262 result.remove_suffix(1);
263 } else if (ch == '\n' || ch == '\r') {
264 break;
265 } else {
266 return text;
267 }
268 }
269 return result;
270 }
271
272public:
273 explicit Lexer(const LexerConfig &config) : config(config), state(State::Text), minus_state(MinusState::Number) {}
274
275 SourceLocation current_position() const {
276 return get_source_location(m_in, tok_start);
277 }
278
279 void start(nonstd::string_view input) {
280 m_in = input;
281 tok_start = 0;
282 pos = 0;
283 state = State::Text;
284 minus_state = MinusState::Number;
285
286 // Consume byte order mark (BOM) for UTF-8
287 if (inja::string_view::starts_with(m_in, "\xEF\xBB\xBF")) {
288 m_in = m_in.substr(3);
289 }
290 }
291
292 Token scan() {
293 tok_start = pos;
294
295 again:
296 if (tok_start >= m_in.size()) {
297 return make_token(Token::Kind::Eof);
298 }
299
300 switch (state) {
301 default:
302 case State::Text: {
303 // fast-scan to first open character
304 const size_t open_start = m_in.substr(pos).find_first_of(config.open_chars);
305 if (open_start == nonstd::string_view::npos) {
306 // didn't find open, return remaining text as text token
307 pos = m_in.size();
308 return make_token(Token::Kind::Text);
309 }
310 pos += open_start;
311
312 // try to match one of the opening sequences, and get the close
313 nonstd::string_view open_str = m_in.substr(pos);
314 bool must_lstrip = false;
315 if (inja::string_view::starts_with(open_str, config.expression_open)) {
316 if (inja::string_view::starts_with(open_str, config.expression_open_force_lstrip)) {
317 state = State::ExpressionStartForceLstrip;
318 must_lstrip = true;
319 } else {
320 state = State::ExpressionStart;
321 }
322 } else if (inja::string_view::starts_with(open_str, config.statement_open)) {
323 if (inja::string_view::starts_with(open_str, config.statement_open_no_lstrip)) {
324 state = State::StatementStartNoLstrip;
325 } else if (inja::string_view::starts_with(open_str, config.statement_open_force_lstrip )) {
326 state = State::StatementStartForceLstrip;
327 must_lstrip = true;
328 } else {
329 state = State::StatementStart;
330 must_lstrip = config.lstrip_blocks;
331 }
332 } else if (inja::string_view::starts_with(open_str, config.comment_open)) {
333 if (inja::string_view::starts_with(open_str, config.comment_open_force_lstrip)) {
334 state = State::CommentStartForceLstrip;
335 must_lstrip = true;
336 } else {
337 state = State::CommentStart;
338 must_lstrip = config.lstrip_blocks;
339 }
340 } else if ((pos == 0 || m_in[pos - 1] == '\n') && inja::string_view::starts_with(open_str, config.line_statement)) {
341 state = State::LineStart;
342 } else {
343 pos += 1; // wasn't actually an opening sequence
344 goto again;
345 }
346
347 nonstd::string_view text = string_view::slice(m_in, tok_start, pos);
348 if (must_lstrip) {
349 text = clear_final_line_if_whitespace(text);
350 }
351
352 if (text.empty()) {
353 goto again; // don't generate empty token
354 }
355 return Token(Token::Kind::Text, text);
356 }
357 case State::ExpressionStart: {
358 state = State::ExpressionBody;
359 pos += config.expression_open.size();
360 return make_token(Token::Kind::ExpressionOpen);
361 }
362 case State::ExpressionStartForceLstrip: {
363 state = State::ExpressionBody;
364 pos += config.expression_open_force_lstrip.size();
365 return make_token(Token::Kind::ExpressionOpen);
366 }
367 case State::LineStart: {
368 state = State::LineBody;
369 pos += config.line_statement.size();
370 return make_token(Token::Kind::LineStatementOpen);
371 }
372 case State::StatementStart: {
373 state = State::StatementBody;
374 pos += config.statement_open.size();
375 return make_token(Token::Kind::StatementOpen);
376 }
377 case State::StatementStartNoLstrip: {
378 state = State::StatementBody;
379 pos += config.statement_open_no_lstrip.size();
380 return make_token(Token::Kind::StatementOpen);
381 }
382 case State::StatementStartForceLstrip: {
383 state = State::StatementBody;
384 pos += config.statement_open_force_lstrip.size();
385 return make_token(Token::Kind::StatementOpen);
386 }
387 case State::CommentStart: {
388 state = State::CommentBody;
389 pos += config.comment_open.size();
390 return make_token(Token::Kind::CommentOpen);
391 }
392 case State::CommentStartForceLstrip: {
393 state = State::CommentBody;
394 pos += config.comment_open_force_lstrip.size();
395 return make_token(Token::Kind::CommentOpen);
396 }
397 case State::ExpressionBody:
398 return scan_body(config.expression_close, Token::Kind::ExpressionClose, config.expression_close_force_rstrip);
399 case State::LineBody:
400 return scan_body("\n", Token::Kind::LineStatementClose);
401 case State::StatementBody:
402 return scan_body(config.statement_close, Token::Kind::StatementClose, config.statement_close_force_rstrip, config.trim_blocks);
403 case State::CommentBody: {
404 // fast-scan to comment close
405 const size_t end = m_in.substr(pos).find(config.comment_close);
406 if (end == nonstd::string_view::npos) {
407 pos = m_in.size();
408 return make_token(Token::Kind::Eof);
409 }
410
411 // Check for trim pattern
412 const bool must_rstrip = inja::string_view::starts_with(m_in.substr(pos + end - 1), config.comment_close_force_rstrip);
413
414 // return the entire comment in the close token
415 state = State::Text;
416 pos += end + config.comment_close.size();
417 Token tok = make_token(Token::Kind::CommentClose);
418
419 if (must_rstrip || config.trim_blocks) {
420 skip_whitespaces_and_first_newline();
421 }
422 return tok;
423 }
424 }
425 }
426
427 const LexerConfig &get_config() const {
428 return config;
429 }
430};
431
432} // namespace inja
433
434#endif // INCLUDE_INJA_LEXER_HPP_
Class for lexing an inja Template.
Definition: lexer.hpp:16
Class for lexer configuration.
Definition: config.hpp:14
Definition: exceptions.hpp:9
Helper-class for the inja Lexer.
Definition: token.hpp:13