| | #include "lexer.h"
|
| | #include "runtime.h"
|
| |
|
| | #include <cctype>
|
| | #include <functional>
|
| | #include <map>
|
| | #include <string>
|
| | #include <vector>
|
| |
|
| | #define FILENAME "jinja-lexer"
|
| |
|
| | namespace jinja {
|
| |
|
| | static void string_lstrip(std::string & s, const char * chars) {
|
| | size_t start = s.find_first_not_of(chars);
|
| | if (start == std::string::npos) {
|
| | s.clear();
|
| | } else {
|
| | s.erase(0, start);
|
| | }
|
| | }
|
| |
|
| | static void string_rstrip(std::string & s, const char * chars) {
|
| | size_t end = s.find_last_not_of(chars);
|
| | if (end == std::string::npos) {
|
| | s.clear();
|
| | } else {
|
| | s.erase(end + 1);
|
| | }
|
| | }
|
| |
|
| | lexer_result lexer::tokenize(const std::string & source) {
|
| | std::vector<token> tokens;
|
| |
|
| |
|
| |
|
| | std::string src = source;
|
| |
|
| | if (source.empty()) {
|
| | return {tokens, src};
|
| | }
|
| |
|
| |
|
| | for (std::string::size_type pos = 0; (pos = src.find("\r\n", pos)) != std::string::npos; ) {
|
| | src.erase(pos, 1);
|
| | ++pos;
|
| | }
|
| | for (std::string::size_type pos = 0; (pos = src.find("\r", pos)) != std::string::npos; ) {
|
| | src.replace(pos, 1, 1, '\n');
|
| | ++pos;
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| | if (source.back() == '\n') {
|
| | src.pop_back();
|
| | }
|
| |
|
| | size_t pos = 0;
|
| | size_t start_pos = 0;
|
| | size_t curly_bracket_depth = 0;
|
| |
|
| | using pred = std::function<bool(char)>;
|
| | auto consume_while = [&](const pred & predicate) -> std::string {
|
| | std::string str;
|
| | while (predicate(src[pos])) {
|
| |
|
| | if (src[pos] == '\\') {
|
| |
|
| | ++pos;
|
| |
|
| | if (pos >= src.size()) {
|
| | throw lexer_exception("unexpected end of input after escape character", source, pos);
|
| | }
|
| |
|
| | char escaped_char = src[pos++];
|
| | if (escape_chars.find(escaped_char) == escape_chars.end()) {
|
| | throw lexer_exception(std::string("unknown escape character \\") + escaped_char, source, pos);
|
| | }
|
| | char unescaped_char = escape_chars.at(escaped_char);
|
| | str += unescaped_char;
|
| | continue;
|
| | }
|
| |
|
| | str += src[pos++];
|
| | if (pos > src.size()) {
|
| | throw lexer_exception("unexpected end of input during consume_while", source, pos);
|
| | }
|
| | }
|
| | return str;
|
| | };
|
| |
|
| | auto consume_numeric = [&]() -> std::string {
|
| | std::string num = consume_while(is_integer);
|
| | if (pos < src.size() && src[pos] == '.' && pos + 1 < src.size() && is_integer(src[pos + 1])) {
|
| | ++pos;
|
| | std::string frac = consume_while(is_integer);
|
| | num += "." + frac;
|
| | }
|
| | return num;
|
| | };
|
| |
|
| | auto next_pos_is = [&](std::initializer_list<char> chars, size_t n = 1) -> bool {
|
| | if (pos + n >= src.size()) return false;
|
| | for (char c : chars) {
|
| | if (src[pos + n] == c) return true;
|
| | }
|
| | return false;
|
| | };
|
| |
|
| |
|
| |
|
| |
|
| | bool opt_lstrip_blocks = true;
|
| |
|
| |
|
| | bool opt_trim_blocks = true;
|
| |
|
| |
|
| | bool is_lstrip_block = false;
|
| | bool is_rstrip_block = false;
|
| |
|
| | while (pos < src.size()) {
|
| | start_pos = pos;
|
| |
|
| |
|
| |
|
| | token::type last_token_type = tokens.empty()
|
| | ? token::close_statement
|
| | : tokens.back().t;
|
| | if (last_token_type == token::close_statement ||
|
| | last_token_type == token::close_expression ||
|
| | last_token_type == token::comment) {
|
| |
|
| | bool last_block_can_rm_newline = false;
|
| | is_rstrip_block = false;
|
| | if (pos > 3) {
|
| | char c0 = src[pos - 3];
|
| | char c1 = src[pos - 2];
|
| | char c2 = src[pos - 1];
|
| |
|
| | is_rstrip_block = c0 == '-'
|
| | && (c1 == '%' || c1 == '}' || c1 == '#')
|
| | && c2 == '}';
|
| |
|
| | last_block_can_rm_newline = (c1 == '#' || c1 == '%' || c1 == '-') && c2 == '}';
|
| | }
|
| |
|
| | size_t start = pos;
|
| | size_t end = start;
|
| | while (pos < src.size() &&
|
| |
|
| | !(
|
| | src[pos] == '{' &&
|
| | next_pos_is( {'%', '{', '#'} )
|
| | )) {
|
| | end = ++pos;
|
| | }
|
| |
|
| |
|
| | if (opt_lstrip_blocks && src[pos] == '{' && next_pos_is({'%', '#', '-'})) {
|
| | size_t current = end;
|
| | while (current > start) {
|
| | char c = src[current - 1];
|
| | if (current == 1) {
|
| | end = 0;
|
| | break;
|
| | }
|
| | if (c == '\n') {
|
| | end = current;
|
| | break;
|
| | }
|
| | if (!std::isspace(static_cast<unsigned char>(c))) {
|
| | break;
|
| | }
|
| | --current;
|
| | }
|
| | }
|
| |
|
| | std::string text = src.substr(start, end - start);
|
| |
|
| |
|
| | if (opt_trim_blocks && last_block_can_rm_newline) {
|
| | if (!text.empty() && text.front() == '\n') {
|
| | text.erase(text.begin());
|
| | }
|
| | }
|
| |
|
| | if (is_rstrip_block) {
|
| |
|
| |
|
| |
|
| | string_lstrip(text, " \t\r\n");
|
| | }
|
| |
|
| | is_lstrip_block = src[pos] == '{' && next_pos_is({'{', '%', '#'}) && next_pos_is({'-'}, 2);
|
| | if (is_lstrip_block) {
|
| |
|
| |
|
| |
|
| | string_rstrip(text, " \t\r\n");
|
| | }
|
| |
|
| | if (!text.empty()) {
|
| |
|
| | tokens.push_back({token::text, text, start_pos});
|
| | continue;
|
| | }
|
| | }
|
| |
|
| |
|
| |
|
| | if (src[pos] == '{' && next_pos_is( {'#'} )) {
|
| | start_pos = pos;
|
| | pos += 2;
|
| | std::string comment;
|
| | while (!(src[pos] == '#' && next_pos_is( {'}'} ))) {
|
| | if (pos + 2 >= src.size()) {
|
| | throw lexer_exception("missing end of comment tag", source, pos);
|
| | }
|
| | comment += src[pos++];
|
| | }
|
| | JJ_DEBUG("consumed comment: '%s'", comment.c_str());
|
| | tokens.push_back({token::comment, comment, start_pos});
|
| | pos += 2;
|
| | continue;
|
| | }
|
| |
|
| | if (src[pos] == '-' && (
|
| | last_token_type == token::open_expression ||
|
| | last_token_type == token::open_statement)
|
| | ) {
|
| | JJ_DEBUG("lexer main loop at pos %zu: '%s...'", pos, src.substr(pos, 10).c_str());
|
| | pos++;
|
| | if (pos >= src.size()) break;
|
| | }
|
| |
|
| |
|
| | consume_while([](char c) { return std::isspace(static_cast<unsigned char>(c)); });
|
| |
|
| | if (pos >= src.size()) break;
|
| |
|
| | char ch = src[pos];
|
| |
|
| | bool is_closing_block = ch == '-' && next_pos_is( {'%', '}'} );
|
| |
|
| |
|
| | if (!is_closing_block && (ch == '-' || ch == '+')) {
|
| | start_pos = pos;
|
| | token::type last_token_type = tokens.empty() ? token::eof : tokens.back().t;
|
| | if (last_token_type == token::text || last_token_type == token::eof) {
|
| | throw lexer_exception(std::string("unexpected character: ") + ch, source, pos);
|
| | }
|
| | switch (last_token_type) {
|
| | case token::identifier:
|
| | case token::numeric_literal:
|
| | case token::string_literal:
|
| | case token::close_paren:
|
| | case token::close_square_bracket:
|
| |
|
| |
|
| |
|
| | break;
|
| | default: {
|
| |
|
| |
|
| | ++pos;
|
| |
|
| |
|
| | std::string num = consume_numeric();
|
| | std::string value = std::string(1, ch) + num;
|
| | token::type t = num.empty() ? token::unary_operator : token::numeric_literal;
|
| |
|
| | tokens.push_back({t, value, start_pos});
|
| | continue;
|
| | }
|
| | }
|
| | }
|
| |
|
| |
|
| | bool matched = false;
|
| | for (const auto & [seq, typ] : ordered_mapping_table) {
|
| | start_pos = pos;
|
| |
|
| | if (seq == "}}" && curly_bracket_depth > 0) {
|
| | continue;
|
| | }
|
| | if (pos + seq.size() <= src.size() && src.substr(pos, seq.size()) == seq) {
|
| | tokens.push_back({typ, seq, start_pos});
|
| | if (typ == token::open_expression) {
|
| | curly_bracket_depth = 0;
|
| | } else if (typ == token::open_curly_bracket) {
|
| | ++curly_bracket_depth;
|
| | } else if (typ == token::close_curly_bracket) {
|
| | --curly_bracket_depth;
|
| | }
|
| |
|
| | pos += seq.size();
|
| | matched = true;
|
| | break;
|
| | }
|
| | }
|
| | if (matched) continue;
|
| |
|
| |
|
| | if (ch == '\'' || ch == '"') {
|
| | start_pos = pos;
|
| | ++pos;
|
| | std::string str = consume_while([ch](char c) { return c != ch; });
|
| |
|
| | tokens.push_back({token::string_literal, str, start_pos});
|
| | ++pos;
|
| | continue;
|
| | }
|
| |
|
| |
|
| | if (is_integer(ch)) {
|
| | start_pos = pos;
|
| | std::string num = consume_numeric();
|
| |
|
| | tokens.push_back({token::numeric_literal, num, start_pos});
|
| | continue;
|
| | }
|
| |
|
| |
|
| | if (is_word(ch)) {
|
| | start_pos = pos;
|
| | std::string word = consume_while(is_word);
|
| |
|
| | tokens.push_back({token::identifier, word, start_pos});
|
| | continue;
|
| | }
|
| |
|
| | throw lexer_exception(std::string("unexpected character: ") + ch, source, pos);
|
| | }
|
| |
|
| | return {std::move(tokens), src};
|
| | }
|
| |
|
| | }
|
| |
|