From 2aea19febdc6588c6e5a47564bdf1ea7f93085e7 Mon Sep 17 00:00:00 2001 From: MihailRis Date: Sun, 23 Mar 2025 08:49:09 +0300 Subject: [PATCH] add YAML parser --- src/coders/BasicParser.hpp | 6 +- src/coders/BasicParser.inl | 45 +++++- src/coders/json.cpp | 9 +- src/coders/toml.cpp | 11 +- src/coders/yaml.cpp | 318 +++++++++++++++++++++++++++++++++++++ src/coders/yaml.hpp | 10 ++ src/frontend/locale.cpp | 10 +- test/coders/yaml.cpp | 22 +++ 8 files changed, 406 insertions(+), 25 deletions(-) create mode 100644 src/coders/yaml.cpp create mode 100644 src/coders/yaml.hpp create mode 100644 test/coders/yaml.cpp diff --git a/src/coders/BasicParser.hpp b/src/coders/BasicParser.hpp index e3a3dda8..ba1cf5a8 100644 --- a/src/coders/BasicParser.hpp +++ b/src/coders/BasicParser.hpp @@ -7,16 +7,20 @@ template class BasicParser { using StringT = std::basic_string; using StringViewT = std::basic_string_view; + + void skipWhitespaceHashComment(bool newline = true); protected: std::string_view filename; StringViewT source; uint pos = 0; uint line = 1; uint linestart = 0; + bool hashComment = false; - virtual void skipWhitespace(); + void skipWhitespace(bool newline = true); void skip(size_t n); void skipLine(); + void skipEmptyLines(); bool skipTo(const StringT& substring); void expect(CharT expected); void expect(const StringT& substring); diff --git a/src/coders/BasicParser.inl b/src/coders/BasicParser.inl index 2d15d8e9..72f644ba 100644 --- a/src/coders/BasicParser.inl +++ b/src/coders/BasicParser.inl @@ -31,10 +31,17 @@ namespace { } template -void BasicParser::skipWhitespace() { +void BasicParser::skipWhitespace(bool newline) { + if (hashComment) { + skipWhitespaceHashComment(newline); + return; + } while (hasNext()) { char next = source[pos]; if (next == '\n') { + if (!newline) { + break; + } line++; linestart = ++pos; continue; @@ -47,6 +54,36 @@ void BasicParser::skipWhitespace() { } } +template +void BasicParser::skipWhitespaceHashComment(bool newline) { + while (hasNext()) { + char next = source[pos]; + if (next == '\n') { + if (!newline) { + break; + } + line++; + linestart = ++pos; + continue; + } + if (is_whitespace(next)) { + pos++; + } else { + break; + } + } + if (hasNext() && source[pos] == '#') { + if (!newline) { + readUntilEOL(); + return; + } + skipLine(); + if (hasNext() && (is_whitespace(source[pos]) || source[pos] == '#')) { + skipWhitespaceHashComment(newline); + } + } +} + template void BasicParser::skip(size_t n) { n = std::min(n, source.length() - pos); @@ -73,6 +110,12 @@ void BasicParser::skipLine() { } } +template +void BasicParser::skipEmptyLines() { + skipWhitespace(); + pos = linestart; +} + template bool BasicParser::skipTo(const std::basic_string& substring) { size_t idx = source.find(substring, pos); diff --git a/src/coders/json.cpp b/src/coders/json.cpp index 19110f90..bbc56360 100644 --- a/src/coders/json.cpp +++ b/src/coders/json.cpp @@ -13,13 +13,14 @@ using namespace json; namespace { class Parser : BasicParser { - dv::value parseList(); - dv::value parseObject(); - dv::value parseValue(); - public: + public: Parser(std::string_view filename, std::string_view source); dv::value parse(); + private: + dv::value parseList(); + dv::value parseObject(); + dv::value parseValue(); }; } diff --git a/src/coders/toml.cpp b/src/coders/toml.cpp index 8fe05391..1652745a 100644 --- a/src/coders/toml.cpp +++ b/src/coders/toml.cpp @@ -16,16 +16,6 @@ using namespace toml; class TomlReader : BasicParser { dv::value root; - void skipWhitespace() override { - BasicParser::skipWhitespace(); - if (hasNext() && source[pos] == '#') { - skipLine(); - if (hasNext() && is_whitespace(peek())) { - skipWhitespace(); - } - } - } - // modified version of BaseParser.parseString // todo: extract common part std::string parseMultilineString() { @@ -214,6 +204,7 @@ class TomlReader : BasicParser { public: TomlReader(std::string_view file, std::string_view source) : BasicParser(file, source), root(dv::object()) { + hashComment = true; } dv::value read() { diff --git a/src/coders/yaml.cpp b/src/coders/yaml.cpp new file mode 100644 index 00000000..a7a1101f --- /dev/null +++ b/src/coders/yaml.cpp @@ -0,0 +1,318 @@ +#include "yaml.hpp" +#include "BasicParser.hpp" + +using namespace yaml; + +namespace { + enum Chomping { + CLIP, STRIP, KEEP + }; + + class Parser : BasicParser { + public: + Parser(std::string_view filename, std::string_view source); + + dv::value parseValue(); + dv::value parseFullValue(int indent); + dv::value parseArray(int indent = 0); + dv::value parseObject(dv::value&& object, int indent = 0); + dv::value parseInlineArray(); + dv::value parseInlineObject(); + private: + int countIndent(); + bool expectIndent(int indent); + std::string_view readYamlIdentifier(); + std::string readMultilineString(int indent, bool eols, Chomping chomp); + }; +} + +inline bool is_yaml_identifier_char(int c) { + return c > 20 && c != ':' && c != ' ' && c != '\n' && c != '\r' && + c != '\t' && c != '\f' && c != '\v'; +} + +static dv::value perform_literal(std::string_view literal) { + if (literal == "true" || literal == "True" || + literal == "false" || literal == "False") { + return literal[0] == 't'; + } + if (literal == "null" || literal == "Null") { + return nullptr; + } + return std::string(literal); +} + +Parser::Parser(std::string_view filename, std::string_view source) + : BasicParser(filename, source) { + hashComment = true; +} + +bool Parser::expectIndent(int required) { + int indent = 0; + while (hasNext() && source[pos] == ' ' && indent < required) { + indent++; + pos++; + } + return indent >= required; +} + +std::string Parser::readMultilineString(int indent, bool eols, Chomping chomp) { + int next_indent = countIndent(); + if (next_indent <= indent) { + throw error("indentation error"); + } + std::stringstream ss; + ss << readUntilEOL(); + if (hasNext()) { + skip(1); + } + int trailingEmpties = 0; + while (true) { + while (expectIndent(next_indent)) { + trailingEmpties = 0; + ss << (eols ? '\n' : ' '); + ss << readUntilEOL(); + if (hasNext()) { + skip(1); + } + } + while (true) { + skipWhitespace(false); + if (!hasNext() || source[pos] != '\n') { + break; + } + skip(1); + trailingEmpties++; + } + if (!expectIndent(next_indent)) { + break; + } + pos = linestart; + } + if (chomp == KEEP) { + for (int i = 0; i < trailingEmpties - 1; i++) { + ss << (eols ? '\n' : ' '); + } + } + ss << '\n'; + + pos = linestart; + + auto string = ss.str(); + if (chomp == STRIP) { + util::trim(string); + } + return string; +} + +std::string_view Parser::readYamlIdentifier() { + char c = peek(); + if (!is_yaml_identifier_char(c)) { + throw error("identifier expected"); + } + int start = pos; + while (hasNext() && is_yaml_identifier_char(source[pos])) { + pos++; + } + return source.substr(start, pos - start); +} + +int Parser::countIndent() { + int indent = 0; + while (hasNext() && source[pos] == ' ') { + indent++; + pos++; + } + return indent; +} + +dv::value Parser::parseValue() { + char c = peek(); + if (is_digit(c)) { + return parseNumber(1); + } else if (c == '-' || c == '+') { + skip(1); + return parseNumber(c == '-' ? -1 : 1); + } else if (c == '"' || c == '\'') { + skip(1); + return parseString(c, true); + } else if (c == '[') { + return parseInlineArray(); + } else if (c == '{') { + return parseInlineObject(); + } else { + return perform_literal(readUntilEOL()); + } + throw error("unexpected character"); +} + +dv::value Parser::parseInlineArray() { + expect('['); + auto list = dv::list(); + while (peek() != ']') { + if (peek() == '#') { + skipLine(); + continue; + } + list.add(parseValue()); + + char next = peek(); + if (next == ',') { + pos++; + } else if (next == ']') { + break; + } else { + throw error("',' expected"); + } + } + pos++; + return list; +} + +dv::value Parser::parseInlineObject() { + expect('{'); + dv::value object = dv::object(); + while (peek() != '}') { + if (peek() == '#') { + skipLine(); + continue; + } + auto name = readYamlIdentifier(); + expect(':'); + object[std::string(name)] = parseValue(); + + char next = peek(); + if (next == ',') { + pos++; + } else if (next == '}') { + break; + } else { + throw error("',' expected"); + } + } + pos++; + return object; +} + +dv::value Parser::parseFullValue(int indent) { + dv::value value; + char c = source[pos]; + if (c == '\n') { + skip(1); + skipEmptyLines(); + int init_pos = pos; + int next_indent = countIndent(); + if (next_indent < indent) { + throw error("indentation error"); + } + if (source[pos] == '-') { + pos = init_pos; + return parseArray(next_indent); + } else { + pos = init_pos; + return parseObject(dv::object(), next_indent); + } + } else if (is_digit(c)) { + return parseNumber(1); + } else if (c == '-' || c == '+') { + skip(1); + return parseNumber(c == '-' ? -1 : 1); + } else if (c == '"' || c == '\'') { + skip(1); + return parseString(c, true); + } else if (c == '[') { + return parseInlineArray(); + } else if (c == '{') { + return parseInlineObject(); + } else if (c == '|' || c == '>') { + skip(1); + Chomping chomp = CLIP; + if (source[pos] == '-' || source[pos] == '+') { + chomp = source[pos] == '-' ? STRIP : KEEP; + skip(1); + } + skipWhitespace(false); + expectNewLine(); + return readMultilineString(indent, c == '|', chomp); + } else { + return perform_literal(readUntilEOL()); + } +} + +dv::value Parser::parseArray(int indent) { + dv::value list = dv::list(); + + while (hasNext()) { + skipEmptyLines(); + int next_indent = countIndent(); + if (next_indent < indent) { + pos = linestart; + break; + } + expect('-'); + skipWhitespace(); + size_t nlpos = source.find('\n', pos); + size_t colonpos = source.find(':', pos); + if (nlpos == std::string::npos && colonpos == std::string::npos) { + list.add(perform_literal(readUntilEOL())); + break; + } + if (nlpos < colonpos) { + list.add(parseFullValue(next_indent)); + skipLine(); + } else { + auto name = readYamlIdentifier(); + expect(':'); + skipWhitespace(false); + dv::value object = dv::object(); + object[std::string(name)] = parseFullValue(next_indent); + skipEmptyLines(); + next_indent = countIndent(); + if (next_indent > indent) { + pos = linestart; + object = parseObject(std::move(object), next_indent); + } else { + pos = linestart; + } + list.add(std::move(object)); + } + } + return list; +} + +dv::value Parser::parseObject(dv::value&& object, int indent) { + skipEmptyLines(); + while (hasNext()) { + size_t prev_pos = pos; + int next_indent = countIndent(); + if (source[pos] == '\n') { + skip(1); + continue; + } + if (next_indent < indent) { + pos = prev_pos; + break; + } + char c = peek(); + if (!is_yaml_identifier_char(c)) { + if (!is_whitespace(c)) { + throw error("invalid character"); + } + continue; + } + auto name = readYamlIdentifier(); + expect(':'); + skipWhitespace(false); + object[std::string(name)] = parseFullValue(indent); + skipEmptyLines(); + } + return object; +} + +dv::value yaml::parse(std::string_view filename, std::string_view source) { + return Parser(filename, source).parseObject(dv::object()); +} + +dv::value yaml::parse(std::string_view source) { + return parse("[string]", source); +} diff --git a/src/coders/yaml.hpp b/src/coders/yaml.hpp new file mode 100644 index 00000000..2802e523 --- /dev/null +++ b/src/coders/yaml.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include + +#include "data/dv.hpp" + +namespace yaml { + dv::value parse(std::string_view filename, std::string_view source); + dv::value parse(std::string_view source); +} diff --git a/src/frontend/locale.cpp b/src/frontend/locale.cpp index 669d0b4c..4e1812dd 100644 --- a/src/frontend/locale.cpp +++ b/src/frontend/locale.cpp @@ -40,18 +40,10 @@ const std::string& langs::Lang::getId() const { /// @brief Language key-value txt files parser namespace { class Reader : BasicParser { - void skipWhitespace() override { - BasicParser::skipWhitespace(); - if (hasNext() && source[pos] == '#') { - skipLine(); - if (hasNext() && is_whitespace(peek())) { - skipWhitespace(); - } - } - } public: Reader(std::string_view file, std::string_view source) : BasicParser(file, source) { + hashComment = true; } void read(langs::Lang& lang, const std::string &prefix) { diff --git a/test/coders/yaml.cpp b/test/coders/yaml.cpp new file mode 100644 index 00000000..272b3b0e --- /dev/null +++ b/test/coders/yaml.cpp @@ -0,0 +1,22 @@ +#include + +#include "coders/yaml.hpp" +#include "coders/json.hpp" +#include "coders/commons.hpp" + +#include "io/io.hpp" +#include "io/devices/StdfsDevice.hpp" + +namespace fs = std::filesystem; + +TEST(YAML, EncodeDecode) { + io::set_device("root", std::make_shared(fs::u8path("../../"))); + auto filename = "root:.github/workflows/windows-clang.yml"; + try { + auto value = yaml::parse(io::read_string(filename)); + std::cout << json::stringify(value, true) << std::endl; + } catch (const parsing_error& error) { + std::cerr << error.errorLog() << std::endl; + throw error; + } +}