From ff6b5785ae0c8e277bca346f3a0c31a067e64552 Mon Sep 17 00:00:00 2001 From: Karutoh Date: Tue, 29 Apr 2025 00:24:13 -0700 Subject: [PATCH] Improved the tokenizer. --- include/arctyx/compiler/Compiler.h | 4 +- include/arctyx/compiler/Language.h | 9 +- include/arctyx/compiler/Token.h | 5 +- src/Arctyx.cpp | 14 ++- src/ArctyxLang.cpp | 6 +- src/compiler/Compiler.cpp | 148 +++++++++++++++++++---------- src/compiler/Language.cpp | 36 ++++--- 7 files changed, 148 insertions(+), 74 deletions(-) diff --git a/include/arctyx/compiler/Compiler.h b/include/arctyx/compiler/Compiler.h index 5df8c67..321370b 100644 --- a/include/arctyx/compiler/Compiler.h +++ b/include/arctyx/compiler/Compiler.h @@ -30,6 +30,8 @@ public: ehs::Array Compile(const ehs::Str_8 &code) const; private: + static bool IsEOL(const ehs::Array &eols, const ehs::Char_8 *c); + static bool IsSeparator(const ehs::Array &separators, const ehs::Char_8 *c); static bool IsPrimitive(const ehs::Array &primitives, const ehs::Str_8 &value); @@ -38,7 +40,5 @@ private: static const Operator *IsOperator(const ehs::Array &operators, const ehs::Str_8 &value); - Token ParseValue(const ehs::Array &primitives, const ehs::Array &keywords, const ehs::Array &operators, const ehs::Str_8 &value) const; - ehs::Vector Parse(const ehs::Str_8 &code) const; }; \ No newline at end of file diff --git a/include/arctyx/compiler/Language.h b/include/arctyx/compiler/Language.h index 0f17a35..1524241 100644 --- a/include/arctyx/compiler/Language.h +++ b/include/arctyx/compiler/Language.h @@ -17,7 +17,7 @@ private: ehs::UInt_64 id; ehs::Str_8 name; ehs::Version version; - ehs::Char_8 eol; + ehs::Array eols; ehs::Array separators; ehs::Array primitives; ehs::Array keywords; @@ -45,9 +45,11 @@ public: ehs::Version GetVersion() const; - void SetEOL(const ehs::Char_8 &newEOL); + ehs::Array GetEOLs() const; - ehs::Char_8 GetEOL() const; + bool HasEOL(const ehs::Char_8 &eol) const; + + bool AddEOL(const ehs::Char_8 &eol); ehs::Array GetSeparators() const; @@ -77,7 +79,6 @@ public: bool HasOperator(const ehs::Str_8 &delimeter) const; - const Operator *GetOperator(const ehs::Str_8 &delimeter) const; bool AddOperator(Operator primitive); diff --git a/include/arctyx/compiler/Token.h b/include/arctyx/compiler/Token.h index d9be08c..94ed7d7 100644 --- a/include/arctyx/compiler/Token.h +++ b/include/arctyx/compiler/Token.h @@ -5,7 +5,10 @@ enum class TokenT : ehs::UInt_8 { UNKNOWN, - VALUE, + SEPARATOR, + NUMBER, + STRING, + CHARACTER, KEYWORD, TYPE, IDENTIFIER, diff --git a/src/Arctyx.cpp b/src/Arctyx.cpp index 4d58eb7..1b1f210 100644 --- a/src/Arctyx.cpp +++ b/src/Arctyx.cpp @@ -79,13 +79,13 @@ void Arctyx::LoadLanguagePlugins() if (!initFunc()) { - ehs::Console::Write_8("Failed to initialize plugin " + pluginName + " v" + pluginVersion.major + "." + + ehs::Console::Write_8("Failed to load plugin " + pluginName + " v" + pluginVersion.major + "." + pluginVersion.minor + "." + pluginVersion.patch); continue; } - ehs::Console::Write_8("Successfully initialized plugin " + pluginName + " v" + pluginVersion.major + "." + + ehs::Console::Write_8("Successfully loaded plugin " + pluginName + " v" + pluginVersion.major + "." + pluginVersion.minor + "." + pluginVersion.patch); plugins.Push((ehs::Open &&)plugin); @@ -94,8 +94,12 @@ void Arctyx::LoadLanguagePlugins() void Arctyx::Initialize() { + ehs::Console::Write_8("Loading plugins..."); + LoadArchitecturePlugins(); LoadLanguagePlugins(); + + ehs::Console::Write_8(""); } void Arctyx::Uninitialize() @@ -110,6 +114,8 @@ void Arctyx::Uninitialize() Language::languages.Clear(); + ehs::Console::Write_8("\nUnloading plugins..."); + for (ehs::UInt_64 i = 0; i < plugins.Size(); ++i) { GetPluginName nameFunc = (GetPluginName)plugins[i].Retrieve("_Z13GetPluginNamev"); @@ -121,11 +127,11 @@ void Arctyx::Uninitialize() if (!shutFunc) { - ehs::Console::Write_8("Failed to shutdown plugin " + pluginName + " v" + pluginVersion.major + "." + + ehs::Console::Write_8("Failed to unload plugin " + pluginName + " v" + pluginVersion.major + "." + pluginVersion.minor + "." + pluginVersion.patch); } - ehs::Console::Write_8("successfully shutdown plugin " + pluginName + " v" + pluginVersion.major + "." + + ehs::Console::Write_8("successfully unloaded plugin " + pluginName + " v" + pluginVersion.major + "." + pluginVersion.minor + "." + pluginVersion.patch); } diff --git a/src/ArctyxLang.cpp b/src/ArctyxLang.cpp index 462d843..48d58f6 100644 --- a/src/ArctyxLang.cpp +++ b/src/ArctyxLang.cpp @@ -15,10 +15,10 @@ bool InitializePlugin() Language arctyx("Arctyx", GetPluginVersion()); - arctyx.AddSeparator(' '); - arctyx.AddSeparator('\t'); + arctyx.AddSeparator(','); - arctyx.SetEOL('\n'); + arctyx.AddEOL('\n'); + arctyx.AddEOL(';'); arctyx.AddPrimitive({"Byte", 1, Signedness::UNSIGNED}); arctyx.AddPrimitive({"Char_8", 1, Signedness::UNSIGNED}); diff --git a/src/compiler/Compiler.cpp b/src/compiler/Compiler.cpp index c9a2c28..d5dcfda 100644 --- a/src/compiler/Compiler.cpp +++ b/src/compiler/Compiler.cpp @@ -1,5 +1,8 @@ #include "arctyx/compiler/Compiler.h" +#include +#include + Compiler::Compiler() : architecture(nullptr), language(nullptr) { @@ -65,13 +68,51 @@ Compiler& Compiler::operator=(const Compiler& other) ehs::Array Compiler::Compile(const ehs::Str_8 &code) const { + ehs::Console::Write_8("Code:"); + ehs::Console::Write_8(code); + + ehs::Console::Write_8("Tokens:"); ehs::Vector tokens = Parse(code); + for (ehs::UInt_64 i = 0; i < tokens.Size(); ++i) + { + if (tokens[i].GetType() == TokenT::UNKNOWN) + ehs::Console::Write_8("UNKNOWN, ", false); + else if (tokens[i].GetType() == TokenT::SEPARATOR) + ehs::Console::Write_8("SEPARATOR, ", false); + else if (tokens[i].GetType() == TokenT::NUMBER) + ehs::Console::Write_8("NUMBER, ", false); + else if (tokens[i].GetType() == TokenT::STRING) + ehs::Console::Write_8("STRING, ", false); + else if (tokens[i].GetType() == TokenT::CHARACTER) + ehs::Console::Write_8("CHARACTER, ", false); + else if (tokens[i].GetType() == TokenT::KEYWORD) + ehs::Console::Write_8("KEYWORD, ", false); + else if (tokens[i].GetType() == TokenT::TYPE) + ehs::Console::Write_8("TYPE, ", false); + else if (tokens[i].GetType() == TokenT::IDENTIFIER) + ehs::Console::Write_8("IDENTIFIER, ", false); + else if (tokens[i].GetType() == TokenT::UNARY_OPERATOR) + ehs::Console::Write_8("UNARY_OPERATOR, ", false); + else if (tokens[i].GetType() == TokenT::COMPOUND_OPERATOR) + ehs::Console::Write_8("COMPOUND_OPERATOR, ", false); + else if (tokens[i].GetType() == TokenT::EOL) + ehs::Console::Write_8("EOL"); + } ehs::Array machineCode; return machineCode; } +bool Compiler::IsEOL(const ehs::Array& eols, const ehs::Char_8* c) +{ + for (ehs::UInt_64 s = 0; s < eols.Size(); ++s) + if (*c == eols[s]) + return true; + + return false; +} + bool Compiler::IsSeparator(const ehs::Array &separators, const ehs::Char_8 *c) { for (ehs::UInt_64 s = 0; s < separators.Size(); ++s) @@ -108,33 +149,11 @@ const Operator *Compiler::IsOperator(const ehs::Array& operators, cons return nullptr; } -Token Compiler::ParseValue(const ehs::Array &primitives, const ehs::Array &keywords, const ehs::Array &operators, const ehs::Str_8& value) const -{ - if (IsPrimitive(primitives, value)) - return {TokenT::TYPE, value}; - - if (IsKeyword(keywords, value)) - return {TokenT::KEYWORD, value}; - - const Operator *op = IsOperator(operators, value); - if (op) - { - if (op->IsUnary()) - return {TokenT::UNARY_OPERATOR, value}; - else - return {TokenT::COMPOUND_OPERATOR, value}; - } - - if (value[0] >= '0' && value[0] <= '9') - return {TokenT::VALUE, value}; - - return {TokenT::IDENTIFIER, value}; -} - ehs::Vector Compiler::Parse(const ehs::Str_8 &code) const { ehs::Vector tokens; + const ehs::Array eols = language->GetEOLs(); const ehs::Array separators = language->GetSeparators(); const ehs::Array primitives = language->GetPrimitives(); const ehs::Array keywords = language->GetKeywords(); @@ -142,45 +161,78 @@ ehs::Vector Compiler::Parse(const ehs::Str_8 &code) const for (ehs::Char_8 *i = &code[0], *start = i; i < &code[code.Size()]; ++i) { - if (*i == language->GetEOL()) - { - if (start != i) - tokens.Push(ParseValue(primitives, keywords, operators, ehs::Str_8(start, i - start))); - - tokens.Push({TokenT::EOL, {i, 1}}); - - start = i + 1; - - continue; - } - if (*start == '\"') { - if (i != start && *i == '\"') + if (*i == '\"') { - tokens.Push({TokenT::VALUE, {start, (ehs::UInt_64)(i - (start - 1))}}); + ++start; + tokens.Push({TokenT::STRING, ehs::Str_8(start, i - 1 - start)}); + + if (i + 1 < &code[code.Size()]) + start = i + 1; + } + } + else if (*start == '\'') + { + if (*i == '\'') + { + ++start; + + const ehs::UInt_64 size = i - 1 - start; + if (size > 1) + { + EHS_LOG(ehs::LogType::ERR, 0, "Characters cannot extend to more than one."); + + return {}; + } + + tokens.Push({TokenT::CHARACTER, {start, size}}); + + if (i + 1 < &code[code.Size()]) + start = i + 1; + } + } + else if (const bool isEOL = IsEOL(eols, i), isSep = IsSeparator(separators, i); *i == ' ' || *i == '\t' || isEOL || isSep) + { + if ((*start == ' ' || *start == '\t') && i + 1 < &code[code.Size()]) + { start = i + 1; continue; } - continue; - } + const ehs::Str_8 value(start, i - start); - if (!IsSeparator(separators, i)) - continue; + if (value.Size()) + { + if (IsPrimitive(primitives, value)) + tokens.Push({TokenT::TYPE, value}); + else if (IsKeyword(keywords, value)) + tokens.Push({TokenT::KEYWORD, value}); + else if (const Operator *op = IsOperator(operators, value); op) + { + if (op->IsUnary()) + tokens.Push({TokenT::UNARY_OPERATOR, value}); + else + tokens.Push({TokenT::COMPOUND_OPERATOR, value}); + } + else if (value[0] >= '0' && value[0] <= '9') + tokens.Push({TokenT::NUMBER, value}); + else + tokens.Push({TokenT::IDENTIFIER, value}); + } + + if (isEOL) + tokens.Push({TokenT::EOL, {i, 1}}); + else if (isSep) + tokens.Push({TokenT::SEPARATOR, {i, 1}}); - if (i - (start - 1) == 1 && IsSeparator(separators, start)) - { start = i + 1; - continue; } - - tokens.Push(ParseValue(primitives, keywords, operators, ehs::Str_8(start, i - start))); - - start = i + 1; } + EHS_LOG_SUCCESS(); + return tokens; } diff --git a/src/compiler/Language.cpp b/src/compiler/Language.cpp index 2881dbf..9603ca2 100644 --- a/src/compiler/Language.cpp +++ b/src/compiler/Language.cpp @@ -3,28 +3,27 @@ ehs::Array Language::languages; Language::Language() - : id(0), eol('\n') + : id(0) { } Language::Language(ehs::Str_8 name, const ehs::Version& version) - : id(name.Hash_64()), name((ehs::Str_8 &&)name), version(version), eol('\n') + : id(name.Hash_64()), name((ehs::Str_8 &&)name), version(version) { } Language::Language(Language&& lang) noexcept - : id(lang.id), name((ehs::Str_8 &&)lang.name), version(lang.version), eol(lang.eol), + : id(lang.id), name((ehs::Str_8 &&)lang.name), version(lang.version), eols((ehs::Array &&)lang.eols), separators((ehs::Array &&)lang.separators), primitives((ehs::Array &&)lang.primitives), keywords((ehs::Array &&)lang.keywords), operators((ehs::Array &&)lang.operators), interpretations((ehs::Array &&)lang.interpretations) { lang.id = 0; lang.version = {}; - lang.eol = '\n'; } Language::Language(const Language& lang) - : id(lang.id), name(lang.name), version(lang.version), eol(lang.eol), separators(lang.separators), + : id(lang.id), name(lang.name), version(lang.version), eols(lang.eols), separators(lang.separators), primitives(lang.primitives), keywords(lang.keywords), operators(lang.operators), interpretations(lang.interpretations) { @@ -38,7 +37,7 @@ Language& Language::operator=(Language&& lang) noexcept id = lang.id; name = (ehs::Str_8 &&)lang.name; version = lang.version; - eol = lang.eol; + eols = (ehs::Array &&)lang.eols; separators = (ehs::Array &&)lang.separators; primitives = (ehs::Array &&)lang.primitives; keywords = (ehs::Array &&)lang.keywords; @@ -47,7 +46,6 @@ Language& Language::operator=(Language&& lang) noexcept lang.id = 0; lang.version = {}; - lang.eol = '\n'; return *this; } @@ -60,7 +58,7 @@ Language& Language::operator=(const Language& lang) id = lang.id; name = lang.name; version = lang.version; - eol = lang.eol; + eols = lang.eols; separators = lang.separators; primitives = lang.primitives; keywords = lang.keywords; @@ -85,14 +83,28 @@ ehs::Version Language::GetVersion() const return version; } -void Language::SetEOL(const ehs::Char_8& newEOL) +ehs::Array Language::GetEOLs() const { - eol = newEOL; + return eols; } -ehs::Char_8 Language::GetEOL() const +bool Language::HasEOL(const ehs::Char_8 &eol) const { - return eol; + for (ehs::UInt_64 i = 0; i < eols.Size(); ++i) + if (eols[i] == eol) + return true; + + return false; +} + +bool Language::AddEOL(const ehs::Char_8 &eol) +{ + if (HasEOL(eol)) + return false; + + eols.Push(eol); + + return true; } ehs::Array Language::GetSeparators() const