Improved the tokenizer.

This commit is contained in:
Karutoh 2025-04-29 00:24:13 -07:00
parent 0fc10f4c76
commit ff6b5785ae
7 changed files with 148 additions and 74 deletions

View File

@ -30,6 +30,8 @@ public:
ehs::Array<ehs::Byte> Compile(const ehs::Str_8 &code) const; ehs::Array<ehs::Byte> Compile(const ehs::Str_8 &code) const;
private: private:
static bool IsEOL(const ehs::Array<ehs::Char_8> &eols, const ehs::Char_8 *c);
static bool IsSeparator(const ehs::Array<ehs::Char_8> &separators, const ehs::Char_8 *c); static bool IsSeparator(const ehs::Array<ehs::Char_8> &separators, const ehs::Char_8 *c);
static bool IsPrimitive(const ehs::Array<Primitive> &primitives, const ehs::Str_8 &value); static bool IsPrimitive(const ehs::Array<Primitive> &primitives, const ehs::Str_8 &value);
@ -38,7 +40,5 @@ private:
static const Operator *IsOperator(const ehs::Array<Operator> &operators, const ehs::Str_8 &value); static const Operator *IsOperator(const ehs::Array<Operator> &operators, const ehs::Str_8 &value);
Token ParseValue(const ehs::Array<Primitive> &primitives, const ehs::Array<ehs::Str_8> &keywords, const ehs::Array<Operator> &operators, const ehs::Str_8 &value) const;
ehs::Vector<Token> Parse(const ehs::Str_8 &code) const; ehs::Vector<Token> Parse(const ehs::Str_8 &code) const;
}; };

View File

@ -17,7 +17,7 @@ private:
ehs::UInt_64 id; ehs::UInt_64 id;
ehs::Str_8 name; ehs::Str_8 name;
ehs::Version version; ehs::Version version;
ehs::Char_8 eol; ehs::Array<ehs::Char_8> eols;
ehs::Array<ehs::Char_8> separators; ehs::Array<ehs::Char_8> separators;
ehs::Array<Primitive> primitives; ehs::Array<Primitive> primitives;
ehs::Array<ehs::Str_8> keywords; ehs::Array<ehs::Str_8> keywords;
@ -45,9 +45,11 @@ public:
ehs::Version GetVersion() const; ehs::Version GetVersion() const;
void SetEOL(const ehs::Char_8 &newEOL); ehs::Array<ehs::Char_8> GetEOLs() const;
ehs::Char_8 GetEOL() const; bool HasEOL(const ehs::Char_8 &eol) const;
bool AddEOL(const ehs::Char_8 &eol);
ehs::Array<ehs::Char_8> GetSeparators() const; ehs::Array<ehs::Char_8> GetSeparators() const;
@ -77,7 +79,6 @@ public:
bool HasOperator(const ehs::Str_8 &delimeter) const; bool HasOperator(const ehs::Str_8 &delimeter) const;
const Operator *GetOperator(const ehs::Str_8 &delimeter) const; const Operator *GetOperator(const ehs::Str_8 &delimeter) const;
bool AddOperator(Operator primitive); bool AddOperator(Operator primitive);

View File

@ -5,7 +5,10 @@
enum class TokenT : ehs::UInt_8 enum class TokenT : ehs::UInt_8
{ {
UNKNOWN, UNKNOWN,
VALUE, SEPARATOR,
NUMBER,
STRING,
CHARACTER,
KEYWORD, KEYWORD,
TYPE, TYPE,
IDENTIFIER, IDENTIFIER,

View File

@ -79,13 +79,13 @@ void Arctyx::LoadLanguagePlugins()
if (!initFunc()) if (!initFunc())
{ {
ehs::Console::Write_8("Failed to initialize plugin " + pluginName + " v" + pluginVersion.major + "." + ehs::Console::Write_8("Failed to load plugin " + pluginName + " v" + pluginVersion.major + "." +
pluginVersion.minor + "." + pluginVersion.patch); pluginVersion.minor + "." + pluginVersion.patch);
continue; continue;
} }
ehs::Console::Write_8("Successfully initialized plugin " + pluginName + " v" + pluginVersion.major + "." + ehs::Console::Write_8("Successfully loaded plugin " + pluginName + " v" + pluginVersion.major + "." +
pluginVersion.minor + "." + pluginVersion.patch); pluginVersion.minor + "." + pluginVersion.patch);
plugins.Push((ehs::Open &&)plugin); plugins.Push((ehs::Open &&)plugin);
@ -94,8 +94,12 @@ void Arctyx::LoadLanguagePlugins()
void Arctyx::Initialize() void Arctyx::Initialize()
{ {
ehs::Console::Write_8("Loading plugins...");
LoadArchitecturePlugins(); LoadArchitecturePlugins();
LoadLanguagePlugins(); LoadLanguagePlugins();
ehs::Console::Write_8("");
} }
void Arctyx::Uninitialize() void Arctyx::Uninitialize()
@ -110,6 +114,8 @@ void Arctyx::Uninitialize()
Language::languages.Clear(); Language::languages.Clear();
ehs::Console::Write_8("\nUnloading plugins...");
for (ehs::UInt_64 i = 0; i < plugins.Size(); ++i) for (ehs::UInt_64 i = 0; i < plugins.Size(); ++i)
{ {
GetPluginName nameFunc = (GetPluginName)plugins[i].Retrieve("_Z13GetPluginNamev"); GetPluginName nameFunc = (GetPluginName)plugins[i].Retrieve("_Z13GetPluginNamev");
@ -121,11 +127,11 @@ void Arctyx::Uninitialize()
if (!shutFunc) if (!shutFunc)
{ {
ehs::Console::Write_8("Failed to shutdown plugin " + pluginName + " v" + pluginVersion.major + "." + ehs::Console::Write_8("Failed to unload plugin " + pluginName + " v" + pluginVersion.major + "." +
pluginVersion.minor + "." + pluginVersion.patch); pluginVersion.minor + "." + pluginVersion.patch);
} }
ehs::Console::Write_8("successfully shutdown plugin " + pluginName + " v" + pluginVersion.major + "." + ehs::Console::Write_8("successfully unloaded plugin " + pluginName + " v" + pluginVersion.major + "." +
pluginVersion.minor + "." + pluginVersion.patch); pluginVersion.minor + "." + pluginVersion.patch);
} }

View File

@ -15,10 +15,10 @@ bool InitializePlugin()
Language arctyx("Arctyx", GetPluginVersion()); Language arctyx("Arctyx", GetPluginVersion());
arctyx.AddSeparator(' '); arctyx.AddSeparator(',');
arctyx.AddSeparator('\t');
arctyx.SetEOL('\n'); arctyx.AddEOL('\n');
arctyx.AddEOL(';');
arctyx.AddPrimitive({"Byte", 1, Signedness::UNSIGNED}); arctyx.AddPrimitive({"Byte", 1, Signedness::UNSIGNED});
arctyx.AddPrimitive({"Char_8", 1, Signedness::UNSIGNED}); arctyx.AddPrimitive({"Char_8", 1, Signedness::UNSIGNED});

View File

@ -1,5 +1,8 @@
#include "arctyx/compiler/Compiler.h" #include "arctyx/compiler/Compiler.h"
#include <iterator>
#include <ehs/io/Console.h>
Compiler::Compiler() Compiler::Compiler()
: architecture(nullptr), language(nullptr) : architecture(nullptr), language(nullptr)
{ {
@ -65,13 +68,51 @@ Compiler& Compiler::operator=(const Compiler& other)
ehs::Array<ehs::Byte> Compiler::Compile(const ehs::Str_8 &code) const ehs::Array<ehs::Byte> Compiler::Compile(const ehs::Str_8 &code) const
{ {
ehs::Console::Write_8("Code:");
ehs::Console::Write_8(code);
ehs::Console::Write_8("Tokens:");
ehs::Vector<Token> tokens = Parse(code); ehs::Vector<Token> tokens = Parse(code);
for (ehs::UInt_64 i = 0; i < tokens.Size(); ++i)
{
if (tokens[i].GetType() == TokenT::UNKNOWN)
ehs::Console::Write_8("UNKNOWN, ", false);
else if (tokens[i].GetType() == TokenT::SEPARATOR)
ehs::Console::Write_8("SEPARATOR, ", false);
else if (tokens[i].GetType() == TokenT::NUMBER)
ehs::Console::Write_8("NUMBER, ", false);
else if (tokens[i].GetType() == TokenT::STRING)
ehs::Console::Write_8("STRING, ", false);
else if (tokens[i].GetType() == TokenT::CHARACTER)
ehs::Console::Write_8("CHARACTER, ", false);
else if (tokens[i].GetType() == TokenT::KEYWORD)
ehs::Console::Write_8("KEYWORD, ", false);
else if (tokens[i].GetType() == TokenT::TYPE)
ehs::Console::Write_8("TYPE, ", false);
else if (tokens[i].GetType() == TokenT::IDENTIFIER)
ehs::Console::Write_8("IDENTIFIER, ", false);
else if (tokens[i].GetType() == TokenT::UNARY_OPERATOR)
ehs::Console::Write_8("UNARY_OPERATOR, ", false);
else if (tokens[i].GetType() == TokenT::COMPOUND_OPERATOR)
ehs::Console::Write_8("COMPOUND_OPERATOR, ", false);
else if (tokens[i].GetType() == TokenT::EOL)
ehs::Console::Write_8("EOL");
}
ehs::Array<ehs::Byte> machineCode; ehs::Array<ehs::Byte> machineCode;
return machineCode; return machineCode;
} }
bool Compiler::IsEOL(const ehs::Array<ehs::Char_8>& eols, const ehs::Char_8* c)
{
for (ehs::UInt_64 s = 0; s < eols.Size(); ++s)
if (*c == eols[s])
return true;
return false;
}
bool Compiler::IsSeparator(const ehs::Array<ehs::Char_8> &separators, const ehs::Char_8 *c) bool Compiler::IsSeparator(const ehs::Array<ehs::Char_8> &separators, const ehs::Char_8 *c)
{ {
for (ehs::UInt_64 s = 0; s < separators.Size(); ++s) for (ehs::UInt_64 s = 0; s < separators.Size(); ++s)
@ -108,33 +149,11 @@ const Operator *Compiler::IsOperator(const ehs::Array<Operator>& operators, cons
return nullptr; return nullptr;
} }
Token Compiler::ParseValue(const ehs::Array<Primitive> &primitives, const ehs::Array<ehs::Str_8> &keywords, const ehs::Array<Operator> &operators, const ehs::Str_8& value) const
{
if (IsPrimitive(primitives, value))
return {TokenT::TYPE, value};
if (IsKeyword(keywords, value))
return {TokenT::KEYWORD, value};
const Operator *op = IsOperator(operators, value);
if (op)
{
if (op->IsUnary())
return {TokenT::UNARY_OPERATOR, value};
else
return {TokenT::COMPOUND_OPERATOR, value};
}
if (value[0] >= '0' && value[0] <= '9')
return {TokenT::VALUE, value};
return {TokenT::IDENTIFIER, value};
}
ehs::Vector<Token> Compiler::Parse(const ehs::Str_8 &code) const ehs::Vector<Token> Compiler::Parse(const ehs::Str_8 &code) const
{ {
ehs::Vector<Token> tokens; ehs::Vector<Token> tokens;
const ehs::Array<ehs::Char_8> eols = language->GetEOLs();
const ehs::Array<ehs::Char_8> separators = language->GetSeparators(); const ehs::Array<ehs::Char_8> separators = language->GetSeparators();
const ehs::Array<Primitive> primitives = language->GetPrimitives(); const ehs::Array<Primitive> primitives = language->GetPrimitives();
const ehs::Array<ehs::Str_8> keywords = language->GetKeywords(); const ehs::Array<ehs::Str_8> keywords = language->GetKeywords();
@ -142,45 +161,78 @@ ehs::Vector<Token> Compiler::Parse(const ehs::Str_8 &code) const
for (ehs::Char_8 *i = &code[0], *start = i; i < &code[code.Size()]; ++i) for (ehs::Char_8 *i = &code[0], *start = i; i < &code[code.Size()]; ++i)
{ {
if (*i == language->GetEOL())
{
if (start != i)
tokens.Push(ParseValue(primitives, keywords, operators, ehs::Str_8(start, i - start)));
tokens.Push({TokenT::EOL, {i, 1}});
start = i + 1;
continue;
}
if (*start == '\"') if (*start == '\"')
{ {
if (i != start && *i == '\"') if (*i == '\"')
{ {
tokens.Push({TokenT::VALUE, {start, (ehs::UInt_64)(i - (start - 1))}}); ++start;
tokens.Push({TokenT::STRING, ehs::Str_8(start, i - 1 - start)});
if (i + 1 < &code[code.Size()])
start = i + 1;
}
}
else if (*start == '\'')
{
if (*i == '\'')
{
++start;
const ehs::UInt_64 size = i - 1 - start;
if (size > 1)
{
EHS_LOG(ehs::LogType::ERR, 0, "Characters cannot extend to more than one.");
return {};
}
tokens.Push({TokenT::CHARACTER, {start, size}});
if (i + 1 < &code[code.Size()])
start = i + 1;
}
}
else if (const bool isEOL = IsEOL(eols, i), isSep = IsSeparator(separators, i); *i == ' ' || *i == '\t' || isEOL || isSep)
{
if ((*start == ' ' || *start == '\t') && i + 1 < &code[code.Size()])
{
start = i + 1; start = i + 1;
continue; continue;
} }
continue; const ehs::Str_8 value(start, i - start);
}
if (!IsSeparator(separators, i)) if (value.Size())
continue; {
if (IsPrimitive(primitives, value))
tokens.Push({TokenT::TYPE, value});
else if (IsKeyword(keywords, value))
tokens.Push({TokenT::KEYWORD, value});
else if (const Operator *op = IsOperator(operators, value); op)
{
if (op->IsUnary())
tokens.Push({TokenT::UNARY_OPERATOR, value});
else
tokens.Push({TokenT::COMPOUND_OPERATOR, value});
}
else if (value[0] >= '0' && value[0] <= '9')
tokens.Push({TokenT::NUMBER, value});
else
tokens.Push({TokenT::IDENTIFIER, value});
}
if (isEOL)
tokens.Push({TokenT::EOL, {i, 1}});
else if (isSep)
tokens.Push({TokenT::SEPARATOR, {i, 1}});
if (i - (start - 1) == 1 && IsSeparator(separators, start))
{
start = i + 1; start = i + 1;
continue;
} }
tokens.Push(ParseValue(primitives, keywords, operators, ehs::Str_8(start, i - start)));
start = i + 1;
} }
EHS_LOG_SUCCESS();
return tokens; return tokens;
} }

View File

@ -3,28 +3,27 @@
ehs::Array<const Language *> Language::languages; ehs::Array<const Language *> Language::languages;
Language::Language() Language::Language()
: id(0), eol('\n') : id(0)
{ {
} }
Language::Language(ehs::Str_8 name, const ehs::Version& version) Language::Language(ehs::Str_8 name, const ehs::Version& version)
: id(name.Hash_64()), name((ehs::Str_8 &&)name), version(version), eol('\n') : id(name.Hash_64()), name((ehs::Str_8 &&)name), version(version)
{ {
} }
Language::Language(Language&& lang) noexcept Language::Language(Language&& lang) noexcept
: id(lang.id), name((ehs::Str_8 &&)lang.name), version(lang.version), eol(lang.eol), : id(lang.id), name((ehs::Str_8 &&)lang.name), version(lang.version), eols((ehs::Array<ehs::Char_8> &&)lang.eols),
separators((ehs::Array<ehs::Char_8> &&)lang.separators), primitives((ehs::Array<Primitive> &&)lang.primitives), separators((ehs::Array<ehs::Char_8> &&)lang.separators), primitives((ehs::Array<Primitive> &&)lang.primitives),
keywords((ehs::Array<ehs::Str_8> &&)lang.keywords), operators((ehs::Array<Operator> &&)lang.operators), keywords((ehs::Array<ehs::Str_8> &&)lang.keywords), operators((ehs::Array<Operator> &&)lang.operators),
interpretations((ehs::Array<Interpretation> &&)lang.interpretations) interpretations((ehs::Array<Interpretation> &&)lang.interpretations)
{ {
lang.id = 0; lang.id = 0;
lang.version = {}; lang.version = {};
lang.eol = '\n';
} }
Language::Language(const Language& lang) Language::Language(const Language& lang)
: id(lang.id), name(lang.name), version(lang.version), eol(lang.eol), separators(lang.separators), : id(lang.id), name(lang.name), version(lang.version), eols(lang.eols), separators(lang.separators),
primitives(lang.primitives), keywords(lang.keywords), operators(lang.operators), primitives(lang.primitives), keywords(lang.keywords), operators(lang.operators),
interpretations(lang.interpretations) interpretations(lang.interpretations)
{ {
@ -38,7 +37,7 @@ Language& Language::operator=(Language&& lang) noexcept
id = lang.id; id = lang.id;
name = (ehs::Str_8 &&)lang.name; name = (ehs::Str_8 &&)lang.name;
version = lang.version; version = lang.version;
eol = lang.eol; eols = (ehs::Array<ehs::Char_8> &&)lang.eols;
separators = (ehs::Array<ehs::Char_8> &&)lang.separators; separators = (ehs::Array<ehs::Char_8> &&)lang.separators;
primitives = (ehs::Array<Primitive> &&)lang.primitives; primitives = (ehs::Array<Primitive> &&)lang.primitives;
keywords = (ehs::Array<ehs::Str_8> &&)lang.keywords; keywords = (ehs::Array<ehs::Str_8> &&)lang.keywords;
@ -47,7 +46,6 @@ Language& Language::operator=(Language&& lang) noexcept
lang.id = 0; lang.id = 0;
lang.version = {}; lang.version = {};
lang.eol = '\n';
return *this; return *this;
} }
@ -60,7 +58,7 @@ Language& Language::operator=(const Language& lang)
id = lang.id; id = lang.id;
name = lang.name; name = lang.name;
version = lang.version; version = lang.version;
eol = lang.eol; eols = lang.eols;
separators = lang.separators; separators = lang.separators;
primitives = lang.primitives; primitives = lang.primitives;
keywords = lang.keywords; keywords = lang.keywords;
@ -85,14 +83,28 @@ ehs::Version Language::GetVersion() const
return version; return version;
} }
void Language::SetEOL(const ehs::Char_8& newEOL) ehs::Array<ehs::Char_8> Language::GetEOLs() const
{ {
eol = newEOL; return eols;
} }
ehs::Char_8 Language::GetEOL() const bool Language::HasEOL(const ehs::Char_8 &eol) const
{ {
return eol; for (ehs::UInt_64 i = 0; i < eols.Size(); ++i)
if (eols[i] == eol)
return true;
return false;
}
bool Language::AddEOL(const ehs::Char_8 &eol)
{
if (HasEOL(eol))
return false;
eols.Push(eol);
return true;
} }
ehs::Array<ehs::Char_8> Language::GetSeparators() const ehs::Array<ehs::Char_8> Language::GetSeparators() const