better structure etc

This commit is contained in:
2025-11-05 23:21:29 +03:00
parent 595fdbe653
commit dcd33c9578
7 changed files with 14 additions and 279 deletions

252
src/lexer.h Normal file
View File

@@ -0,0 +1,252 @@
#include <assert.h>
#include <ctype.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
typedef enum {
TOKEN_PLUS,
TOKEN_MINUS,
TOKEN_INTEGER,
TOKEN_FLOAT,
TOKEN_SPACE,
TOKEN_STRING, // idx 5
TOKEN_IDENTIFIER,
TOKEN_MUL,
TOKEN_DIV,
TOKEN_UNKNOWN, // idx 9
TOKEN_EOF,
TOKEN_NEWLINE,
TOKEN_LPAREN,
TOKEN_RPAREN,
TOKEN_COMMA,
TOKEN_LCURLY,
TOKEN_RCURLY,
TOKEN_COLON,
TOKEN_SEMI
} symbols;
typedef enum {
BHV_STACK,
BHV_UNDEFINED,
BHV_NUMBER,
BHV_STRING,
BHV_FLOAT,
BHV_IDENT,
} symbol_bhv;
char *token_type_to_string(symbols type) {
switch (type) {
case TOKEN_PLUS: return "TOKEN_PLUS";
case TOKEN_MINUS: return "TOKEN_MINUS";
case TOKEN_INTEGER: return "TOKEN_INTEGER";
case TOKEN_FLOAT: return "TOKEN_FLOAT";
case TOKEN_SPACE: return "TOKEN_SPACE";
case TOKEN_STRING: return "TOKEN_STRING";
case TOKEN_MUL: return "TOKEN_MUL";
case TOKEN_DIV: return "TOKEN_DIV";
case TOKEN_LPAREN: return "TOKEN_LPAREN";
case TOKEN_RPAREN: return "TOKEN_RPAREN";
case TOKEN_COMMA: return "TOKEN_COMMA";
case TOKEN_EOF: return "TOKEN_EOF";
case TOKEN_NEWLINE: return "TOKEN_NEWLINE";
case TOKEN_IDENTIFIER: return "TOKEN_IDENTIFIER";
case TOKEN_LCURLY: return "TOKEN_LCURLY";
case TOKEN_RCURLY: return "TOKEN_RCURLY";
case TOKEN_SEMI: return "TOKEN_SEMI";
case TOKEN_COLON: return "TOKEN_COLON";
case TOKEN_UNKNOWN: return "TOKEN_UNKNOWN";
default: return "UNKNOWN_SYMBOL";
}
}
typedef struct {
symbols *type;
char **text;
char **tktype;
size_t *text_len;
symbol_bhv *behaviour;
unsigned int *cursor_skip;
symbols *previous_token;
size_t capacity;
size_t size;
} Token;
void token_init(Token *tok, size_t capacity) {
tok->capacity = capacity;
tok->size = 0;
tok->type = malloc(sizeof(symbols) * capacity);
tok->text = malloc(sizeof(char *) * capacity);
tok->text_len = malloc(sizeof(size_t) * capacity);
tok->behaviour = malloc(sizeof(symbol_bhv) * capacity);
tok->cursor_skip = malloc(sizeof(unsigned int) * capacity);
tok->previous_token = malloc(sizeof(symbols) * capacity);
tok->tktype = malloc(sizeof(char*) * capacity);
assert(tok->type && tok->text && tok->text_len &&
tok->behaviour && tok->cursor_skip && tok->previous_token);
}
void token_grow(Token *tok) {
size_t new_capacity = (tok->capacity == 0 ? 8 : tok->capacity * 2);
tok->type = realloc(tok->type, new_capacity * sizeof(symbols));
tok->text = realloc(tok->text, new_capacity * sizeof(char *));
tok->text_len = realloc(tok->text_len, new_capacity * sizeof(size_t));
tok->behaviour = realloc(tok->behaviour, new_capacity * sizeof(symbol_bhv));
tok->cursor_skip = realloc(tok->cursor_skip, new_capacity * sizeof(unsigned int));
tok->previous_token = realloc(tok->previous_token, new_capacity * sizeof(symbols));
tok->tktype = realloc(tok->tktype, new_capacity*sizeof(char*));
assert(tok->type && tok->text && tok->text_len &&
tok->behaviour && tok->cursor_skip && tok->previous_token);
tok->capacity = new_capacity;
}
void token_push(Token *tok, symbols type, const char *text,
symbol_bhv behaviour, size_t cursor_skip) {
if (tok->size >= tok->capacity) {
token_grow(tok);
}
size_t i = tok->size;
tok->type[i] = type;
tok->text[i] = strdup(text);
tok->text_len[i] = strlen(text);
tok->behaviour[i] = behaviour;
tok->cursor_skip[i] = cursor_skip;
tok->tktype[i] = token_type_to_string(tok->type[i]);
if (i > 0)
tok->previous_token[i] = tok->type[i - 1];
else
tok->previous_token[i] = TOKEN_UNKNOWN;
tok->size++;
}
void token_free(Token *tok) {
for (size_t i = 0; i < tok->size; i++) {
free(tok->text[i]);
}
free(tok->type);
free(tok->text);
free(tok->text_len);
free(tok->behaviour);
free(tok->cursor_skip);
free(tok->previous_token);
}
int str_to_int(char *strint) { return atoi(strint); }
float str_to_float(char *strif) { return strtof(strif, NULL); }
size_t read_from_tok(Token *tok, const char *input, size_t cursor) {
char buf[64];
size_t start = cursor;
size_t i = 0;
if (isdigit((unsigned char)input[cursor])) {
int dots_seen = 0;
while (isdigit((unsigned char)input[cursor]) || input[cursor] == '.') {
if (input[cursor] == '.') dots_seen++;
buf[i++] = input[cursor++];
if (i >= sizeof(buf) - 1) break;
}
buf[i] = '\0';
token_push(tok, dots_seen == 0 ? TOKEN_INTEGER : TOKEN_FLOAT,
buf, dots_seen == 0 ? BHV_NUMBER : BHV_FLOAT,
cursor - start);
return cursor - start; // all digits handled
}
else if (input[cursor] == '"') {
cursor++; // skip opening quote
while (input[cursor] != '"' && input[cursor] != '\0') {
buf[i++] = input[cursor++];
if (i >= sizeof(buf) - 1) break;
}
buf[i] = '\0';
if (input[cursor] == '"') cursor++; // skip closing quote
token_push(tok, TOKEN_STRING, buf, BHV_STRING, cursor - start);
return cursor - start;
}
else if (isalpha((unsigned char)input[cursor])) {
while (isalpha((unsigned char)input[cursor])) {
buf[i++] = input[cursor++];
if (i >= sizeof(buf) - 1) break;
}
buf[i] = '\0';
token_push(tok, TOKEN_IDENTIFIER, buf, BHV_IDENT, cursor - start);
return cursor - start;
}
// Single-character tokens and symbols
switch (input[cursor]) {
case '+': token_push(tok, TOKEN_PLUS, "+", BHV_STACK, 1); break;
case '-': token_push(tok, TOKEN_MINUS, "-", BHV_STACK, 1); break;
case '*': token_push(tok, TOKEN_MUL, "*", BHV_STACK, 1); break;
case '/': token_push(tok, TOKEN_DIV, "/", BHV_STACK, 1); break;
case '{': token_push(tok, TOKEN_LCURLY, "{", BHV_STACK, 1); break;
case '}': token_push(tok, TOKEN_RCURLY, "}", BHV_STACK, 1); break;
case ';': token_push(tok, TOKEN_SEMI, ";", BHV_STACK, 1); break;
case ':': token_push(tok, TOKEN_COLON, ":", BHV_STACK, 1); break;
case '(':
token_push(tok, TOKEN_LPAREN, "(", BHV_STACK, 1);
break;
case ')':
token_push(tok, TOKEN_RPAREN, ")", BHV_STACK, 1);
break;
case ',':
token_push(tok, TOKEN_COMMA, ",", BHV_STACK, 1);
break;
case ' ':
// you can skip space tokens if you don't need them
token_push(tok, TOKEN_SPACE, " ", BHV_UNDEFINED, 1);
break;
case '\n':
token_push(tok, TOKEN_NEWLINE, "\\n", BHV_UNDEFINED, 1);
break;
case '\0':
return 0; // end of input
default: {
buf[0] = input[cursor];
buf[1] = '\0';
token_push(tok, TOKEN_UNKNOWN, buf, BHV_UNDEFINED, 1);
break;
}
}
cursor++; // move forward exactly one char for symbol cases
return cursor - start;
}
Token tokenize_all(const char *input) {
Token tok;
token_init(&tok, 8);
size_t i = 0;
size_t length = strlen(input);
while (i < length) {
i += read_from_tok(&tok, input, i);
}
token_push(&tok, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0);
return tok;
}

219
src/parser.h Normal file
View File

@@ -0,0 +1,219 @@
#include "./lexer.h"
#define NB_IMPLEMENTATION
#include "./nb.h"
int get_prec(symbols op){
switch (op) {
case TOKEN_MUL:
case TOKEN_DIV:
return 2; break;
case TOKEN_PLUS:
case TOKEN_MINUS:
return 1; break;
default: return 0;
}
}
// parse
bool is_left_asc(symbols op){
switch (op) {
case TOKEN_MUL:
case TOKEN_DIV:
case TOKEN_PLUS:
case TOKEN_MINUS:
return true; break;
default: return false;
}
}
Token *global_tok = NULL;
typedef enum {
SYM_VAR,
SYM_FUNC,
} SymbolKind;
typedef struct {
const char* name;
size_t ret_count;
size_t arg_count;
symbols arg_types[16];
symbols ret_type;
SymbolKind symbol_kind;
bool builtin;
} Symbol;
// static Symbol builtins[] = {
// { "print", 1, 1, { TOKEN_UNKNOWN }, TOKEN_EOF, SYM_FUNC, true },
// };
typedef struct {
Symbol *symbols;
size_t size;
size_t capacity;
} SymbolTable;
// static int builtin_num = sizeof(builtins)/sizeof(builtins[0]);
// static SymbolTable global_env = {
// .size = sizeof(builtins)/sizeof(builtins[0]),
// .capacity = sizeof(builtins)/sizeof(builtins[0]),
// .symbols = builtins};
Symbol *symbol_lookup(SymbolTable *table, const char *n){
for (size_t i=0; i<table->size; ++i){
if(strcmp(n, table->symbols[i].name) == 0){
return &table->symbols[i];
}
}
return NULL;
}
// fn add(x: int, y: int) int {
// return x+y;
// }
void symbol_table_init(SymbolTable *table, size_t initial_capacity) {
table->symbols = malloc(sizeof(Symbol) * initial_capacity);
if (!table->symbols) {
fprintf(stderr, "symbol_table_init: malloc failed\n");
exit(1);
}
table->size = 0;
table->capacity = initial_capacity;
}
void symbol_table_add(SymbolTable *table, Symbol sym) {
if (table->size >= table->capacity) {
table->capacity = (table->capacity == 0) ? 8 : table->capacity * 2;
table->symbols = realloc(table->symbols, sizeof(Symbol) * table->capacity);
if (!table->symbols) {
fprintf(stderr, "symbol_table_add: realloc failed\n");
exit(1);
}
}
table->symbols[table->size++] = sym;
}
void symbol_table_free(SymbolTable *table) {
free(table->symbols);
table->symbols = NULL;
table->size = 0;
table->capacity = 0;
}
Token build_rpn(Token *inp, SymbolTable *symtab) {
Token output;
Token stack;
token_init(&output, 16);
token_init(&stack, 16);
for (size_t i = 0; i < inp->size; ++i) {
symbols type = inp->type[i];
const char *text = inp->text[i];
if (type == TOKEN_IDENTIFIER && i + 1 < inp->size && inp->type[i + 1] == TOKEN_LPAREN) {
Symbol *found = symbol_lookup(symtab, text);
if (!found) {
Symbol sym = {
.name = strdup(text),
.arg_count = 0,
.ret_type = TOKEN_EOF,
.symbol_kind = SYM_FUNC,
.builtin = false
};
symbol_table_add(symtab, sym);
}
token_push(&stack, type, text, inp->behaviour[i], 0);
} else if (type == TOKEN_IDENTIFIER) {
Symbol *found = symbol_lookup(symtab, text);
if (!found) {
Symbol sym = {
.name = strdup(text),
.arg_count = 0,
.ret_type = TOKEN_UNKNOWN,
.symbol_kind = SYM_VAR,
.builtin = false
};
symbol_table_add(symtab, sym);
}
token_push(&output, type, text, inp->behaviour[i], 0);
} else if (type == TOKEN_LPAREN) {
token_push(&stack, type, text, inp->behaviour[i], 0);
} else if (type == TOKEN_RPAREN) {
while (stack.size > 0 && stack.type[stack.size - 1] != TOKEN_LPAREN) {
token_push(&output, stack.type[stack.size - 1],
stack.text[stack.size - 1],
stack.behaviour[stack.size - 1], 0);
stack.size--;
}
if (stack.size > 0 && stack.type[stack.size - 1] == TOKEN_LPAREN)
stack.size--;
if (stack.size > 0 && stack.type[stack.size - 1] == TOKEN_IDENTIFIER) {
token_push(&output, stack.type[stack.size - 1],
stack.text[stack.size - 1],
stack.behaviour[stack.size - 1], 0);
stack.size--;
}
} else if (type == TOKEN_INTEGER || type == TOKEN_FLOAT || type == TOKEN_STRING) {
token_push(&output, type, text, inp->behaviour[i], 0);
} else if (is_left_asc(type)) {
while (stack.size > 0 && stack.type[stack.size - 1] != TOKEN_LPAREN &&
(get_prec(stack.type[stack.size - 1]) > get_prec(type) ||
get_prec(stack.type[stack.size - 1]) == get_prec(type)) &&
is_left_asc(type)) {
token_push(&output, stack.type[stack.size - 1],
stack.text[stack.size - 1],
stack.behaviour[stack.size - 1], 0);
stack.size--;
}
token_push(&stack, type, text, inp->behaviour[i], 0);
}
}
while (stack.size > 0) {
token_push(&output, stack.type[stack.size - 1],
stack.text[stack.size - 1],
stack.behaviour[stack.size - 1], 0);
stack.size--;
}
token_push(&output, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0);
return output;
}
void print_token(Token *tk){
for (size_t i=0; i<tk->size; ++i){
printf("TokenNum: %zu Type: %s Value: %s\n", i, tk->tktype[i], tk->text[i]);
}
}
// int main(int argc, char **argv){
// if (argc < 2) return -1;
// const char ts[] = "\"hello\" hi + 2 2.312";
// const char math[] = "print(((1+2)*6)/18)"; // = 1
// const char print[] = "print(\"hello\")";
// const char simple[] = "1 + ( 3 + 3 )/4+4*3";
// char* read = nb_read_file(argv[1]);
// Token tk = tokenize_all(read);
// printf("INPUT: %s\n", read);
// SymbolTable table = {0};
// symbol_table_init(&table, 32);
// Token rpn = build_rpn(&tk, &table);
// print_token(&rpn);
// }

204
src/vm.c Normal file
View File

@@ -0,0 +1,204 @@
#include "parser.h"
#include <string.h>
typedef enum {
OP_PUSH_INT,
OP_PUSH_FLOAT,
OP_PUSH_STRING,
OP_ADD,
OP_SUB,
OP_MUL,
OP_DIV,
OP_PRINT,
OP_HALT
} OPcode;
typedef struct {
OPcode op;
double num;
char *strlit;
} instruct;
typedef enum {
VAL_INT,
VAL_FLOAT,
VAL_STRING,
} ValueType;
typedef struct {
ValueType type;
union {
long i;
double f;
char *s;
};
} Value;
typedef struct {
instruct *program;
size_t inst_p;
size_t program_size;
Value stack[256];
size_t st_p;
bool running;
} VM;
instruct *rpn_to_bytecode(Token *rpn, size_t *out){
size_t cap = 64;
size_t size = 0;
instruct *prog = malloc(sizeof(instruct) * cap);
for (size_t i=0; i<rpn->size; ++i){
symbols t = rpn->type[i];
const char *text = rpn->text[i];
instruct ins = {0};
switch (t){
case TOKEN_INTEGER: ins.op = OP_PUSH_INT; ins.num = atof(text); break;
case TOKEN_FLOAT: ins.op = OP_PUSH_FLOAT; ins.num = atof(text); break;
case TOKEN_STRING: ins.op = OP_PUSH_STRING; ins.strlit = strdup(text); break;
case TOKEN_PLUS: ins.op = OP_ADD; break;
case TOKEN_MINUS: ins.op = OP_SUB; break;
case TOKEN_MUL: ins.op = OP_MUL; break;
case TOKEN_DIV: ins.op = OP_DIV; break;
case TOKEN_IDENTIFIER:
if (strcmp(text, "print") == 0) {
ins.op = OP_PRINT;
} else {
printf("[WARNING] Uknown Identifier '%s'\n", text);
}
break; //TODO: unhardcode this
case TOKEN_EOF: ins.op = OP_HALT; break;
default: continue;
}
if (size >= cap){
cap*=2;
prog = realloc(prog, sizeof(instruct)*cap);
}
prog[size++] = ins;
}
*out = size;
return prog;
}
void vm_run(VM *vm) {
vm->running = true;
vm->inst_p = 0;
vm->st_p = 0;
while (vm->running && vm->inst_p < vm->program_size) {
instruct ins = vm->program[vm->inst_p++];
switch (ins.op) {
case OP_PUSH_INT: {
Value v = { .type = VAL_INT, .i = ins.num };
vm->stack[vm->st_p++] = v;
} break;
case OP_PUSH_FLOAT: {
Value v = { .type = VAL_FLOAT, .f = ins.num };
vm->stack[vm->st_p++] = v;
} break;
case OP_PUSH_STRING: {
Value v = { .type = VAL_STRING, .s = strdup(ins.strlit) };
vm->stack[vm->st_p++] = v;
} break;
case OP_ADD:
case OP_SUB:
case OP_MUL:
case OP_DIV: {
if (vm->st_p < 2) {
fprintf(stderr, "not enough values on stack.\n");
vm->running = false;
break;
}
Value b = vm->stack[--vm->st_p];
Value a = vm->stack[--vm->st_p];
double av = (a.type == VAL_INT) ? a.i : a.f;
double bv = (b.type == VAL_INT) ? b.i : b.f;
double result = 0;
switch (ins.op) {
case OP_ADD: result = av + bv; break;
case OP_SUB: result = av - bv; break;
case OP_MUL: result = av * bv; break;
case OP_DIV:
if (bv == 0) {
fprintf(stderr, "division by zero.\n");
vm->running = false;
} else result = av / bv;
break;
default: break;
}
Value v = { .type = VAL_FLOAT, .f = result };
vm->stack[vm->st_p++] = v;
} break;
case OP_PRINT: {
if (vm->st_p == 0) {
fprintf(stderr, "cant print an empty stack\n");
vm->running = false;
break;
}
Value v = vm->stack[--vm->st_p];
switch (v.type) {
case VAL_INT: printf("%ld\n", v.i); break;
case VAL_FLOAT: printf("%g\n", v.f); break;
case VAL_STRING:
printf("%s\n", v.s);
free(v.s);
break;
}
} break;
case OP_HALT:
vm->running = false;
break;
default:
fprintf(stderr, "unknown opcode %d\n", ins.op);
vm->running = false;
break;
}
}
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <source file>\n", argv[0]);
return 1;
}
char* read = nb_read_file(argv[1]);
//printf("INPUT: %s\n", read);
Token tk = tokenize_all(read);
SymbolTable table = {0};
symbol_table_init(&table, 32);
Token rpn = build_rpn(&tk, &table);
//print_token(&rpn);
size_t prog_size = 0;
instruct *prog = rpn_to_bytecode(&rpn, &prog_size);
VM vm = {
.program = prog,
.program_size = prog_size,
.inst_p = 0,
.st_p = 0,
.running = true,
};
vm_run(&vm);
return 0;
}