better structure etc

2025-11-05 23:21:29 +03:00
parent 595fdbe653
commit dcd33c9578
7 changed files with 14 additions and 279 deletions
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -0,0 +1,252 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+
+
+typedef enum {
+  TOKEN_PLUS,
+  TOKEN_MINUS,
+  TOKEN_INTEGER,
+  TOKEN_FLOAT,
+  TOKEN_SPACE,
+  TOKEN_STRING, // idx 5 
+  TOKEN_IDENTIFIER,
+  TOKEN_MUL,
+  TOKEN_DIV,
+  TOKEN_UNKNOWN, // idx 9 
+  TOKEN_EOF,
+  TOKEN_NEWLINE,
+  TOKEN_LPAREN,
+  TOKEN_RPAREN,
+  TOKEN_COMMA,
+  TOKEN_LCURLY,
+  TOKEN_RCURLY,
+  TOKEN_COLON,
+  TOKEN_SEMI
+} symbols;
+
+typedef enum {
+  BHV_STACK,
+  BHV_UNDEFINED,
+  BHV_NUMBER,
+  BHV_STRING,
+  BHV_FLOAT,
+  BHV_IDENT,
+} symbol_bhv;
+
+
+
+char *token_type_to_string(symbols type) {
+  switch (type) {
+    case TOKEN_PLUS: return "TOKEN_PLUS";
+    case TOKEN_MINUS: return "TOKEN_MINUS";
+    case TOKEN_INTEGER: return "TOKEN_INTEGER";
+    case TOKEN_FLOAT: return "TOKEN_FLOAT";
+    case TOKEN_SPACE: return "TOKEN_SPACE";
+    case TOKEN_STRING: return "TOKEN_STRING";
+    case TOKEN_MUL: return "TOKEN_MUL";
+    case TOKEN_DIV: return "TOKEN_DIV";
+    case TOKEN_LPAREN: return "TOKEN_LPAREN";
+    case TOKEN_RPAREN: return "TOKEN_RPAREN";
+    case TOKEN_COMMA: return "TOKEN_COMMA";
+    case TOKEN_EOF: return "TOKEN_EOF";
+    case TOKEN_NEWLINE: return "TOKEN_NEWLINE";
+    case TOKEN_IDENTIFIER: return "TOKEN_IDENTIFIER";
+    case TOKEN_LCURLY: return "TOKEN_LCURLY";
+    case TOKEN_RCURLY: return "TOKEN_RCURLY";
+    case TOKEN_SEMI: return "TOKEN_SEMI";
+    case TOKEN_COLON: return "TOKEN_COLON";
+    case TOKEN_UNKNOWN: return "TOKEN_UNKNOWN";
+    default: return "UNKNOWN_SYMBOL";
+  }
+}
+
+typedef struct {
+  symbols *type;
+  char **text;
+  char **tktype;
+  size_t *text_len;
+  symbol_bhv *behaviour;
+  unsigned int *cursor_skip;
+  symbols *previous_token;
+  size_t capacity;
+  size_t size;
+} Token;
+
+
+void token_init(Token *tok, size_t capacity) {
+  tok->capacity = capacity;
+  tok->size = 0;
+
+  tok->type = malloc(sizeof(symbols) * capacity);
+  tok->text = malloc(sizeof(char *) * capacity);
+  tok->text_len = malloc(sizeof(size_t) * capacity);
+  tok->behaviour = malloc(sizeof(symbol_bhv) * capacity);
+  tok->cursor_skip = malloc(sizeof(unsigned int) * capacity);
+  tok->previous_token = malloc(sizeof(symbols) * capacity);
+  tok->tktype = malloc(sizeof(char*) * capacity);
+  assert(tok->type && tok->text && tok->text_len &&
+         tok->behaviour && tok->cursor_skip && tok->previous_token);
+}
+
+void token_grow(Token *tok) {
+  size_t new_capacity = (tok->capacity == 0 ? 8 : tok->capacity * 2);
+
+  tok->type = realloc(tok->type, new_capacity * sizeof(symbols));
+  tok->text = realloc(tok->text, new_capacity * sizeof(char *));
+  tok->text_len = realloc(tok->text_len, new_capacity * sizeof(size_t));
+  tok->behaviour = realloc(tok->behaviour, new_capacity * sizeof(symbol_bhv));
+  tok->cursor_skip = realloc(tok->cursor_skip, new_capacity * sizeof(unsigned int));
+  tok->previous_token = realloc(tok->previous_token, new_capacity * sizeof(symbols));
+  tok->tktype = realloc(tok->tktype, new_capacity*sizeof(char*));
+  assert(tok->type && tok->text && tok->text_len &&
+         tok->behaviour && tok->cursor_skip && tok->previous_token);
+
+  tok->capacity = new_capacity;
+}
+
+void token_push(Token *tok, symbols type, const char *text,
+                symbol_bhv behaviour, size_t cursor_skip) {
+  if (tok->size >= tok->capacity) {
+    token_grow(tok);
+  }
+
+  size_t i = tok->size;
+
+  tok->type[i] = type;
+  tok->text[i] = strdup(text);
+  tok->text_len[i] = strlen(text);
+  tok->behaviour[i] = behaviour;
+  tok->cursor_skip[i] = cursor_skip;
+  tok->tktype[i] = token_type_to_string(tok->type[i]);
+  
+  if (i > 0)
+    tok->previous_token[i] = tok->type[i - 1];
+  else
+    tok->previous_token[i] = TOKEN_UNKNOWN;
+
+  tok->size++;
+}
+
+void token_free(Token *tok) {
+  for (size_t i = 0; i < tok->size; i++) {
+    free(tok->text[i]);
+  }
+  free(tok->type);
+  free(tok->text);
+  free(tok->text_len);
+  free(tok->behaviour);
+  free(tok->cursor_skip);
+  free(tok->previous_token);
+}
+
+
+int str_to_int(char *strint) { return atoi(strint); }
+float str_to_float(char *strif) { return strtof(strif, NULL); }
+
+
+
+size_t read_from_tok(Token *tok, const char *input, size_t cursor) {
+  char buf[64];
+  size_t start = cursor;
+  size_t i = 0;
+
+  if (isdigit((unsigned char)input[cursor])) {
+    int dots_seen = 0;
+    while (isdigit((unsigned char)input[cursor]) || input[cursor] == '.') {
+      if (input[cursor] == '.') dots_seen++;
+      buf[i++] = input[cursor++];
+      if (i >= sizeof(buf) - 1) break;
+    }
+    buf[i] = '\0';
+    token_push(tok, dots_seen == 0 ? TOKEN_INTEGER : TOKEN_FLOAT,
+               buf, dots_seen == 0 ? BHV_NUMBER : BHV_FLOAT,
+               cursor - start);
+    return cursor - start; // all digits handled
+  }
+
+  else if (input[cursor] == '"') {
+    cursor++; // skip opening quote
+    while (input[cursor] != '"' && input[cursor] != '\0') {
+      buf[i++] = input[cursor++];
+      if (i >= sizeof(buf) - 1) break;
+    }
+    buf[i] = '\0';
+    if (input[cursor] == '"') cursor++; // skip closing quote
+    token_push(tok, TOKEN_STRING, buf, BHV_STRING, cursor - start);
+    return cursor - start;
+  }
+
+  else if (isalpha((unsigned char)input[cursor])) {
+    while (isalpha((unsigned char)input[cursor])) {
+      buf[i++] = input[cursor++];
+      if (i >= sizeof(buf) - 1) break;
+    }
+    buf[i] = '\0';
+    token_push(tok, TOKEN_IDENTIFIER, buf, BHV_IDENT, cursor - start);
+    return cursor - start;
+  }
+
+  // Single-character tokens and symbols
+  switch (input[cursor]) {
+    case '+': token_push(tok, TOKEN_PLUS, "+", BHV_STACK, 1); break;
+    case '-': token_push(tok, TOKEN_MINUS, "-", BHV_STACK, 1); break;
+    case '*': token_push(tok, TOKEN_MUL, "*", BHV_STACK, 1); break;
+    case '/': token_push(tok, TOKEN_DIV, "/", BHV_STACK, 1); break;
+    case '{': token_push(tok, TOKEN_LCURLY, "{", BHV_STACK, 1); break;
+    case '}': token_push(tok, TOKEN_RCURLY, "}", BHV_STACK, 1); break;
+    case ';': token_push(tok, TOKEN_SEMI, ";", BHV_STACK, 1); break;
+    case ':': token_push(tok, TOKEN_COLON, ":", BHV_STACK, 1); break;
+
+    case '(':
+      token_push(tok, TOKEN_LPAREN, "(", BHV_STACK, 1);
+      break;
+    case ')':
+      token_push(tok, TOKEN_RPAREN, ")", BHV_STACK, 1);
+      break;
+    case ',':
+      token_push(tok, TOKEN_COMMA, ",", BHV_STACK, 1);
+      break;
+    case ' ':
+      // you can skip space tokens if you don't need them
+      token_push(tok, TOKEN_SPACE, " ", BHV_UNDEFINED, 1);
+      break;
+    case '\n':
+      token_push(tok, TOKEN_NEWLINE, "\\n", BHV_UNDEFINED, 1);
+      break;
+    case '\0':
+      return 0; // end of input
+    default: {
+      buf[0] = input[cursor];
+      buf[1] = '\0';
+      token_push(tok, TOKEN_UNKNOWN, buf, BHV_UNDEFINED, 1);
+      break;
+    }
+  }
+
+  cursor++; // move forward exactly one char for symbol cases
+  return cursor - start;
+}
+
+
+Token tokenize_all(const char *input) {
+  Token tok;
+  token_init(&tok, 8);
+
+  size_t i = 0;
+  size_t length = strlen(input);
+
+  while (i < length) {
+    i += read_from_tok(&tok, input, i);
+  }
+
+  token_push(&tok, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0);
+  return tok;
+}
+
+
+
--- a/src/parser.h
+++ b/src/parser.h
@@ -0,0 +1,219 @@
+#include "./lexer.h"
+#define NB_IMPLEMENTATION
+#include "./nb.h"
+
+int get_prec(symbols op){
+  switch (op) {
+    case TOKEN_MUL:
+    case TOKEN_DIV:
+       return 2; break;
+    case TOKEN_PLUS:
+    case TOKEN_MINUS:
+      return 1; break;
+    default: return 0;
+  }
+}
+// parse
+
+bool is_left_asc(symbols op){
+  switch (op) {
+    case TOKEN_MUL:
+    case TOKEN_DIV:
+    case TOKEN_PLUS:
+    case TOKEN_MINUS:
+      return true; break;
+    default: return false;
+  }
+}
+
+Token *global_tok = NULL;
+
+typedef enum {
+  SYM_VAR,
+  SYM_FUNC,
+} SymbolKind;
+
+typedef struct {
+  const char* name;
+  size_t ret_count;
+  size_t arg_count;
+  symbols arg_types[16];
+  symbols ret_type;
+  SymbolKind symbol_kind;
+  bool builtin;
+} Symbol;
+
+
+// static Symbol builtins[] = {
+//     { "print", 1, 1, { TOKEN_UNKNOWN }, TOKEN_EOF, SYM_FUNC, true },
+// };
+
+
+typedef struct {
+  Symbol *symbols;
+  size_t size;
+  size_t capacity;
+} SymbolTable; 
+
+
+// static int builtin_num = sizeof(builtins)/sizeof(builtins[0]);
+
+// static SymbolTable global_env = {
+//   .size = sizeof(builtins)/sizeof(builtins[0]),
+//   .capacity = sizeof(builtins)/sizeof(builtins[0]),
+//   .symbols = builtins};
+
+
+Symbol *symbol_lookup(SymbolTable *table, const char *n){
+  for (size_t i=0; i<table->size; ++i){
+    if(strcmp(n, table->symbols[i].name) == 0){
+      return &table->symbols[i];
+    }
+  }
+  return NULL;
+}
+
+// fn add(x: int, y: int) int {
+//   return x+y;
+// }
+
+
+void symbol_table_init(SymbolTable *table, size_t initial_capacity) {
+    table->symbols = malloc(sizeof(Symbol) * initial_capacity);
+    if (!table->symbols) {
+        fprintf(stderr, "symbol_table_init: malloc failed\n");
+        exit(1);
+    }
+    table->size = 0;
+    table->capacity = initial_capacity;
+}
+
+void symbol_table_add(SymbolTable *table, Symbol sym) {
+    if (table->size >= table->capacity) {
+        table->capacity = (table->capacity == 0) ? 8 : table->capacity * 2;
+        table->symbols = realloc(table->symbols, sizeof(Symbol) * table->capacity);
+        if (!table->symbols) {
+            fprintf(stderr, "symbol_table_add: realloc failed\n");
+            exit(1);
+        }
+    }
+    table->symbols[table->size++] = sym;
+}
+
+
+void symbol_table_free(SymbolTable *table) {
+    free(table->symbols);
+    table->symbols = NULL;
+    table->size = 0;
+    table->capacity = 0;
+}
+
+
+Token build_rpn(Token *inp, SymbolTable *symtab) {
+    Token output;
+    Token stack;
+
+    token_init(&output, 16);
+    token_init(&stack, 16);
+
+    for (size_t i = 0; i < inp->size; ++i) {
+        symbols type = inp->type[i];
+        const char *text = inp->text[i];
+
+        if (type == TOKEN_IDENTIFIER && i + 1 < inp->size && inp->type[i + 1] == TOKEN_LPAREN) {
+            Symbol *found = symbol_lookup(symtab, text);
+            if (!found) {
+                Symbol sym = {
+                    .name = strdup(text),
+                    .arg_count = 0,
+                    .ret_type = TOKEN_EOF,
+                    .symbol_kind = SYM_FUNC,
+                    .builtin = false
+                };
+                symbol_table_add(symtab, sym);
+            }
+            token_push(&stack, type, text, inp->behaviour[i], 0);
+        } else if (type == TOKEN_IDENTIFIER) {
+            Symbol *found = symbol_lookup(symtab, text);
+            if (!found) {
+                Symbol sym = {
+                    .name = strdup(text),
+                    .arg_count = 0,
+                    .ret_type = TOKEN_UNKNOWN,
+                    .symbol_kind = SYM_VAR,
+                    .builtin = false
+                };
+                symbol_table_add(symtab, sym);
+            }
+            token_push(&output, type, text, inp->behaviour[i], 0);
+        } else if (type == TOKEN_LPAREN) {
+            token_push(&stack, type, text, inp->behaviour[i], 0);
+        } else if (type == TOKEN_RPAREN) {
+            while (stack.size > 0 && stack.type[stack.size - 1] != TOKEN_LPAREN) {
+                token_push(&output, stack.type[stack.size - 1],
+                           stack.text[stack.size - 1],
+                           stack.behaviour[stack.size - 1], 0);
+                stack.size--;
+            }
+            if (stack.size > 0 && stack.type[stack.size - 1] == TOKEN_LPAREN)
+                stack.size--;
+            if (stack.size > 0 && stack.type[stack.size - 1] == TOKEN_IDENTIFIER) {
+                token_push(&output, stack.type[stack.size - 1],
+                           stack.text[stack.size - 1],
+                           stack.behaviour[stack.size - 1], 0);
+                stack.size--;
+            }
+        } else if (type == TOKEN_INTEGER || type == TOKEN_FLOAT || type == TOKEN_STRING) {
+            token_push(&output, type, text, inp->behaviour[i], 0);
+        } else if (is_left_asc(type)) {
+            while (stack.size > 0 && stack.type[stack.size - 1] != TOKEN_LPAREN &&
+                   (get_prec(stack.type[stack.size - 1]) > get_prec(type) ||
+                    get_prec(stack.type[stack.size - 1]) == get_prec(type)) &&
+                   is_left_asc(type)) {
+                token_push(&output, stack.type[stack.size - 1],
+                           stack.text[stack.size - 1],
+                           stack.behaviour[stack.size - 1], 0);
+                stack.size--;
+            }
+            token_push(&stack, type, text, inp->behaviour[i], 0);
+        }
+    }
+
+    while (stack.size > 0) {
+        token_push(&output, stack.type[stack.size - 1],
+                   stack.text[stack.size - 1],
+                   stack.behaviour[stack.size - 1], 0);
+        stack.size--;
+    }
+
+    token_push(&output, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0);
+    return output;
+}
+
+void print_token(Token *tk){
+  for (size_t i=0; i<tk->size; ++i){
+    printf("TokenNum: %zu Type: %s Value: %s\n", i, tk->tktype[i], tk->text[i]);
+  }
+}
+
+
+
+
+// int main(int argc, char **argv){
+//   if (argc < 2) return -1;
+//   const char ts[] = "\"hello\" hi + 2 2.312"; 
+//   const char math[] = "print(((1+2)*6)/18)"; // = 1
+//   const char print[] = "print(\"hello\")";
+//   const char simple[] = "1 + (  3 + 3  )/4+4*3";
+  
+
+//   char* read = nb_read_file(argv[1]);
+//   Token tk = tokenize_all(read);
+//   printf("INPUT: %s\n", read);
+//   SymbolTable table = {0};
+//   symbol_table_init(&table, 32);
+
+
+//   Token rpn = build_rpn(&tk, &table);
+//   print_token(&rpn); 
+// }
--- a/src/vm.c
+++ b/src/vm.c
@@ -0,0 +1,204 @@
+#include "parser.h"
+#include <string.h>
+
+typedef enum {
+  OP_PUSH_INT,
+  OP_PUSH_FLOAT,
+  OP_PUSH_STRING,
+  OP_ADD,
+  OP_SUB,
+  OP_MUL,
+  OP_DIV,
+  OP_PRINT,
+  OP_HALT
+} OPcode;
+
+typedef struct {
+  OPcode op;
+  double num;
+  char *strlit;
+} instruct;
+
+typedef enum {
+    VAL_INT,
+    VAL_FLOAT,
+    VAL_STRING,
+} ValueType;
+
+typedef struct {
+    ValueType type;
+    union {
+        long i;
+        double f;
+        char *s;
+    };
+} Value;
+
+typedef struct {
+  instruct *program;
+  size_t inst_p;
+  size_t program_size;
+  Value stack[256];
+  size_t st_p;
+  bool running;
+} VM;
+
+instruct *rpn_to_bytecode(Token *rpn, size_t *out){
+  size_t cap = 64;
+  size_t size = 0;
+
+  instruct *prog = malloc(sizeof(instruct) * cap);
+
+  for (size_t i=0; i<rpn->size; ++i){
+    symbols t = rpn->type[i];
+    const char *text = rpn->text[i];
+
+    instruct ins = {0};
+
+    switch (t){
+      case TOKEN_INTEGER: ins.op = OP_PUSH_INT; ins.num = atof(text); break;
+      case TOKEN_FLOAT: ins.op = OP_PUSH_FLOAT; ins.num = atof(text); break;
+      case TOKEN_STRING: ins.op = OP_PUSH_STRING; ins.strlit = strdup(text); break;
+      case TOKEN_PLUS: ins.op = OP_ADD; break;
+      case TOKEN_MINUS: ins.op = OP_SUB; break;
+      case TOKEN_MUL: ins.op = OP_MUL; break;
+      case TOKEN_DIV: ins.op = OP_DIV; break;
+
+      case TOKEN_IDENTIFIER:
+        if (strcmp(text, "print") == 0) {
+            ins.op = OP_PRINT;
+        } else {
+            printf("[WARNING] Uknown Identifier '%s'\n", text);
+        }
+        break; //TODO: unhardcode this
+      case TOKEN_EOF: ins.op = OP_HALT; break;
+      default: continue;
+    }
+    if (size >= cap){
+      cap*=2;
+      prog = realloc(prog, sizeof(instruct)*cap);
+    }
+    prog[size++] = ins;
+  }
+  *out = size;
+  return prog;
+}
+
+void vm_run(VM *vm) {
+    vm->running = true;
+    vm->inst_p = 0;
+    vm->st_p = 0;
+
+    while (vm->running && vm->inst_p < vm->program_size) {
+        instruct ins = vm->program[vm->inst_p++];
+
+        switch (ins.op) {
+        case OP_PUSH_INT: {
+            Value v = { .type = VAL_INT, .i = ins.num };
+            vm->stack[vm->st_p++] = v;
+        } break;
+
+        case OP_PUSH_FLOAT: {
+            Value v = { .type = VAL_FLOAT, .f = ins.num };
+            vm->stack[vm->st_p++] = v;
+        } break;
+
+        case OP_PUSH_STRING: {
+            Value v = { .type = VAL_STRING, .s = strdup(ins.strlit) };
+            vm->stack[vm->st_p++] = v;
+        } break;
+
+        case OP_ADD:
+        case OP_SUB:
+        case OP_MUL:
+        case OP_DIV: {
+            if (vm->st_p < 2) {
+                fprintf(stderr, "not enough values on stack.\n");
+                vm->running = false;
+                break;
+            }
+
+            Value b = vm->stack[--vm->st_p];
+            Value a = vm->stack[--vm->st_p];
+
+            double av = (a.type == VAL_INT) ? a.i : a.f;
+            double bv = (b.type == VAL_INT) ? b.i : b.f;
+            double result = 0;
+
+            switch (ins.op) {
+            case OP_ADD: result = av + bv; break;
+            case OP_SUB: result = av - bv; break;
+            case OP_MUL: result = av * bv; break;
+            case OP_DIV:
+                if (bv == 0) {
+                    fprintf(stderr, "division by zero.\n");
+                    vm->running = false;
+                } else result = av / bv;
+                break;
+            default: break;
+            }
+
+            Value v = { .type = VAL_FLOAT, .f = result };
+            vm->stack[vm->st_p++] = v;
+        } break;
+
+        case OP_PRINT: {
+            if (vm->st_p == 0) {
+                fprintf(stderr, "cant print an empty stack\n");
+                vm->running = false;
+                break;
+            }
+
+            Value v = vm->stack[--vm->st_p];
+            switch (v.type) {
+            case VAL_INT:   printf("%ld\n", v.i); break;
+            case VAL_FLOAT: printf("%g\n", v.f); break;
+            case VAL_STRING:
+                printf("%s\n", v.s);
+                free(v.s);
+                break;
+            }
+        } break;
+
+        case OP_HALT:
+            vm->running = false;
+            break;
+
+        default:
+            fprintf(stderr, "unknown opcode %d\n", ins.op);
+            vm->running = false;
+            break;
+        }
+    }
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <source file>\n", argv[0]);
+        return 1;
+    }
+
+    char* read = nb_read_file(argv[1]);
+    //printf("INPUT: %s\n", read);
+
+    Token tk = tokenize_all(read);
+    SymbolTable table = {0};
+    symbol_table_init(&table, 32);
+
+    Token rpn = build_rpn(&tk, &table);
+    //print_token(&rpn);
+
+    size_t prog_size = 0;
+    instruct *prog = rpn_to_bytecode(&rpn, &prog_size);
+    VM vm = {
+        .program = prog,
+        .program_size = prog_size,
+        .inst_p = 0,
+        .st_p = 0,
+        .running = true,
+    };
+
+    vm_run(&vm);
+
+    return 0;
+}