military grade tokenizer

2025-09-23 16:17:21 +03:00
parent e1727a46e3
commit b1f1b80142
2 changed files with 166 additions and 492 deletions
--- a/lexer.c
+++ b/lexer.c
@@ -1,26 +1,13 @@
 #include <assert.h>
-#include <ctype.h>
+ #include <ctype.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <stdbool.h>
 #define NB_IMPLEMENTATION
 #include "nb.h"
 int str_to_int(char *strint){
  int new_int = atoi(strint);
  return new_int;
 }
-float str_to_float(char *strif){
+typedef enum {
  char *fptr;
  float new_int = strtof(strif, &fptr);
  return new_int;
 }
 typedef enum{
  TOKEN_PLUS,
  TOKEN_MINUS,
  TOKEN_INTEGER,
@@ -37,7 +24,7 @@ typedef enum{
  TOKEN_COMMA
 } symbols;
-typedef enum{
+typedef enum {
  BHV_STACK,
  BHV_UNDEFINED,
  BHV_NUMBER,
@@ -45,346 +32,89 @@ typedef enum{
  BHV_FLOAT,
 } symbol_bhv;
-typedef struct{
+
-  symbols type;
+typedef struct {
-  char* text;
+  symbols *type;
-  size_t text_len;
+  char **text;
-  symbol_bhv behaviour;
+  size_t *text_len;
-  uint cursor_skip;
+  symbol_bhv *behaviour;
-  symbols previous_token;
+  unsigned int *cursor_skip;
  symbols *previous_token;
  size_t capacity;
  size_t size;
 } Token;
 typedef struct{
  Token* unit;
  size_t size;
  size_t capacity;
 } TokenArr;
-typedef struct{
+void token_init(Token *tok, size_t capacity) {
-  char *content;
+  tok->capacity = capacity;
-  // size_t cursor;
+  tok->size = 0;
  // size_t line;
 } Lexer;
-typedef enum{
+  tok->type = malloc(sizeof(symbols) * capacity);
-  AST_NUMBER,
+  tok->text = malloc(sizeof(char *) * capacity);
-  AST_BINARY_OP,
+  tok->text_len = malloc(sizeof(size_t) * capacity);
-} ASTNodeType;
+  tok->behaviour = malloc(sizeof(symbol_bhv) * capacity);
  tok->cursor_skip = malloc(sizeof(unsigned int) * capacity);
  tok->previous_token = malloc(sizeof(symbols) * capacity);
-typedef struct ASTNode ASTNode;
+  assert(tok->type && tok->text && tok->text_len &&
-
+         tok->behaviour && tok->cursor_skip && tok->previous_token);
 struct ASTNode {
  ASTNodeType type;
  union {
    struct { double value; } number;
    struct {
      char op;
      ASTNode* left;
      ASTNode* right;
    } binary;
    struct {
      char *name;
      ASTNode** args;
      size_t arg_count;
    } func_call;
  } data;
 };
 typedef struct{
  Token* tokens;
  size_t cursor;
 } parser;
 // Lexer 
 void lexer_new(char *content, size_t content_len){
  (void) content;
  (void) content_len;
 }
 // Token
 void lexer_next(Lexer *mylexer){
  (void) mylexer;
 }
-Token parser_peek(parser* p){
+void token_grow(Token *tok) {
-  return p->tokens[p->cursor];
+  size_t new_capacity = (tok->capacity == 0 ? 8 : tok->capacity * 2);
  tok->type = realloc(tok->type, new_capacity * sizeof(symbols));
  tok->text = realloc(tok->text, new_capacity * sizeof(char *));
  tok->text_len = realloc(tok->text_len, new_capacity * sizeof(size_t));
  tok->behaviour = realloc(tok->behaviour, new_capacity * sizeof(symbol_bhv));
  tok->cursor_skip = realloc(tok->cursor_skip, new_capacity * sizeof(unsigned int));
  tok->previous_token = realloc(tok->previous_token, new_capacity * sizeof(symbols));
  assert(tok->type && tok->text && tok->text_len &&
         tok->behaviour && tok->cursor_skip && tok->previous_token);
  tok->capacity = new_capacity;
 }
-Token parser_advance(parser* p){
+void token_push(Token *tok, symbols type, const char *text,
-  return p->tokens[p->cursor++];
+                symbol_bhv behaviour, size_t cursor_skip) {
  if (tok->size >= tok->capacity) {
    token_grow(tok);
  }
  size_t i = tok->size;
  tok->type[i] = type;
  tok->text[i] = strdup(text);
  tok->text_len[i] = strlen(text);
  tok->behaviour[i] = behaviour;
  tok->cursor_skip[i] = cursor_skip;
  if (i > 0)
    tok->previous_token[i] = tok->type[i - 1];
  else
    tok->previous_token[i] = TOKEN_UNKNOWN;
  tok->size++;
 }
-bool parser_match(parser* p, symbols tokent){
+void token_free(Token *tok) {
-  if (parser_peek(p).type == tokent){
+  for (size_t i = 0; i < tok->size; i++) {
-    parser_advance(p);
+    free(tok->text[i]);
    return true;
  } else {
    return false;
  }
-} 
+  free(tok->type);
-
+  free(tok->text);
-ASTNode* ast_new_number(double val){
+  free(tok->text_len);
-  ASTNode* node = malloc(sizeof(ASTNode));
+  free(tok->behaviour);
-  node->type = AST_NUMBER;
+  free(tok->cursor_skip);
-  node->data.number.value = val;
+  free(tok->previous_token);
  return node;
 }
 ASTNode* ast_new_binary(char op, ASTNode* l, ASTNode* r){
  ASTNode* node = malloc(sizeof(ASTNode));
  node->type = AST_BINARY_OP;
  node->data.binary.op = op;
  node->data.binary.left = l;
  node->data.binary.right = r;
  // maybe need to fix
  return node;
 }
 ASTNode* parse_factor(parser* p) {
  Token tok = parser_peek(p);
  if (tok.type == TOKEN_EOF){
    fprintf(stderr, "Unexpected end of input in factor\n");
    exit(EXIT_FAILURE);
  }
  if (tok.type == TOKEN_INTEGER || tok.type == TOKEN_FLOAT){
    parser_advance(p);
    double v = atof(tok.text);
    return ast_new_number(v);
  }
  // if (tok.type == TOKEN_STRING){
  //   parser_advance(p);
  //   char* func_name = tok.text;
  //   if (parser_match(p, TOKEN_LPAREN)){
  //     size_t argc_count = 0;
  //
  //   }
  // }
  fprintf(stderr, "Unexpected token '%s' in factor\n", tok.text);
  exit(EXIT_FAILURE);
 }
 ASTNode* parse_term(parser* p) {
    ASTNode* node = parse_factor(p);
    while (true) {
        Token tok = parser_peek(p);
        if (tok.type == TOKEN_MUL || tok.type == TOKEN_DIV) {
            parser_advance(p);
            ASTNode* right = parse_factor(p);
            node = ast_new_binary(tok.text[0], node, right);
        } else {
            break;
        }
    }
    return node;
 }
 ASTNode* parse_expression(parser* p) {
    ASTNode* node = parse_term(p);
    while (true) {
        Token tok = parser_peek(p);
        if (tok.type == TOKEN_PLUS || tok.type == TOKEN_MINUS) {
            parser_advance(p);
            ASTNode* right = parse_term(p);
            node = ast_new_binary(tok.text[0], node, right);
        } else {
            break;
        }
    }
    return node;
 }
 double eval_ast(ASTNode* node) {
    if (node->type == AST_NUMBER) {
        return node->data.number.value;
    }
    double L = eval_ast(node->data.binary.left);
    double R = eval_ast(node->data.binary.right);
    switch (node->data.binary.op) {
        case '+': return L + R;
        case '-': return L - R;
        case '*': return L * R;
        case '/': return L / R;
        default:
            fprintf(stderr, "Unknown op '%c'\n", node->data.binary.op);
            exit(EXIT_FAILURE);
    }
 }
 Token read_from_tok(char* text, uint cursor){ 
    Token mytoks;
    static char buf[64];
    size_t i = 0;
    mytoks.cursor_skip = 1;
    if (isdigit(text[cursor])) {
        size_t start = cursor;
        int dots_seen = 0;
        while ( isdigit(text[cursor]) || text[cursor] == '.') {
          if (text[cursor] == '.') {
            dots_seen +=1;
            assert(dots_seen < 2);
          }
          buf[i++] = text[cursor++];
        } 
        buf[i] = '\0';
        if (!dots_seen){
          mytoks.type = TOKEN_INTEGER;
          mytoks.behaviour = BHV_NUMBER;
        } else {
          mytoks.type = TOKEN_FLOAT;
          mytoks.behaviour = BHV_FLOAT;
        }
        mytoks.cursor_skip = cursor - start;
        mytoks.text = strdup(buf);
        mytoks.text_len = i;
    } 
    else if (isalpha(text[cursor])){
        size_t start = cursor;
        while (isalpha(text[cursor])) {
            buf[i++] = text[cursor++];
        }
        buf[i] = '\0';
        mytoks.type = TOKEN_STRING;
        mytoks.behaviour = BHV_STRING; 
        mytoks.cursor_skip = cursor - start;
        mytoks.text = strdup(buf);
        mytoks.text_len = i;
    }
    else {
       buf[0] = text[cursor];
       buf[1] = '\0';
       switch (text[cursor])
       {
         case '+':
           mytoks.type = TOKEN_PLUS;
           mytoks.text = strdup("+");
           mytoks.behaviour = BHV_STACK;
           break;
         case '-':
           mytoks.type = TOKEN_MINUS;
           mytoks.text = strdup("-");
           mytoks.behaviour = BHV_STACK; 
           break;
         case ' ':
           mytoks.type = TOKEN_SPACE;
           mytoks.text = strdup("space");
           break;
         case '*':
          mytoks.type = TOKEN_MUL;
          mytoks.text = strdup("*");
          mytoks.behaviour = BHV_STACK;
          break;
         case '/':
          mytoks.type = TOKEN_DIV;
          mytoks.text = strdup("/");
          mytoks.behaviour = BHV_STACK;
          break;
         case '\n':
          mytoks.type = TOKEN_NEWLINE;
          mytoks.text = strdup("newline");
          mytoks.cursor_skip = 1;
          break;
         case '(':
          mytoks.type = TOKEN_LPAREN;
          mytoks.text = strdup("(");
          mytoks.behaviour = BHV_STACK;
          break;
         case ')':
          mytoks.type = TOKEN_RPAREN;
          mytoks.text = strdup(")");
          mytoks.behaviour = BHV_STACK;
          break;
         case ',':
          mytoks.type = TOKEN_COMMA;
          mytoks.text = strdup(",");
          mytoks.behaviour = BHV_STACK;
          break;
         default:
           mytoks.type = TOKEN_UNKNOWN;
           mytoks.behaviour = BHV_UNDEFINED;
           mytoks.text = strdup(buf);
    } 
  }
  return mytoks;
 }
-void tokenarr_push(TokenArr* arr, Token tok) {
+int str_to_int(char *strint) { return atoi(strint); }
-    if (arr->size >= arr->capacity) {
+float str_to_float(char *strif) { return strtof(strif, NULL); }
        arr->capacity = arr->capacity ? arr->capacity * 2 : 8;
        arr->unit = realloc(arr->unit, arr->capacity * sizeof(Token));
        assert(arr->unit != NULL);
    }
    arr->unit[arr->size++] = tok;
 }
-TokenArr tokenize_all(const char* input) {
+char *token_type_to_string(symbols type) {
    TokenArr arr = {NULL, 0, 0};
    size_t i = 0;
    size_t len = strlen(input);
    while (i < len) {
        Token tok = read_from_tok((char*)input, i);
        i += tok.cursor_skip;
        if (tok.type == TOKEN_SPACE || tok.type == TOKEN_NEWLINE) {
            free(tok.text);
            continue;
        }
        tokenarr_push(&arr, tok);
    }
    Token eof = {0};
    eof.type = TOKEN_EOF;
    eof.text = strdup("EOF");
    eof.text_len = 3;
    eof.behaviour = BHV_UNDEFINED;
    eof.cursor_skip = 0;
    tokenarr_push(&arr, eof);
    return arr;
 }
 // Token* c
 void token_parser(Token mytok, char* input){
  int length1 = strlen(input);
  int i=0;
  while (i < length1) {
  mytok = read_from_tok(input, i);
  printf("Text: %s\n", mytok.text);
  printf("Behaviour: %d\n", mytok.behaviour);
      if (mytok.behaviour == BHV_STACK){
         printf("this is stack lil bro\n");
      }
    i++;
  }
 }
 // operators accepted in int/digit or whatever type def only when they have a digit before AND after them 
 /*
 int main(){
  Token newtok;
  char* input = "8";
  parser(newtok, input);
 }
 */
 char* token_type_to_string(symbols type) {
  switch (type) {
    case TOKEN_PLUS: return "TOKEN_PLUS";
    case TOKEN_MINUS: return "TOKEN_MINUS";
@@ -392,144 +122,88 @@ char* token_type_to_string(symbols type) {
    case TOKEN_FLOAT: return "TOKEN_FLOAT";
    case TOKEN_SPACE: return "TOKEN_SPACE";
    case TOKEN_STRING: return "TOKEN_STRING";
    case TOKEN_MUL: return "TOKEN_MUL";
    case TOKEN_DIV: return "TOKEN_DIV";
    case TOKEN_LPAREN: return "TOKEN_LPAREN";
    case TOKEN_RPAREN: return "TOKEN_RPAREN";
    case TOKEN_COMMA: return "TOKEN_COMMA";
    case TOKEN_EOF: return "TOKEN_EOF";
    case TOKEN_NEWLINE: return "TOKEN_NEWLINE";
    case TOKEN_UNKNOWN: return "TOKEN_UNKNOWN";
    default: return "UNKNOWN_SYMBOL";
  }
 }
 // void main2() {
 //     char* input = "323.23 + Hello world 102102";
 //     int length1 = strlen(input);
 //     int i = 0;
 //     printf("input: %s\n\n", input);
 //     while (i < length1) {
 //         Token result = read_from_tok(input, i); 
 //         printf("text: %s\ntype: %u (%s)\n\n", result.text, result.type, token_type_to_string(result.type));
 //         i += result.cursor_skip;  
 //     }
 // }
 size_t read_from_tok(Token *tok, const char *input, size_t cursor) {
  char buf[64];
  size_t start = cursor;
  size_t i = 0;
-
+  if (isdigit(input[cursor])) {
-void mathparser(const char* input) {  
+    int dots_seen = 0;
-    TokenArr stack = tokenize_all(input);  
+    while (isdigit(input[cursor]) || input[cursor] == '.') {
-    float result = 0;
+      if (input[cursor] == '.') dots_seen++;
-    float current = 0;  
+      buf[i++] = input[cursor++];
-    float sign = 1;
+    }
-    float op = 0;  
+    buf[i] = '\0';
-    
+    if (dots_seen == 0) {
-    for (size_t i = 0; i < stack.size; ++i) {  
+      token_push(tok, TOKEN_INTEGER, buf, BHV_NUMBER, cursor - start);
        switch (stack.unit[i].type) {  
            case TOKEN_INTEGER:
                {  
                float value = str_to_float(stack.unit[i].text);  
                if (op == 1) {  
                    current *= value;  
                    op = 0;  
                } else if (op == 2) { 
                    current /= value;  
                    op = 0;  
    } else {
-                    current = value;  
+      token_push(tok, TOKEN_FLOAT, buf, BHV_FLOAT, cursor - start);
    }
-                break;  
+  } else if (isalpha(input[cursor])) {
    while (isalpha(input[cursor])) {
      buf[i++] = input[cursor++];
    }
    buf[i] = '\0';
    token_push(tok, TOKEN_STRING, buf, BHV_STRING, cursor - start);
  } else {
    buf[0] = input[cursor];
    buf[1] = '\0';
    switch (input[cursor]) {
      case '+': token_push(tok, TOKEN_PLUS, "+", BHV_STACK, 1); break;
      case '-': token_push(tok, TOKEN_MINUS, "-", BHV_STACK, 1); break;
      case '*': token_push(tok, TOKEN_MUL, "*", BHV_STACK, 1); break;
      case '/': token_push(tok, TOKEN_DIV, "/", BHV_STACK, 1); break;
      case ' ': token_push(tok, TOKEN_SPACE, " ", BHV_UNDEFINED, 1); break;
      case '\n': token_push(tok, TOKEN_NEWLINE, "\\n", BHV_UNDEFINED, 1); break;
      case '(': token_push(tok, TOKEN_LPAREN, "(", BHV_STACK, 1); break;
      case ')': token_push(tok, TOKEN_RPAREN, ")", BHV_STACK, 1); break;
      case ',': token_push(tok, TOKEN_COMMA, ",", BHV_STACK, 1); break;
      default: token_push(tok, TOKEN_UNKNOWN, buf, BHV_UNDEFINED, 1); break;
    }
    cursor++;
  }
-            case TOKEN_FLOAT:
+  return cursor - start;
-                {  
+}
-                float value = str_to_float(stack.unit[i].text);  
+
-                if (op == 1) {  
+Token tokenize_all(const char *input) {
-                    current *= value;  
+  Token tok;
-                    op = 0;  
+  token_init(&tok, 8);
-                } else if (op == 2) { 
+
-                    current /= value;  
+  size_t i = 0;
-                    op = 0;  
+  size_t length = strlen(input);
-                } else {  
+
-                    current = value;  
+  while (i < length) {
    i += read_from_tok(&tok, input, i);
  }
-                break;  
+
-            }
+  token_push(&tok, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0);
-            case TOKEN_PLUS:  
+  return tok;
                result += sign * current;  
                sign = 1;  
                op = 0;  
                break;  
            case TOKEN_MINUS:  
                result += sign * current;  
                sign = -1;  
                op = 0;  
                break;  
            case TOKEN_MUL:  
                op = 1;  
                break;  
            case TOKEN_DIV:  
                op = 2;  
                break;  
            default:  
                break;  
        }
    }
    result += sign * current;  
    printf("%f\n", result);  
    for (size_t j = 0; j < stack.size; ++j) {  
        free(stack.unit[j].text);  
    }
    free(stack.unit);  
 }
-// int main4() {
+int main() {
-//     char* input = "print(5) hello";
+  char *expr = "1 + 2 * 3";
 //     printf("input: %s\n\n", input);
 //
 //     TokenArr arr = tokenize_all(input);
 //
 //     for (size_t j = 0; j < arr.size; ++j) {
 //         Token* result = &arr.unit[j];
 //         printf("text: %s\ntype: %u (%s)\n\n", result->text, result->type, token_type_to_string(result->type));
 //     }
 //
 //     printf("================ Tokenized =================\n");
 //
 //     for (size_t j = 0; j < arr.size; ++j) {
 //         Token* result = &arr.unit[j];
 //         printf("text: %s, type: %u (%s) || ", result->text, result->type, token_type_to_string(result->type));
 //     }
 //     printf("\n");
 //     for (size_t j = 0; j < arr.size; ++j) {
 //       free(arr.unit[j].text);
 //     }
 //     free(arr.unit);
 //     return 0;
 // }
  Token tokens = tokenize_all(expr);
-
+  for (size_t i = 0; i < tokens.size; i++) {
-
+    printf("[%s] \"%s\"\n", token_type_to_string(tokens.type[i]), tokens.text[i]);
 // int main5(){
 //     char* input = "40/2.3 * 10 + 400";
 //     printf("input: %s\n", input);
 //     mathparser(input);
 //     return 0;
 // }
 int main(int argc, char** argv) {
    if (argc > 1){
    char* input = nb_read_file(argv[1]);
    // printf("Input: %s\n", input);
    TokenArr toks = tokenize_all(input);
    parser p = { toks.unit, 0 };
    ASTNode* root = parse_expression(&p);
    double result = eval_ast(root);
    printf("%f\n", result);
  } else {
    printf("Usage: %s <file>\n", argv[0]);
  }
  token_free(&tokens);
  return 0;
 }
--- a/nb.h
+++ b/nb.h
@@ -86,7 +86,7 @@ void nb_init(nb_arr *newarr, int initial_capacity){
 void nb_append(nb_arr *newarr, char *newval){
  if (newarr->value == NULL){
    newarr->capacity =16;
-  if (newarr->capacity > 16 | newarr->arrsize > newarr->capacity) {
+  if ((newarr->capacity > 16) | (newarr->arrsize > newarr->capacity)) {
    newarr->capacity *=2;
  }
    newarr->value = (char**)realloc(newarr->value, sizeof(char*) * newarr->capacity);
@@ -178,9 +178,9 @@ void nb_com(nb_arr *newarr){
 }
-void append_c_file(FILE *filepointer){
+// void append_c_file(FILE *filepointer){
-
+//   filepointer = NULL;
-}
+// }
 void nb_copy_file(char* old_file_name, char* new_file_name){ // old name shouldnt be nobuild.c. it should be the name of the current file.
  nb_file old_file;