military grade tokenizer

This commit is contained in:
2025-09-23 16:17:21 +03:00
parent e1727a46e3
commit b1f1b80142
2 changed files with 166 additions and 492 deletions

594
lexer.c
View File

@@ -1,26 +1,13 @@
#include <assert.h> #include <assert.h>
#include <ctype.h> #include <ctype.h>
#include <stddef.h> #include <stddef.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <sys/types.h>
#include <stdbool.h> #include <stdbool.h>
#define NB_IMPLEMENTATION
#include "nb.h"
int str_to_int(char *strint){
int new_int = atoi(strint);
return new_int;
}
float str_to_float(char *strif){ typedef enum {
char *fptr;
float new_int = strtof(strif, &fptr);
return new_int;
}
typedef enum{
TOKEN_PLUS, TOKEN_PLUS,
TOKEN_MINUS, TOKEN_MINUS,
TOKEN_INTEGER, TOKEN_INTEGER,
@@ -37,7 +24,7 @@ typedef enum{
TOKEN_COMMA TOKEN_COMMA
} symbols; } symbols;
typedef enum{ typedef enum {
BHV_STACK, BHV_STACK,
BHV_UNDEFINED, BHV_UNDEFINED,
BHV_NUMBER, BHV_NUMBER,
@@ -45,346 +32,89 @@ typedef enum{
BHV_FLOAT, BHV_FLOAT,
} symbol_bhv; } symbol_bhv;
typedef struct{
symbols type; typedef struct {
char* text; symbols *type;
size_t text_len; char **text;
symbol_bhv behaviour; size_t *text_len;
uint cursor_skip; symbol_bhv *behaviour;
symbols previous_token; unsigned int *cursor_skip;
symbols *previous_token;
size_t capacity;
size_t size;
} Token; } Token;
typedef struct{
Token* unit;
size_t size;
size_t capacity;
} TokenArr;
typedef struct{ void token_init(Token *tok, size_t capacity) {
char *content; tok->capacity = capacity;
// size_t cursor; tok->size = 0;
// size_t line;
} Lexer;
typedef enum{ tok->type = malloc(sizeof(symbols) * capacity);
AST_NUMBER, tok->text = malloc(sizeof(char *) * capacity);
AST_BINARY_OP, tok->text_len = malloc(sizeof(size_t) * capacity);
} ASTNodeType; tok->behaviour = malloc(sizeof(symbol_bhv) * capacity);
tok->cursor_skip = malloc(sizeof(unsigned int) * capacity);
tok->previous_token = malloc(sizeof(symbols) * capacity);
typedef struct ASTNode ASTNode; assert(tok->type && tok->text && tok->text_len &&
tok->behaviour && tok->cursor_skip && tok->previous_token);
struct ASTNode {
ASTNodeType type;
union {
struct { double value; } number;
struct {
char op;
ASTNode* left;
ASTNode* right;
} binary;
struct {
char *name;
ASTNode** args;
size_t arg_count;
} func_call;
} data;
};
typedef struct{
Token* tokens;
size_t cursor;
} parser;
// Lexer
void lexer_new(char *content, size_t content_len){
(void) content;
(void) content_len;
}
// Token
void lexer_next(Lexer *mylexer){
(void) mylexer;
} }
Token parser_peek(parser* p){ void token_grow(Token *tok) {
return p->tokens[p->cursor]; size_t new_capacity = (tok->capacity == 0 ? 8 : tok->capacity * 2);
tok->type = realloc(tok->type, new_capacity * sizeof(symbols));
tok->text = realloc(tok->text, new_capacity * sizeof(char *));
tok->text_len = realloc(tok->text_len, new_capacity * sizeof(size_t));
tok->behaviour = realloc(tok->behaviour, new_capacity * sizeof(symbol_bhv));
tok->cursor_skip = realloc(tok->cursor_skip, new_capacity * sizeof(unsigned int));
tok->previous_token = realloc(tok->previous_token, new_capacity * sizeof(symbols));
assert(tok->type && tok->text && tok->text_len &&
tok->behaviour && tok->cursor_skip && tok->previous_token);
tok->capacity = new_capacity;
} }
Token parser_advance(parser* p){ void token_push(Token *tok, symbols type, const char *text,
return p->tokens[p->cursor++]; symbol_bhv behaviour, size_t cursor_skip) {
if (tok->size >= tok->capacity) {
token_grow(tok);
}
size_t i = tok->size;
tok->type[i] = type;
tok->text[i] = strdup(text);
tok->text_len[i] = strlen(text);
tok->behaviour[i] = behaviour;
tok->cursor_skip[i] = cursor_skip;
if (i > 0)
tok->previous_token[i] = tok->type[i - 1];
else
tok->previous_token[i] = TOKEN_UNKNOWN;
tok->size++;
} }
bool parser_match(parser* p, symbols tokent){ void token_free(Token *tok) {
if (parser_peek(p).type == tokent){ for (size_t i = 0; i < tok->size; i++) {
parser_advance(p); free(tok->text[i]);
return true;
} else {
return false;
} }
} free(tok->type);
free(tok->text);
ASTNode* ast_new_number(double val){ free(tok->text_len);
ASTNode* node = malloc(sizeof(ASTNode)); free(tok->behaviour);
node->type = AST_NUMBER; free(tok->cursor_skip);
node->data.number.value = val; free(tok->previous_token);
return node;
}
ASTNode* ast_new_binary(char op, ASTNode* l, ASTNode* r){
ASTNode* node = malloc(sizeof(ASTNode));
node->type = AST_BINARY_OP;
node->data.binary.op = op;
node->data.binary.left = l;
node->data.binary.right = r;
// maybe need to fix
return node;
}
ASTNode* parse_factor(parser* p) {
Token tok = parser_peek(p);
if (tok.type == TOKEN_EOF){
fprintf(stderr, "Unexpected end of input in factor\n");
exit(EXIT_FAILURE);
}
if (tok.type == TOKEN_INTEGER || tok.type == TOKEN_FLOAT){
parser_advance(p);
double v = atof(tok.text);
return ast_new_number(v);
}
// if (tok.type == TOKEN_STRING){
// parser_advance(p);
// char* func_name = tok.text;
// if (parser_match(p, TOKEN_LPAREN)){
// size_t argc_count = 0;
//
// }
// }
fprintf(stderr, "Unexpected token '%s' in factor\n", tok.text);
exit(EXIT_FAILURE);
}
ASTNode* parse_term(parser* p) {
ASTNode* node = parse_factor(p);
while (true) {
Token tok = parser_peek(p);
if (tok.type == TOKEN_MUL || tok.type == TOKEN_DIV) {
parser_advance(p);
ASTNode* right = parse_factor(p);
node = ast_new_binary(tok.text[0], node, right);
} else {
break;
}
}
return node;
}
ASTNode* parse_expression(parser* p) {
ASTNode* node = parse_term(p);
while (true) {
Token tok = parser_peek(p);
if (tok.type == TOKEN_PLUS || tok.type == TOKEN_MINUS) {
parser_advance(p);
ASTNode* right = parse_term(p);
node = ast_new_binary(tok.text[0], node, right);
} else {
break;
}
}
return node;
}
double eval_ast(ASTNode* node) {
if (node->type == AST_NUMBER) {
return node->data.number.value;
}
double L = eval_ast(node->data.binary.left);
double R = eval_ast(node->data.binary.right);
switch (node->data.binary.op) {
case '+': return L + R;
case '-': return L - R;
case '*': return L * R;
case '/': return L / R;
default:
fprintf(stderr, "Unknown op '%c'\n", node->data.binary.op);
exit(EXIT_FAILURE);
}
}
Token read_from_tok(char* text, uint cursor){
Token mytoks;
static char buf[64];
size_t i = 0;
mytoks.cursor_skip = 1;
if (isdigit(text[cursor])) {
size_t start = cursor;
int dots_seen = 0;
while ( isdigit(text[cursor]) || text[cursor] == '.') {
if (text[cursor] == '.') {
dots_seen +=1;
assert(dots_seen < 2);
}
buf[i++] = text[cursor++];
}
buf[i] = '\0';
if (!dots_seen){
mytoks.type = TOKEN_INTEGER;
mytoks.behaviour = BHV_NUMBER;
} else {
mytoks.type = TOKEN_FLOAT;
mytoks.behaviour = BHV_FLOAT;
}
mytoks.cursor_skip = cursor - start;
mytoks.text = strdup(buf);
mytoks.text_len = i;
}
else if (isalpha(text[cursor])){
size_t start = cursor;
while (isalpha(text[cursor])) {
buf[i++] = text[cursor++];
}
buf[i] = '\0';
mytoks.type = TOKEN_STRING;
mytoks.behaviour = BHV_STRING;
mytoks.cursor_skip = cursor - start;
mytoks.text = strdup(buf);
mytoks.text_len = i;
}
else {
buf[0] = text[cursor];
buf[1] = '\0';
switch (text[cursor])
{
case '+':
mytoks.type = TOKEN_PLUS;
mytoks.text = strdup("+");
mytoks.behaviour = BHV_STACK;
break;
case '-':
mytoks.type = TOKEN_MINUS;
mytoks.text = strdup("-");
mytoks.behaviour = BHV_STACK;
break;
case ' ':
mytoks.type = TOKEN_SPACE;
mytoks.text = strdup("space");
break;
case '*':
mytoks.type = TOKEN_MUL;
mytoks.text = strdup("*");
mytoks.behaviour = BHV_STACK;
break;
case '/':
mytoks.type = TOKEN_DIV;
mytoks.text = strdup("/");
mytoks.behaviour = BHV_STACK;
break;
case '\n':
mytoks.type = TOKEN_NEWLINE;
mytoks.text = strdup("newline");
mytoks.cursor_skip = 1;
break;
case '(':
mytoks.type = TOKEN_LPAREN;
mytoks.text = strdup("(");
mytoks.behaviour = BHV_STACK;
break;
case ')':
mytoks.type = TOKEN_RPAREN;
mytoks.text = strdup(")");
mytoks.behaviour = BHV_STACK;
break;
case ',':
mytoks.type = TOKEN_COMMA;
mytoks.text = strdup(",");
mytoks.behaviour = BHV_STACK;
break;
default:
mytoks.type = TOKEN_UNKNOWN;
mytoks.behaviour = BHV_UNDEFINED;
mytoks.text = strdup(buf);
}
}
return mytoks;
} }
void tokenarr_push(TokenArr* arr, Token tok) { int str_to_int(char *strint) { return atoi(strint); }
if (arr->size >= arr->capacity) { float str_to_float(char *strif) { return strtof(strif, NULL); }
arr->capacity = arr->capacity ? arr->capacity * 2 : 8;
arr->unit = realloc(arr->unit, arr->capacity * sizeof(Token));
assert(arr->unit != NULL);
}
arr->unit[arr->size++] = tok;
}
TokenArr tokenize_all(const char* input) { char *token_type_to_string(symbols type) {
TokenArr arr = {NULL, 0, 0};
size_t i = 0;
size_t len = strlen(input);
while (i < len) {
Token tok = read_from_tok((char*)input, i);
i += tok.cursor_skip;
if (tok.type == TOKEN_SPACE || tok.type == TOKEN_NEWLINE) {
free(tok.text);
continue;
}
tokenarr_push(&arr, tok);
}
Token eof = {0};
eof.type = TOKEN_EOF;
eof.text = strdup("EOF");
eof.text_len = 3;
eof.behaviour = BHV_UNDEFINED;
eof.cursor_skip = 0;
tokenarr_push(&arr, eof);
return arr;
}
// Token* c
void token_parser(Token mytok, char* input){
int length1 = strlen(input);
int i=0;
while (i < length1) {
mytok = read_from_tok(input, i);
printf("Text: %s\n", mytok.text);
printf("Behaviour: %d\n", mytok.behaviour);
if (mytok.behaviour == BHV_STACK){
printf("this is stack lil bro\n");
}
i++;
}
}
// operators accepted in int/digit or whatever type def only when they have a digit before AND after them
/*
int main(){
Token newtok;
char* input = "8";
parser(newtok, input);
}
*/
char* token_type_to_string(symbols type) {
switch (type) { switch (type) {
case TOKEN_PLUS: return "TOKEN_PLUS"; case TOKEN_PLUS: return "TOKEN_PLUS";
case TOKEN_MINUS: return "TOKEN_MINUS"; case TOKEN_MINUS: return "TOKEN_MINUS";
@@ -392,144 +122,88 @@ char* token_type_to_string(symbols type) {
case TOKEN_FLOAT: return "TOKEN_FLOAT"; case TOKEN_FLOAT: return "TOKEN_FLOAT";
case TOKEN_SPACE: return "TOKEN_SPACE"; case TOKEN_SPACE: return "TOKEN_SPACE";
case TOKEN_STRING: return "TOKEN_STRING"; case TOKEN_STRING: return "TOKEN_STRING";
case TOKEN_MUL: return "TOKEN_MUL";
case TOKEN_DIV: return "TOKEN_DIV";
case TOKEN_LPAREN: return "TOKEN_LPAREN";
case TOKEN_RPAREN: return "TOKEN_RPAREN";
case TOKEN_COMMA: return "TOKEN_COMMA";
case TOKEN_EOF: return "TOKEN_EOF";
case TOKEN_NEWLINE: return "TOKEN_NEWLINE";
case TOKEN_UNKNOWN: return "TOKEN_UNKNOWN"; case TOKEN_UNKNOWN: return "TOKEN_UNKNOWN";
default: return "UNKNOWN_SYMBOL"; default: return "UNKNOWN_SYMBOL";
} }
} }
// void main2() {
// char* input = "323.23 + Hello world 102102";
// int length1 = strlen(input);
// int i = 0;
// printf("input: %s\n\n", input);
// while (i < length1) {
// Token result = read_from_tok(input, i);
// printf("text: %s\ntype: %u (%s)\n\n", result.text, result.type, token_type_to_string(result.type));
// i += result.cursor_skip;
// }
// }
size_t read_from_tok(Token *tok, const char *input, size_t cursor) {
char buf[64];
size_t start = cursor;
size_t i = 0;
if (isdigit(input[cursor])) {
void mathparser(const char* input) { int dots_seen = 0;
TokenArr stack = tokenize_all(input); while (isdigit(input[cursor]) || input[cursor] == '.') {
float result = 0; if (input[cursor] == '.') dots_seen++;
float current = 0; buf[i++] = input[cursor++];
float sign = 1; }
float op = 0; buf[i] = '\0';
if (dots_seen == 0) {
for (size_t i = 0; i < stack.size; ++i) { token_push(tok, TOKEN_INTEGER, buf, BHV_NUMBER, cursor - start);
switch (stack.unit[i].type) {
case TOKEN_INTEGER:
{
float value = str_to_float(stack.unit[i].text);
if (op == 1) {
current *= value;
op = 0;
} else if (op == 2) {
current /= value;
op = 0;
} else { } else {
current = value; token_push(tok, TOKEN_FLOAT, buf, BHV_FLOAT, cursor - start);
} }
break; } else if (isalpha(input[cursor])) {
while (isalpha(input[cursor])) {
buf[i++] = input[cursor++];
}
buf[i] = '\0';
token_push(tok, TOKEN_STRING, buf, BHV_STRING, cursor - start);
} else {
buf[0] = input[cursor];
buf[1] = '\0';
switch (input[cursor]) {
case '+': token_push(tok, TOKEN_PLUS, "+", BHV_STACK, 1); break;
case '-': token_push(tok, TOKEN_MINUS, "-", BHV_STACK, 1); break;
case '*': token_push(tok, TOKEN_MUL, "*", BHV_STACK, 1); break;
case '/': token_push(tok, TOKEN_DIV, "/", BHV_STACK, 1); break;
case ' ': token_push(tok, TOKEN_SPACE, " ", BHV_UNDEFINED, 1); break;
case '\n': token_push(tok, TOKEN_NEWLINE, "\\n", BHV_UNDEFINED, 1); break;
case '(': token_push(tok, TOKEN_LPAREN, "(", BHV_STACK, 1); break;
case ')': token_push(tok, TOKEN_RPAREN, ")", BHV_STACK, 1); break;
case ',': token_push(tok, TOKEN_COMMA, ",", BHV_STACK, 1); break;
default: token_push(tok, TOKEN_UNKNOWN, buf, BHV_UNDEFINED, 1); break;
}
cursor++;
} }
case TOKEN_FLOAT: return cursor - start;
{ }
float value = str_to_float(stack.unit[i].text);
if (op == 1) { Token tokenize_all(const char *input) {
current *= value; Token tok;
op = 0; token_init(&tok, 8);
} else if (op == 2) {
current /= value; size_t i = 0;
op = 0; size_t length = strlen(input);
} else {
current = value; while (i < length) {
i += read_from_tok(&tok, input, i);
} }
break;
} token_push(&tok, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0);
case TOKEN_PLUS: return tok;
result += sign * current;
sign = 1;
op = 0;
break;
case TOKEN_MINUS:
result += sign * current;
sign = -1;
op = 0;
break;
case TOKEN_MUL:
op = 1;
break;
case TOKEN_DIV:
op = 2;
break;
default:
break;
}
}
result += sign * current;
printf("%f\n", result);
for (size_t j = 0; j < stack.size; ++j) {
free(stack.unit[j].text);
}
free(stack.unit);
} }
// int main4() { int main() {
// char* input = "print(5) hello"; char *expr = "1 + 2 * 3";
// printf("input: %s\n\n", input);
//
// TokenArr arr = tokenize_all(input);
//
// for (size_t j = 0; j < arr.size; ++j) {
// Token* result = &arr.unit[j];
// printf("text: %s\ntype: %u (%s)\n\n", result->text, result->type, token_type_to_string(result->type));
// }
//
// printf("================ Tokenized =================\n");
//
// for (size_t j = 0; j < arr.size; ++j) {
// Token* result = &arr.unit[j];
// printf("text: %s, type: %u (%s) || ", result->text, result->type, token_type_to_string(result->type));
// }
// printf("\n");
// for (size_t j = 0; j < arr.size; ++j) {
// free(arr.unit[j].text);
// }
// free(arr.unit);
// return 0;
// }
Token tokens = tokenize_all(expr);
for (size_t i = 0; i < tokens.size; i++) {
printf("[%s] \"%s\"\n", token_type_to_string(tokens.type[i]), tokens.text[i]);
// int main5(){
// char* input = "40/2.3 * 10 + 400";
// printf("input: %s\n", input);
// mathparser(input);
// return 0;
// }
int main(int argc, char** argv) {
if (argc > 1){
char* input = nb_read_file(argv[1]);
// printf("Input: %s\n", input);
TokenArr toks = tokenize_all(input);
parser p = { toks.unit, 0 };
ASTNode* root = parse_expression(&p);
double result = eval_ast(root);
printf("%f\n", result);
} else {
printf("Usage: %s <file>\n", argv[0]);
} }
token_free(&tokens);
return 0; return 0;
} }

8
nb.h
View File

@@ -86,7 +86,7 @@ void nb_init(nb_arr *newarr, int initial_capacity){
void nb_append(nb_arr *newarr, char *newval){ void nb_append(nb_arr *newarr, char *newval){
if (newarr->value == NULL){ if (newarr->value == NULL){
newarr->capacity =16; newarr->capacity =16;
if (newarr->capacity > 16 | newarr->arrsize > newarr->capacity) { if ((newarr->capacity > 16) | (newarr->arrsize > newarr->capacity)) {
newarr->capacity *=2; newarr->capacity *=2;
} }
newarr->value = (char**)realloc(newarr->value, sizeof(char*) * newarr->capacity); newarr->value = (char**)realloc(newarr->value, sizeof(char*) * newarr->capacity);
@@ -178,9 +178,9 @@ void nb_com(nb_arr *newarr){
} }
void append_c_file(FILE *filepointer){ // void append_c_file(FILE *filepointer){
// filepointer = NULL;
} // }
void nb_copy_file(char* old_file_name, char* new_file_name){ // old name shouldnt be nobuild.c. it should be the name of the current file. void nb_copy_file(char* old_file_name, char* new_file_name){ // old name shouldnt be nobuild.c. it should be the name of the current file.
nb_file old_file; nb_file old_file;