From d46febe6fcc6a3e78926d9e4b036782868bb95fe Mon Sep 17 00:00:00 2001 From: victor Date: Tue, 29 Apr 2025 01:30:30 +0200 Subject: [PATCH] first lexing --- .project.gf | 5 +- Makefile | 5 +- src/core/string/is_alpha.s | 46 +++++ src/core/string/is_num.s | 8 +- src/core/string/split.s | 2 +- src/inc/asm_output.s | 2 + src/{parse => inc}/expression.s | 8 +- src/inc/lexer.s | 22 +++ src/inc/macros.inc | 7 - src/inc/token.s | 15 ++ src/parse/create_expressions.s | 17 +- src/parse/debug_expression.s | 11 +- src/parse/debug_token.s | 56 +++++- src/parse/lexer.s | 309 +++++++++++++++++++++++++++++++- src/parse/parse.s | 34 ++-- src/start.s | 5 +- test.lang | 3 - 17 files changed, 493 insertions(+), 62 deletions(-) create mode 100644 src/inc/asm_output.s rename src/{parse => inc}/expression.s (61%) create mode 100644 src/inc/lexer.s delete mode 100644 src/inc/macros.inc create mode 100644 src/inc/token.s diff --git a/.project.gf b/.project.gf index 0982c60..32aed5c 100644 --- a/.project.gf +++ b/.project.gf @@ -1,4 +1,3 @@ [executable] -path=/home/victor/git/ctools/lang/debug -arguments= -ask_directory=1 +path=/home/victor/git/lang/debug +arguments=/home/victor/git/lang/test.lang diff --git a/Makefile b/Makefile index d6d1dd3..b4095f0 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ MATHSRC := $(addprefix $(MATHDIR)/, $(addsuffix .s, \ operators \ )) STRSRC := $(addprefix $(STRDIR)/, $(addsuffix .s, \ - strlen split strcpy substr is_num strcmp\ + strlen split strcpy substr is_num strcmp is_alpha \ )) MEMSRC := $(addprefix $(MEMDIR)/, $(addsuffix .s, \ malloc memchr \ @@ -28,7 +28,8 @@ SYSCALLSRC := $(addprefix $(SYSCALLDIR)/, $(addsuffix .s, \ exit file_ops syscall_err\ )) PARSESRC := $(addprefix $(PARSEDIR)/, $(addsuffix .s, \ - parse debug_token create_expressions expression debug_expression \ + parse debug_token create_expressions debug_expression \ + lexer \ )) # Collect all source files - now using the file variables, not directory variables diff --git a/src/core/string/is_alpha.s b/src/core/string/is_alpha.s index e69de29..036fe8d 100644 --- a/src/core/string/is_alpha.s +++ b/src/core/string/is_alpha.s @@ -0,0 +1,46 @@ +section .text + global is_alpha + global is_alpha_str + +is_alpha: ; rax: bool (rdi: int c) + mov al, dil + sub al, 'A' + cmp al, 25 + jc .is_alpha_true + + mov al, dil + sub al, 'a' + cmp al, 25 + ja .not_alpha + +.is_alpha_true: + mov rax, 1 + ret + +.not_alpha: + xor rax, rax + ret + +is_alpha_str: ; rax: bool (rdi: char *) + xor rcx, rcx + mov r8, rdi + +.loop: + mov dil, byte [r8 + rcx] + cmp dil, 0 + je .done + + call is_alpha + test rax, rax + jz .not_alpha + + inc rcx + jmp .loop + +.done: + mov rax, 1 + ret + +.not_alpha: + xor rax, rax + ret diff --git a/src/core/string/is_num.s b/src/core/string/is_num.s index b18a007..9da2599 100644 --- a/src/core/string/is_num.s +++ b/src/core/string/is_num.s @@ -3,10 +3,10 @@ section .text global is_num_str is_num: ; rax: bool (rdi: int c) - cmp rdi, 48 - jl not_num - cmp rdi, 57 - jg not_num + sub dil, '0' + cmp dil, 9 + jnc not_num + mov rax, 1 ret not_num: diff --git a/src/core/string/split.s b/src/core/string/split.s index 60ae01d..7c26ac6 100644 --- a/src/core/string/split.s +++ b/src/core/string/split.s @@ -78,7 +78,7 @@ split: ; RAX: char ** split(RDI: char *, RSI: int) mov [rbp - 16], rax mov rcx, rax - cmp rbx, 1 + cmp rbx, 0 je .no_match call strlen diff --git a/src/inc/asm_output.s b/src/inc/asm_output.s new file mode 100644 index 0000000..36dfdf7 --- /dev/null +++ b/src/inc/asm_output.s @@ -0,0 +1,2 @@ +section .data + ADD_INST: db "mov eax, esi", 0xa, "add eax, edi", 0xa, 0 diff --git a/src/parse/expression.s b/src/inc/expression.s similarity index 61% rename from src/parse/expression.s rename to src/inc/expression.s index 5af749c..0fc591a 100644 --- a/src/parse/expression.s +++ b/src/inc/expression.s @@ -1,8 +1,10 @@ -section .data - - %define EXPR_SIZE 32 +%define EXPR_TYPE 0 +%define EXPR_TOK_CNT 8 +%define EXPR_TOK 16 + + ; struct expression size = 32 ; .type ; .tok_count + 8 diff --git a/src/inc/lexer.s b/src/inc/lexer.s new file mode 100644 index 0000000..b91d4dc --- /dev/null +++ b/src/inc/lexer.s @@ -0,0 +1,22 @@ +%define LEX_EXPR_CNT 0 +%define LEX_VAR_CNT 4 +%define LEX_EXPR 8 +%define LEX_VAR 16 +%define LEX_OUT 24 + +%define LEX_SIZE 32 + +%define VAR_NAME 0 +%define VAR_OFFS 8 +%define VAR_SIZE 16 + +; struct var +; .name* 0 +; .stack_off 8 + +; struct lexer +; .expr_cnt 0 +; .var_cnt 4 +; .expr* 8 +; .vars 16 +; .output** 24 diff --git a/src/inc/macros.inc b/src/inc/macros.inc deleted file mode 100644 index 7b43970..0000000 --- a/src/inc/macros.inc +++ /dev/null @@ -1,7 +0,0 @@ -%define EXPR_SIZE 32 - -%define EXPR_TYPE 0 - -%define EXPR_TOK_CNT 8 - -%define EXPR_TOK 16 diff --git a/src/inc/token.s b/src/inc/token.s new file mode 100644 index 0000000..d93645c --- /dev/null +++ b/src/inc/token.s @@ -0,0 +1,15 @@ +%define SIZE_TOK 16 +%define TOK_TYPE 0 +%define TOK_VALUE 8 + +%define TOK_LOAD 0 +%define TOK_VAR 1 +%define TOK_CONST 2 +%define TOK_ADD 3 +%define TOK_SUB 4 +%define TOK_FUNC 5 + +; struct token +; .type 0 +; .value 8 + diff --git a/src/parse/create_expressions.s b/src/parse/create_expressions.s index 7f53f54..07541d9 100644 --- a/src/parse/create_expressions.s +++ b/src/parse/create_expressions.s @@ -1,4 +1,4 @@ -%include "./lang/src/inc/macros.inc" +%include "./src/inc/expression.s" section .text global create_expressions @@ -11,11 +11,13 @@ section .text extern print_expression -create_expressions: ; rax: exp* (rdi: char *filecontent) +create_expressions: ; rax: exp* (rdi: char *filecontent, rsi: *cnt) push rbp mov rbp, rsp sub rsp, 32 ; allocate stack + push rsi + mov rsi, 0x0a call split @@ -55,7 +57,7 @@ create_expressions: ; rax: exp* (rdi: char *filecontent) .splitting_done: ; allocate expressions - mov rax, [expr_size] + mov rax, EXPR_SIZE mul rcx ; rcx contains the amount of splits aka expr count mov rdi, rax call malloc @@ -88,10 +90,8 @@ create_expressions: ; rax: exp* (rdi: char *filecontent) mul rcx lea rax, [rbx + rax] pop rbx - mov rdx, [expr_tok] - mov [rax + rdx], rbx - mov rdx, [expr_tok_cnt] - mov [rax + rdx], rdi + mov [rax + EXPR_TOK], rbx + mov [rax + EXPR_TOK_CNT], rdi inc rcx jmp .loop_expressions @@ -114,6 +114,9 @@ create_expressions: ; rax: exp* (rdi: char *filecontent) jmp .expr_loop_print .done: + pop rsi + mov rdi, [rbp - 24] + mov dword [rsi], edi mov rax, [rbp - 16] add rsp, 32 mov rsp, rbp diff --git a/src/parse/debug_expression.s b/src/parse/debug_expression.s index 29fb021..c0646b8 100644 --- a/src/parse/debug_expression.s +++ b/src/parse/debug_expression.s @@ -1,11 +1,10 @@ +%include "./src/inc/expression.s" + section .data header: db 0xa, "---------", 0xa, "Expr ", 0xa, "---------", 0 type: db 0xa, "type: ", 0 section .text extern print_tokens - extern expr_type - extern expr_tok - extern expr_tok_cnt extern putendl global print_expression @@ -18,10 +17,8 @@ print_expression: ; (rdi: expr*) call putendl pop rdi - mov rbx, [expr_tok_cnt] - mov rsi, [rdi + rbx] - add rdi, [expr_tok] ; tok** - mov r12, [rdi] ; r12 = tok* + mov rsi, [rdi + EXPR_TOK_CNT] + mov r12, [rdi + EXPR_TOK] ; r12 = tok* mov rdi, r12 call print_tokens diff --git a/src/parse/debug_token.s b/src/parse/debug_token.s index 0ef5f21..b6ddaaa 100644 --- a/src/parse/debug_token.s +++ b/src/parse/debug_token.s @@ -1,7 +1,15 @@ +%include "./src/inc/token.s" + section .data token: db 0xa, "Token ", 0 type: db "type = ", 0 value: db "value = ", 0 + VAL_CONST: db "const", 0 + VAL_VAR: db "variable", 0 + VAL_OP_ADD: db "operator '+'", 0 + VAL_OP_SUB: db "operator '-'", 0 + VAL_OP_LOAD: db "operator '='", 0 + VAL_FUNC: db "function call", 0 section .text global print_tokens @@ -11,11 +19,47 @@ section .text extern putnumberendl extern get_split_count +print_token_type: ; (rdi: int) + cmp rdi, TOK_LOAD + je .tok_load + cmp rdi, TOK_ADD + je .tok_add + cmp rdi, TOK_SUB + je .tok_sub + cmp rdi, TOK_CONST + je .tok_const + cmp rdi, TOK_VAR + je .tok_var + cmp rdi, TOK_FUNC + je .tok_func +.tok_load: + mov rdi, VAL_OP_LOAD + jmp .print -; struct token -; .type 0 -; .value +8 +.tok_add: + mov rdi, VAL_OP_ADD + jmp .print + +.tok_sub: + mov rdi, VAL_OP_SUB + jmp .print + +.tok_const: + mov rdi, VAL_CONST + jmp .print + +.tok_var: + mov rdi, VAL_VAR + jmp .print + +.tok_func: + mov rdi, VAL_FUNC + jmp .print + +.print: + call putendl + ret print_tokens: ; (rdi: tok*, rsi: tok_count) push rbp @@ -49,9 +93,9 @@ print_tokens: ; (rdi: tok*, rsi: tok_count) mul r12 mov rbx, [rbp - 8] lea r13, [rbx + rax] - mov rdi, [r13] + mov rdi, [r13 + TOK_TYPE] push rax - call putnumberendl + call print_token_type mov rdi, value call putstr @@ -59,7 +103,7 @@ print_tokens: ; (rdi: tok*, rsi: tok_count) mov rbx, [rbp - 8] lea r13, [rbx + rax] - mov rdi, [r13 + 8] + mov rdi, [r13 + TOK_VALUE] call putendl mov rcx, r12 diff --git a/src/parse/lexer.s b/src/parse/lexer.s index 4689d9e..bd03412 100644 --- a/src/parse/lexer.s +++ b/src/parse/lexer.s @@ -1,8 +1,307 @@ -section .text +%include "./src/inc/token.s" +%include "./src/inc/lexer.s" +%include "./src/inc/expression.s" +%include "./src/inc/asm_output.s" -; struct lexer -; .cnt 0 -; .expr* 8 +%define LEX_ERROR 0xa, "[LEX_ERROR] " + +section .data + EEXPECT: db LEX_ERROR, "expected: ", 0 + + MOV: db "mov ", 0 + OPEN_STACK_VAR: db "[rbp - ", 0 + CLOSE_STACK_VAR: db "], ", 0 + + +section .text + extern malloc + extern err_malloc + extern exit + extern putstr + extern create_expressions + extern strcmp + extern VAL_OP_LOAD + extern putchar + extern putnumber + extern putendl + +lex_eexpect: ; (rdi: tok_type) + push rdi + mov rdi, EEXPECT + call putstr + pop rdi + call putstr + mov rdi, 1 + call exit + + +count_vars: ; rdi: lex* + push rbp + mov rbp, rsp + sub rsp, 16 + push rbx + push r12 + xor r12, r12 + + mov rbx, [rdi + LEX_EXPR] + + xor rcx, rcx + push rcx +.loop_expr: + pop rcx + cmp ecx, dword [rdi + LEX_EXPR_CNT] + je .done + mov rax, EXPR_SIZE + mul rcx + lea rax, [rbx + rax] + mov rdx, [rax + EXPR_TOK_CNT] + inc rcx + push rcx + xor rcx, rcx + mov rax, [rax + EXPR_TOK] +.loop_toks: + cmp rcx, rdx + je .loop_expr + cmp qword [rax + TOK_TYPE], TOK_VAR + jne .no_var + inc r12 +.no_var: + inc rcx + add rax, SIZE_TOK + jmp .loop_toks + +.done: + mov dword [rdi + LEX_VAR_CNT], r12d + pop r12 + pop rbx + mov rsp, rbp + pop rbp + ret + +get_vars: ; (rdi: lex*) + push rbp + mov rbp, rsp + sub rsp, 32 + push rbx + push r12 + xor r12, r12 + xor r9, r9 + + call count_vars + + mov [rbp - 24], rdi ; store lex + mov eax, dword [rdi + LEX_VAR_CNT] + mov [rbp - 32], rax + + mov rdi, VAR_SIZE + mul rdi + mov rdi, rax + call malloc + cmp rax, 0 + je err_malloc + + mov rdi, [rbp - 24] + mov [rdi + LEX_VAR], rax + + mov eax, dword [rdi + LEX_EXPR_CNT] + mov [rbp - 8], eax + mov rax, [rdi + LEX_EXPR] + mov [rbp - 16], rax + + xor rcx, rcx + push rcx +.loop_expr: + pop rcx + cmp ecx, dword [rdi + LEX_EXPR_CNT] + je .done + mov rax, EXPR_SIZE + mul rcx + mov rbx, [rbp - 16] + lea rax, [rbx + rax] + mov rdx, [rax + EXPR_TOK_CNT] + inc rcx + push rcx + xor rcx, rcx + mov rax, [rax + EXPR_TOK] + +.loop_toks: + cmp rcx, rdx + je .loop_expr + cmp qword [rax + TOK_TYPE], TOK_VAR + jne .skip_alloc + mov rbx, [rbp - 24] + mov rbx, [rbx + LEX_VAR] + lea rbx, [rbx + r12] + mov r8, [rax + TOK_VALUE] + mov [rbx + VAR_NAME], r8 + inc r9 + mov rax, r8 + mov rax, 8 + push rdx + mul r9 + pop rdx + mov [rbx + VAR_OFFS], rax + mov rax, r8 + +.skip_alloc: + add rax, SIZE_TOK + inc rcx + jmp .loop_toks + +.done: + pop r12 + pop rbx + mov rsp, rbp + pop rbp + ret global lex -lex: ; (rdi: expr*, rsi: cnt) +lex: ; rax: lex* (rdi: char *file_content) + push rbp + mov rbp, rsp + sub rsp, 32 + push rbx + + mov [rbp - 8], rdi + + ; allocate lexer + mov rdi, LEX_SIZE + call malloc + cmp rax, 0 + je err_malloc + mov [rbp - 24], rax ; store lex on stack + + lea rsi, [rbp - 16] ; int* expr_cnt + mov rdi, [rbp - 8] ; restore file_content + + call create_expressions + + mov rdi, [rbp - 24] + mov [rdi + LEX_EXPR], rax + mov rax, [rbp - 16] + mov [rdi + LEX_EXPR_CNT], rax + + call get_vars + + xor rcx, rcx + +.process_expressions: + mov rdi, [rbp - 24] + mov esi, [rdi + LEX_EXPR_CNT] + cmp ecx, esi + je .done + + mov rbx, [rdi + LEX_EXPR] + mov rax, EXPR_SIZE + mul rcx + push rcx + mov rdi, [rbx + rax + EXPR_TOK] + + mov rdx, [rbp - 24] + call lex_assignment + pop rcx + + inc rcx + + jmp .process_expressions +.done: + pop rbx + mov rsp, rbp + pop rbp + ret + +lex_assignment: ; (rdi: tok*, rsi: n, rdx: lex*) + push rbp + mov rbp, rsp + sub rsp, 32 + + mov [rbp - 16], rdi ; store tok array + mov rdi, [rdx + LEX_VAR_CNT] + mov [rbp - 8], edi ; var_cnt + mov rdi, [rdx + LEX_VAR] + mov [rbp - 24], rdi ; vars + + ; check first token: if not TOK_VAR, cant be assign + mov rdi, [rbp - 16] + mov rdx, [rdi + TOK_TYPE] + cmp rdx, TOK_VAR + jne .done_false + + xor rcx, rcx + + push rdi + + mov rsi, [rdi + TOK_VALUE] + mov rdi, [rbp - 24] + mov rdx, [rbp - 8] + + call look_up_var + push rax + + mov rdi, MOV + call putstr + mov rdi, OPEN_STACK_VAR + call putstr + pop rdi + call putnumber + mov rdi, CLOSE_STACK_VAR + call putstr + + pop rdi + + add rdi, SIZE_TOK + mov rdx, [rdi + TOK_TYPE] + cmp rdx, TOK_LOAD + jne .err_found + + add rdi, SIZE_TOK + mov rdx, [rdi + TOK_TYPE] + cmp rdx, TOK_CONST + je .print_const + cmp rdx, TOK_VAR + je .done_true + + +.done_false: + mov rax, 0 + ret + +.done_true: + mov rsp, rbp + pop rbp + cmp rcx, rsi + jne .done_false + mov rax, 1 + ret + +.err_found: + mov rdi, VAL_OP_LOAD + call lex_eexpect + +.print_const: + mov rdi, [rdi + TOK_VALUE] + call putendl + jmp .done_true + +look_up_var: ; rax: bool (rdi: vars*, rsi: name*, rdx: n) + push rbp + mov rbp, rsp + xor rax, rax + + xor rcx, rcx + +.loop_vars: + cmp rcx, rdx + je .done + cmp [rdi], rsi + je .found + inc rcx + add rdi, VAR_SIZE + jmp .loop_vars + +.found: + mov rax, [rdi + VAR_OFFS] +.done: + mov rsp, rbp + pop rbp + ret diff --git a/src/parse/parse.s b/src/parse/parse.s index fd8d63e..4cc6e84 100644 --- a/src/parse/parse.s +++ b/src/parse/parse.s @@ -1,11 +1,24 @@ -%define SIZE_TOK 16 -%define TOK_ASSIGN 0 -%define TOK_ADD 1 -%define TOK_PRINT 2 -%define TOK_VAR 3 -%define TOK_CONST 4 +%include "./src/inc/token.s" section .data + global VAL_CONST + VAL_CONST: db "const", 0 + + global VAL_VAR + VAL_VAR: db "variable", 0 + + global VAL_OP_ADD + VAL_OP_ADD: db "operator '+'", 0 + + global VAL_OP_SUB + VAL_OP_SUB: db "operator '-'", 0 + + global VAL_OP_LOAD + VAL_OP_LOAD: db "operator '='", 0 + + global VAL_FUNC + VAL_FUNC: db "function call", 0 + OP_ASSIGN: db "=", 0 OP_ADD: db "+", 0 OP_PRINT: db "print", 0 @@ -21,11 +34,6 @@ section .text extern get_split_count -; struct token -; .type 0 -; .value +8 - - token_alloc: ; rax: tok* (rdi: int cnt) mov rax, rdi mov rdi, SIZE_TOK @@ -81,7 +89,7 @@ parse: ; rax: tok* (rdi: char**) .is_assign: pop rcx push rdi - mov rdi, TOK_ASSIGN + mov rdi, TOK_LOAD jmp .set_token .is_add: @@ -93,7 +101,7 @@ parse: ; rax: tok* (rdi: char**) .is_print: pop rcx push rdi - mov rdi, TOK_PRINT + mov rdi, TOK_FUNC jmp .set_token .is_const: diff --git a/src/start.s b/src/start.s index b237b6a..eb9a83b 100644 --- a/src/start.s +++ b/src/start.s @@ -23,6 +23,7 @@ section .text extern err_malloc extern get_file_content extern create_expressions + extern lex print_usage: mov rdi, usage @@ -40,9 +41,11 @@ _start: call get_file_content mov rdi, rax - call create_expressions mov [rbp - 8], rax + mov rdi, rax + call lex + mov rsp, rbp pop rbp diff --git a/test.lang b/test.lang index 60bfbb4..6f4c91b 100644 --- a/test.lang +++ b/test.lang @@ -1,4 +1 @@ a = 5 -b = a + 6 -print b -b = a + a