1 /* A minimal FORTH interpreter for Linux / i386 systems. -*- asm -*-
2 * By Richard W.M. Jones <rich@annexia.org>
4 * gcc -m32 -nostdlib -static -Wl,-Ttext,0 -o jonesforth jonesforth.S
7 #include <asm-i386/unistd.h>
9 /* NOTES-------------------------------------------------------------------------------------------------------------------
11 Need to say something about $ before constants.
13 And about je/jne/ja/jb/jbe/etc
26 /* Macros to deal with the return stack. */
28 lea -4(%ebp),%ebp // push reg on to return stack
33 mov (%ebp),\reg // pop top of return stack to reg
37 /* ELF entry point. */
42 mov $return_stack,%ebp // Initialise the return stack.
44 mov $cold_start,%esi // Initialise interpreter.
45 NEXT // Run interpreter!
48 cold_start: // High-level code without a codeword.
51 /* DOCOL - the interpreter! */
55 PUSHRSP %esi // push %esi on to the return stack
56 addl $4,%eax // %eax points to codeword, so make
57 movl %eax,%esi // %esi point to first data word
60 /*----------------------------------------------------------------------
61 * Fixed sized buffers for everything.
65 /* FORTH return stack. */
66 #define RETURN_STACK_SIZE 8192
68 .space RETURN_STACK_SIZE
71 /* Space for user-defined words. */
72 #define USER_DEFS_SIZE 16384
82 /*----------------------------------------------------------------------
83 * Built-in words defined the long way.
88 // Store the chain of links.
91 .macro defcode name, namelen, flags=0, label
98 .byte \flags+\namelen // flags + length byte
99 .ascii "\name" // the name
103 .int code_\label // codeword
107 code_\label : // assembler code follows
110 .macro defword name, namelen, flags=0, label
116 .set link,name_\label
117 .byte \flags+\namelen // flags + length byte
118 .ascii "\name" // the name
122 .int DOCOL // codeword - the interpreter
123 // list of word pointers follow
126 .macro defvar name, namelen, flags=0, label, initial=0
127 defcode \name,\namelen,\flags,\label
136 // Some easy ones, written in assembly for speed
137 defcode "DROP",4,,DROP
138 pop %eax // drop top of stack
142 pop %eax // duplicate top of stack
147 defcode "SWAP",4,,SWAP
148 pop %eax // swap top of stack
154 defcode "OVER",4,,OVER
155 mov 4(%esp),%eax // get the second element of stack
156 push %eax // and push it on top
160 incl (%esp) // increment top of stack
164 decl (%esp) // decrement top of stack
181 push %eax // ignore overflow
189 push %eax // push quotient
197 push %edx // push remainder
200 defcode "=",1,,EQU // top two words are equal?
210 defcode "<>",2,,NEQU // top two words are not equal?
220 defcode "0=",2,,ZEQU // top of stack equals 0?
239 defcode "INVERT",6,,INVERT
243 // COLD must not return (ie. must not call EXIT).
244 defword "COLD",4,,COLD
245 // XXX reinitialisation of the interpreter
246 .int INTERPRETER // call the interpreter loop (never returns)
247 .int LIT,1,SYSEXIT // hmmm, but in case it does, exit(1).
249 defcode "EXIT",4,,EXIT
250 POPRSP %esi // pop return stack into %esi
254 // %esi points to the next command, but in this case it points to the next
255 // literal 32 bit integer. Get that literal into %eax and increment %esi.
256 // On x86, it's a convenient single byte instruction! (cf. NEXT macro)
258 push %eax // push the literal number on to stack
261 defcode "LITSTRING",9,,LITSTRING
262 lodsl // get the length of the string
263 push %eax // push it on the stack
264 push %esi // push the address of the start of the string
265 addl %eax,%esi // skip past the string
266 addl $3,%esi // but round up to next 4 byte boundary
270 defcode "BRANCH",6,,BRANCH
271 add (%esi),%esi // add the offset to the instruction pointer
274 defcode "0BRANCH",7,,ZBRANCH
276 test %eax,%eax // top of stack is zero?
277 jz code_BRANCH // if so, jump back to the branch function above
278 lodsl // otherwise we need to skip the offset
282 pop %ebx // address to store at
283 pop %eax // data to store there
284 mov %eax,(%ebx) // store it
288 pop %ebx // address to fetch
289 mov (%ebx),%eax // fetch it
290 push %eax // push value onto stack
293 /* ! and @ (STORE and FETCH) store 32-bit words. It's also useful to be able to read and write bytes.
294 * I don't know whether FORTH has these words, so I invented my own, called !b and @b.
295 * Byte-oriented operations only work on architectures which permit them (i386 is one of those).
297 defcode "!b",2,,STOREBYTE
298 pop %ebx // address to store at
299 pop %eax // data to store there
300 movb %al,(%ebx) // store it
303 defcode "@b",2,,FETCHBYTE
304 pop %ebx // address to fetch
306 movb (%ebx),%al // fetch it
307 push %eax // push value onto stack
310 // The STATE variable is 0 for execute mode, != 0 for compile mode
311 defvar "STATE",5,,STATE
313 // This points to where compiled words go.
314 defvar "HERE",4,,HERE,user_defs_start
316 // This is the last definition in the dictionary.
317 defvar "LATEST",6,,LATEST,name_SYSEXIT // SYSEXIT must be last in built-in dictionary
319 // _X, _Y and _Z are scratch variables used by standard words.
324 defcode "DSP@",4,,DSPFETCH
329 defcode "DSP!",4,,DSPSTORE
334 pop %eax // pop parameter stack into %eax
335 PUSHRSP %eax // push it on to the return stack
338 defcode "R>",2,,FROMR
339 POPRSP %eax // pop return stack on to %eax
340 push %eax // and push on to parameter stack
343 defcode "RSP@",4,,RSPFETCH
347 defcode "RSP!",4,,RSPSTORE
351 defcode "RDROP",5,,RDROP
352 lea 4(%ebp),%ebp // pop return stack and throw away
357 push %eax // push return value on stack
369 1: // out of input; use read(2) to fetch more input from stdin
370 xor %ebx,%ebx // 1st param: stdin
371 mov $buffer,%ecx // 2nd param: buffer
373 mov $buffend-buffer,%edx // 3rd param: max length
374 mov $__NR_read,%eax // syscall: read
376 test %eax,%eax // If %eax <= 0, then exit.
378 addl %eax,%ecx // buffer+%eax = bufftop
382 2: // error or out of input: exit
384 mov $__NR_exit,%eax // syscall: exit
387 defcode "EMIT",4,,EMIT
392 mov $1,%ebx // 1st param: stdout
394 // write needs the address of the byte to write
396 mov $2f,%ecx // 2nd param: address
398 mov $1,%edx // 3rd param: nbytes = 1
400 mov $__NR_write,%eax // write syscall
405 2: .space 1 // scratch used by EMIT
407 defcode "WORD",4,,WORD
409 push %ecx // push length
410 push %edi // push base address
414 /* Search for first non-blank character. Also skip \ comments. */
416 call _KEY // get next key, returned in %eax
417 cmpb $'\\',%al // start of a comment?
418 je 3f // if so, skip the comment
420 jbe 1b // if so, keep looking
422 /* Search for the end of the word, storing chars as we go. */
423 mov $5f,%edi // pointer to return buffer
425 stosb // add character to return buffer
426 call _KEY // get next key, returned in %al
427 cmpb $' ',%al // is blank?
428 ja 2b // if not, keep looping
430 /* Return the word (well, the static buffer) and length. */
432 mov %edi,%ecx // return length of the word
433 mov $5f,%edi // return address of the word
436 /* Code to skip \ comments to end of the current line. */
439 cmpb $'\n',%al // end of line yet?
444 // A static buffer where WORD returns. Subsequent calls
445 // overwrite this buffer. Maximum word length is 32 chars.
448 defcode "EMITSTRING",10,,EMITSTRING
449 mov $1,%ebx // 1st param: stdout
450 pop %ecx // 2nd param: address of string
451 pop %edx // 3rd param: length of string
453 mov $__NR_write,%eax // write syscall
459 pop %eax // Get the number to print into %eax
460 call _DOT // Easier to do this recursively ...
463 mov $10,%ecx // Base 10
467 xor %edx,%edx // %edx:%eax / %ecx -> quotient %eax, remainder %edx
481 // Parse a number from a string on the stack -- almost the opposite of . (DOT)
482 // Note that there is absolutely no error checking. In particular the length of the
483 // string must be >= 1 bytes.
484 defcode "SNUMBER",7,,SNUMBER
494 imull $10,%eax // %eax *= 10
497 subb $'0',%bl // ASCII -> digit
503 defcode "FIND",4,,FIND
504 pop %edi // %edi = address
505 pop %ecx // %ecx = length
511 push %esi // Save %esi so we can use it in string comparison.
513 // Now we start searching backwards through the dictionary for this word.
514 mov var_LATEST,%edx // LATEST points to name header of the latest word in the dictionary
516 test %edx,%edx // NULL pointer? (end of the linked list)
519 // Compare the length expected and the length of the word.
520 // Note that if the F_HIDDEN flag is set on the word, then by a bit of trickery
521 // this won't pick the word (the length will appear to be wrong).
523 movb 4(%edx),%al // %al = flags+length field
524 andb $(F_HIDDEN|0x1f),%al // %al = name length
525 cmpb %cl,%al // Length is the same?
528 // Compare the strings in detail.
529 push %ecx // Save the length
530 push %edi // Save the address (repe cmpsb will move this pointer)
531 lea 5(%edx),%esi // Dictionary string we are checking against.
532 repe cmpsb // Compare the strings.
535 jne 2f // Not the same.
537 // The strings are the same - return the header pointer in %eax
543 mov (%edx),%edx // Move back through the link field to the previous word
544 jmp 1b // .. and loop.
548 xor %eax,%eax // Return zero to indicate not found.
551 defcode ">CFA",4,,TCFA // DEA -> Codeword address
558 add $4,%edi // Skip link pointer.
559 movb (%edi),%al // Load flags+len into %al.
560 inc %edi // Skip flags+len byte.
561 andb $0x1f,%al // Just the length, not the flags.
562 add %eax,%edi // Skip the name.
563 addl $3,%edi // The codeword is 4-byte aligned.
567 defcode "CHAR",4,,CHAR
568 call _WORD // Returns %ecx = length, %edi = pointer to word.
570 movb (%edi),%al // Get the first character of the word.
571 push %eax // Push it onto the stack.
576 // Get the word and create a dictionary entry header for it.
577 call _WORD // Returns %ecx = length, %edi = pointer to word.
578 mov %edi,%ebx // %ebx = address of the word
580 movl var_HERE,%edi // %edi is the address of the header
581 movl var_LATEST,%eax // Get link pointer
582 stosl // and store it in the header.
584 mov %cl,%al // Get the length.
585 orb $F_HIDDEN,%al // Set the HIDDEN flag on this entry.
586 stosb // Store the length/flags byte.
588 mov %ebx,%esi // %esi = word
589 rep movsb // Copy the word
591 addl $3,%edi // Align to next 4 byte boundary.
594 movl $DOCOL,%eax // The codeword for user-created words is always DOCOL (the interpreter)
597 // Header built, so now update LATEST and HERE.
598 // We'll be compiling words and putting them HERE.
603 // And go into compile mode by setting STATE to 1.
608 pop %eax // Code pointer to store.
612 movl var_HERE,%edi // HERE
614 movl %edi,var_HERE // Update HERE (incremented)
617 defcode "HIDDEN",6,,HIDDEN
621 movl var_LATEST,%edi // LATEST word.
622 addl $4,%edi // Point to name/flags byte.
623 xorb $F_HIDDEN,(%edi) // Toggle the HIDDEN bit.
626 defcode "IMMEDIATE",9,F_IMMED,IMMEDIATE
630 movl var_LATEST,%edi // LATEST word.
631 addl $4,%edi // Point to name/flags byte.
632 xorb $F_IMMED,(%edi) // Toggle the IMMED bit.
635 defcode ";",1,F_IMMED,SEMICOLON
636 movl $EXIT,%eax // EXIT is the final codeword in compiled words.
637 call _COMMA // Store it.
638 call _HIDDEN // Toggle the HIDDEN flag (unhides the new word).
639 xor %eax,%eax // Set STATE to 0 (back to execute mode).
643 /* This definiton of ' (TICK) is strictly cheating - it also only works in compiled code. */
645 lodsl // Get the address of the next word and skip it.
646 pushl %eax // Push it on the stack.
649 /* This interpreter is pretty simple, but remember that in FORTH you can always override
650 * it later with a more powerful one!
652 defword "INTERPRETER",11,,INTERPRETER
653 .int INTERPRET,RDROP,INTERPRETER
655 defcode "INTERPRET",9,,INTERPRET
656 call _WORD // Returns %ecx = length, %edi = pointer to word.
658 // Is it in the dictionary?
660 movl %eax,interpret_is_lit // Not a literal number (not yet anyway ...)
661 call _FIND // Returns %eax = pointer to header or 0 if not found.
662 test %eax,%eax // Found?
665 // In the dictionary. Is it an IMMEDIATE codeword?
666 mov %eax,%edi // %edi = dictionary entry
667 movb 4(%edi),%al // Get name+flags.
668 push %ax // Just save it for now.
669 call _TCFA // Convert dictionary entry (in %edi) to codeword pointer.
671 andb $F_IMMED,%al // Is IMMED flag set?
673 jnz 4f // If IMMED, jump straight to executing.
677 1: // Not in the dictionary (not a word) so assume it's a literal number.
678 incl interpret_is_lit
679 call _SNUMBER // Returns the parsed number in %eax
681 mov $LIT,%eax // The word is LIT
683 2: // Are we compiling or executing?
686 jz 4f // Jump if executing.
688 // Compiling - just append the word to the current dictionary definition.
690 mov interpret_is_lit,%ecx // Was it a literal?
693 mov %ebx,%eax // Yes, so LIT is followed by a number.
697 4: // Executing - run it!
698 mov interpret_is_lit,%ecx // Literal?
699 test %ecx,%ecx // Literal?
702 // Not a literal, execute it now. This never returns, but the codeword will
703 // eventually call NEXT which will reenter the loop in INTERPRETER.
706 5: // Executing a literal, which means push it on the stack.
713 .int 0 // Flag used to record if reading a literal
715 // NB: SYSEXIT must be the last entry in the built-in dictionary.
716 defcode SYSEXIT,7,,SYSEXIT
721 /*----------------------------------------------------------------------
722 * Input buffer & initial input.
727 // XXX gives 'Warning: unterminated string; newline inserted' messages which you can ignore
729 \\ Define some character constants
735 \\ CR prints a carriage return
738 \\ SPACE prints a space
739 : SPACE 'SPACE' EMIT ;
741 \\ Primitive . (DOT) function doesn't follow with a blank, so redefine it to behave like FORTH.
742 \\ Notice how we can trivially redefine existing functions.
745 \\ DUP, DROP are defined in assembly for speed, but this is how you might define them
746 \\ in FORTH. Notice use of the scratch variables _X and _Y.
747 \\ : DUP _X ! _X @ _X @ ;
750 \\ [ and ] allow you to break into immediate mode while compiling a word.
751 : [ IMMEDIATE \\ define [ as an immediate word
752 0 STATE ! \\ go into immediate mode
756 1 STATE ! \\ go back to compile mode
759 \\ LITERAL takes whatever is on the stack and compiles LIT <foo>
761 ' LIT , \\ compile LIT
762 , \\ compile the literal itself (from the stack)
765 \\ condition IF true-part THEN rest
767 \\ condition 0BRANCH OFFSET true-part rest
768 \\ where OFFSET is the offset of 'rest'
769 \\ condition IF true-part ELSE false-part THEN
771 \\ condition 0BRANCH OFFSET true-part BRANCH OFFSET2 false-part rest
772 \\ where OFFSET if the offset of false-part and OFFSET2 is the offset of rest
774 \\ IF is an IMMEDIATE word which compiles 0BRANCH followed by a dummy offset, and places
775 \\ the address of the 0BRANCH on the stack. Later when we see THEN, we pop that address
776 \\ off the stack, calculate the offset, and back-fill the offset.
778 ' 0BRANCH , \\ compile 0BRANCH
779 HERE @ \\ save location of the offset on the stack
780 0 , \\ compile a dummy offset
785 HERE @ SWAP - \\ calculate the offset from the address saved on the stack
786 SWAP ! \\ store the offset in the back-filled location
790 ' BRANCH , \\ definite branch to just over the false-part
791 HERE @ \\ save location of the offset on the stack
792 0 , \\ compile a dummy offset
793 SWAP \\ now back-fill the original (IF) offset
794 DUP \\ same as for THEN word above
799 \\ BEGIN loop-part condition UNTIL
801 \\ loop-part condition 0BRANCH OFFSET
802 \\ where OFFSET points back to the loop-part
803 \\ This is like do { loop-part } while (condition) in the C language
805 HERE @ \\ save location on the stack
809 ' 0BRANCH , \\ compile 0BRANCH
810 HERE @ - \\ calculate the offset from the address saved on the stack
811 , \\ compile the offset here
814 \\ BEGIN loop-part AGAIN
816 \\ loop-part BRANCH OFFSET
817 \\ where OFFSET points back to the loop-part
818 \\ In other words, an infinite loop which can only be returned from with EXIT
820 ' BRANCH , \\ compile BRANCH
821 HERE @ - \\ calculate the offset back
822 , \\ compile the offset here
825 \\ BEGIN condition WHILE loop-part REPEAT
827 \\ condition 0BRANCH OFFSET2 loop-part BRANCH OFFSET
828 \\ where OFFSET points back to condition (the beginning) and OFFSET2 points to after the whole piece of code
829 \\ So this is like a while (condition) { loop-part } loop in the C language
831 ' 0BRANCH , \\ compile 0BRANCH
832 HERE @ \\ save location of the offset2 on the stack
833 0 , \\ compile a dummy offset2
837 ' BRANCH , \\ compile BRANCH
838 SWAP \\ get the original offset (from BEGIN)
839 HERE @ - , \\ and compile it after BRANCH
841 HERE @ SWAP - \\ calculate the offset2
842 SWAP ! \\ and back-fill it in the original location
845 \\ With the looping constructs, we can now write SPACES, which writes n spaces to stdout.
848 SPACE \\ print a space
849 1- \\ until we count down to 0
854 \\ .\" is the print string operator in FORTH. Example: .\" Something to print\"
855 \\ The space after the operator is the ordinary space required between words.
856 \\ This is tricky to define because it has to do different things depending on whether
857 \\ we are compiling or in immediate mode. (Thus the word is marked IMMEDIATE so it can
858 \\ detect this and do different things).
859 \\ In immediate mode we just keep reading characters and printing them until we get to
860 \\ the next double quote.
861 \\ In compile mode we have the problem of where we're going to store the string (remember
862 \\ that the input buffer where the string comes from may be overwritten by the time we
863 \\ come round to running the function). We store the string in the compiled function
865 \\ LITSTRING, string length, string rounded up to 4 bytes, EMITSTRING, ...
867 STATE @ \\ compiling?
869 ' LITSTRING , \\ compile LITSTRING
870 HERE @ \\ save the address of the length word on the stack
871 0 , \\ dummy length - we don't know what it is yet
873 KEY \\ get next character of the string
876 HERE @ !b \\ store the character in the compiled image
877 HERE @ 1+ HERE ! \\ increment HERE pointer by 1 byte
879 DROP \\ drop the double quote character at the end
880 DUP \\ get the saved address of the length word
881 HERE @ SWAP - \\ calculate the length
882 4 - \\ subtract 4 (because we measured from the start of the length word)
883 SWAP ! \\ and back-fill the length location
884 HERE @ \\ round up to next multiple of 4 bytes for the remaining code
888 ' EMITSTRING , \\ compile the final EMITSTRING
890 \\ In immediate mode, just read characters and print them until we get
891 \\ to the ending double quote. Much simpler!
894 DUP '\"' = IF EXIT THEN
900 : TEST .\" hello, world..!\" CR ;
903 \\ Finally print the welcome prompt.