1 /* A somewhat minimal FORTH interpreter for Linux / i386 systems. -*- asm -*-
2 * By Richard W.M. Jones <rich@annexia.org>
4 * gcc -m32 -nostdlib -static -Wl,-Ttext,0 -o jonesforth jonesforth.S
7 #include <asm-i386/unistd.h>
9 /* NOTES-------------------------------------------------------------------------------------------------------------------
11 Need to say something about $ before constants.
13 And about je/jne/ja/jb/jbe/etc
26 /* Macros to deal with the return stack. */
28 lea -4(%ebp),%ebp // push reg on to return stack
33 mov (%ebp),\reg // pop top of return stack to reg
37 /* ELF entry point. */
42 mov %esp,var_S0 // Store the initial data stack pointer.
43 mov $return_stack,%ebp // Initialise the return stack.
45 mov $cold_start,%esi // Initialise interpreter.
46 NEXT // Run interpreter!
49 cold_start: // High-level code without a codeword.
52 /* DOCOL - the interpreter! */
56 PUSHRSP %esi // push %esi on to the return stack
57 addl $4,%eax // %eax points to codeword, so make
58 movl %eax,%esi // %esi point to first data word
61 /*----------------------------------------------------------------------
62 * Fixed sized buffers for everything.
66 /* FORTH return stack. */
67 #define RETURN_STACK_SIZE 8192
69 .space RETURN_STACK_SIZE
72 /* Space for user-defined words. */
73 #define USER_DEFS_SIZE 16384
83 /*----------------------------------------------------------------------
84 * Built-in words defined the long way.
89 // Store the chain of links.
92 .macro defcode name, namelen, flags=0, label
99 .byte \flags+\namelen // flags + length byte
100 .ascii "\name" // the name
104 .int code_\label // codeword
108 code_\label : // assembler code follows
111 .macro defword name, namelen, flags=0, label
117 .set link,name_\label
118 .byte \flags+\namelen // flags + length byte
119 .ascii "\name" // the name
123 .int DOCOL // codeword - the interpreter
124 // list of word pointers follow
127 .macro defvar name, namelen, flags=0, label, initial=0
128 defcode \name,\namelen,\flags,\label
137 // Some easy ones, written in assembly for speed
138 defcode "DROP",4,,DROP
139 pop %eax // drop top of stack
143 pop %eax // duplicate top of stack
148 defcode "SWAP",4,,SWAP
149 pop %eax // swap top of stack
155 defcode "OVER",4,,OVER
156 mov 4(%esp),%eax // get the second element of stack
157 push %eax // and push it on top
169 defcode "-ROT",4,,NROT
179 incl (%esp) // increment top of stack
183 decl (%esp) // decrement top of stack
186 defcode "4+",2,,INCR4
187 addl $4,(%esp) // increment top of stack
190 defcode "4-",2,,DECR4
191 subl $4,(%esp) // decrement top of stack
208 push %eax // ignore overflow
216 push %eax // push quotient
224 push %edx // push remainder
227 defcode "=",1,,EQU // top two words are equal?
237 defcode "<>",2,,NEQU // top two words are not equal?
247 defcode "0=",2,,ZEQU // top of stack equals 0?
266 defcode "INVERT",6,,INVERT
270 // COLD must not return (ie. must not call EXIT).
271 defword "COLD",4,,COLD
272 // XXX reinitialisation of the interpreter
273 .int INTERPRETER // call the interpreter loop (never returns)
274 .int LIT,1,SYSEXIT // hmmm, but in case it does, exit(1).
276 defcode "EXIT",4,,EXIT
277 POPRSP %esi // pop return stack into %esi
281 // %esi points to the next command, but in this case it points to the next
282 // literal 32 bit integer. Get that literal into %eax and increment %esi.
283 // On x86, it's a convenient single byte instruction! (cf. NEXT macro)
285 push %eax // push the literal number on to stack
288 defcode "LITSTRING",9,,LITSTRING
289 lodsl // get the length of the string
290 push %eax // push it on the stack
291 push %esi // push the address of the start of the string
292 addl %eax,%esi // skip past the string
293 addl $3,%esi // but round up to next 4 byte boundary
297 defcode "BRANCH",6,,BRANCH
298 add (%esi),%esi // add the offset to the instruction pointer
301 defcode "0BRANCH",7,,ZBRANCH
303 test %eax,%eax // top of stack is zero?
304 jz code_BRANCH // if so, jump back to the branch function above
305 lodsl // otherwise we need to skip the offset
309 pop %ebx // address to store at
310 pop %eax // data to store there
311 mov %eax,(%ebx) // store it
315 pop %ebx // address to fetch
316 mov (%ebx),%eax // fetch it
317 push %eax // push value onto stack
320 defcode "+!",2,ADDSTORE
322 pop %eax // the amount to add
323 addl %eax,(%ebx) // add it
326 defcode "-!",2,SUBSTORE
328 pop %eax // the amount to subtract
329 subl %eax,(%ebx) // add it
332 /* ! and @ (STORE and FETCH) store 32-bit words. It's also useful to be able to read and write bytes.
333 * I don't know whether FORTH has these words, so I invented my own, called !b and @b.
334 * Byte-oriented operations only work on architectures which permit them (i386 is one of those).
335 * UPDATE: writing a byte to the dictionary pointer is called C, in FORTH.
337 defcode "!b",2,,STOREBYTE
338 pop %ebx // address to store at
339 pop %eax // data to store there
340 movb %al,(%ebx) // store it
343 defcode "@b",2,,FETCHBYTE
344 pop %ebx // address to fetch
346 movb (%ebx),%al // fetch it
347 push %eax // push value onto stack
350 // The STATE variable is 0 for execute mode, != 0 for compile mode
351 defvar "STATE",5,,STATE
353 // This points to where compiled words go.
354 defvar "HERE",4,,HERE,user_defs_start
356 // This is the last definition in the dictionary.
357 defvar "LATEST",6,,LATEST,name_SYSEXIT // SYSEXIT must be last in built-in dictionary
359 // _X, _Y and _Z are scratch variables used by standard words.
364 // This stores the top of the data stack.
367 // This stores the top of the return stack.
368 defvar "R0",2,,RZ,return_stack
370 defcode "DSP@",4,,DSPFETCH
375 defcode "DSP!",4,,DSPSTORE
380 pop %eax // pop parameter stack into %eax
381 PUSHRSP %eax // push it on to the return stack
384 defcode "R>",2,,FROMR
385 POPRSP %eax // pop return stack on to %eax
386 push %eax // and push on to parameter stack
389 defcode "RSP@",4,,RSPFETCH
393 defcode "RSP!",4,,RSPSTORE
397 defcode "RDROP",5,,RDROP
398 lea 4(%ebp),%ebp // pop return stack and throw away
403 push %eax // push return value on stack
415 1: // out of input; use read(2) to fetch more input from stdin
416 xor %ebx,%ebx // 1st param: stdin
417 mov $buffer,%ecx // 2nd param: buffer
419 mov $buffend-buffer,%edx // 3rd param: max length
420 mov $__NR_read,%eax // syscall: read
422 test %eax,%eax // If %eax <= 0, then exit.
424 addl %eax,%ecx // buffer+%eax = bufftop
428 2: // error or out of input: exit
430 mov $__NR_exit,%eax // syscall: exit
433 defcode "EMIT",4,,EMIT
438 mov $1,%ebx // 1st param: stdout
440 // write needs the address of the byte to write
442 mov $2f,%ecx // 2nd param: address
444 mov $1,%edx // 3rd param: nbytes = 1
446 mov $__NR_write,%eax // write syscall
451 2: .space 1 // scratch used by EMIT
453 defcode "WORD",4,,WORD
455 push %ecx // push length
456 push %edi // push base address
460 /* Search for first non-blank character. Also skip \ comments. */
462 call _KEY // get next key, returned in %eax
463 cmpb $'\\',%al // start of a comment?
464 je 3f // if so, skip the comment
466 jbe 1b // if so, keep looking
468 /* Search for the end of the word, storing chars as we go. */
469 mov $5f,%edi // pointer to return buffer
471 stosb // add character to return buffer
472 call _KEY // get next key, returned in %al
473 cmpb $' ',%al // is blank?
474 ja 2b // if not, keep looping
476 /* Return the word (well, the static buffer) and length. */
478 mov %edi,%ecx // return length of the word
479 mov $5f,%edi // return address of the word
482 /* Code to skip \ comments to end of the current line. */
485 cmpb $'\n',%al // end of line yet?
490 // A static buffer where WORD returns. Subsequent calls
491 // overwrite this buffer. Maximum word length is 32 chars.
494 defcode "EMITSTRING",10,,EMITSTRING
495 mov $1,%ebx // 1st param: stdout
496 pop %ecx // 2nd param: address of string
497 pop %edx // 3rd param: length of string
499 mov $__NR_write,%eax // write syscall
505 pop %eax // Get the number to print into %eax
506 call _DOT // Easier to do this recursively ...
509 mov $10,%ecx // Base 10
513 xor %edx,%edx // %edx:%eax / %ecx -> quotient %eax, remainder %edx
527 // Parse a number from a string on the stack -- almost the opposite of . (DOT)
528 // Note that there is absolutely no error checking. In particular the length of the
529 // string must be >= 1 bytes.
530 defcode "SNUMBER",7,,SNUMBER
540 imull $10,%eax // %eax *= 10
543 subb $'0',%bl // ASCII -> digit
549 defcode "FIND",4,,FIND
550 pop %edi // %edi = address
551 pop %ecx // %ecx = length
557 push %esi // Save %esi so we can use it in string comparison.
559 // Now we start searching backwards through the dictionary for this word.
560 mov var_LATEST,%edx // LATEST points to name header of the latest word in the dictionary
562 test %edx,%edx // NULL pointer? (end of the linked list)
565 // Compare the length expected and the length of the word.
566 // Note that if the F_HIDDEN flag is set on the word, then by a bit of trickery
567 // this won't pick the word (the length will appear to be wrong).
569 movb 4(%edx),%al // %al = flags+length field
570 andb $(F_HIDDEN|0x1f),%al // %al = name length
571 cmpb %cl,%al // Length is the same?
574 // Compare the strings in detail.
575 push %ecx // Save the length
576 push %edi // Save the address (repe cmpsb will move this pointer)
577 lea 5(%edx),%esi // Dictionary string we are checking against.
578 repe cmpsb // Compare the strings.
581 jne 2f // Not the same.
583 // The strings are the same - return the header pointer in %eax
589 mov (%edx),%edx // Move back through the link field to the previous word
590 jmp 1b // .. and loop.
594 xor %eax,%eax // Return zero to indicate not found.
597 defcode ">CFA",4,,TCFA // DEA -> Codeword address
604 add $4,%edi // Skip link pointer.
605 movb (%edi),%al // Load flags+len into %al.
606 inc %edi // Skip flags+len byte.
607 andb $0x1f,%al // Just the length, not the flags.
608 add %eax,%edi // Skip the name.
609 addl $3,%edi // The codeword is 4-byte aligned.
613 defcode "CHAR",4,,CHAR
614 call _WORD // Returns %ecx = length, %edi = pointer to word.
616 movb (%edi),%al // Get the first character of the word.
617 push %eax // Push it onto the stack.
622 // Get the word and create a dictionary entry header for it.
623 call _WORD // Returns %ecx = length, %edi = pointer to word.
624 mov %edi,%ebx // %ebx = address of the word
626 movl var_HERE,%edi // %edi is the address of the header
627 movl var_LATEST,%eax // Get link pointer
628 stosl // and store it in the header.
630 mov %cl,%al // Get the length.
631 orb $F_HIDDEN,%al // Set the HIDDEN flag on this entry.
632 stosb // Store the length/flags byte.
634 mov %ebx,%esi // %esi = word
635 rep movsb // Copy the word
637 addl $3,%edi // Align to next 4 byte boundary.
640 movl $DOCOL,%eax // The codeword for user-created words is always DOCOL (the interpreter)
643 // Header built, so now update LATEST and HERE.
644 // We'll be compiling words and putting them HERE.
649 // And go into compile mode by setting STATE to 1.
654 pop %eax // Code pointer to store.
658 movl var_HERE,%edi // HERE
660 movl %edi,var_HERE // Update HERE (incremented)
663 defcode "HIDDEN",6,,HIDDEN
667 movl var_LATEST,%edi // LATEST word.
668 addl $4,%edi // Point to name/flags byte.
669 xorb $F_HIDDEN,(%edi) // Toggle the HIDDEN bit.
672 defcode "IMMEDIATE",9,F_IMMED,IMMEDIATE
676 movl var_LATEST,%edi // LATEST word.
677 addl $4,%edi // Point to name/flags byte.
678 xorb $F_IMMED,(%edi) // Toggle the IMMED bit.
681 defcode ";",1,F_IMMED,SEMICOLON
682 movl $EXIT,%eax // EXIT is the final codeword in compiled words.
683 call _COMMA // Store it.
684 call _HIDDEN // Toggle the HIDDEN flag (unhides the new word).
685 xor %eax,%eax // Set STATE to 0 (back to execute mode).
689 /* This definiton of ' (TICK) is strictly cheating - it also only works in compiled code. */
691 lodsl // Get the address of the next word and skip it.
692 pushl %eax // Push it on the stack.
695 /* This interpreter is pretty simple, but remember that in FORTH you can always override
696 * it later with a more powerful one!
698 defword "INTERPRETER",11,,INTERPRETER
699 .int INTERPRET,RDROP,INTERPRETER
701 defcode "INTERPRET",9,,INTERPRET
702 call _WORD // Returns %ecx = length, %edi = pointer to word.
704 // Is it in the dictionary?
706 movl %eax,interpret_is_lit // Not a literal number (not yet anyway ...)
707 call _FIND // Returns %eax = pointer to header or 0 if not found.
708 test %eax,%eax // Found?
711 // In the dictionary. Is it an IMMEDIATE codeword?
712 mov %eax,%edi // %edi = dictionary entry
713 movb 4(%edi),%al // Get name+flags.
714 push %ax // Just save it for now.
715 call _TCFA // Convert dictionary entry (in %edi) to codeword pointer.
717 andb $F_IMMED,%al // Is IMMED flag set?
719 jnz 4f // If IMMED, jump straight to executing.
723 1: // Not in the dictionary (not a word) so assume it's a literal number.
724 incl interpret_is_lit
725 call _SNUMBER // Returns the parsed number in %eax
727 mov $LIT,%eax // The word is LIT
729 2: // Are we compiling or executing?
732 jz 4f // Jump if executing.
734 // Compiling - just append the word to the current dictionary definition.
736 mov interpret_is_lit,%ecx // Was it a literal?
739 mov %ebx,%eax // Yes, so LIT is followed by a number.
743 4: // Executing - run it!
744 mov interpret_is_lit,%ecx // Literal?
745 test %ecx,%ecx // Literal?
748 // Not a literal, execute it now. This never returns, but the codeword will
749 // eventually call NEXT which will reenter the loop in INTERPRETER.
752 5: // Executing a literal, which means push it on the stack.
759 .int 0 // Flag used to record if reading a literal
761 // NB: SYSEXIT must be the last entry in the built-in dictionary.
762 defcode SYSEXIT,7,,SYSEXIT
767 /*----------------------------------------------------------------------
768 * Input buffer & initial input.
773 // XXX gives 'Warning: unterminated string; newline inserted' messages which you can ignore
775 \\ Define some character constants
781 \\ CR prints a carriage return
784 \\ SPACE prints a space
785 : SPACE 'SPACE' EMIT ;
787 \\ Primitive . (DOT) function doesn't follow with a blank, so redefine it to behave like FORTH.
788 \\ Notice how we can trivially redefine existing functions.
791 \\ DUP, DROP are defined in assembly for speed, but this is how you might define them
792 \\ in FORTH. Notice use of the scratch variables _X and _Y.
793 \\ : DUP _X ! _X @ _X @ ;
796 \\ The 2... versions of the standard operators work on pairs of stack entries. They're not used
797 \\ very commonly so not really worth writing in assembler. Here is how they are defined in FORTH.
801 \\ More standard FORTH words.
805 \\ [ and ] allow you to break into immediate mode while compiling a word.
806 : [ IMMEDIATE \\ define [ as an immediate word
807 0 STATE ! \\ go into immediate mode
811 1 STATE ! \\ go back to compile mode
814 \\ LITERAL takes whatever is on the stack and compiles LIT <foo>
816 ' LIT , \\ compile LIT
817 , \\ compile the literal itself (from the stack)
820 \\ condition IF true-part THEN rest
822 \\ condition 0BRANCH OFFSET true-part rest
823 \\ where OFFSET is the offset of 'rest'
824 \\ condition IF true-part ELSE false-part THEN
826 \\ condition 0BRANCH OFFSET true-part BRANCH OFFSET2 false-part rest
827 \\ where OFFSET if the offset of false-part and OFFSET2 is the offset of rest
829 \\ IF is an IMMEDIATE word which compiles 0BRANCH followed by a dummy offset, and places
830 \\ the address of the 0BRANCH on the stack. Later when we see THEN, we pop that address
831 \\ off the stack, calculate the offset, and back-fill the offset.
833 ' 0BRANCH , \\ compile 0BRANCH
834 HERE @ \\ save location of the offset on the stack
835 0 , \\ compile a dummy offset
840 HERE @ SWAP - \\ calculate the offset from the address saved on the stack
841 SWAP ! \\ store the offset in the back-filled location
845 ' BRANCH , \\ definite branch to just over the false-part
846 HERE @ \\ save location of the offset on the stack
847 0 , \\ compile a dummy offset
848 SWAP \\ now back-fill the original (IF) offset
849 DUP \\ same as for THEN word above
854 \\ BEGIN loop-part condition UNTIL
856 \\ loop-part condition 0BRANCH OFFSET
857 \\ where OFFSET points back to the loop-part
858 \\ This is like do { loop-part } while (condition) in the C language
860 HERE @ \\ save location on the stack
864 ' 0BRANCH , \\ compile 0BRANCH
865 HERE @ - \\ calculate the offset from the address saved on the stack
866 , \\ compile the offset here
869 \\ BEGIN loop-part AGAIN
871 \\ loop-part BRANCH OFFSET
872 \\ where OFFSET points back to the loop-part
873 \\ In other words, an infinite loop which can only be returned from with EXIT
875 ' BRANCH , \\ compile BRANCH
876 HERE @ - \\ calculate the offset back
877 , \\ compile the offset here
880 \\ BEGIN condition WHILE loop-part REPEAT
882 \\ condition 0BRANCH OFFSET2 loop-part BRANCH OFFSET
883 \\ where OFFSET points back to condition (the beginning) and OFFSET2 points to after the whole piece of code
884 \\ So this is like a while (condition) { loop-part } loop in the C language
886 ' 0BRANCH , \\ compile 0BRANCH
887 HERE @ \\ save location of the offset2 on the stack
888 0 , \\ compile a dummy offset2
892 ' BRANCH , \\ compile BRANCH
893 SWAP \\ get the original offset (from BEGIN)
894 HERE @ - , \\ and compile it after BRANCH
896 HERE @ SWAP - \\ calculate the offset2
897 SWAP ! \\ and back-fill it in the original location
900 \\ With the looping constructs, we can now write SPACES, which writes n spaces to stdout.
903 SPACE \\ print a space
904 1- \\ until we count down to 0
909 \\ .S prints the contents of the stack. Very useful for debugging.
911 DSP@ \\ get current stack pointer
913 DUP @ . \\ print the stack element
915 DUP S0 @ 4- = \\ stop when we get to the top
920 \\ DEPTH returns the depth of the stack.
921 : DEPTH S0 @ DSP@ - ;
923 \\ .\" is the print string operator in FORTH. Example: .\" Something to print\"
924 \\ The space after the operator is the ordinary space required between words.
925 \\ This is tricky to define because it has to do different things depending on whether
926 \\ we are compiling or in immediate mode. (Thus the word is marked IMMEDIATE so it can
927 \\ detect this and do different things).
928 \\ In immediate mode we just keep reading characters and printing them until we get to
929 \\ the next double quote.
930 \\ In compile mode we have the problem of where we're going to store the string (remember
931 \\ that the input buffer where the string comes from may be overwritten by the time we
932 \\ come round to running the function). We store the string in the compiled function
934 \\ LITSTRING, string length, string rounded up to 4 bytes, EMITSTRING, ...
936 STATE @ \\ compiling?
938 ' LITSTRING , \\ compile LITSTRING
939 HERE @ \\ save the address of the length word on the stack
940 0 , \\ dummy length - we don't know what it is yet
942 KEY \\ get next character of the string
945 HERE @ !b \\ store the character in the compiled image
946 HERE 1 +! \\ increment HERE pointer by 1 byte
948 DROP \\ drop the double quote character at the end
949 DUP \\ get the saved address of the length word
950 HERE @ SWAP - \\ calculate the length
951 4- \\ subtract 4 (because we measured from the start of the length word)
952 SWAP ! \\ and back-fill the length location
953 HERE @ \\ round up to next multiple of 4 bytes for the remaining code
957 ' EMITSTRING , \\ compile the final EMITSTRING
959 \\ In immediate mode, just read characters and print them until we get
960 \\ to the ending double quote. Much simpler than the above code!
963 DUP '\"' = IF EXIT THEN
969 \\ While compiling, [COMPILE] WORD compiles WORD if it would otherwise be IMMEDIATE.
970 : [COMPILE] IMMEDIATE
971 WORD \\ get the next word
972 FIND \\ find it in the dictionary
973 >CFA \\ get its codeword
974 , \\ and compile that
977 \\ RECURSE makes a recursive call to the current word that is being compiled.
978 \\ Normally while a word is being compiled, it is marked HIDDEN so that references to the
979 \\ same word within are calls to the previous definition of the word.
981 LATEST @ >CFA \\ LATEST points to the word being compiled at the moment
985 \\ ALLOT is used to allocate (static) memory when compiling. It increases HERE by
986 \\ the amount given on the stack.
990 \\ Finally print the welcome prompt.