-- HamsterSpeak Compiler -- (C) Copyright 2001-2024 James Paige, Ralph Versteegen, and the OHRRPGCE Developers -- Dual licensed under the GNU GPL v2+ and MIT Licenses. Read LICENSE.txt for terms and disclaimer of liability. -- This is a compiler for HamsterSpeak scripts used for plotscripting in -- the O.H.R.RPG.C.E. For more info, visit http://HamsterRepublic.com -- This code is written in Euphoria 4.0. You can get the Free Open-Source -- Euphoria compiler from http://OpenEuphoria.com . I also highly recommend -- David Cuny's EE editor --------------------------------------------------------------------------- --Changelog --3W 2024-02-06 Add srcpos debug info, local var names, and srcfiles.txt --3Vb 2024-02-06 Disallow local/subscript or script/global with the same name -- 2024-01-11 Fix lexing of identifiers ending in : & | or - --3Va 2022-04-08 Add exit --3V 2022-04-07 Add showvalueof, and tracevalueof alias --3Ug 2021-12-12 Change license to Dual GPL+MIT (with consent of all contributors) --3Uf 2020-02-29 Accept non-breaking space as whitespace --3Ue 2019-09-09 Windows: fix double-inclusion detection --3Ud 2019-07-12 Windows: double-include detection now case-insensitive --3Uc 2019-03-25 Enforce at most 32 args allowed --3Ub 2018-11-17 Fix some error messages --3Ua 2018-08-18 Script can also have attributes --3U 2018-08-12 @obsolete attribute --3Tl 2018-05-14 Delete (possibly-bad) .hs output file on error --3Tk 2018-03-18 Optimise binary_compile; puts nodes in opposite order --3Tj 2018-03-18 Fix some number and - lexing bugs --3Ti 2018-03-17 More compiler global_scope optimisations --3Th 2018-03-15 Optimize compiler, rewriting global_scope lookup --3Tg 2017-12-31 Fix crash due to missing KIND_LONGNAMES[KIND_KEYWORD] --3Tf 2017-11-08 Specialise assert(x==y) to use _asserteq --3Te 2017-10-29 Look for plotscr.hsd/scancode.hsi first in global dirs --3Td 2017-10-07 Add parent ID to scripts.bin, for --reuse-ids --3Tc 2017-10-02 Make -j wait for warnings too --3Tb 2017-09-28 Added -j option; warn on unknown options --3Ta 2017-07-28 Increased MAXGLOBAL to 50000 -- 2017-06-13 0x, 0o and 0b number prefixes --3T 2017-06-11 Allow subscript vars/args to shadow outer locals --3Se 2017-05-17 Disallow extra commas --3Sd 2017-03-17 --incdir option; check ../share/games/ohrrpgce/ instead --3Sc 2016-04-11 Check /usr/share/games/ohrrpgce/ for include files --3Sb 2014-11-11 Better 'include' line lexing & garbage detection --3Sa 2014-08-23 Fix printing of a lexer error message --3S 2014-07-22 Automatically include plotscr.hsd and ignore -- double inclusion; added --include and -b --3Rg 2013-02-23 Fix a subscript related crash --3Rf 2013-02-17 Fix several 31-bit int overflows --3Re 2014-02-17 Trivial change to math builtin warnings ----- 2013-11-09 Prevent misuse of betweenable-operators in -- declarations, and allow constants for global ids --3Rc 2013-10-11 subscript implementation changes (add nonlocal kind) --3Rb 2013-10-10 Remove variable() later, for syntax checking --3Ra 2013-10-09 Fixed \r lineendings (AGAIN) and slow \r\n handling --3R 2013-10-05 Add subscript --3Qc 2013-10-04 Disallow flow statements used as expressions --3Qb 2013-07-24 Require begin after then, do, etc. (else excluded) --3Qa 2013-04-11 Warn when plotscr.hsd not included ----- 2013-02-28 Allow non-constant expressions in case blocks again --3Pe 2013-01-03 Faster compile times --3Pd 2012-10-24 Fix \x escape code --3Pc 2012-10-09 Change srcpos encoding to remove source size limits --3Pb 2012-10-08 \x escape code --3Pa 2012-08-24 --reuse-ids option --3P 2012-08-18 Change way include is lexed: quotes no longer needed -- [], {}, . now totally disallowed -- Add < and > operators -- Add plotscrversion block and new HS header lump -- New switch syntax: do after case optional, case(else) -- Added elseif --3Of 2012-08-18 Lexer bugfix --3Oe 2012-04-11 Add \n and \t escape codes --3Od 2012-03-03 Change directory search order slightly --3Oc 2012-02-07 Increased MAXGLOBAL to 16383 --3Ob 2012-02-07 Make the max script global number set by a constant --3Oa 2012-01-20 Fix lexer allowing newlines in strings --3O 2012-01-19 Unicode support --3Nd 2011-10-16 Fix commands.bin generation bug --3Nc 2011-08-28 Additional expression optimisations --3Nb 2011-08-28 Fix overflow_int32 and bitwise operator overflow --3Na 2011-07-25 Add -x option for Hamster Whisper's benefit --3N 2011-07-12 Add assert --3Ma 2010-10-21 Correctly strip \r to fix newlines ... again. --3M 2010-05-02 abs, sign, sqrt; fix error reporting --3L 2010-04-29 tracevalue support, involving a few internal changes --3Ke 2010-04-18 Add -t commandline option --3Kd 2010-04-14 Quite a few bugfixes in optimizer --3Kc 2010-04-14 Start checking well-formedness of numbers with lexer --3Kb 2010-04-12 Whole lot of bugfixes in script declarations and -- checking for unreserved identifier names --3Ka 2010-04-12 Throw error if source file contains binary --3K 2010-04-12 Error column reporting (introducing srcpos); lump -- original source files into source.lumped --3Ja 2010-04-07 Wrote a (nearly) proper lexer --3J 2009-12-05 Multiple function names may now alias the same id --3I 2009-07-27 Export a backup copy of full source to source.txt lump -- disable with -n option --3H 2009-06-18 Export function names to commands.bin lump --3Gk 2009-05-25 Fix short references crash --3Gj 2009-01-27 Show future-compatability warning when using square -- brackets [] and curly brackets {} in names --3Gi 2008-08-03 Disallow orphaned then() and else(), a common mistake --3Gh 2008-07-14 Disable warnings for unused variables by default, and -- Add -u option to enable warnings for unused variables. --3Gg 2008-03-01 Raise global variable limits --3Gf 2008-02-11 Give user another chance on missing Y key :P --3Ge 2007-07-01 String conversion rewrite to allow more whitespace --3Gd 2007-07-01 Allow constants for new syntax default args --3Gc 2007-05-29 Handle empty paratheses correctly, and use -- lowercase ".hs" extension --3Gb 2007-05-09 Throw error on missing include file --3Ga 2007-05-09 String length not limited to 40 --3G 2007-02-09 Offset to string literal tables given as a 32-bit -- int, script format version++ to 2. --3F 2006-12-11 Support for variable number of arguments to functions -- by giving -1 as the number of args in the define block --3Ec 2006-11-30 while(true) no longer produces a warning --3Eb 2006-10-23 Two small fixes to some rare bugs causing crashes --3Ea 2006-09-03 Actually export triggers in scripts.bin --3E 2006-08-30 "define trigger" support --3D 2006-08-16 String literal lookup tables added to end of -- HSZ lumps replacing setstring + appendstring --3C 2006-08-15 New syntax for giving default script arguments, -- and for leaving out definescript --3B 2006-08-13 HSX lumps renamed to HSZ --3A 2006-08-10 HamsterSpeak becomes 32-bit. Also, script format -- version added to HSX header. Current is 1 --2Na 2006-07-07 Minor update to add new logical operators --2N 2006-07-07 Added not() logic function --2M 2006-07-06 Set exit code on warnings --2L 2006-05-13 Added switch statement (+ case keyword) --2K 2006-05-01 Added @scriptname and @globalvariable syntax to -- return script or global ID number at compile-time -- not run-time (for use with "run script by ID" and -- "read global" and "write global") --2J 2006-04-10 Added break, continue, exitscript, exitreturning -- flow statements. Also fixed some return bugs --2I 2006-04-04 Extended HSX header to include number of arguments -- to a script, to really fix arguments-overflow-into -- -locals bug --2H 2006-03-29 Display better help and wait for keypress when run -- by double-clicking the icon. Added -k command line -- option to skip waiting for keypress --2G 2005-10-03 Additional operators $+ and $= -- Mention GPL in help text --2F 2005-07-24 Strings implemented: -- $id="..." -> setstring -- $id+"..." -> appendstring --2E 2005-02-15 Changed license to GPL -- Added += and -= operators --2D 2002-08-03 Only a small change, strip out \r from lines of -- the script as we read them to avoid errors related -- to busted newlines --2C 2002-03-05 Fixed some bugs that could cause crashes when -- non-printable characters exist in the input file. -- (naturally the script will not compile right, but -- at least it will not crash) --2B 2001-06-06 Added := as a commaless separater so it can be -- defined as an operator --2A 2001-05-04 Fixed -w command line option when used with -z --2 First Release --------------------------------------------------------------------------- without warning --to avoid annoying warnings without type_check --for a small speed boost --with profile_time --time profiling include hsspiffy.e --various routines, sequence manipulation - James Paige include machine.e --needed for int_to_bytes include std/math.e --needed for mod include std/graphics.e --needed for color output include std/filesys.e include std/map.e include euphoria/unicode.e include euphoria/ucstypes.e include euphoria/io_with_unicode.e --------------------------------------------------------------------------- --constants-- constant false=0 constant true=1 -- increment COMPILER_VERSION only for major language-altering changes constant COMPILER_VERSION="3" -- For small changes increment the second letter of COMPILER_SUB_VERSION -- For large changes increment the first letter and reset the second letter --*Please make sure this is 2 chararacters long! Append a space if required.* constant COMPILER_SUB_VERSION="W " constant COPYRIGHT_DATE="2024" --Known single-character commandline options constant VALID_OPTIONS="abcdfjknstwuyz" --This is the oldest version of plotscr.hsd which is supported (feel free to --increment this whenever using an older plotscr.hsd means you're missing out --on new but nonessential features; there's no harm) --*Should be 3 characters long, same captialisation as COMPILER_SUB_VERSION* constant MIN_PLOTSCR_VERSION="3Tf" --these constants are color-flags. May add more colors, up to #F8FF --These values are also passed directly to Hamster Whisper. --(these codes are at the end of the first private use area) constant COL_FIRST=#F8E0 constant COLYEL=#F8E0 constant COLRED=#F8E1 constant COLPNK=#F8E2 constant COLWHI=#F8E3 constant COLBWHI=#F8E4 constant COL_LAST=#F8FF --Map from the above to the color constants in std/graphics.e --(which incidentally vary from the constants in graphics.e) constant HS_TO_EU_COLORS={YELLOW, RED, BRIGHT_RED, WHITE, BRIGHT_WHITE} --These are character classes used in lexer_table constant LEX_CHAR=0 --Most character which are valid in identifiers. Everything above char 255 constant LEX_SPACE=1 --space and tab constant LEX_BEGIN=2 --( constant LEX_END=3 --) constant LEX_OPERATOR=4 --The first character of an operator constant LEX_COMMENT=5 --# constant LEX_STRING=6 --" constant LEX_COMMA=7 --, constant LEX_WARN=8 --[]{}. constant LEX_BINARY=9 --Nonprintable characters constant LEX_DIGIT=10 --0-9 constant LEX_HEXDIGIT=11 --a-fA-F constant LEX_OPERATOR2=12 --Only used in lexer_table_key2!! constant LEX_NEWLINE=13 --\n constant FILE_NAME=1 --With whatever path was needed to open it constant FILE_TEXT=2 --Entire contents of the file constant FILE_LINE_EXTENTS=3 --List of {start,end} pairs; inclusive indices, not including newlines constant FILE_OFFSET=4 --Used by srcpos's constant FILE_LUMP=5 --POS_* index sequence returned by decode_srcpos constant POS_FILENAME=1 --FILE_NAME of the file constant POS_LINE=2 --1-based line number constant POS_COLUMN=3 --0-based column number constant POS_LENGTH=4 --token length in characters (might be 0) constant POS_LINETEXT=5 --The line containing the token constant POS_TOKTEXT=6 --The text of the token constant CMD_TEXT=1 constant CMD_POS=2 --For global_scope constant SYM_KIND=1 --KIND_* constant SYM_ID=2 --ID or 0 constant SYM_POS=3 --srcpos or 0 constant PAIR_NUM=1 constant PAIR_NAME=2 constant PAIR_START=1 constant PAIR_END=2 constant OPER_TRUENAME=3 constant OPER_POS=4 --FUNC_* are for elements of math_list, function_list or script_list --(When changing these, also update create_function() and math_list) constant FUNC_ID=1 --(aka PAIR_NUM) constant FUNC_NAME=2 --(aka PAIR_NAME) constant FUNC_ARGS=3 --Either list of default values (each an integer or NO_DEFAULT), or VAR_ARGS constant FUNC_ATTRS=4 --List of attributes as a list of {string,TokenList} pairs where the --TokenList are the arguments. Like a Node except children are Tokens instead of Nodes. constant FUNC_FLAGS=5 --Bitvector of FLAG_* bits. Also indicates presence of attributes. See attribute_list constant FUNC_POS=6 --srcpos of start of function/script. Not used by math_list --ATTR_* are for attribute_list constant ATTR_NAME=1 constant ATTR_FLAGS=2 --An integer which gets OR'd into FUNC_FLAGS constant FLAG_OBSOLETE=#01 --Has @obsolete attribute constant GLB_POS=3 --srcpos of globalvariable definition constant CONST_VALUE=1 constant CONST_POS=2 constant VAR_FRAME=3 --VarToken index, integer giving frame number of a variable (0 is current script, 1 is enclosing...) constant VAR_ID=4 --VarToken index, integer giving ID in frame constant REUSE_PARENT_AND_NAME=2 constant NO_DEFAULT={} --when appears in script_list[FUNC_ARGS], indicates arg with no default value constant VAR_ARGS=0 --when appears AS function_list[FUNC_ARGS], indicates unlimited num of args --KIND_* constants are used BOTH to describe the types of raw tokens, and tokens --which been parsed into Nodes in an AST, although many of them can only appear --as one or the other. constant KIND_NUMBER=1 --int32. ID is the value constant KIND_FLOW=2 --Flow control (flow_list), including begin and end. constant KIND_GLOBAL=3 --Global variable (global_list) constant KIND_LOCAL=4 --Local variable constant KIND_MATH=5 --Math builtin function (math_list) constant KIND_FUNCTION=6 --Builtin command (function_list) constant KIND_SCRIPT=7 --Call to script (script_list) constant KIND_NONLOCAL=8 --Nonlocal variable --The following never appear in compiled scripts constant KIND_REFERENCE=9 --Reference to a global or script. Converted to KIND_NUMBER in compiled script constant KIND_OPERATOR=10 --Infix binary operator created by defineoperator. (operator_list) --If a name like 'mod' is both operator and something else, it gets added --to global_scope as the other thing, not KIND_OPERATOR. constant KIND_PARENS=11 --Node created by extra pair of brackets around an expression constant KIND_MACRO=12 --Expanded by hspeak, e.g. assert constant KIND_KEYWORD=13 --Other builtin keywords that can appear in scripts but aren't flow constructs. --Currently just 'variable' and 'subscript'. constant KIND_CONSTANT=14 --A constant which hasn't been translated to a KIND_NUMBER yet constant KIND_SEPARATOR=15 --Auto-delimiting tokens like +, if they haven't been turned into a --KIND_OPERATOR yet. Can only be used in defineoperator. (separator_list) constant KIND_ATTRIBUTE=16 --An @attribute constant KIND_LAST_VALID_IN_SCRIPT=16 constant KIND_TOPLEVEL=17 --Toplevel keyword such as defineconstant, or script trigger, eg. plotscript constant KIND_UNIMPLEMENTED=18 constant KIND_LAST=18 constant KIND_LONGNAMES={"number" ,"flow control statement" ,"global variable" ,"local variable" ,"built-in function" ,"hard-coded function" ,"script" ,"non-local variable" ,"reference" ,"operator" ,"order-of-operations-enforcing parentheses" ,"hard-coded function/macro" ,"keyword" ,"constant" ,"untranslated operator" ,"attribute declaration" ,"top-level declaration" ,"unimplemented keyword" ,"function attribute" } constant nil_sym={0,0,0} constant TREE_TRUNK=1 constant TREE_BRANCHES=2 constant TIMING_DEPTH=1 constant TIMING_DESCRIPTION=2 constant TIMING_TIME=3 constant TIMING_ACTIVE=4 --Version number for the .hs/.hsp format (not including HSZ files!) constant HSP_FORMAT_VERSION=1 constant CODE_START_BYTE_OFFSET=32 constant HSZ_FORMAT_VERSION=3 constant MAXGLOBAL=50000 --largest ID constant MAX_LOCALS=100 --max number of locals per script (not including nonlocals) constant MAX_ARGS=32 --max number of arguments to a script, function or builting constant MAX_NEST_DEPTH=4 --------------------------------------------------------------------------- --globals-- --initializations-- string compiler_dir compiler_dir="" string include_dir include_dir="" string source_file source_file="" string dest_file dest_file="" integer dest_file_fh dest_file_fh=0 --For use by error handlers only! integer cleanup_dest cleanup_dest=false --Whether to delete dest_file on error sequence optlist optlist={} sequence file_list file_list={} --type {FILE_NAME,FILE_TEXT,FILE_LINE_EXTENTS} sequence additional_includes additional_includes={} srcpos max_srcpos max_srcpos=0 --Maximum possible value of a valid srcpos (for typechecking) srcpos script_position script_position=0 --All srcpos's emitted to a .hsz are relative to this. (Length 0) integer debug_info debug_info=false integer total_lines total_lines=0 sequence cmd cmd={} --TokenList (but type checking is very slow) sequence reuse_ids reuse_ids={} --type {PAIR_NUM, REUSE_PARENT_AND_NAME={parent_id, PAIR_NAME}} map constant_list constant_list=map:new() sequence trigger_list trigger_list={} --type {PAIR_NUM, PAIR_NAME} sequence operator_list operator_list={} --type {PAIR_NUM, PAIR_NAME, OPER_TRUENAME, OPER_POS} sequence function_list function_list={} --type {PAIR_NUM, PAIR_NAME, FUNC_ARGS, FUNC_POS, FUNC_ATTRS, FUNC_FLAGS} sequence global_list global_list={{},{},{}} --NOTE: three lists {PAIR_NUM, PAIR_NAME, GLB_POS}, not a list of globals sequence string_list string_list={} --list of strings sequence script_list script_list={} --type {PAIR_NUM, PAIR_NAME, FUNC_ARGS, FUNC_POS, FUNC_ATTRS, FUNC_FLAGS} --See also math_list, below --(PAIR_NAME is the full name) map global_scope global_scope=map:new() --Maps all names in global scope EXCEPT operators to {SYM_KIND, SYM_ID, SYM_POS} tuples --SCRIPT_* are for all_scripts. --The number of data fields used is variable, between _SCRIPT_MIN_LAST and _SCRIPT_MAX_LAST constant SCRIPT_TRIGGER_TOK=1 --Trigger token (type {CMD_TEXT, CMD_POS}), e.g. "script" constant SCRIPT_NAME=2 --String, not fully-scoped constant SCRIPT_POS=3 --srcpos of script name token constant SCRIPT_FULL_NAME=4 --Full-scoped script name constant SCRIPT_ARG_TOKS=5 --List of argument name tokens (type {CMD_TEXT, CMD_POS}) --Initially the tokens may include default values (like "a=4"), which are later removed constant SCRIPT_BODY_TOKS=6 --List of tokens (type {CMD_TEXT, CMD_POS}) in the body, of the form do,begin,...,end constant SCRIPT_PARENT_IDX=7 --Index in all_scripts of parent, or 0 for none constant SCRIPT_NEST_DEPTH=8 --Nesting depth. A script is 0, subscript 1, sub-subscript 2... constant SCRIPT_ATTRIBUTES=9 --Temporary store, do not use. These attributes get copied to script_list's FUNC_ATTRS constant SCRIPT_END_POS=10 --A srcpos of the last token of the script ('end' or ')') constant _SCRIPT_MIN_LAST=10 --The following are added later constant SCRIPT_ID=11 --Script ID. Value not appended until all IDs are assigned in check_script_declarations constant SCRIPT_TRIGGER_ID=12 --trigger ID. SCRIPT_TRIGGER_ID and later not appended until compile_a_script constant SCRIPT_NUM_NONLOCALS=13 constant SCRIPT_VARIABLES=14 --VarList of locals and nonlocals constant SCRIPT_AST=15 constant SCRIPT_BINARY=16 constant _SCRIPT_MAX_LAST=16 sequence all_scripts all_scripts={} --list of 'Script's == {SCRIPT_*} (variable length) atom start_time start_time=time() atom run_time run_time=0 sequence timing_data timing_data={} integer timing_depth timing_depth=-1 integer get_cmd_pointer get_cmd_pointer=0 integer autonumber_id autonumber_id=32767 sequence flow_list flow_list={ {0,"do"} ,{1,"begin"} ,{2,"end"} ,{3,"return"} ,{4,"if"} ,{5,"then"} ,{6,"else"} ,{7,"for"} ,{10,"while"} ,{11,"break"} ,{12,"continue"} ,{13,"exitscript"} ,{14,"exitreturning"} ,{15,"switch"} ,{16,"case"} --never appears in compiled script ,{17,"subscript"} --never appears in compiled script } --elseif is not a flow type sequence flow_requiring_brackets --else is excluded, because of case(else) flow_requiring_brackets={ "do", "if", "then", "elseif", "for", "while", "exitreturning", "switch", "case" } sequence math_list math_list={ --{FUNC_ID, FUNC_NAME, FUNC_ARGS, FUNC_ATTRS, FUNC_FLAGS} (FUNC_POS omitted) {0,"random", {0,1}, {}, 0} ,{1,"exponent", {0,2}, {}, 0} ,{2,"modulus", {0,1}, {}, 0} ,{3,"divide", {0,1}, {}, 0} ,{4,"multiply", {0,0}, {}, 0} ,{5,"subtract", {0,0}, {}, 0} ,{6,"add", {0,0}, {}, 0} ,{7,"xor", {0,0}, {}, 0} ,{8,"or", {0,0}, {}, 0} ,{9,"and", {0,0}, {}, 0} ,{10,"equal", {0,0}, {}, 0} ,{11,"notequal", {0,0}, {}, 0} ,{12,"lessthan", {0,0}, {}, 0} ,{13,"greaterthan", {0,0}, {}, 0} ,{14,"lessthanorequalto", {0,0}, {}, 0} ,{15,"greaterthanorequalto", {0,0}, {}, 0} ,{16,"setvariable", {0,0}, {}, 0} ,{17,"increment", {0,1}, {}, 0} ,{18,"decrement", {0,1}, {}, 0} ,{19,"not", {0}, {}, 0} ,{20,"logand", {0,0}, {}, 0} ,{21,"logor", {0,0}, {}, 0} ,{22,"logxor", {0,0}, {}, 0} ,{23,"abs", {0}, {}, 0} ,{24,"sign", {0}, {}, 0} ,{25,"sqrt", {0}, {}, 0} } --update is_simple_math_op on adding new operators sequence attribute_list attribute_list={ --type {ATTR_NAME, ATTR_FLAGS} {"@obsolete",FLAG_OBSOLETE} } sequence separator_list separator_list={ "+=","-=","$+","$=","$","+","--","/","*","^^","^" ,"==","<>",">>","<<","<=",">=",">","<",":=","=","&&","||" } sequence lexer_table lexer_table={} --maps from ASCII value+1 (note +1!) to LEX_* constant sequence lexer_table_key2 lexer_table_key2={} string hex_chars hex_chars="0123456789abcedfABCDEF" sequence hex_char_values hex_char_values={0,1,2,3,4,5,6,7,8,9,#a,#b,#c,#e,#d,#f,#A,#B,#C,#D,#E,#F} sequence compiled_data compiled_data={} --used inside binary_compile only; compiled data for the current script map node_lookup --used inside binary_compile only; map from serialised nodes to offsets string current_script current_script="" --name of current script integer cur_script_num cur_script_num=0 --index into all_scripts of current script integer colors_enabled colors_enabled=true integer simple_colorcodes simple_colorcodes=false integer error_file error_file=false map used_globals used_globals=map:new() --set of ids (keys are ids, values are true) map used_locals used_locals=map:new() --set of ids (keys are ids, values are true) sequence string_table string_table={} --binary data string plotscr_version plotscr_version="" integer max_used_function max_used_function=0 --maximum id of a function used in a script integer fast_mode fast_mode=false integer end_anchor_kludge end_anchor_kludge=false integer was_warnings was_warnings=false --------------------------------------------------------------------------- --types-- --A position in the script source (where newlines are one character). --Encoded in 32-bit integer, where: --bits 0..7 : Length of the token in characters, including whitespace -- (aside from leading/trailing whitespace), capped to 254. -- (255 reserved for future use) --bit 8 : Virtual flag --bits 9..31: The (1-based) character number of the start of the token in the file -- plus the file's FILE_OFFSET -- --The first file has FILE_OFFSET=1, to ensure srcpos 0 is never used. --Offset FILE_OFFSET+0 is reserved to refer to the file as a whole (unused); --FILE_OFFSET+1 is the start of the file. -- --See also encode_srcpos and decode_srcpos type srcpos(object pos) --need to allow 0 as a dummy value, and ignore max_srcpos if it's uninitialised) if atom(pos) and pos=floor(pos) and pos>=0 and (max_srcpos=0 or pos<=max_srcpos) then --odd, 'or' and 'and' only seem to shortcut when used in if condition, not --in a general expression return true else return false end if end type --Not the same as Euphoria's string type, which is limited to extended ASCII type string(object str) if not sequence(str) then return false end if --Too slow -- for i=1 to length(str) do -- if not integer(str[i]) or str[i]<0 then -- return false -- end if -- end for return true end type type Kind(integer kind) return kind>=0 and kind<=KIND_LAST end type --SYM_KIND,SYM_ID,SYM_POS tuple type Symbol(object sym) return length(sym)=3 and Kind(sym[SYM_KIND]) and srcpos(sym[SYM_POS]) end type --A CMD_TEXT,CMD_POS pair type Token(object tok) if sequence(tok) and length(tok)=2 and string(tok[1]) and srcpos(tok[2]) then return true else return false end if end type --A CMD_TEXT,CMD_POS,VAR_FRAME,VAR_ID tuple describing a local/nonlocal variable type VarToken(object var) if length(var)=4 and string(var[1]) and srcpos(var[2]) and integer(var[3]) and integer(var[4]) then return true end if return false end type --sequence of Tokens type TokenList(object list) if not sequence(list) then return false end if for i=1 to length(list) do if not Token(list[i]) then return false end if end for return true end type --sequence of VarTokens type VarList(object vars) if not sequence(vars) then return false end if for i=1 to length(vars) do if not VarToken(vars[i]) then return false end if end for return true end type --{Token,children} type Node(object node) if sequence(node) and length(node)=2 and Token(node[1]) and sequence(node[2]) then for i=1 to length(node[2]) do if not Node(node[2][i]) then return false end if end for return true else return false end if end type --sequence of Nodes type NodeList(object list) if not sequence(list) then return false end if for i=1 to length(list) do if not Node(list[i]) then return false end if end for return true end type --An element of all_scripts type Script(object script) if sequence(script) and length(script)>=_SCRIPT_MIN_LAST and length(script)<=_SCRIPT_MAX_LAST then return true else return false end if end type --------------------------------------------------------------------------- --time spent waiting for a user-keypress shouldnt count function timeless_wait_key() atom skip_time integer key skip_time=time() key=wait_key() skip_time=time()-skip_time start_time+=skip_time for i=1 to length(timing_data) do if timing_data[i][TIMING_ACTIVE] then timing_data[i][TIMING_TIME]+=skip_time end if end for return(key) end function --------------------------------------------------------------------------- --Euphoria's built-in sprintf truncates all string elements to 8 bits, --so this is an Unicode-enabled replacement. Supports %d, %s, %g, %% --Beware! This will be way slower than sprintf, but luckily it's not --needed anywhere where speed matters. function sprintf_utf(string s, sequence printf_args) integer seg_start, seg_end integer arg_ctr integer code string ret if length(printf_args)=0 then return s end if ret="" arg_ctr=1 seg_start=1 seg_end=find('%',s) while seg_end do ret&=s[seg_start..seg_end-1] code=s[seg_end+1] if code='s' then ret&=printf_args[arg_ctr] arg_ctr+=1 elsif code='d' or code='g' then ret&=sprintf({'%',code},printf_args[arg_ctr]) arg_ctr+=1 elsif code='%' then ret&='%' else simple_error(sprintf("sprintf_utf: unrecognised format code %%%s\n",{s[seg_end+1]})) end if seg_start=seg_end+2 seg_end=find('%',s,seg_start) end while if seg_start<=length(s) then ret&=s[seg_start..$] end if if arg_ctr!=length(printf_args)+1 then simple_error(sprintf("sprintf_utf: recieved %d format arguments, only used %d\n",{length(printf_args),arg_ctr-1})) end if return ret end function --------------------------------------------------------------------------- --fprintf as UTF8 procedure print_utf(object fh, string s, sequence printf_args) if length(printf_args) then s=sprintf_utf(s,printf_args) end if s=toUTF(s,utf_32,utf_8) puts(fh,s) end procedure --------------------------------------------------------------------------- --prints a string with printf to stdout converting color codes procedure color_print(string s, sequence printf_args) string buffer s=sprintf_utf(s,printf_args) if simple_colorcodes then buffer=s else buffer="" for i=1 to length(s) do if s[i]>=COL_FIRST and s[i]<=COL_LAST then print_utf(stdout,buffer,{}) buffer="" if colors_enabled then text_color(HS_TO_EU_COLORS[s[i]-COL_FIRST+1]) end if else buffer&=s[i] end if end for end if if length(buffer) then print_utf(stdout,buffer,{}) end if end procedure --------------------------------------------------------------------------- procedure opt_wait_for_key(integer error_or_warning) if (not error_or_warning and find('j',optlist)) or find('k',optlist) then --skip else color_print("[Press Any Key]\n",{}) timeless_wait_key() end if end procedure --------------------------------------------------------------------------- procedure enter_timing_zone(string description) if not find('t',optlist) then return end if timing_depth+=1 timing_data=append(timing_data,{timing_depth,description,-time(),true}) end procedure --------------------------------------------------------------------------- procedure reenter_timing_zone(string description) if not find('t',optlist) then return end if for i=1 to length(timing_data) do if equal(timing_data[i][TIMING_DESCRIPTION],description) then timing_depth+=1 timing_data[i][TIMING_DEPTH]=timing_depth --could change... timing_data[i][TIMING_TIME]-=time() timing_data[i][TIMING_ACTIVE]=true return end if end for enter_timing_zone(description) end procedure --------------------------------------------------------------------------- procedure exit_timing_zone() if not find('t',optlist) then return end if for i=length(timing_data) to 1 by -1 do --there can only be one active timing job at each depth at a time if timing_data[i][TIMING_ACTIVE] and timing_data[i][TIMING_DEPTH]=timing_depth then timing_data[i][TIMING_TIME]+=time() timing_data[i][TIMING_ACTIVE]=false exit end if end for timing_depth-=1 end procedure --------------------------------------------------------------------------- procedure print_timing_data() string indent sequence data string tmp1, tmp2 if run_time=0 then --so that all percentages show as 0% run_time=1e100 end if for i=1 to length(timing_data) do data=timing_data[i] indent=repeat(' ',data[TIMING_DEPTH]*4) tmp1=sprintf("%.2f",{data[TIMING_TIME]}) tmp2=sprintf("%5.2g",{100*data[TIMING_TIME]/run_time}) color_print("%s "&COLBWHI&"%s"&COLWHI&"s %s%% %s\n",{indent,tmp1,tmp2,data[TIMING_DESCRIPTION]}) end for end procedure --------------------------------------------------------------------------- function html_char_convert(string s) string buffer string result result="" buffer="" for i=1 to length(s) do if s[i]=' ' and i>1 then if s[i-1]=' ' then buffer&=" " else buffer&=s[i] end if elsif s[i]='<' then buffer&="<" elsif s[i]='>' then buffer&=">" else buffer&=s[i] end if end for if length(buffer) then result&=buffer end if return(result) end function --------------------------------------------------------------------------- function error_string_convert(string s) string buffer string result result="" buffer="" for i=1 to length(s) do if s[i]>=COL_FIRST and s[i]<=COL_LAST then if s[i]=COLYEL then buffer&="<font color=\"#F0F000\">" elsif s[i]=COLRED then buffer&="</font>" elsif s[i]=COLPNK then buffer&="<font color=\"#F07070\">" elsif s[i]=COLWHI then buffer&="<font color=\"#909090\">" elsif s[i]=COLBWHI then buffer&="<font color=\"#F0F0F0\">" end if elsif s[i]='\n' then buffer&="<br>\n" else buffer&=s[i] end if end for if length(buffer) then result&=buffer end if return(result) end function --------------------------------------------------------------------------- --prints a long string wrapped at 80 columns procedure wrap_print(string s, sequence arguments) string outstring s=sprintf_utf(s,arguments) while length(s) do outstring=before_wrap_point(s) s=after_wrap_point(s) color_print("%s\n",{outstring}) end while end procedure --------------------------------------------------------------------------- procedure error_file_print(string s) integer fh if error_file then fh=open(compiler_dir&"hs_error.htm","a") if fh!=failure then print_utf(fh,error_string_convert(s)&"\n",{}) close(fh) end if end if end procedure --------------------------------------------------------------------------- --prints out warning message in red with word wrap procedure simple_warn(string s) sequence pos if not find('w',optlist) then --do not warn if -w is set pos=get_position() if pos[2]>1 then print_utf(stdout,"\n",{}) end if wrap_print(COLRED&"WARNING: %s"&COLWHI&"\n",{s}) error_file_print(sprintf_utf("<Font color=\"#FF0000\">WARNING: %s</font>",{html_char_convert(s)})) was_warnings = true end if end procedure --------------------------------------------------------------------------- --prints out an error message in red with word wrap, then aborts procedure simple_error(string s) sequence pos pos=get_position() if pos[2]>1 then print_utf(stdout,"\n",{}) end if wrap_print(COLRED&"ERROR: %s"&COLWHI&"\n",{s}) error_file_print(sprintf_utf("<font color=\"#FF0000\">ERROR: %s</font>",{html_char_convert(s)})) if end_anchor_kludge then error_file_print("</a>\n<hr>\n") end if opt_wait_for_key(true) --Custom doesn't check hspeak's error code (due to difficulty retrieving it after spawning), --it only checks whether the output .hs file was created. if cleanup_dest then if dest_file_fh then close(dest_file_fh) end if delete_file(dest_file) end if abort(1) end procedure --------------------------------------------------------------------------- --prints out usage info if not enough arguments procedure check_arg_count(sequence args) if length(args)=2 then wrap_print("HamsterSpeak Compiler v%s%s\n",{COMPILER_VERSION,COMPILER_SUB_VERSION}) wrap_print("(C)2001-%s James Paige, Ralph Versteegen, and the OHRRPGCE Developers\n",{COPYRIGHT_DATE}) wrap_print("Please read LICENSE.txt for GNU GPL & MIT dual license details and disclaimer of liability",{}) wrap_print(COLYEL&"%s [-%s] [long-options] source.hss [dest.hs]"&COLWHI&"\n\n",{file_only(args[2]),VALID_OPTIONS}) color_print(" -f fast mode. Disables some optimization\n",{}) color_print(" -j don't wait for a keypress when finished if there's no error or warning\n",{}) color_print(" -k don't wait for a keypress when finished\n",{}) color_print(" -n don't add debug info, or a copy of the source code, to .hs file\n",{}) color_print(" -w suppress warnings\n",{}) color_print(" -u show warnings for unused variables\n",{}) color_print(" -y overwrite the destination file without asking\n",{}) color_print(" --incdir <path>\n" & " also search this directory for include files\n",{}) color_print(" --include <filename>\n" & " include an additional file\n",{}) color_print("\nAdvanced options (not intended for normal use):\n",{}) color_print(" -b don't include plotscr.hsd automatically\n",{}) color_print(" -c colors will be disabled\n",{}) color_print(" -d dump debug report to hs_debug.txt\n",{}) color_print(" -s print the name of each plotscript as it is compiled\n",{}) color_print(" -a same as -s, but printing all scripts\n",{}) color_print(" -t print detailed timing info (for HSpeak developers)\n",{}) color_print(" -z write error messages to hs_error.htm\n",{}) --Undocumented arguments: -- --unicode-cols -- used by Hamster Whisper for piped output -- --reuse-ids scripts.bin -- Read the provided scripts.bin file from a previous compilation, and reuse script ID mapping wrap_print("\nFor more info about HamsterSpeak visit "&COLBWHI&"http://HamsterRepublic.com/ohrrpgce"&COLWHI&"\n",{}) wrap_print("\nThis is a command-line program. You should either run it from the command-line, or drag and drop your script file onto it. You don't ever need to run HSpeak yourself; Custom will do that for you.\n",{}) opt_wait_for_key(false) abort(0) end if end procedure --------------------------------------------------------------------------- --Build the lexer tables, which for each code point 0<=i<256, contains the type --of that character in entry i+1 (have to handle NUL) --Other characters above code 255 are always LEX_CHAR procedure init_lexer() sequence table_insert --main table --(\xa0 is non-breaking space) table_insert={" (),\n#\t\"[]{}.\xa0", {LEX_SPACE,LEX_BEGIN,LEX_END,LEX_COMMA,LEX_NEWLINE,LEX_COMMENT,LEX_SPACE,LEX_STRING,LEX_WARN,LEX_WARN,LEX_WARN,LEX_WARN,LEX_WARN,LEX_SPACE}} --by default each character is either binary, or an allowed identifier --character. This is for backwards compatibility --NOTE the +1! lexer_table=repeat(LEX_BINARY,32) & repeat(LEX_CHAR,127-32) & repeat(LEX_BINARY,#A1-127) & repeat(LEX_CHAR,256-#A1) for i=1 to length(table_insert[1]) do lexer_table[table_insert[1][i]+1]=table_insert[2][i] end for for i=0 to 9 do lexer_table['0'+i+1]=LEX_DIGIT end for for i=0 to 5 do lexer_table['a'+i+1]=LEX_HEXDIGIT lexer_table['A'+i+1]=LEX_HEXDIGIT end for --add the one-character prefix of each operator for i=1 to length(separator_list) do lexer_table[separator_list[i][1]+1]=LEX_OPERATOR end for --keyword suffix parsing table lexer_table_key2=repeat(LEX_CHAR,256) lexer_table_key2[' '+1]=LEX_SPACE lexer_table_key2['\t'+1]=LEX_SPACE --2nd character of each operator for i=1 to length(separator_list) do if length(separator_list[i])=2 then lexer_table_key2[separator_list[i][2]+1]=LEX_OPERATOR2 end if end for --when floating point is added will want a table for number parsing end procedure --------------------------------------------------------------------------- --initializes global variables, and generally gets things ready to roll procedure init() sequence args integer index integer key integer fh sequence sym --Process arguments args=command_line() compiler_dir=path_only(args[2]) check_arg_count(args) wrap_print("HamsterSpeak Compiler v%s%s\n",{COMPILER_VERSION,COMPILER_SUB_VERSION}) optlist={} index=3 while index<=length(args) do --These are undocumented if equal(args[index],"--unicode-cols") then simple_colorcodes=true args=delete_element(args,index) elsif equal(args[index],"--reuse-ids") and index+1<=length(args) then color_print(COLWHI&"Reading script IDs from previous compile\n",{}) reuse_ids=read_scripts_dot_bin(args[index+1]) args=delete_slice(args,index,index+1) elsif equal(args[index],"--include") and index+1<=length(args) then additional_includes&={args[index+1]} args=delete_slice(args,index,index+1) elsif equal(args[index],"--incdir") and index+1<=length(args) then include_dir=args[index+1] args=delete_slice(args,index,index+1) elsif args[index][1]='-' then optlist=optlist&hs_lower(args[index][2..length(args[index])]) args=delete_element(args,index) else index+=1 end if end while for i=1 to length(optlist) do if not find(optlist[i], VALID_OPTIONS) then simple_warn("Unknown commandline option -" & optlist[i]) end if end for check_arg_count(args) source_file=normalize_filename(args[3]) if length(args)>3 then dest_file=normalize_filename(args[4]) else dest_file=normalize_filename(alter_extension(source_file,"hs")) end if if find('f',optlist) then fast_mode=true color_print("Using fast mode. Some size optimization disabled\n",{}) end if if find('c',optlist) then colors_enabled=false end if if not find('n',optlist) then debug_info=true end if --'x' used by old versions of Hamster Whisper to specify color code format --However this clashes with Latin-1 characters and doesn't work with UTF-8 anyway --Replaced by --unicode-cols --the semi-undocumented command line argument -z writes a file called --hs_error.htm formatted for HssEd to read if find('z',optlist) then error_file=true if file_exists(compiler_dir&"hs_error.htm") then fh=open(compiler_dir&"hs_error.htm","w") if fh!=failure then puts(fh,"") close(fh) end if end if end if wrap_print("Compiling "&COLBWHI&"%s"&COLWHI&" to "&COLBWHI&"%s"&COLWHI&"\n",{source_file,dest_file}) if file_exists(dest_file) then if find(extension_only(dest_file),{"hss","txt"}) then simple_error("Refusing to write output to "&dest_file&" because only input files should have hss/txt extension!") end if if find('y',optlist) then --found the -y command line arg, overwrite automatically else while true do --prompt the user to overwrite wrap_print("file "&COLBWHI&"%s"&COLWHI&" already exists. Overwrite it? (Y/N)",{dest_file}) key=timeless_wait_key() color_print(" "&COLYEL&"%s"&COLWHI&"\n",{key}) if hs_lower(key)='y' then exit elsif hs_lower(key)='n' then simple_error("output file overwrite cancelled by user") else wrap_print(COLYEL&"%s"&COLWHI&"? "&COLYEL&"%s"&COLWHI&"!? How is that Y or N?",{key,hs_upper(key)}) end if end while end if end if --From now on, delete dest_file on error cleanup_dest=true --Init global data tables global_scope=map:new_from_kvpairs({ {"defineconstant",{KIND_TOPLEVEL,0,0}} ,{"definetrigger" ,{KIND_TOPLEVEL,0,0}} ,{"defineoperator",{KIND_TOPLEVEL,0,0}} ,{"globalvariable",{KIND_TOPLEVEL,0,0}} ,{"definefunction",{KIND_TOPLEVEL,0,0}} ,{"definescript" ,{KIND_TOPLEVEL,0,0}} ,{"plotscrversion",{KIND_TOPLEVEL,0,0}} ,{"include" ,{KIND_TOPLEVEL,0,0}} --should never appear, in theory ,{"cfor" ,{KIND_UNIMPLEMENTED,0,0}} ,{"foreach" ,{KIND_UNIMPLEMENTED,0,0}} ,{"=" ,{KIND_UNIMPLEMENTED,0,0}} ,{"elseif" ,{KIND_FLOW,-1,0}} --not in flow_list... shouldn't that be a problem? ,{"exit" ,{KIND_FLOW,-1,0}} --not in flow_list --keywords have special case handling ,{"subscript" ,{KIND_KEYWORD,0,0}} --subscript is also in flow_list! ,{"variable" ,{KIND_KEYWORD,0,0}} ,{"tracevalue" ,{KIND_MACRO,0,0}} ,{"tracevalueof" ,{KIND_MACRO,0,0}} ,{"showvalueof" ,{KIND_MACRO,0,0}} ,{"assert" ,{KIND_MACRO,0,0}} -- Stray "$"'s are picked up by the lexer (not totally ideal maybe) }) for i=1 to length(flow_list) do sym=flow_list[i] if map:has(global_scope, sym[PAIR_NAME])=false then --omit subscript map:put(global_scope, sym[PAIR_NAME], {KIND_FLOW,sym[PAIR_NUM],0}) end if end for for i=1 to length(math_list) do sym=math_list[i] map:put(global_scope, sym[PAIR_NAME], {KIND_MATH,sym[PAIR_NUM],0}) end for -- Adding the separators (operators) here prevents them from being used before -- defineoperator is handled (either a block before include,plotscr.hsd, or in -- e.g. defineconstant) for i=1 to length(separator_list) do if map:has(global_scope,separator_list[i])=false then --omit = map:put(global_scope,separator_list[i],{KIND_SEPARATOR,i,0}) end if end for for i=1 to length(attribute_list) do map:put(global_scope,attribute_list[i][ATTR_NAME],{KIND_ATTRIBUTE,i,0}) end for --Append subscript trigger id (this is needed for scripts.bin) trigger_list=append(trigger_list,{-1,"subscript"}) --Lexer finite state transition tables init_lexer() end procedure --------------------------------------------------------------------------- --Returns offset component of a srcpos (usually you want to use decode_srcpos_file --instead if you want to know the file offset) function srcpos_offset(srcpos pos) integer ret --It appears to be a Euphoria euc bug that it's necessary to explicitly cast to integer ret=floor(pos/power(2,9)) return ret end function --------------------------------------------------------------------------- --Returns length of a srcpos (between 0 and 254) function srcpos_length(srcpos pos) return and_bits(pos,power(2,8)-1) end function --------------------------------------------------------------------------- --returns {filenumber,file position} pair, later being 1-based character offset function decode_srcpos_file(srcpos pos) integer len, offset --0 is an invalid srcpos if pos=0 then simple_error("compiler bug: can't decode uninitialised srcpos") end if integer point point=srcpos_offset(pos) if not integer(point) or point<=0 then simple_error(sprintf("compiler bug: bad srcpos offset %d", {point})) end if for i=1 to length(file_list) do len=length(file_list[i][FILE_TEXT]) offset=file_list[i][FILE_OFFSET] if point<=offset+len then return {i,point-offset} exit end if end for simple_error(sprintf_utf("compiler bug: can't decode invalid srcpos %d",{pos})) end function --------------------------------------------------------------------------- function srcpos_file_number(srcpos pos) sequence temp temp=decode_srcpos_file(pos) return temp[1] end function --------------------------------------------------------------------------- --Returns 1-based character number in file function srcpos_file_offset(srcpos pos) sequence temp temp=decode_srcpos_file(pos) return temp[2] end function --------------------------------------------------------------------------- --Combine base/file offset (usually FILE_OFFSET), token offset (in --characters from start of file counting from 1) and token length into --a non-virtual a srcpos. The redundancy between baseoffset and --offset is for clarity. function encode_srcpos(integer baseoffset, integer offset, integer len) if len > 254 then len = 254 end if if offset < 1 or len < 0 then --offset 0 is allowed, it refers to the file. But if you wanted it you'd --just write file_list[i][FILE_OFFSET]. So disallow it here. simple_error(sprintf("Compiler bug! encode_srcpos(%d,%d,%d)\n",{baseoffset,offset,len})) end if return (baseoffset+offset)*power(2,9)+len end function --------------------------------------------------------------------------- --This transforms a srcpos into one marked as virtual. Indicates a roughly relevant position, but --not something that actually occurs in the source. Use for inserted dummy code function virtual_pos(srcpos pos) return(or_bits(pos,256)) end function --------------------------------------------------------------------------- --Return just the original text of a token. --Unlike decode_srcpos, this is quite fast function get_srcpos_text(srcpos pos) integer point, tokenlen, filenum sequence temp temp=decode_srcpos_file(pos) filenum=temp[1] point=temp[2] tokenlen=srcpos_length(pos) if tokenlen=0 then return "" else return file_list[filenum][FILE_TEXT][point..point+tokenlen-1] end if end function --------------------------------------------------------------------------- --Probably slow: avoid heavy use. --Returns {file name, line number, column number, token length, line text, token text}: index with POS_* --Column number returned is 0-based function decode_srcpos(srcpos pos) integer point, tokenlen, filenum, columnnum integer lineno, line_start, line_end string filetext, linetext, tokentext sequence lines sequence temp if pos=0 then simple_error("compiler bug: can't decode uninitialised srcpos") end if temp=decode_srcpos_file(pos) filenum=temp[1] point=temp[2] tokenlen=srcpos_length(pos) --Find the line it's on filetext=file_list[filenum][FILE_TEXT] lines=file_list[filenum][FILE_LINE_EXTENTS] for i=1 to length(lines) do if point>=lines[i][PAIR_START] then lineno=i else exit end if end for line_start=lines[lineno][PAIR_START] line_end=lines[lineno][PAIR_END] linetext=filetext[line_start..line_end] -- +1 for 1-based column indexing columnnum=point-line_start+1 if tokenlen=0 then tokentext="" else tokentext=linetext[columnnum..min({length(linetext),columnnum+tokenlen-1})] end if return({file_list[filenum][FILE_NAME],lineno,columnnum,tokenlen,linetext,tokentext}) end function --------------------------------------------------------------------------- function form_error_text(string s, srcpos pos) integer poscol, poslen, posend string posline, line_display sequence src_position string column_display, caret_display integer tab_compensate, token_tab_compensate src_position=decode_srcpos(pos) --?pos --pretty_print(1,src_position,{2}) posline=src_position[POS_LINETEXT] poscol=src_position[POS_COLUMN] poslen=src_position[POS_LENGTH] --Expand tabs, because we don't know how the terminal displays then. line_display=substring_replace(posline,"\t"," ") posline&=" " --Used to point to the newline at the end posend=poscol+large(0,poslen-1) if posend>length(posline) then --One past the end of the actual line is allowed simple_warn(sprintf_utf("Compiler bug: srcpos past end of line %d", {src_position[POS_LINE]})) posend=length(posline) end if --Count number of tabs in the line before and in the token, --so that we can determine their display length tab_compensate=3*count('\t',posline[1..poscol-1]) token_tab_compensate=3*count('\t',posline[poscol..posend]) caret_display=repeat('^',large(1,poslen)+token_tab_compensate) column_display=repeat(' ',poscol-1+tab_compensate) error_file_print(sprintf_utf("<a href=\"%s#%d\">\n",{src_position[POS_FILENAME],src_position[POS_LINE]})) if length(current_script) then return( sprintf_utf( "in script "&COLYEL&"%s"&COLRED&" on line %d in "&COLPNK&"%s"&COLRED&"\n" &COLBWHI&"%s\n" &"%s%s\n" &COLRED&"%s\n" ,{current_script,src_position[POS_LINE],src_position[POS_FILENAME],line_display,column_display,caret_display,s} ) ) else return( sprintf_utf( "in line %d of "&COLPNK&"%s"&COLRED&"\n" &COLBWHI&"%s\n" &"%s%s\n" &COLRED&"%s\n" ,{src_position[POS_LINE],src_position[POS_FILENAME],line_display,column_display,caret_display,s} ) ) end if --note that the calling procedure must send/deal with closing tags to error_file_print end function --------------------------------------------------------------------------- procedure src_warn(string s, srcpos pos) if not find('w',optlist) then --do not warn if -w is set simple_warn(form_error_text(s,pos)) error_file_print("</a>\n<hr>\n") end if end procedure --------------------------------------------------------------------------- procedure src_error(string s, srcpos pos) end_anchor_kludge=true simple_error(form_error_text(s,pos)) end procedure --------------------------------------------------------------------------- --Find newlines in a file (possibly mixed type), returns line extents --({start of line, end of line} pairs, both inclusive, newlines excluded) --and the file_text with Mac lineendings converted to Unix, because the engine's --decode_srcpos expects source.lumped contents don't use Mac lineendings. --Note also read_file() adds a newline at the end if none. function split_lines(string file_text) integer line_start=1 integer last_13=-1 sequence line_extents = {} for i=1 to length(file_text) do if file_text[i]=13 then line_extents=append(line_extents, {line_start,i-1}) last_13=i line_start=i+1 if i+1<=length(file_text) and file_text[i+1]!=10 then -- A \r lineending file_text[i]=10 end if elsif file_text[i]=10 then if last_13!=i-1 then -- handle {13,10} "\r\n" line_extents=append(line_extents, {line_start,i-1}) end if line_start=i+1 end if end for if line_start<=length(file_text) then line_extents=append(line_extents, {line_start,length(file_text)}) end if return {line_extents,file_text} end function --------------------------------------------------------------------------- --Read a file, adding to file_list. --pos is used only for location of include statement, may be 0. procedure load_source(string filename, string reading_how, srcpos pos) integer fh string file_text sequence line_extents -- {start of line, end of line} pairs integer offset -- FILE_OFFSET value -- First file starts at FILE_OFFSET=1, to ensure srcpos 0 is never used, -- and each file's FILE_OFFSET+0 is reserved. First char of first file -- has offset 2. offset=1 if length(file_list) then offset=file_list[$][FILE_OFFSET]+length(file_list[$][FILE_TEXT])+1 end if fh=open(filename,"r") if fh!=failure then wrap_print("%s "&COLBWHI&"%s"&COLWHI&"\n",{reading_how,filename}) reenter_timing_zone("read_file") --Autodetect the encoding file_text=read_file(fh, TEXT_MODE, UTF) exit_timing_zone() close(fh) sequence temp=split_lines(file_text) line_extents=temp[1] file_text=temp[2] total_lines+=length(line_extents) file_list=append(file_list,{filename,file_text,line_extents,offset,""}) --Maximum valid srcpos points to length 1 virtual token at end of file. --max_srcpos is used to do srcpos type checking... so we need to disable that --check before we can assign to it! max_srcpos=0 max_srcpos=virtual_pos(encode_srcpos(offset,length(file_text),1)) else if pos then src_error(sprintf_utf("file "&COLYEL&"%s"&COLRED&" not found\n",{filename}),pos) else simple_error(sprintf_utf("file "&COLYEL&"%s"&COLRED&" not found\n",{filename})) end if end if end procedure --------------------------------------------------------------------------- --Load and lex, recursively finding includes procedure load_and_lex_source(string filename, string reading_how, srcpos pos, sequence include_stack) string canon_filename --We don't really support \ in paths on Unix. canon_filename=normalize_filename(filename) ifdef WINDOWS then canon_filename=hs_lower(canon_filename) end ifdef if find(canon_filename,include_stack) then src_error(sprintf_utf("File "&COLPNK&"%s"&COLRED&" is being included recursively",{filename}),pos) end if include_stack&={canon_filename} if find(canon_filename,column(file_list,FILE_NAME)) then --This exact same include file has already been included. --Silently ignore this include. Double includes are allowed because of automatic inclusion. --src_warn(sprintf_utf("File "&COLPNK&"%s"&COLRED&" is being included multiple times",{include_name}),pos) return end if --Reads one additional file reenter_timing_zone("load_and_lex_source/reading") load_source(canon_filename,reading_how,pos) exit_timing_zone() --Lex that file, and load and lex all included files lex_file(length(file_list), include_stack) end procedure --------------------------------------------------------------------------- function find_include_file_globally(string include_name) if file_exists(compiler_dir&include_name) then return normalize_filename(compiler_dir&include_name) end if --in case we're installed at $prefix/games, try $prefix/share/games/ohrrpgce/ if file_exists(compiler_dir&"../share/games/ohrrpgce/"&include_name) then return normalize_filename(compiler_dir&"../share/games/ohrrpgce/"&include_name) end if --Failure: return filename, used for error messages return include_name end function --------------------------------------------------------------------------- --Search for an include file function find_include_file(string include_name) sequence ret if equal(include_name,"plotscr.hsd") or equal(include_name,"scancode.hsi") then --Prefer to use fundamental include files found in global locations, because --plotscr.hsd and scancode.hsi will be exported too, when exporting scripts --from Custom. return find_include_file_globally(include_name) end if --try source directory if file_exists(path_only(source_file)&include_name) then return normalize_filename(path_only(source_file)&include_name) end if if file_exists(include_dir&include_name) then return include_dir&include_name end if --try current directory (which will be where the .rpg is) if file_exists(include_name) then return include_name end if --try global locations return find_include_file_globally(include_name) end function --------------------------------------------------------------------------- procedure show_source_info() if total_lines then wrap_print("%d lines read from %d files\n",{total_lines,length(file_list)}) else simple_error("no data to compile\n") end if if file_list[$][FILE_OFFSET]+length(file_list[$][FILE_TEXT])>power(2,23) then simple_warn("You have over 8 MB of scripts! Unfortunately compile and in-game errors will report the wrong line numbers for some errors.") end if end procedure --------------------------------------------------------------------------- procedure lexer_binary_error(srcpos pos) --better use src_error, to track down garbage if accidentally creeping in src_error("This file contains binary gunk. Are you sure that this is the text file you want to compile?",pos) end procedure --------------------------------------------------------------------------- --Lex a string, processing the escape codes. --s is the source text (typically whole file contents), i is the starting offset in s, --1 past the opening ", and stop is last permissible offset in s to include. --Returns {closing_quote_offset,quoted_string} (i.e. enclosed in quote marks, but with escape codes processed) function lexer_read_string(string s, integer i, integer stop, integer posoff) string string integer escaping integer at1, at2 escaping=false string="\"" while true do if i>stop then --did not find a closing " so not a valid string src_error("Expected \" at end of line to end string (multi-line strings are not supported)", encode_srcpos(posoff,i,1)) end if if s[i]<256 and lexer_table[s[i]+1]=LEX_BINARY then lexer_binary_error(encode_srcpos(posoff,i,1)) end if if escaping then if s[i]='"' or s[i]='\\' or s[i]='\'' then --including ' for closer compatibility with Python string&=s[i] elsif s[i]='n' then string&=10 elsif s[i]='t' then string&=9 elsif s[i]='x' then --escape 8 bit values (later will add \u and \U for 16 bit and 32 bit escapes) at1=0 at2=0 if i+2<=stop then at1=find(s[i+1],hex_chars) at2=find(s[i+2],hex_chars) end if if at1=0 or at2=0 then src_error("Expected two hexidecimal characters to follow \\x escape code",encode_srcpos(posoff,i-1,4)) end if string&={16*hex_char_values[at1]+hex_char_values[at2]} i+=2 else --invalid sequence string&='\\' string&=s[i] end if escaping=false else if s[i]='"' then return({i,string&'"'}) elsif s[i]='\\' then escaping=true else string&=s[i] end if end if i+=1 end while end function --------------------------------------------------------------------------- --Lex a signed or unsigned integer literal. --Returns {newi, text} with text is the string representation of an integer. function lexer_read_number(string s, integer i, integer stop, integer posoff) integer state atom val integer sign,base string text integer textstart integer textend integer digits textstart=i textend=i --Process negative prefix, because it isn't treated as a unary operator if s[i]='-' then text="-" i+=1 sign=-1 else text="" sign=1 end if --Skip whitespace while i<=stop and (s[i]=' ' or s[i]='\t') do i+=1 end while --Check for base prefix 0x 0o or 0b. No separating space allowed! base=10 if i+1<=stop and s[i]='0' then if s[i+1]='x' or s[i+1]='X' then base=16 i+=2 elsif s[i+1]='o' or s[i+1]='O' then base=8 i+=2 elsif s[i+1]='b' or s[i+1]='B' then base=2 i+=2 end if end if val=0 digits=0 while i<=stop do if s[i]<256 then state=lexer_table[s[i]+1] else state=LEX_CHAR end if --color_print("lexing number i=%d %d=%s state %d\n",{i,s[i],{s[i]},state}) if state=LEX_DIGIT then --0-9 integer digit digit=s[i]-'0' if digit>=base then src_error(sprintf_utf("Found digit "&COLYEL&"%s"&COLRED&" which is not permitted in a base-%d number",{s[i],base}),encode_srcpos(posoff,i,1)) end if text&=s[i] textend=i val=val*base+digit digits+=1 elsif base=16 and state=LEX_HEXDIGIT then --a-fA-F text&=s[i] textend=i val=val*16+(toLower(s[i])-'a'+10) digits+=1 elsif state=LEX_SPACE then --yes, allowed in numbers too elsif state=LEX_COMMA or state=LEX_BEGIN or state=LEX_END or state=LEX_OPERATOR or state=LEX_COMMENT or state=LEX_BINARY or state=LEX_NEWLINE then --backtrack one character --FIXME: bug: if this looks like part of a separator (KEYWORD) but isn't, then have granted a free comma where probably should throw an error instead i-=1 exit else --currently, state=LEX_CHAR or state=LEX_HEXDIGIT or state=LEX_WARN or state=LEX_STRING if s[i]='.' then src_error("Non-whole (decimal) numbers are not yet implemented", encode_srcpos(posoff,textstart,i-textstart+1)) end if if equal(text,"-") then exit --Jumps down to the 'if digits=0' error case end if src_error( sprintf_utf("Found unexpected character "&COLYEL&"%s"&COLRED&" while reading a number, "&COLYEL&"%s"&COLRED&". Names such as variables can't start with a digit. Or maybe you forgot a comma or operator?" ,{s[i],text}) ,encode_srcpos(posoff,i,1) ) end if i+=1 end while if digits=0 then --Encountered just '-' or '0x' or '-0x' (etc) if equal(text,"-") then src_error(COLYEL&"-"&COLRED&" can only be used to take the negation of a number, not other values/expressions (sorry!). Write "&COLYEL&"-1 * ..."&COLRED&" instead. Write "&COLYEL&"x -- y"&COLRED&" for subtraction.", encode_srcpos(posoff, textstart, 1)) else src_error(sprintf_utf(COLYEL&"%s"&COLRED&" looks like the start of a number, but there are no digits!",{s[textstart..i-1]}), encode_srcpos(posoff, textstart, i-textstart+1)) end if end if if base!=10 then --Special behaviour for non-decimals: silently overflow large 32 bit unsigned values to negatives if val>=#80000000 and val<#100000000 then val=val-#100000000 end if --Convert to text text=sprintf("%d",{sign*val}) end if if not int32(sign*val) then src_error(sprintf_utf(COLYEL&"%s"&COLRED&" is too big to be stored in a 32-bit signed integer. Numbers must be between -2147483648 and 2147483647",{text}),encode_srcpos(posoff,textstart,textend-textstart+1)) end if return({textend,text}) end function --------------------------------------------------------------------------- --Clean the rest of a line for seek_include after 'include' occurs --s is the whole line, i is end of the 'include', posoff is the file position at start of line function lexer_read_include_line(TokenList tokens, string s, integer i, integer stop, integer posoff) integer at integer whitespace_len sequence temp string filename_string srcpos pos integer isquoted if not equal(tokens[1][CMD_TEXT],"include") then src_error(COLYEL&"include"&COLRED&" must be the only statement on the line. Maybe you tried to use this reserved keyword illegally?",tokens[$][CMD_POS]) end if filename_string="" --Trim the 'include', comments, and whitespace, i+=1 --Skip the comma or whatever character ended the 'include' s=s[i..stop] if length(s) then --Need to increment i while stripping while s[1]=' ' or s[1]='\t' do s=s[2..length(s)] i+=1 end while whitespace_len=0 --Optionally allow quotes around the filename. --A quote mark in the middle of a filename doesn't need any escaping isquoted=(s[1]='"') if isquoted then temp=lexer_read_string(s,2,length(s),posoff+i-1) -- temp is {closing_quote_offset,quoted_string} pos=encode_srcpos(posoff,i,temp[1]) s=s[temp[1]+1..$] --s is now whatever is after the string i+=temp[1] --1 past the closing " filename_string=temp[2] --enclosed in quotes whitespace_len=length(s) --Trim comment and whitespace to check for anything after the filename string at=find('#',s) if at then s=s[1..at-1] whitespace_len=at-1 end if s=trim_whitespace(s) if length(s) then src_error("Found unexpected text after "&COLYEL&"include, \"filename\""&COLRED,encode_srcpos(posoff,i,whitespace_len)) end if else --Trim comment and whitespace: for an unquoted filename this delimits the filename at=find('#',s) if at then s=s[1..at-1] end if s=trim_whitespace(s) --Trims only from the right because we already striped left filename_string=s pos=encode_srcpos(posoff,i,length(s)) end if tokens=append(tokens,{filename_string,pos}) end if if length(tokens)=1 then src_error("Expected name of file to follow "&COLYEL&"include"&COLRED,tokens[1][CMD_POS]) end if if length(tokens)>2 then src_error(sprintf_utf("Found unexpected syntax "&COLYEL&"%s"&COLRED&" instead of filename to include. Syntax of an include statement is "&COLYEL&"include, filename"&COLRED&" possibly with a path",{tokens[2][CMD_TEXT]}),tokens[2][CMD_POS]) end if return(tokens) end function --------------------------------------------------------------------------- -- s is usually the text for the whole file, the interval i..stop is a single line, -- and posoff is the srcpos offset for the beginning of the file function lex_line(sequence s, integer i, integer stop, integer posoff) integer state TokenList tokens string ident --identifier text being built up sequence temp string masked integer found integer textstart --the offset at which this identifier starts, otherwise 0 integer textend --the offset at which the last non-whitespace character was read string remem_ident integer remem_textstart integer backtracked integer last_state --A LEX_* constant; the last state that was not LEX_SPACE integer last_state_i --Value at i where last_state was set ident="" last_state=LEX_NEWLINE last_state_i=0 textstart=0 tokens={} --pretty_print(stdout, {"lex_line posoff: ", posoff, "i:", i, "srcpos:", decode_srcpos(encode_srcpos(posoff,i,0))}, {2}) while i<=stop do if s[i]<256 then state=lexer_table[s[i]+1] else state=LEX_CHAR --other unicode end if --color_print("lexing i=%d %d=%s state %d ident=%s\n",{i,s[i],{s[i]},state,ident}) if state=LEX_CHAR or state=LEX_HEXDIGIT then ident=ident & toLower(s[i]) if textstart=0 then textstart=i end if textend=i elsif state=LEX_SPACE then elsif state=LEX_DIGIT then if length(ident) then ident=ident & s[i] textend=i else --Start of a number not starting with - (those are parsed below) temp=lexer_read_number(s,i,stop,posoff) --{offset of end of number, number string} tokens=append(tokens,{temp[2],encode_srcpos(posoff,i,temp[1]-i+1)}) i=temp[1] end if elsif state=LEX_WARN then ident=ident & toLower(s[i]) if textstart=0 then textstart=i end if textend=i src_error(sprintf_utf(COLYEL&"%s"&COLRED&" is reserved and not allowed in names (identifiers)!", {s[i]}), encode_srcpos(posoff,i,1)) else --We've reached the end of a token... unless it's only the first character --of a two-character operator. remem_ident=ident --for backtracking remem_textstart=textstart backtracked=false if length(ident) then tokens=append(tokens,{ident,encode_srcpos(posoff,textstart,textend+1-textstart)}) end if ident="" textstart=0 if state=LEX_BEGIN then --( tokens=append(tokens,{"begin",encode_srcpos(posoff,i,1)}) elsif state=LEX_END then --) if last_state=LEX_COMMA then --foo(1,). This probably doesn't particularly need to be disallowed --Could also check for <op>) but that will be caught later end if tokens=append(tokens,{"end",encode_srcpos(posoff,i,1)}) elsif state=LEX_COMMA then --Check we didn't just finish an identifier, to allow using : (which is LEX_OPERATOR) on the end of an identifier. if length(remem_ident)=0 and (last_state=LEX_COMMA or last_state=LEX_BEGIN or last_state=LEX_OPERATOR or last_state=LEX_OPERATOR2) then --foo(, 1) or foo(1, , 2) or <op>, but not ), src_error( sprintf_utf("Found an extra comma which previous HSpeak versions ignored, but which might have a different meaning in the future. Remove it.",{}), encode_srcpos(posoff,i,1)) end if elsif state=LEX_NEWLINE then elsif state=LEX_COMMENT then exit elsif state=LEX_STRING then temp=lexer_read_string(s,i+1,stop,posoff) --{final offset, string} tokens=append(tokens,{temp[2],encode_srcpos(posoff,i,temp[1]-i+1)}) i=temp[1] elsif state=LEX_OPERATOR then --A character which the start of some auto-separating sequence such as +=. --Read the next non-space character. If it's LEX_OPERATOR2 we greedily --add it, otherwise we do nothing (we only support length 1 and 2 --separators). However, we might have have been too greedy (not a value --two-char separator), and need to put it back. textstart=i masked=s[i..i] integer lookahead_state --This while loop is in order to allow whitespace in the middle of separators. while i<stop do i+=1 if s[i]<256 then lookahead_state=lexer_table_key2[s[i]+1] else lookahead_state=LEX_CHAR end if if lookahead_state=LEX_OPERATOR2 then --Could be a second character. Stop masked&=s[i] exit elsif lookahead_state=LEX_CHAR then --includes binary gunk, warning characters and comments i-=1 exit end if --otherwise, it's a space character end while --now i+1 is the start of another token, or is > stop --Lookup for separators which are prefixes of masked found=false for j=1 to length(separator_list) do if match(separator_list[j],masked)=1 then --very ugly if length(separator_list[j])=1 then --Backtrack if necessary to first character in masked, otherwise remain at second i=textstart end if tokens=append(tokens,{separator_list[j],encode_srcpos(posoff,textstart,i-textstart+1)}) found=true exit end if end for if found=false then --need to backtrack, in particular for - --color_print("backtracking on '%s'\n", {masked}) backtracked=true ident=remem_ident & masked[1] i=textstart if length(remem_ident) then --This character appeared inside a normal token, resume reading that token tokens=tokens[1..$-1] textstart=remem_textstart elsif equal(ident,"-") then --'-' appears at the start of a token: this is only allowed to be a number. --We haven't appended anything to tokens that needs removing. ident="" temp=lexer_read_number(s,i,stop,posoff) tokens=append(tokens,{temp[2],encode_srcpos(posoff,i,temp[1]-i+1)}) i=temp[1] textstart=0 state=LEX_DIGIT --Sensible last_state to avoid 'extra comma' error else --a keyword character followed by garbage src_error( "Expected a two-character operator (like "&COLYEL&":="&COLRED&") but the second character (e.g. "&COLYEL&"="&COLRED&") is missing", encode_srcpos(posoff,i,1)) end if else textstart=0 if last_state=LEX_COMMA then -- ,<op> --(Could check for (<op> but that will be detected later anyway.) --Allowed if it's a unary operator. There's only one, since - is not an operator. if s[i]!='$' then src_error( "Found an extra comma which previous HSpeak versions ignored, but which might have a different meaning in the future. Remove it.", encode_srcpos(posoff,last_state_i,1)) end if end if end if elsif state=LEX_BINARY then lexer_binary_error(encode_srcpos(posoff,i,1)) end if if backtracked=false then if equal(remem_ident,"include") then --include is a special case, as normal lexing rules don't apply for rest of line if length(ident) then simple_error("compiler bug! [before lexer_read_include_line]\n") end if return lexer_read_include_line(tokens,s,i,stop,posoff) end if end if --End of a complete token end if if state!=LEX_SPACE then last_state_i=i last_state=state end if i+=1 end while if length(ident) then --ought to check whether ident is 'include'... tokens=append(tokens,{ident,encode_srcpos(posoff,textstart,textend+1-textstart)}) end if return(tokens) end function --------------------------------------------------------------------------- --this also checks for rogue strings and $'s function translate_plotstrings(TokenList tokens) integer state integer start_token integer i Token string_func Token string_token srcpos pos,vpos state=0 i=1 while i<=length(tokens) do if compare(tokens[i][CMD_TEXT],"$")=0 then pos=tokens[i][CMD_POS] vpos=virtual_pos(pos) if state=0 then state=1 start_token=i else --jump to error throw state=1 exit end if end if if state=2 then if tokens[i][CMD_TEXT][1]='"' then string_list=append(string_list,tokens[i][CMD_TEXT][2..$-1]) string_token={sprintf("@$string%d",{length(string_list)}),tokens[i][CMD_POS]} tokens=tokens[1..start_token-1]&{string_func,{"begin",vpos}}&tokens[start_token+1..i-2]&{string_token,{"end",vpos}}&tokens[i+1..$] state=0 --may have multiple strings on one line else --we saw a + (or illegal =) inside the string number expression state=1 end if end if if state=1 and compare(tokens[i][CMD_TEXT],"=")=0 then string_func={"setstringfromtable",pos} state=2 elsif state=1 and compare(tokens[i][CMD_TEXT],"+")=0 then string_func={"appendstringfromtable",pos} state=2 end if i+=1 end while if state!=0 then --we've seen a surplus $, not right src_error(COLYEL&"$"&COLRED&" may only be used as part of a $...=\"...\" or $...+\"...\" construct",pos) end if return(tokens) end function --------------------------------------------------------------------------- --Given a command tree, get the original text from the file --OK, this function is horribly complicated just so tracevalue can print --"hero X (me)" instead of "herox(me)". It could be implemented much simpler if --macros were expanded in the lexer, but then build_ast and --convert_macros would have to be as well function tree_original_text(Node tree) TokenList script_toks Node subtree srcpos leftmost, rightmost integer left_at, right_at integer filenum integer left_point, right_point integer depth TokenList tokens string text script_toks=all_scripts[cur_script_num][SCRIPT_BODY_TOKS] --Find leftmost and rightmost tokens making up the expression, not including brackets --(Do we care about virtual flags?) --NOTE: we're making an assumption about srcpos encoding here, that --if token A begins before token B, then srcpos A < srcpos B leftmost=tree[TREE_TRUNK][CMD_POS] subtree=tree while length(subtree[TREE_BRANCHES]) do subtree=subtree[TREE_BRANCHES][1] if subtree[TREE_TRUNK][CMD_POS]<leftmost then leftmost=subtree[TREE_TRUNK][CMD_POS] end if end while rightmost=tree[TREE_TRUNK][CMD_POS] subtree=tree while length(subtree[TREE_BRANCHES]) do subtree=subtree[TREE_BRANCHES][$] rightmost=subtree[TREE_TRUNK][CMD_POS] end while --grab those tokens from the script's token list left_at=find_in_column(leftmost,script_toks,CMD_POS) right_at=find_in_column(rightmost,script_toks,CMD_POS) tokens=script_toks[left_at..right_at] --pretty_print(stdout,{"tree_original_text tokens: ",tokens},{2}) --keep grabbing extra tokens until brackets are balanced --note: this will read 'foo(bar())' but will read 'foo()' as 'foo' depth=count("begin",column(tokens,CMD_TEXT))-count("end",column(tokens,CMD_TEXT)) if depth<0 then src_error("Compiler bug: negative depth in tree_original_text",tokens[1][CMD_POS]) end if while depth do right_at+=1 if equal(script_toks[right_at][CMD_TEXT],"begin") then depth+=1 else --only other possibility is "end" depth-=1 end if end while --grab original text corresponding to this range of tokens filenum=srcpos_file_number(leftmost) left_point=srcpos_file_offset(leftmost) --this +1 is safe, guaranteed to be followed by either another argument or "end" right_point=srcpos_file_offset(script_toks[right_at+1][CMD_POS]) text=file_list[filenum][FILE_TEXT][left_point..right_point-1] --pretty_print(stdout, {"tree_original_text left/right_point:", left_point, right_point, "text:", text, decode_srcpos(leftmost)},{2}) --NOTE: should be able to make use of srcpos lengths instead. (Again, how to handle virtuals?) --clean up the grabbed text text=trim_whitespace(substitute(text,'\n',',')) while find(text[$],", \t") do text=text[1..$-1] end while return(text) end function --------------------------------------------------------------------------- function seek_include(TokenList tokens) string ret if length(tokens) and equal(tokens[1][CMD_TEXT],"include") then if length(tokens)=1 or length(tokens[2][CMD_TEXT])=0 then src_error(COLYEL&"include"&COLRED&" is a keyword which should be followed on the same line by the filename to include",tokens[1][CMD_POS]) end if ret=tokens[2][CMD_TEXT] if ret[1]='"' then --trim quote marks ret=ret[2..$-1] end if return(ret) -- --Alternatively use original text (escape sequences won't work)... so might be useful on Wdinwos -- srcdata=decode_srcpos(s[at+1][CMD_POS]) -- return(srcdata[POS_LINETEXT][srcdata[POS_COLUMN]+1..srcdata[POS_COLUMN]+srcdata[POS_LENGTH]]) end if return("") end function --------------------------------------------------------------------------- procedure lex_file(integer file_num, sequence include_stack) TokenList broken string include_name sequence lines integer posoff -- object temp posoff=file_list[file_num][FILE_OFFSET] lines=file_list[file_num][FILE_LINE_EXTENTS] for i=1 to length(lines) do broken=lex_line(file_list[file_num][FILE_TEXT],lines[i][PAIR_START],lines[i][PAIR_END],posoff) -- for j=1 to length(broken) do -- temp = decode_srcpos(broken[j][CMD_POS]) -- pretty_print(stdout,{broken[j][CMD_TEXT], temp},{2}) -- end for include_name=seek_include(broken) if length(include_name) then load_and_lex_source(find_include_file(include_name),"including",broken[2][CMD_POS],include_stack) else --this is probably not the best place to do this translation, but for now we simply emulate old string translation --this also checks for stray strings and $'s cmd&=translate_plotstrings(broken) end if end for end procedure --------------------------------------------------------------------------- procedure load_and_lex_all() enter_timing_zone("Loading & lexing files") if find('b',optlist)=0 then load_and_lex_source(find_include_file("plotscr.hsd"),"reading",0,{}) end if --Load any additional includes first, because it is almost certainly --an exported .hsi of constants. Due to the two-pass parsing --the order of includes matters for almost nothing aside from defineconstant, --eg including the .hsi first allows "defineconstant(song:foobar, victory song)" for i=1 to length(additional_includes) do load_and_lex_source(additional_includes[i],"reading",0,{}) end for --Load the main file load_and_lex_source(source_file,"reading",0,{}) exit_timing_zone() end procedure --------------------------------------------------------------------------- --Warning: you normally want to use check_undefined_string instead of this --This does not check whether s is defined as a constant, subscript, or local variable, --and may miss scripts too. procedure check_for_reserved(string s, srcpos pos, string expect) string extra Symbol sym sequence src_position if map:has(global_scope,s) then extra="" sym=map:get(global_scope,s) if sym[SYM_POS] then src_position=decode_srcpos(sym[SYM_POS]) extra&=sprintf_utf( " (which was declared on line %d in "&COLPNK&"%s"&COLRED&")" ,{src_position[POS_LINE],src_position[POS_FILENAME]} ) end if if equal("top-level declaration",expect) then extra&=". Perhaps there is an extra "&COLYEL&"end"&COLRED&" or "&COLYEL&")"&COLRED&" earlier in the file" end if src_error( sprintf_utf( "Expected %s, but found %s "&COLYEL&"%s"&COLRED&"%s" ,{expect, KIND_LONGNAMES[sym[SYM_KIND]], s, extra} ) ,pos ) end if end procedure --------------------------------------------------------------------------- procedure mustnt_be_a_number(Token s) --note that identifiers are allowed to start with - if length(exclude(s[CMD_TEXT],"-0123456789"))=0 and count('-',s[CMD_TEXT])!=length(s[CMD_TEXT]) then --Work backwards to figured out if constant substitution has occurred on s[CMD_TEXT] sequence src_position sequence const_src_position string original_text string msg src_position=decode_srcpos(s[CMD_POS]) original_text=src_position[POS_TOKTEXT] object const_data const_data=map:get(constant_list,original_text,{}) if length(const_data) then const_src_position=decode_srcpos(const_data[CONST_POS]) msg=sprintf_utf("Expected a name, but found "&COLYEL&"%s"&COLRED&" which is defined as a constant on line %d of "&COLPNK&"%s"&COLRED&" with the value "&COLYEL&"%d"&COLRED, { original_text, const_src_position[POS_LINE], const_src_position[POS_FILENAME], const_data[CONST_VALUE] }) else msg="Expected a name, but found a number "&COLYEL &s[CMD_TEXT] &COLRED end if src_error(msg, s[CMD_POS]) end if end procedure --------------------------------------------------------------------------- procedure check_undefined_constant(Token s, atom value) sequence const_data sequence src_position string err_string string whats_happening if map:has(constant_list,s[CMD_TEXT]) then --constant is already defined const_data=map:get(constant_list,s[CMD_TEXT]) src_position=decode_srcpos(const_data[CONST_POS]) if const_data[CONST_VALUE]=value then whats_happening="will be ignored because" else whats_happening="is being redefined;" end if err_string=sprintf_utf("Constant "&COLYEL&"%s"&COLRED&" %s it is already defined in line %d of "&COLPNK&"%s"&COLRED&" with the value "&COLYEL&"%d"&COLRED ,{ s[CMD_TEXT] ,whats_happening ,src_position[POS_LINE] ,src_position[POS_FILENAME] ,const_data[CONST_VALUE] } ) if const_data[CONST_VALUE]=value then src_warn(err_string,s[CMD_POS]) else src_error(err_string,s[CMD_POS]) end if --if it's in constant_list, then it's also in global_scope. else check_for_reserved(s[CMD_TEXT],s[CMD_POS],"constant name") end if mustnt_be_a_number(s) end procedure --------------------------------------------------------------------------- --Check whether an identifier is unused and available. If cur_script_num is set, also --checks local variables and subscripts in the current scope, but not nonlocal vars. procedure check_undefined_string(Token s, string expect) sequence src_position integer at --Check global_scope check_for_reserved(s[CMD_TEXT],s[CMD_POS],expect) --Check constant_list and for number literals mustnt_be_a_number(s) --Search for an in-scope subscript of that name (subscripts aren't shadowed) --Also, when check_for_reserved is called from inside parse_script, script names haven't been added to global_scope yet --so non-subscripts are missed by check_for_reserved. at=lookup_scoped_script_name(s[CMD_TEXT]) if at then src_position=decode_srcpos(all_scripts[at][SCRIPT_POS]) src_error( sprintf_utf( COLYEL&"%s"&COLRED&" can't be used as a %s because it's already declared as a %s on line %d of "&COLPNK&"%s"&COLRED, {s[CMD_TEXT], expect, all_scripts[at][SCRIPT_TRIGGER_TOK][CMD_TEXT], src_position[POS_LINE], src_position[POS_FILENAME]} ) ,s[CMD_POS] ) end if if cur_script_num!=0 then --Search for a local with that name. Ignore non-locals because they are allowed to be shadowed. VarList local_vars --Might be in the middle of building this script and not have SCRIPT_VARIABLES yet if length(all_scripts[cur_script_num]) >= SCRIPT_VARIABLES then local_vars=all_scripts[cur_script_num][SCRIPT_VARIABLES] for idx=1 to length(local_vars) do if equal(local_vars[idx][CMD_TEXT],s[CMD_TEXT]) then if local_vars[idx][VAR_FRAME]=0 then --is local src_position=decode_srcpos(local_vars[idx][CMD_POS]) src_error( sprintf_utf( COLYEL&"%s"&COLRED&" can't be used as a %s because it's already defined as a local variable/argument on line %d of "&COLPNK&"%s"&COLRED, {s[CMD_TEXT],expect,src_position[POS_LINE],src_position[POS_FILENAME]} ) ,s[CMD_POS] ) end if end if end for end if end if end procedure --------------------------------------------------------------------------- function force_16_bit(atom n, srcpos pos) if n>32767 then src_warn(sprintf_utf("number "&COLYEL&"%d"&COLRED&" is out of range for a 16-bit signed integer, and will be truncated to "&COLYEL&"32767"&COLRED,{n}),pos) n=32767 elsif n<-32768 then src_warn(sprintf_utf("number "&COLYEL&"%d"&COLRED&" is out of range for a 16-bit signed integer, and will be truncated to "&COLYEL&"-32768"&COLRED,{n}),pos) n=-32768 end if return(n) end function --------------------------------------------------------------------------- function try_string_to_number(Token s) atom result result=floor(string_to_object(s[CMD_TEXT],0)) if not string_is_int32(s[CMD_TEXT]) then src_error(sprintf_utf("Expected number but found "&COLYEL&"%s"&COLRED,{s[CMD_TEXT]}),s[CMD_POS]) end if return(result) end function --------------------------------------------------------------------------- function enforce_constants(string s) --enforces both constants object v v=map:get(constant_list,s,{}) if length(v) then v=v[CONST_VALUE] if int32(v) then return(sprintf("%d",{v})) end if end if return(s) end function --------------------------------------------------------------------------- function get_cmd() Token result if get_cmd_pointer>length(cmd) then src_error("Unexpected end of file",cmd[length(cmd)][CMD_POS]) end if result=cmd[get_cmd_pointer] get_cmd_pointer+=1 result[CMD_TEXT]=enforce_constants(result[CMD_TEXT]) return(result) end function --------------------------------------------------------------------------- function get_cmd_no_constants() Token result if get_cmd_pointer>length(cmd) then src_error("Unexpected end of file",cmd[length(cmd)][CMD_POS]) end if result=cmd[get_cmd_pointer] get_cmd_pointer+=1 result[CMD_TEXT]=result[CMD_TEXT] return(result) end function --------------------------------------------------------------------------- --Returns a list of tokens enclosed in begin,end, with no nested begin,end allowed. function get_cmd_block(integer convert_constants) Token this TokenList result result={} this=get_cmd() if compare("begin",this[CMD_TEXT])!=0 then src_error(sprintf_utf("Expected "&COLYEL&"begin"&COLRED&" or "&COLYEL&"("&COLRED&" bracket but found "&COLYEL&"%s"&COLRED,{this[CMD_TEXT]}),this[CMD_POS]) end if while true do if convert_constants then this=get_cmd() else this=get_cmd_no_constants() end if if compare("begin",this[CMD_TEXT])=0 then src_error("Recursive "&COLYEL&"begin"&COLRED&" and "&COLYEL&"("&COLRED&" brackets are not permitted in this block",this[CMD_POS]) elsif compare("end",this[CMD_TEXT])=0 then exit--break out of the while else result=append(result,this) end if end while return(result) end function --------------------------------------------------------------------------- --Get a block containing balanced begin,end pairs. --Returns {ptr,TokenList} --Initial ptr is 1 if we already grabbed the opening 'begin'. function get_cmd_depth(integer ptr, TokenList data, integer depth) TokenList result Token this result={} while true do --if get_key()=27 then abort(1/0) end if if ptr>length(data) then src_error("block ended prematurely. Missing "&COLYEL&"end"&COLRED&" or "&COLYEL&")"&COLRED&"?",data[length(data)][CMD_POS]) end if this=data[ptr] ptr+=1 if compare("end",this[CMD_TEXT])=0 then depth-=1 elsif compare("begin",this[CMD_TEXT])=0 then depth+=1 end if if depth=0 then exit --break out of the while else result=append(result,this) end if end while return({ptr,result}) end function --------------------------------------------------------------------------- --Like get_cmd_block, but allowed to contain balanced begin,end pairs. function get_cmd_block_nestable(integer convert_constants) sequence ptr_and_block TokenList block ptr_and_block=get_cmd_depth(get_cmd_pointer,cmd,0) get_cmd_pointer=ptr_and_block[1] block=ptr_and_block[2][2..$] --trim leading begin if convert_constants then for i=1 to length(block) do block[i][CMD_TEXT]=enforce_constants(block[i][CMD_TEXT]) end for end if return block end function --------------------------------------------------------------------------- procedure parse_constant_block(TokenList block) atom num string name srcpos pos for i=1 to length(block) by 2 do if i+1>length(block) then src_error("Expected name to follow but defineconstant block ended",block[i][CMD_POS]) end if num=try_string_to_number({enforce_constants(block[i][CMD_TEXT]),block[i][CMD_POS]}) check_undefined_constant(block[i+1],num) name=block[i+1][CMD_TEXT] pos=block[i+1][CMD_POS] map:put(constant_list,name,{num,pos}) map:put(global_scope,name,{KIND_CONSTANT,num,pos}) end for end procedure --------------------------------------------------------------------------- procedure parse_trigger_block(TokenList block) integer num string name srcpos pos for i=1 to length(block) by 2 do if i+1>length(block) then src_error("Expected name but script trigger definition block ended",block[i][CMD_POS]) end if num=force_16_bit(try_string_to_number(block[i]),block[i][CMD_POS]) check_undefined_string(block[i+1],"script trigger name") name=block[i+1][CMD_TEXT] pos=block[i+1][CMD_POS] trigger_list=append(trigger_list,{num,name}) map:put(global_scope,name,{KIND_TOPLEVEL,0,pos}) end for end procedure --------------------------------------------------------------------------- procedure parse_version_block(TokenList block) string req_version if length(block)<2 then --Don't have a srcpos simple_error(COLYEL&"plotscr.hsd"&COLRED&" is malformed (empty 'plotscr version' block)") end if plotscr_version=block[1][CMD_TEXT] --version ID of this plotscr.hsd version (equal to the version of HSpeak at the time) req_version=block[2][CMD_TEXT] --the minimum version of HSpeak which supports this plotscr.hsd --trim quote marks if plotscr_version[1]!='"' or req_version[1]!='"' then simple_error(COLYEL&"plotscr.hsd"&COLRED&" is malformed (expected strings in 'plotscr version')") end if plotscr_version=plotscr_version[2..$-1] req_version=req_version[2..$-1] if compare(plotscr_version, MIN_PLOTSCR_VERSION) < 0 then simple_error("You have included a copy of "&COLYEL&"plotscr.hsd"&COLRED&" (version "&trim_whitespace(plotscr_version)&") from an old release of the OHRRPGCE. Please use the copies of plotscr.hsd and HSpeak provided with the OHRRPGCE.") end if --printf(stdout,"'%s' '%s'\n", {req_version, COMPILER_VERSION&COMPILER_SUB_VERSION}) if compare(req_version, COMPILER_VERSION&COMPILER_SUB_VERSION) > 0 then simple_error("You have included a copy of "&COLYEL&"plotscr.hsd"&COLRED&" (version "&trim_whitespace(plotscr_version)&") from a newer version of the OHRRPGCE than supported by this version of HSpeak. Please use the copies of plotscr.hsd and HSpeak provided with the OHRRPGCE.") end if end procedure --------------------------------------------------------------------------- procedure create_global(atom id, string name, srcpos pos) integer at at=find(id,global_list[PAIR_NUM]) if at then src_error(sprintf_utf("global variable ID "&COLYEL&"%d"&COLRED&" is already defined as "&COLYEL&"%s"&COLRED,{id,global_list[PAIR_NAME][at]}),pos) else if id>=0 and id<=MAXGLOBAL then global_list[PAIR_NUM]=append(global_list[PAIR_NUM],id) global_list[PAIR_NAME]=append(global_list[PAIR_NAME],name) global_list[GLB_POS]=append(global_list[GLB_POS],pos) map:put(global_scope,name,{KIND_GLOBAL,id,pos}) else src_error(sprintf_utf("global variable ID "&COLYEL&"%d"&COLRED&" is not permitted. Valid IDs are 0 to %d",{id,MAXGLOBAL}),pos) end if end if end procedure --------------------------------------------------------------------------- procedure parse_global_block(TokenList block) atom num for i=1 to length(block) by 2 do if i+1>length(block) then src_error("expected name but globalvariable block ended",block[i][CMD_POS]) end if num=try_string_to_number({enforce_constants(block[i][CMD_TEXT]),block[i][CMD_POS]}) check_undefined_string(block[i+1],"global variable name") create_global(num,block[i+1][CMD_TEXT],block[i][CMD_POS]) end for end procedure --------------------------------------------------------------------------- procedure parse_operator_block(TokenList block) atom num Symbol sym Kind kind string name,truename srcpos pos for i=1 to length(block) by 3 do if i+2>length(block) then src_error("expected name but defineoperator block ended",block[i][CMD_POS]) end if num=try_string_to_number(block[i]) mustnt_be_a_number(block[i+1]) mustnt_be_a_number(block[i+2]) name=block[i+1][CMD_TEXT] truename=block[i+2][CMD_TEXT] pos=block[i+2][CMD_POS] operator_list=append(operator_list,{num,name,truename,pos}) sym=map:get(global_scope,name,nil_sym) kind=sym[SYM_KIND] if kind then if kind!=KIND_SEPARATOR and kind!=KIND_MATH then src_error(sprintf_utf("%s "&COLYEL&"%s"&COLRED&" may not be used as an operator",{KIND_LONGNAMES[kind],name}),pos) else --Existing names do not get added to global_scope, because we need to be able to use --global_scope to map names like 'mod' to the math function rather than the operator. --Instead, operator_list is consulted directly to lookup operators. end if else --Add to global_scope to prevent it from being reused for anything else. map:put(global_scope,name,{KIND_OPERATOR,length(operator_list),pos}) end if end for end procedure --------------------------------------------------------------------------- --Decide which ID number to assign to an autonumbered script. --name isn't fully-scoped. function pick_script_id(integer parent_id, string name) integer at, id sequence encoded if length(reuse_ids) then encoded=encode_ohr(name) if length(encoded)>36 then encoded=encoded[1..36] end if at=find_in_column({parent_id,encoded},reuse_ids,REUSE_PARENT_AND_NAME) if at then return reuse_ids[at][PAIR_NUM] end if end if loop do autonumber_id-=1 id=autonumber_id until find_in_column(id,reuse_ids,PAIR_NUM)=0 end loop return id end function --------------------------------------------------------------------------- --Compute bits to add to FUNC_FLAGS from a FUNC_ATTRS sequence function compute_attribute_bits(sequence attributes) integer at integer attr_bits attr_bits=0 for i=1 to length(attributes) do at=find_in_column(attributes[i][CMD_TEXT],attribute_list,ATTR_NAME) if at=0 then simple_error("Compiler bug! compute_attribute_bits can't handle "&attributes[i][CMD_TEXT]) end if attr_bits=or_bits(attr_bits,attribute_list[at][ATTR_FLAGS]) end for return attr_bits end function --------------------------------------------------------------------------- --arglist is either a sequence of default values (which may equal to the NO_DEFAULT constant), or it's equal to --VAR_ARGS indicating a variable (unlimited) number of arguments. --id may only be -1 (autonumbered) when there is no parent script (so that the unscoped --and fully-scoped name are the same), otherwise pick_script_id must be called explicitly! function create_function(sequence list, integer id, string name, object arglist, integer func_type, srcpos pos, sequence attributes) integer at, attr_bits if func_type=KIND_SCRIPT then at=find(id,column(list,PAIR_NUM)) if at then src_error(sprintf_utf("%s ID "&COLYEL&"%d"&COLRED&" is already defined as "&COLYEL&"%s"&COLRED,{KIND_LONGNAMES[func_type],id,list[at][PAIR_NAME]}),pos) end if if id=0 or id<-1 then src_error(sprintf_utf("ID "&COLYEL&"%d"&COLRED&" is not valid",{id}),pos) elsif id=-1 then --autonumber id=pick_script_id(0,name) end if end if attr_bits=compute_attribute_bits(attributes) list=append(list,{id,name,arglist,attributes,attr_bits,pos}) map:put(global_scope,name,{func_type,id,pos}) return(list) end function --------------------------------------------------------------------------- --Read zero or more attributes (token prefixed with @ to set them apart, optionally with parameters) --from current position 'ptr' in 'block'. --We assume that this is never called on a token that might be a reference instead of an attribute --(eg by first checking for KIND_ATTRIBUTE) function parse_attributes(TokenList block, integer ptr) sequence attributes sequence attr_args, ptr_and_block string attr_name integer at attributes={} while ptr<=length(block) and block[ptr][CMD_TEXT][1]='@' do attr_name=block[ptr][CMD_TEXT] at=find_in_column(attr_name,attribute_list,ATTR_NAME) if at=0 then src_error("Unrecognised function attribute "&COLYEL&attr_name&COLRED, block[ptr][CMD_POS]) end if if ptr+1<length(block) and equal(block[ptr+1][CMD_TEXT],"begin") then --FIXME: should probably enforce constants in the params, because when called from parse_top_level, --that hasn't been done yet. ptr_and_block=get_cmd_depth(ptr+2,block,1) ptr=ptr_and_block[1] attr_args=ptr_and_block[2] else ptr+=1 attr_args={} end if attributes&={{attr_name,attr_args}} end while return {attributes,ptr} end function --------------------------------------------------------------------------- function parse_define_block(TokenList block, sequence list, integer func_type) atom num string name atom args object arglist --sequence of strings, or VAR_ARGS sequence attributes sequence attrs_and_ptr srcpos name_pos integer i i=1 while i<=length(block) do --Parse any attributes attrs_and_ptr=parse_attributes(block,i) attributes=attrs_and_ptr[1] i=attrs_and_ptr[2] if length(attributes)>0 and i>length(block) then src_error(sprintf_utf("expected %s id to follow, but define block ended",{KIND_LONGNAMES[func_type]}),block[i-1][CMD_POS]) end if --ID num=force_16_bit(try_string_to_number(block[i]),block[i][CMD_POS]) if i+1>length(block) then src_error(sprintf_utf("expected %s name but define block ended",{KIND_LONGNAMES[func_type]}),block[i][CMD_POS]) end if --Name i+=1 check_undefined_string(block[i],KIND_LONGNAMES[func_type]&" name") name=block[i][CMD_TEXT] name_pos=block[i][CMD_POS] if i+1>length(block) then src_error("expected argument count but define block ended",block[i][CMD_POS]) end if --Get arglist i+=1 args=try_string_to_number(block[i]) arglist={} if args<0 then if func_type=KIND_SCRIPT then src_error("Number of arguments for a script can't be negative (variable number of arguments aren't supported)",block[i][CMD_POS]) end if arglist=VAR_ARGS else for j=1 to args do if i+1>length(block) then src_error("expected argument default but define block ended",block[i][CMD_POS]) end if i+=1 arglist=append(arglist,try_string_to_number(block[i])) end for if args>MAX_ARGS then src_error(sprintf_utf("Number of arguments (%d) is more than allowed (%d)",{args,MAX_ARGS}),block[i][CMD_POS]) end if end if i+=1 list=create_function(list,num,name,arglist,func_type,name_pos,attributes) end while return(list) end function --------------------------------------------------------------------------- procedure preliminary_parse_pass() Token this enter_timing_zone("Preliminary pass") color_print("preliminary pass\n",{}) get_cmd_pointer=1 while get_cmd_pointer<=length(cmd) do --read a top-level command this=get_cmd() if equal("defineconstant",this[CMD_TEXT]) then parse_constant_block(get_cmd_block(false)) elsif equal("definetrigger",this[CMD_TEXT]) then parse_trigger_block(get_cmd_block(false)) elsif equal("plotscrversion",this[CMD_TEXT]) then parse_version_block(get_cmd_block(false)) end if end while --Check for very old plotscr.hsd if equal(plotscr_version,"") then if find("plotscr.hsd",column(file_list,FILE_NAME)) then simple_error("You have included a copy of "&COLYEL&"plotscr.hsd"&COLRED&" from an old release of the OHRRPGCE. Please use the copies of plotscr.hsd and HSpeak provided with the OHRRPGCE..") end if --If plotscr.hsd is not included then we're compiling with -b, so don't complain end if exit_timing_zone() end procedure --------------------------------------------------------------------------- --Parse a script or subscript, finding the list of tokens (NOT yet transformed into a tree), --and appending an entry to all_scripts. --parent_name is a fully-scoped name of parent script, or "" --if this script isn't a subscript of another. --Returns Token with name of parsed script function parse_script(Token trigger, string parent_name, integer parent_idx, sequence attributes) Token name string full_name TokenList arglist TokenList body Token this string err_string string previous_script sequence src_position string msg integer depth integer nest_depth Symbol sym integer temp integer idx sequence subscript_attributes sequence attrs_and_ptr name=get_cmd() if parent_idx=0 then full_name=name[CMD_TEXT] nest_depth=0 else full_name=sprintf_utf("%s<-%s",{parent_name,name[CMD_TEXT]}) nest_depth=all_scripts[parent_idx][SCRIPT_NEST_DEPTH]+1 if nest_depth>MAX_NEST_DEPTH then src_error( sprintf_utf("%s "&COLYEL&"%s"&COLRED&" is nested too deeply (%d layers deep), which is more than is supported. Reorganise your scripts to not use so much nesting." ,{trigger[CMD_TEXT],name[CMD_TEXT],nest_depth}) ,name[CMD_POS] ) end if end if --Check name not already in use... unless it appears in a definescript block; if it did the name is already reserved. --Note that we won't add it to script_list or global_scope yet; that happens later in check_script_declarations if not find_in_column(full_name,script_list,PAIR_NAME) then cur_script_num=parent_idx --temporarily to check for subscripts check_undefined_string(name, COLYEL &trigger[CMD_TEXT] &COLRED &" name") cur_script_num=0 --This probably isn't needed end if previous_script=current_script current_script=full_name --name[CMD_TEXT] --Parse arguments arglist={} while true do if get_cmd_pointer>length(cmd) then src_error(sprintf_utf("%s "&COLYEL&"%s"&COLRED&" is missing "&COLYEL&"begin"&COLRED&" or "&COLYEL&"("&COLRED,{trigger[CMD_TEXT],name[CMD_TEXT]}),name[CMD_POS]) end if this=get_cmd() if compare("begin",this[CMD_TEXT])=0 then exit--break the while end if --Note that arglist isn't the actual list of arguments, it's just a string of tokens right now (including defaults). --The actual arglist is built in process_arglist, while compiling each script. arglist=append(arglist,this) end while --Create all_scripts entry now for recursive calls; SCRIPT_BODY_TOKS and SCRIPT_END_POS filled in later, --and more fields are appended in compile_a_script all_scripts=append(all_scripts,{ trigger ,name[CMD_TEXT] ,name[CMD_POS] ,full_name ,arglist ,{} --SCRIPT_BODY_TOKS ,parent_idx ,nest_depth ,attributes ,0 --SCRIPT_END_POS }) idx=length(all_scripts) --Read the body --Every script is nested inside a big fat do() block body={{"do",virtual_pos(name[CMD_POS])}} depth=0 subscript_attributes={} while true do --If we just read some attribute statements, they must precede a subscript if length(subscript_attributes)>0 and equal("subscript",this[CMD_TEXT])=0 then src_error( sprintf_utf( "Expected a subscript to follow function attribute "&COLYEL&"%s"&COLRED&", but found "&COLYEL&"%s"&COLRED ,{subscript_attributes[$][CMD_TEXT],this[CMD_TEXT]} ) , this[CMD_POS]) end if body=append(body,this) if compare("end",this[CMD_TEXT])=0 then depth-=1 if depth=0 then all_scripts[idx][SCRIPT_END_POS]=this[CMD_POS] exit--break while end if elsif compare("begin",this[CMD_TEXT])=0 then depth+=1 elsif compare("subscript",this[CMD_TEXT])=0 then --Recurse, so body of subscript doesn't appear in this script. --The subscript node stays, for syntax checking, but is removed in normalize_flow_control parse_script(this,full_name,idx,subscript_attributes) subscript_attributes={} body&={{"begin",this[CMD_POS]},{"end",this[CMD_POS]}} else sym=map:get(global_scope,this[CMD_TEXT],nil_sym) temp=sym[SYM_KIND] if temp=KIND_ATTRIBUTE then --Read all subscript attributes, and don't put them in this script's body body=body[1..$-1] attrs_and_ptr=parse_attributes(cmd,get_cmd_pointer-1) subscript_attributes=attrs_and_ptr[1] get_cmd_pointer=attrs_and_ptr[2] elsif temp>KIND_LAST_VALID_IN_SCRIPT then err_string=sprintf_utf( "%s "&COLYEL&"%s"&COLRED&" is not permitted inside a script.", {KIND_LONGNAMES[temp], this[CMD_TEXT]} ) if temp=KIND_TOPLEVEL then err_string&=sprintf_utf(" Perhaps "&COLYEL&"%s"&COLRED&" has an extra "&COLYEL&"begin"&COLRED&" or "&COLYEL&"("&COLRED&" or is missing a "&COLYEL&")"&COLRED&" or "&COLYEL&"end"&COLRED&".",{name[CMD_TEXT]}) end if src_error(err_string,this[CMD_POS]) end if end if if get_cmd_pointer>length(cmd) then src_error( sprintf_utf( "%s "&COLYEL&"%s"&COLRED&" is missing "&COLYEL&"end"&COLRED&" or "&COLYEL&")"&COLRED ,{trigger[CMD_TEXT],name[CMD_TEXT]} ) ,name[CMD_POS] ) end if this=get_cmd() end while all_scripts[idx][SCRIPT_BODY_TOKS]=body current_script=previous_script return name end function --------------------------------------------------------------------------- procedure parse_top_level() Token this sequence triggers, attributes sequence attrs_and_ptr integer triggeridx enter_timing_zone("Top level pass") color_print("parsing top-level\n",{}) triggers=column(trigger_list,PAIR_NAME) attributes={} get_cmd_pointer=1 while get_cmd_pointer<=length(cmd) do --read a top-level command this=get_cmd() triggeridx=find(this[CMD_TEXT],triggers) --If we just read some attribute statements, they must precede a script if length(attributes)>0 and triggeridx=0 then src_error( sprintf_utf( "Expected a script to follow function attribute "&COLYEL&"%s"&COLRED&", but found "&COLYEL&"%s"&COLRED ,{attributes[$][CMD_TEXT],this[CMD_TEXT]} ) , this[CMD_POS]) end if if compare("defineconstant",this[CMD_TEXT])=0 then get_cmd_block(true) elsif compare("definetrigger",this[CMD_TEXT])=0 then get_cmd_block(true) elsif compare("globalvariable",this[CMD_TEXT])=0 then parse_global_block(get_cmd_block(true)) elsif compare("defineoperator",this[CMD_TEXT])=0 then parse_operator_block(get_cmd_block(true)) elsif compare("definefunction",this[CMD_TEXT])=0 then function_list=parse_define_block(get_cmd_block_nestable(true),function_list,KIND_FUNCTION) elsif compare("definescript",this[CMD_TEXT])=0 then script_list=parse_define_block(get_cmd_block(true),script_list,KIND_SCRIPT) elsif compare("plotscrversion",this[CMD_TEXT])=0 then --ignore get_cmd_block(false) elsif triggeridx>0 then --"script"/etc. Negative trigger IDs are subscripts if trigger_list[triggeridx][PAIR_NUM]<0 then src_error( sprintf_utf( COLYEL&"%s"&COLRED&" can only occur inside another script. Use "&COLYEL&"script"&COLRED&" instead for regular scripts." ,{this[CMD_TEXT]} ) ,this[CMD_POS] ) else parse_script(this,{},0,attributes) attributes={} end if elsif this[CMD_TEXT][1]='@' then --This will keep reading all consecutive attributes until there are no more attrs_and_ptr=parse_attributes(cmd,get_cmd_pointer-1) attributes=attrs_and_ptr[1] get_cmd_pointer=attrs_and_ptr[2] else check_for_reserved(this[CMD_TEXT],this[CMD_POS],"top-level declaration") src_error( sprintf_utf( "Expected top-level declaration but found "&COLYEL&"%s"&COLRED ,{this[CMD_TEXT]} ) ,this[CMD_POS] ) end if end while if length(attributes)>0 then --If we just read some attribute statements, they must precede a script src_error("Expected a script to follow function attribute, but file ended", this[CMD_POS]) end if cmd={} --Free memory exit_timing_zone() end procedure --------------------------------------------------------------------------- procedure dump_script_and_function_info(integer fh, sequence list) sequence this string id_string sequence src_position for i=1 to length(list) do this=list[i] if this[PAIR_NUM]>autonumber_id then id_string=sprintf("AUTONUMBER=%d",{this[PAIR_NUM]}) else id_string=sprintf("ID=%d",{this[PAIR_NUM]}) end if src_position=decode_srcpos(this[FUNC_POS]) print_utf(fh,"%s %d\t%s\t%s(",{ src_position[POS_FILENAME] ,src_position[POS_LINE] ,id_string ,this[PAIR_NAME] }) if equal(this[FUNC_ARGS],VAR_ARGS) then print_utf(fh,"VARIABLE ARGS",{}) else for j=1 to length(this[FUNC_ARGS]) do if j>1 then print_utf(fh,",",{}) end if if equal(this[FUNC_ARGS][j],NO_DEFAULT) then print_utf(fh,"REQ",{}) else print_utf(fh,"%d",{this[FUNC_ARGS][j]}) end if end for end if print_utf(fh,")\n",{}) end for end procedure --------------------------------------------------------------------------- function seek_string_by_id(integer id, sequence list, string name) integer at at=find_in_column(id,list,PAIR_NUM) if at then return(list[at][PAIR_NAME]) else simple_error(sprintf_utf("decompiler couldnt find %s ID "&COLYEL&"%d"&COLRED,{name,id})) end if return("") end function --------------------------------------------------------------------------- --Used while dumping script binary function name_lookup(sequence pair, VarList locals) integer at integer frame,id if pair[1]=KIND_NUMBER then return(sprintf("%d",{pair[2]})) elsif pair[1]=KIND_LOCAL or pair[1]=KIND_NONLOCAL then frame=and_bits(pair[2],#FF00)/#100 id=and_bits(pair[2],#FF) for i=1 to length(locals) do if locals[i][VAR_FRAME]=frame and locals[i][VAR_ID]=id then return locals[i][CMD_TEXT] end if end for simple_error(sprintf_utf("HSpeak bug: decompiler found illegal variable ID "&COLYEL&"%d*256+%d"&COLRED,{frame,id})) elsif pair[1]=KIND_GLOBAL then at=find(pair[2],global_list[PAIR_NUM]) if at then return(global_list[PAIR_NAME][at]) else simple_error(sprintf_utf("decompiler couldnt find global variable ID "&COLYEL&"%d"&COLRED,{pair[2]})) end if elsif pair[1]=KIND_FLOW then return(seek_string_by_id(pair[2],flow_list,"flow control structure")) elsif pair[1]=KIND_SCRIPT then return(seek_string_by_id(pair[2],script_list,"user script")) elsif pair[1]=KIND_FUNCTION then return(seek_string_by_id(pair[2],function_list,"hardcoded function")) elsif pair[1]=KIND_MATH then return(seek_string_by_id(pair[2],math_list,"built-in function")) else simple_error(sprintf_utf("HSpeak bug: decompiler found illegal kind "&COLYEL&"%d"&COLRED,{pair[1]})) end if end function --------------------------------------------------------------------------- function binstring_to_int(sequence encoded) --bytes_to_int is NOT the opposite of int_to_bytes, it can't handle negative numbers, which int_to_bytes mangles atom temp temp=and_bits(encoded[1],#FF)+and_bits(encoded[2],#FF)*#100+and_bits(encoded[3],#FF)*#10000+and_bits(encoded[4],#FF)*#1000000 if and_bits(temp,#80000000) then return(temp-#100000000) end if return(temp) end function --------------------------------------------------------------------------- function read_word(sequence encoded) --opposite of output_word. encoded should be length 2 or more integer temp temp=and_bits(encoded[1],#FF)+and_bits(encoded[2],#FF)*#100 if and_bits(temp,#8000) then return(temp-#10000) end if return(temp) end function --------------------------------------------------------------------------- --Convenience wrapper around read_word, to read an int16 with offset given --in int16 words and starting from 0 instead of 1. function read_int16(sequence encoded, integer offset) return read_word(encoded[1+2*offset..2+2*offset]) end function --------------------------------------------------------------------------- function dump_script_binary(sequence bin, integer offset, integer depth, VarList locals) string result sequence kind_and_id integer kind integer argcount integer new_offset result="" kind_and_id={binstring_to_int(bin[offset*4+1..offset*4+4]),binstring_to_int(bin[offset*4+5..offset*4+8])} kind=kind_and_id[1] result&=sprintf_utf("%s%s",{ repeat(' ',depth) --indent ,name_lookup(kind_and_id,locals) }) if kind=KIND_FLOW or kind=KIND_SCRIPT or kind=KIND_FUNCTION or kind=KIND_MATH then argcount=binstring_to_int(bin[1+offset*4+8..1+offset*4+11]) if argcount then result&="(\n" for i=0 to argcount-1 do new_offset=binstring_to_int(bin[1+(offset+3+i)*4..1+(offset+3+i)*4+3]) result&=dump_script_binary(bin,new_offset,depth+2,locals) end for result&=repeat(' ',depth)&")\n" else result&="()\n" end if else result&="\n" end if return(result) end function --------------------------------------------------------------------------- function dump_script_tree(NodeList tree, integer depth) string result result="" for i=1 to length(tree) do result&=sprintf_utf("%s%s",{ repeat(' ',depth)--indent ,tree[i][TREE_TRUNK][CMD_TEXT] }) if length(tree[i][TREE_BRANCHES])>0 then result&="(\n" result&=dump_script_tree(tree[i][TREE_BRANCHES],depth+2) result&=repeat(' ',depth)&")\n" else result&="\n" end if end for return(result) end function --------------------------------------------------------------------------- function dump_script_strings(sequence bin) string result integer table_start integer offset integer len result="" table_start=binstring_to_int(bin[9..12]) if table_start=0 then return("") end if bin=bin[table_start+1..$] offset=1 while offset<length(bin) do len=binstring_to_int(bin[offset..offset+3]) result&=sprintf("string offset %-4d: \"",{offset/4}) result&=escape_string(bin[offset+4..offset+4+len-1]) & "\"\n" offset+=4+len if remainder(len,4) then offset+=4-remainder(len,4) end if end while return(result) end function --------------------------------------------------------------------------- procedure dump_debug_report() integer fh string debug_file sequence src_position Script script integer idx VarList locals --only do this if the -d debug option was on the command line if find('d',optlist) then enter_timing_zone("Writing hs_debug.txt") if length(path_only(dest_file))>1 then debug_file=normalize_filename(path_only(dest_file)&"hs_debug.txt") else debug_file="hs_debug.txt" end if fh=open(debug_file,"w") if fh!=failure then wrap_print("writing debug report file "&COLBWHI&"%s"&COLWHI&"\n",{debug_file}) ------------------------------------- print_utf(fh,"[Scripts]\n",{}) dump_script_and_function_info(fh,script_list) print_utf(fh,"\n",{}) ------------------------------------- print_utf(fh,"[Global Variables]\n",{}) for i=1 to length(global_list[PAIR_NUM]) do src_position=decode_srcpos(global_list[GLB_POS][i]) print_utf(fh,"%s %d\tID=%d\t%s\n",{ src_position[POS_FILENAME] ,src_position[POS_LINE] ,global_list[PAIR_NUM][i] ,global_list[PAIR_NAME][i] }) end for print_utf(fh,"\n",{}) ------------------------------------- print_utf(fh,"[Builtin Functions]\n",{}) dump_script_and_function_info(fh,function_list) print_utf(fh,"\n",{}) ------------------------------------- print_utf(fh,"[Operators]\n",{}) for i=1 to length(operator_list) do src_position=decode_srcpos(operator_list[i][OPER_POS]) print_utf(fh,"%s %d\t%s\t%s\tPriority=%d\n",{ src_position[POS_FILENAME] ,src_position[POS_LINE] ,operator_list[i][PAIR_NAME] ,operator_list[i][OPER_TRUENAME] ,operator_list[i][PAIR_NUM] }) end for print_utf(fh,"\n",{}) ------------------------------------- print_utf(fh,"[Script Dumps]\n",{}) for i=1 to length(all_scripts) do script=all_scripts[i] src_position=decode_srcpos(script[SCRIPT_POS]) print_utf(fh,"%s %d\tID=%d\t%s (%s)\n",{ src_position[POS_FILENAME] ,src_position[POS_LINE] ,script[SCRIPT_ID] ,script[SCRIPT_NAME] ,script[SCRIPT_FULL_NAME] }) idx=script[SCRIPT_PARENT_IDX] if idx then src_position=decode_srcpos(script[SCRIPT_POS]) print_utf(fh,"%s %d\tparent=ID %d\t%s (access to %d nonlocals)\n",{ src_position[POS_FILENAME] ,src_position[POS_LINE] ,all_scripts[idx][SCRIPT_ID] ,all_scripts[idx][SCRIPT_NAME] ,script[SCRIPT_NUM_NONLOCALS] }) end if src_position=decode_srcpos(script[SCRIPT_TRIGGER_TOK][CMD_POS]) print_utf(fh,"%s %d\tTrigger=%d\t%s\n",{ src_position[POS_FILENAME] ,src_position[POS_LINE] ,script[SCRIPT_TRIGGER_ID] ,script[SCRIPT_TRIGGER_TOK][CMD_TEXT] }) locals=script[SCRIPT_VARIABLES] for j=1 to length(locals) do src_position=decode_srcpos(locals[j][CMD_POS]) print_utf(fh,"%s %d\tvar(frame %d id %d)=%s\n",{ src_position[POS_FILENAME] ,src_position[POS_LINE] ,locals[j][VAR_FRAME] ,locals[j][VAR_ID] ,locals[j][CMD_TEXT] }) end for print_utf(fh,"%d bytes compiled\n",{length(script[SCRIPT_BINARY])}) -- print_utf(fh,"%s\n\n",{dump_script_tree(script[SCRIPT_AST],0)}) print_utf(fh,"%s",{dump_script_binary(script[SCRIPT_BINARY][CODE_START_BYTE_OFFSET+1..$],0,0,script[SCRIPT_VARIABLES])}) print_utf(fh,"%s\n\n",{dump_script_strings(script[SCRIPT_BINARY])}) end for print_utf(fh,"\n",{}) ------------------------------------- close(fh) else wrap_print("Error opening debug report file "&COLBWHI&"%s"&COLRED&"\n",{debug_file}) end if exit_timing_zone() end if end procedure --------------------------------------------------------------------------- --Look up the name of a [sub]script in all current scopes, returning all_scripts --index if found, or 0 if not function lookup_scoped_script_name(string name) string s integer at integer idx idx=cur_script_num while true do if idx then s=all_scripts[idx][SCRIPT_FULL_NAME]&"<-"&name else s=name end if at=find_in_column(s,all_scripts,SCRIPT_FULL_NAME) if at then return at end if if idx=0 then return 0 end if idx=all_scripts[idx][SCRIPT_PARENT_IDX] end while end function --------------------------------------------------------------------------- --identify the kind and id of a text command. Does not support untranslated operators or floaty parethesis function what_kind_and_id(Token command, VarList local_vars, integer look_for_operators) integer kind integer at atom id Symbol sym string s s=command[CMD_TEXT] if length(s)=0 then kind=KIND_PARENS id=0 elsif (s[1] >= '0' and s[1] <= '9') or s[1] = '-' then kind=KIND_NUMBER id=string_to_object(s,{}) elsif look_for_operators and find_in_column(s,operator_list,PAIR_NAME) then kind=KIND_OPERATOR --this MUST go before KIND_MATH, because some operators and math functions have the same name id=0 else sym=map:get(global_scope,s,nil_sym) if sym[SYM_KIND] then kind=sym[SYM_KIND] id=sym[SYM_ID] elsif s[1] = '@' then kind=KIND_REFERENCE id=0 -- ID always resolves to 0 for references here, since it is too early to know all -- script IDs (really? Surely false). The real work is done in binary_compile_recurse elsif s[1] = '"' then src_error( sprintf_utf( "The string "&COLYEL&"%s"&COLRED&" is illegal here: strings may only be used as part of a $...=\"...\" or $...+\"...\" construct", {shorten_string(s,15)}), command[CMD_POS] ) else --Local variables can't shadow any names in global scope, they only shadow --nonlocal variables. So cen check this last at=find_in_column(s,local_vars,CMD_TEXT) if at then if local_vars[at][VAR_FRAME]=0 then kind=KIND_LOCAL else kind=KIND_NONLOCAL end if id=local_vars[at][VAR_ID] + 256*local_vars[at][VAR_FRAME] else --It may be a subscript, in which case the scoped name needs to be looked up at=lookup_scoped_script_name(s) if at then kind=KIND_SCRIPT id=all_scripts[at][SCRIPT_ID] else src_error(sprintf_utf("Unrecognised name "&COLYEL&"%s"&COLRED&". It has not been defined as script, constant, variable, or anything else",{s}),command[CMD_POS]) end if end if end if end if return({kind,id}) end function --------------------------------------------------------------------------- --Check whether a math operator is not an assignment, a compare operator, or 'random' function is_simple_math_op(integer id) return (id >= 1 and id <= 9) or id >= 19 end function --------------------------------------------------------------------------- --Given name (CMD_TEXT) of a Node, return whether it is a block: all its arguments are --statements as opposed to expressions (if, for, while, switch, which take a mix need special handling) function block_command(string name) return compare("do",name)=0 or compare("then",name)=0 or compare("else",name)=0 end function --------------------------------------------------------------------------- procedure check_is_expression(Node node, string parent) Symbol sym Kind kind --Local variables are always expressions sym=map:get(global_scope,node[TREE_TRUNK][CMD_TEXT],nil_sym) kind=sym[SYM_KIND] --This function is always called after convert_macros, so we don't need to --worry about whether a particular macro is an expression if kind=KIND_FLOW or kind=KIND_KEYWORD then src_error( sprintf_utf(COLYEL&"%s"&COLRED&" may not be used as part of an expression (found as an argument to "&COLYEL&"%s"&COLRED&")" ,{node[TREE_TRUNK][CMD_TEXT],parent}) ,node[TREE_TRUNK][CMD_POS]) end if end procedure --------------------------------------------------------------------------- --this function not used anywhere function how_many_args(string name, integer kind) integer result integer at if kind=KIND_PARENS then result=-1 --parens support (n,operator,n) but if one of n is an operator, it comes out to be more :P elsif kind=KIND_FLOW then result=-1 --flow supports an unknown number of args elsif kind=KIND_OPERATOR then result=0 -- its important that operators behave as zero-arg-thingamabobs before they are translated into builtin math functions elsif kind=KIND_MATH then at=find_in_column(name[CMD_TEXT],math_list,PAIR_NAME) result=length(math_list[at][FUNC_ARGS]) elsif kind=KIND_FUNCTION then at=find_in_column(name[CMD_TEXT],function_list,PAIR_NAME) if equal(function_list[at][FUNC_ARGS],VAR_ARGS) then result=-1 else result=length(function_list[at][FUNC_ARGS]) end if elsif kind=KIND_SCRIPT then at=find_in_column(name[CMD_TEXT],script_list,PAIR_NAME) result=length(script_list[at][FUNC_ARGS]) elsif kind=KIND_MACRO then result=-1 --unknown number of args else --numbers, variables, etc do not permit args result=0 end if return(result) end function --------------------------------------------------------------------------- --Something that potentially takes args function takes_args(integer kind) if kind=KIND_PARENS or kind=KIND_FLOW or kind=KIND_MATH or kind=KIND_FUNCTION or kind=KIND_SCRIPT or kind=KIND_MACRO or kind=KIND_KEYWORD then return(true) else return(false) end if end function --------------------------------------------------------------------------- --Get next AST node from a list of tokens by pairing up begin,end --Returns {new_ptr,{token,children}} where children is a TokenList not a NodeList function get_script_node(integer ptr, TokenList data, VarList vars) Token command sequence ptr_and_block TokenList block sequence temp integer kind integer followed_by_begin block={} command=data[ptr] ptr+=1 if compare("end",command[CMD_TEXT])=0 then src_error(COLYEL&"end"&COLRED&" or "&COLYEL&")"&COLRED&" without "&COLYEL&"begin"&COLRED&" or "&COLYEL&"("&COLRED,command[CMD_POS]) elsif compare("begin",command[CMD_TEXT])=0 then --floaty brackets for order-of-operations-enforcement ptr-=1 command[CMD_TEXT]="" end if temp=what_kind_and_id(command,vars,true) --check for KIND_OPERATOR too kind=temp[1] followed_by_begin=false if ptr<=length(data) then --there is room for args followed_by_begin=equal("begin",data[ptr][CMD_TEXT]) --distinguishing between functions with and without args means wait() would be ok but noop() would not if takes_args(kind) then if followed_by_begin then --yes, it has args ptr+=1--only increment the pointer when we have args ptr_and_block=get_cmd_depth(ptr,data,1) ptr=ptr_and_block[1] block=ptr_and_block[2] if length(block)=0 and kind=KIND_PARENS then src_error("found empty parentheses not associated with a function call",command[CMD_POS]) end if end if end if end if if kind=KIND_FLOW then --Require certain flow control types to be followed by parentheses. --We have to check this here or earlier, because after building the parse tree --there will be no difference between "()" and omitted parentheses. --(Note that several later checks in normalize_flow_control are redundant to this) --FIXME: Ideally we wouldn't actually check anything here, but instead preserve presence --of empty parentheses to check later (it will matter in future anyway according to typing plan) --FIXME: excludes else, because of 'case(else)' if find(command[CMD_TEXT],flow_requiring_brackets) then if not followed_by_begin then src_error(sprintf_utf(COLYEL&"%s"&COLRED&" must be followed by "&COLYEL&"begin"&COLRED&" or "&COLYEL&"("&COLRED,{command[CMD_TEXT]}),command[CMD_POS]) end if end if --else has a special check for 'else,if' mistake. --'not followed_by_begin' excludes 'else(...)if' if equal("else",command[CMD_TEXT]) and ptr<=length(data) and equal("if",data[ptr][CMD_TEXT]) and not followed_by_begin then src_error(COLYEL&"else if"&COLRED&" may not be written with a separator in-between",command[CMD_POS]) --temp=get_script_node(ptr, data, vars) --hack to allow if necessary --temp[2][1][CMD_TEXT]="elseif" --return temp end if end if return({ptr,{command,block}}) end function --------------------------------------------------------------------------- --Returns a NodeList function build_ast(TokenList script_body_toks, VarList vars) integer ptr sequence ptr_and_seminode sequence seminode --Not a Node, has a TokenList instead of NodeList Node node NodeList result result={} ptr=1 while true do ptr_and_seminode=get_script_node(ptr,script_body_toks,vars) ptr=ptr_and_seminode[1] seminode=ptr_and_seminode[2] if length(seminode[TREE_BRANCHES])>0 then --this seminode has arguments that need parsing node={seminode[TREE_TRUNK],build_ast(seminode[TREE_BRANCHES],vars)} else node=seminode end if result=append(result,node) if ptr>length(script_body_toks) then exit --break out of while when there is no more data end if end while return(result) end function --------------------------------------------------------------------------- --When variables are inherited by a subscript all frame numbers need to be incremented function shift_variables_frame(VarList vars) for i=1 to length(vars) do vars[i][VAR_FRAME]+=1 end for return vars end function --------------------------------------------------------------------------- --Given a VarList with all local & nonlocal variables, returns a VarList --of just the locals, in-order function filter_local_variables(VarList vars) sequence ret integer next_idx ret={} next_idx=0 for i=length(vars) to 1 by -1 do if vars[i][VAR_FRAME]=0 then --Is local if vars[i][VAR_ID]!=next_idx then --This is really an assumption, not a bug simple_error("compiler bug! Local vars are not in order") end if next_idx+=1 ret=append(ret,vars[i]) end if end for return ret end function --------------------------------------------------------------------------- procedure add_local_variable(Token var) Script script VarList vars integer id script=all_scripts[cur_script_num] vars=script[SCRIPT_VARIABLES] --First check it's not already defined at global scope check_undefined_string(var,"local variable name") --Always add to scope of current script (frame == 0) id=length(vars)-script[SCRIPT_NUM_NONLOCALS] --Extend the Token into a VarToken, and prefix to the list, so that it takes --precedence over shadowed variables all_scripts[cur_script_num][SCRIPT_VARIABLES]={var & {0,id}} & vars end procedure --------------------------------------------------------------------------- --Find the variable declarations in a script procedure gather_local_vars(TokenList data) Token this integer ptr ptr=1 while true do this=data[ptr] ptr+=1 if compare("variable",this[CMD_TEXT])=0 then if ptr>length(data) then src_error(sprintf_utf(COLYEL&"variable"&COLRED&" should be followed by "&COLYEL&"(name)"&COLRED,{}),this[CMD_POS]) end if this=data[ptr] ptr+=1 if compare("begin",this[CMD_TEXT])=0 then while true do if ptr>length(data) then src_error(sprintf_utf(COLYEL&"variable"&COLRED&" should be followed by "&COLYEL&"(name)"&COLRED,{}),this[CMD_POS]) end if this=data[ptr] ptr+=1 if compare("end",this[CMD_TEXT])=0 then exit--break the while end if add_local_variable(this) end while else src_error(sprintf_utf(COLYEL&"variable"&COLRED&" should be followed by "&COLYEL&"(name)"&COLRED,{}),this[CMD_POS]) end if end if if ptr>length(data) then exit --break out of the while end if end while end procedure --------------------------------------------------------------------------- --There are only two, hardcoded, macros right now: tracevalue, assert. function expand_macros(NodeList tree) string cmd string tracecmd NodeList newtree NodeList newargs string new_string Node string_node, assertstr_node sequence src_position sequence temp srcpos pos, vpos integer ptr ptr=1 while ptr<=length(tree) do --recurse first, because tree[ptr] might be split up (as in 'assert') if length(tree[ptr][TREE_BRANCHES]) then tree[ptr][TREE_BRANCHES]=expand_macros(tree[ptr][TREE_BRANCHES]) end if cmd=tree[ptr][TREE_TRUNK][CMD_TEXT] if equal(cmd,"tracevalue") or equal(cmd,"tracevalueof") then tracecmd="tracevalueinternal" elsif equal(cmd,"showvalueof") then tracecmd="showvalueofinternal" else tracecmd="" end if if length(tracecmd) then newargs={} tree[ptr][TREE_TRUNK][CMD_TEXT]=tracecmd pos=tree[ptr][TREE_TRUNK][CMD_POS] for i=1 to length(tree[ptr][TREE_BRANCHES]) do new_string=tree_original_text(tree[ptr][TREE_BRANCHES][i]) string_list=append(string_list,new_string) string_node={ {sprintf("@$string%d",{length(string_list)}),virtual_pos(pos)}, {} } newargs&={string_node,tree[ptr][TREE_BRANCHES][i]} end for tree[ptr][TREE_BRANCHES]=newargs end if if equal(cmd,"assert") then --assert(condition) --> if (not(condition)) then ($assert expression string="condition", assertfailure) -- and --assert(x == y) --> if(_asserteq(x, y, assert expression string, "condition")) then (assertfailure) pos=tree[ptr][TREE_TRUNK][CMD_POS] vpos=virtual_pos(pos) src_position=decode_srcpos(pos) if length(tree[ptr][TREE_BRANCHES])!=1 then src_error(sprintf_utf(COLYEL&"assert"&COLRED&" statement takes exactly one argument: a condition. It has %d",{length(tree[ptr][TREE_BRANCHES])}),pos) end if new_string=sprintf_utf("%s:%d: %s",{src_position[POS_FILENAME],src_position[POS_LINE],tree_original_text(tree[ptr][TREE_BRANCHES][1])}) string_list=append(string_list,new_string) string_node={ {sprintf("@$string%d",{length(string_list)}),vpos}, {} } assertstr_node={ {enforce_constants("assertexpressionstring"),vpos}, {} } newtree={{{"if",vpos},{}}, {{"then",vpos},{}}} if equal(tree[ptr][TREE_BRANCHES][1][TREE_TRUNK][CMD_TEXT],"equal") then --Use _asserteq sequence x,y temp=tree[ptr][TREE_BRANCHES][1][TREE_BRANCHES] --x, y x=temp[1] y=temp[2] --if(...) newtree[1][TREE_BRANCHES]={{ {"_asserteq",vpos}, {x,y,assertstr_node,string_node} }} --then(assertfailure) newtree[2][TREE_BRANCHES]={{ {"assertfailure",vpos}, {} }} else --if(not(condition)) newtree[1][TREE_BRANCHES]={{ {"not",vpos}, {tree[ptr][TREE_BRANCHES][1]} }} temp={{{"setstringfromtable",vpos},{}}, {{"assertfailure",vpos},{}}} temp[1][TREE_BRANCHES]={assertstr_node,string_node} --stringnum, string --then(...) newtree[2][TREE_BRANCHES]=temp end if tree=replace_slice(tree,ptr,ptr,newtree) --tree[ptr..ptr]=newtree ptr+=1 end if ptr+=1 end while return(tree) end function --------------------------------------------------------------------------- --tree[ptr] is a partially constructed if Node (lacking else child), followed either by an else or elseif Node. --Handle else/elseif, parenting to tree[pt], and return the new tree. function build_else_node(NodeList tree, integer ptr) NodeList elseargs string nextcmd integer eptr srcpos pos nextcmd=tree[ptr+1][TREE_TRUNK][CMD_TEXT] pos=tree[ptr+1][TREE_TRUNK][CMD_POS] if equal("else",nextcmd) then --found else tree[ptr][TREE_BRANCHES]&={tree[ptr+1]} tree=delete_element(tree,ptr+1) else -- equal("elseif",nextcmd) --Split off the if elseargs={ {{"if",pos},tree[ptr+1][TREE_BRANCHES]} } --Grab the rest of this if,elseif,...,else chain, to put inside the else eptr=ptr+2 while eptr<=length(tree) do nextcmd=tree[eptr][TREE_TRUNK][CMD_TEXT] --Stop when we see an else but otherwise we don't check correctness until we recurse if equal(nextcmd,"else") then eptr+=1 exit elsif find(nextcmd,{"elseif","then"}) then --continue else exit end if eptr+=1 end while elseargs&=tree[ptr+2..eptr-1] tree=tree[1..ptr]&tree[eptr..$] tree[ptr][TREE_BRANCHES]&={ {{"else",pos},elseargs} } end if return tree end function --------------------------------------------------------------------------- --Build switch AST Nodes, return new tree function build_switch_node(NodeList tree, VarList vars, integer ptr) srcpos pos srcpos argpos NodeList args Token thiscmd NodeList rawargs NodeList caseargs Token casecmd string lastnodename sequence collected --either a Node or {} sequence temp integer have_else srcpos elsepos integer expect_case integer kind atom id string else_error_msg else_error_msg=COLYEL&"else"&COLRED&" case should be placed last inside a "&COLYEL&"switch"&COLRED&" block" pos=tree[ptr][TREE_TRUNK][CMD_POS] args=tree[ptr][TREE_BRANCHES] --start with the condition, length 1 if length(args)>1 then src_error(sprintf_utf( COLYEL&"switch"&COLRED&" statement has %d expressions. It should have only one. (The body of the switch should go in its "&COLYEL&"do(...)"&COLRED&" block instead.)" ,{length(args)} ),args[2][TREE_TRUNK][CMD_POS]) elsif length(args)=0 then src_error(COLYEL&"switch"&COLRED&" statement has no expression to match! Write "&COLYEL&"switch (expression) do (...)"&COLRED,pos) end if check_is_expression(args[1],"switch") if ptr>=length(tree) then src_error(COLYEL&"switch"&COLRED&" should be followed by "&COLYEL&"do"&COLRED,pos) end if thiscmd=tree[ptr+1][TREE_TRUNK] if equal("do",thiscmd[CMD_TEXT])=0 then src_error(sprintf_utf(COLYEL&"switch"&COLRED&" should be followed by "&COLYEL&"do"&COLRED&", not by "&COLYEL&"%s"&COLRED&".",{thiscmd[CMD_TEXT]}),thiscmd[CMD_POS]) end if --found do, processing starts here (build arguments to switch, after the expression) rawargs=tree[ptr+1][TREE_BRANCHES] --contents of the 'do' lastnodename="" have_else=false expect_case=false collected={} for j=1 to length(rawargs) do thiscmd=rawargs[j][TREE_TRUNK] argpos=thiscmd[CMD_POS] if j=1 and equal("case",thiscmd[CMD_TEXT])=0 then src_error(sprintf_utf(COLYEL&"switch() do("&COLRED&" should be followed with a "&COLYEL&"case"&COLRED&", not with "&COLYEL&"%s"&COLRED&"." ,{thiscmd[CMD_TEXT]}),argpos) end if --to resolve the ambiguity of an else, we always suppose that they belong to an 'if' (or elseif) if possible if equal("else",thiscmd[CMD_TEXT]) and find(lastnodename,{"if","then","elseif"})=0 then --Old syntax else(...) block if length(collected) then args&={collected} collected={} end if if have_else then src_error(else_error_msg,elsepos) end if expect_case=false if length(args) and equal(args[$][TREE_TRUNK][CMD_TEXT],"do")=false then --Last case is new-style syntax and wasn't followed by any statements --Lets just disallow this to prevent a little confusion src_error(COLYEL&"case(...) else(...)"&COLRED&" is ambiguous. "& "Write either "&COLYEL&"case(..., else) ..."&COLRED&" or "&COLYEL&"case(...) do() else(...)"&COLRED&" as fits your intentions." ,rawargs[j-1][TREE_TRUNK][CMD_POS]) end if if j!=length(rawargs) then if length(rawargs[j][TREE_BRANCHES])=0 then else_error_msg&=". Did you forget to enclose the else block with brackets, like "&COLYEL&"else(...)"&COLRED&"?" end if src_error(else_error_msg,argpos) end if --convert it to a do args&={ {{"do",argpos},rawargs[j][TREE_BRANCHES]} } elsepos=argpos have_else=true elsif equal("case",thiscmd[CMD_TEXT]) then if length(collected) then args&={collected} collected={} end if if have_else then --Note we disallow case(else) case(1) but allow case(else,1) src_error(else_error_msg,elsepos) end if expect_case=false caseargs=rawargs[j][TREE_BRANCHES] --Call a couple passes early here, so that constant expressions are allowed caseargs=collapse_floaty_brackets(caseargs) caseargs=optimize_script(caseargs,vars) --Check contents are only either integers, or "else", and append to args for k=1 to length(caseargs) do casecmd=caseargs[k][TREE_TRUNK] if equal(casecmd[CMD_TEXT],"else") then if have_else then src_error(COLYEL&"else"&COLRED&" appears multiple times inside this "&COLYEL&"switch"&COLRED&" block",casecmd[CMD_POS]) end if have_else=true elsepos=argpos --don't emit anything: next do is the 'else' one else temp=what_kind_and_id(casecmd,vars,false) kind=temp[1] id=temp[2] if kind=KIND_FLOW or (kind=KIND_MATH and is_simple_math_op(id)=false) then src_error(sprintf_utf(COLYEL&"%s"&COLRED&" is not allowed inside a "&COLYEL&"case"&COLRED ,{casecmd[CMD_TEXT]}),casecmd[CMD_POS]) end if args&={caseargs[k]} end if end for else --Disallow "case() do() garbage" if expect_case then src_error(sprintf_utf("Expected "&COLYEL&"case"&COLRED&" or "&COLYEL&"else"&COLRED&" to follow "&COLYEL&"case(...) do(...)"&COLRED&" but found "&COLYEL&"%s"&COLRED,{thiscmd[CMD_TEXT]}),argpos) end if if length(collected)=0 then --Start collecting into a do. But if it is a do (old syntax), don't double encapsulate it if equal("do",thiscmd[CMD_TEXT]) then args&={rawargs[j]} expect_case=true else collected={{"do",argpos},{rawargs[j]}} end if else --Continue collecting collected[TREE_BRANCHES]&={rawargs[j]} end if end if lastnodename=thiscmd[CMD_TEXT] end for if length(collected) then args&={collected} end if if length(args)>1 and equal(args[$][TREE_TRUNK][CMD_TEXT],"do")=false then --Last case (possibly an else) is new-style syntax and wasn't followed by any statements, --so add the missing do. Don't need to do this, but I'd like to keep things tidy --(srcpos is of the last case) args&={ {{"do",virtual_pos(argpos)},{}} } end if if have_else=false then --Insert a dummy do default block since else has been left out args&={ {{"do",virtual_pos(pos)},{}} } end if tree[ptr][TREE_BRANCHES]=args --delete the do block return delete_element(tree,ptr+1) end function --------------------------------------------------------------------------- --Given a variable token for 'for' or 'setvariable', etc. (parent) and list of locals/non-locals 'vars', --return a an integer ID token, or throw an error if not a variable. function convert_variable_to_id(Token var, VarList locals, string parent) sequence kind_and_id atom id integer argkind integer var_at kind_and_id=what_kind_and_id(var,locals,false) argkind=kind_and_id[1] id=kind_and_id[2] if argkind=KIND_LOCAL or argkind=KIND_NONLOCAL then id=-1-id elsif argkind=KIND_GLOBAL then --id unmodified else --It's not a variable. bad! src_error(sprintf_utf( "first argument of "&COLYEL&"%s"&COLRED&" must be a variable, not %s "&COLYEL&"%s"&COLRED ,{parent,KIND_LONGNAMES[argkind],var[CMD_TEXT]} ),var[CMD_POS]) end if return {sprintf("%d",{id}),var[CMD_POS]} end function --------------------------------------------------------------------------- --parse the script tree and make if absorb then and else, for and while absorb do, switch absorb stuff, check correctness of flow statements --Note that several of these checks are now redundant to the check for 'begin' performed in get_script_node function normalize_flow_control(NodeList tree, VarList vars, string parent) integer ptr string s string nextcmd NodeList args Token ctr srcpos pos Symbol sym Kind kind sequence temp integer var_at ptr=1 while ptr<=length(tree) do s=tree[ptr][TREE_TRUNK][CMD_TEXT] pos=tree[ptr][TREE_TRUNK][CMD_POS] args=tree[ptr][TREE_BRANCHES] sym=map:get(global_scope,s,nil_sym) kind=sym[SYM_KIND] if kind=KIND_FLOW or kind=KIND_KEYWORD then if equal(s,"then") or equal(s,"else") or equal(s,"do") then --these are checked for correct parents below. elsif block_command(parent)=false then src_error(COLYEL&s&COLRED&" may not be used as part of an expression (found as an argument to "&COLYEL&parent&COLRED&")",pos) end if end if if kind=KIND_FLOW then if compare("if",s)=0 then if length(args)>1 then src_error(sprintf_utf( COLYEL&"if"&COLRED&" statement has %d conditions. It should have only one. Use "&COLYEL&"&&"&COLRED&" and "&COLYEL&"||"&COLRED&" to combine multiple conditions" ,{length(args)} ),args[2][TREE_TRUNK][CMD_POS]) elsif length(args)=0 then src_error(sprintf_utf(COLYEL&"if"&COLRED&" statement has no condition. It should have one.",{}),pos) end if check_is_expression(args[1],s) if ptr<length(tree) then --there is room nextcmd=tree[ptr+1][TREE_TRUNK][CMD_TEXT] if compare("then",nextcmd)=0 then tree[ptr][TREE_BRANCHES]&={tree[ptr+1]} tree=delete_element(tree,ptr+1) if ptr<length(tree) then --there is room for else nextcmd=tree[ptr+1][TREE_TRUNK][CMD_TEXT] if equal("else",nextcmd) or equal("elseif",nextcmd) then tree=build_else_node(tree,ptr) else --no else found, but thats okay. --add dummy else tree[ptr][TREE_BRANCHES]&={ {{"else",virtual_pos(pos)},{}} } end if else --add dummy else tree[ptr][TREE_BRANCHES]&={ {{"else",virtual_pos(pos)},{}} } end if elsif equal("else",nextcmd) or equal("elseif",nextcmd) then --add dummy then before else tree[ptr][TREE_BRANCHES]&={ {{"then",virtual_pos(pos)},{}} } tree=build_else_node(tree,ptr) else --found neither then nor else src_error(sprintf_utf("expected "&COLYEL&"then"&COLRED&" or "&COLYEL&"else"&COLRED&" but found "&COLYEL&"%s"&COLRED,{nextcmd}),tree[ptr+1][TREE_TRUNK][CMD_POS]) end if else --no room for then or else src_error(COLYEL&"if"&COLRED&" should be followed by "&COLYEL&"then"&COLRED&" or "&COLYEL&"else"&COLRED,pos) end if elsif compare("then",s)=0 then if compare("if",parent)!=0 or ptr!=2 then src_error(COLYEL&"then"&COLRED&" should follow an "&COLYEL&"if"&COLRED&" or "&COLYEL&"else if"&COLRED&" block",pos) end if elsif compare("else",s)=0 then if compare("if",parent)!=0 or ptr!=3 then src_error(COLYEL&"else"&COLRED&" should follow an "&COLYEL&"if"&COLRED&" or "&COLYEL&"then"&COLRED&" or "&COLYEL&"else if"&COLRED&" block",pos) end if elsif compare("elseif",s)=0 then --Should be absorbed by now src_error(COLYEL&"else if"&COLRED&" should follow an "&COLYEL&"if"&COLRED&" or "&COLYEL&"then"&COLRED&" block",pos) elsif compare("do",s)=0 then if equal(parent,"")=false and block_command(parent)=false then --These check their arguments are expressions, so a do which is a child of one of these is correct if equal(parent,"for")=false and equal(parent,"while")=false and equal(parent,"switch")=false then src_error(COLYEL&s&COLRED&" may not be used as part of an expression (found as an argument to "&COLYEL&parent&COLRED&")",pos) end if end if elsif compare("while",s)=0 then if length(args)>1 then src_error(sprintf_utf( COLYEL&"while"&COLRED&" statement has %d conditions. It should have only one. Use "&COLYEL&"&&"&COLRED&" and "&COLYEL&"||"&COLRED&" to combine multiple conditions" ,{length(args)} ),args[2][TREE_TRUNK][CMD_POS]) elsif length(args)=0 then src_error(sprintf_utf(COLYEL&"while"&COLRED&" statement has no condition. It should have one.",{}),pos) end if check_is_expression(args[1],s) if ptr<length(tree) then --there is room if compare("do",tree[ptr+1][TREE_TRUNK][CMD_TEXT])=0 then --found do tree[ptr][TREE_BRANCHES]&={tree[ptr+1]} tree=delete_element(tree,ptr+1) else src_error(sprintf_utf(COLYEL&"while"&COLRED&" should be followed by "&COLYEL&"do"&COLRED&", not by "&COLYEL&"%s"&COLRED&".",{tree[ptr+1][TREE_TRUNK][CMD_TEXT]}),tree[ptr+1][TREE_TRUNK][CMD_POS]) end if else src_error(sprintf_utf(COLYEL&"while"&COLRED&" should be followed by "&COLYEL&"do"&COLRED,{}),pos) end if elsif compare("for",s)=0 then --Handle regular arguments if length(args)<3 then src_error(sprintf_utf(COLYEL&"for"&COLRED&" statement needs three or four arguments",{}),pos) elsif length(args)>4 then src_error(sprintf_utf(COLYEL&"for"&COLRED&" statement has too many arguments (%d)",{length(args)}),args[5][TREE_TRUNK][CMD_POS]) elsif length(args)=3 then --append default step value args&={ {{"1",virtual_pos(pos)},{}} } end if ctr=args[1][TREE_TRUNK] temp=what_kind_and_id(ctr,vars,false) kind=temp[1] if kind=KIND_LOCAL then map:put(used_locals,temp[2],true) elsif kind=KIND_GLOBAL then --warn, then translate into a numeric reference to a variable src_warn(sprintf_utf( "Using global variable "&COLYEL&"%s"&COLRED&" as the counter in a "&COLYEL&"for"&COLRED&" loop" ,{ctr[CMD_TEXT]} ),ctr[CMD_POS]) end if args[1][TREE_TRUNK]=convert_variable_to_id(ctr,vars,"for") check_is_expression(args[2],s) check_is_expression(args[3],s) check_is_expression(args[4],s) --Handle "do" if ptr<length(tree) then --there is room if compare("do",tree[ptr+1][TREE_TRUNK][CMD_TEXT])=0 then --found do args&={tree[ptr+1]} tree=delete_element(tree,ptr+1) else src_error(sprintf_utf(COLYEL&"for"&COLRED&" should be followed by "&COLYEL&"do"&COLRED&", not by "&COLYEL&"%s"&COLRED&".",{tree[ptr+1][TREE_TRUNK][CMD_TEXT]}),tree[ptr+1][TREE_TRUNK][CMD_POS]) end if else src_error(sprintf_utf(COLYEL&"for"&COLRED&" should be followed by "&COLYEL&"do"&COLRED,{}),pos) end if tree[ptr][TREE_BRANCHES]=args elsif compare("exit",s)=0 then if length(args)>1 then src_error(sprintf_utf( COLYEL&s&COLRED&" statement has %d arguments. It should have only zero or one." ,{length(args)} ),args[2][TREE_TRUNK][CMD_POS]) elsif length(args)=1 then tree[ptr][TREE_TRUNK][CMD_TEXT]="exitreturning" elsif length(args)=0 then tree[ptr][TREE_TRUNK][CMD_TEXT]="exitscript" end if elsif compare("return",s)=0 or compare("exitreturning",s)=0 then if length(args)>1 then src_error(sprintf_utf( COLYEL&s&COLRED&" statement has %d arguments. It should have only one." ,{length(args)} ),args[2][TREE_TRUNK][CMD_POS]) elsif length(args)=0 then src_error(sprintf_utf(COLYEL&s&COLRED&" statement has no argument. It should have one. Prehaps you meant to use "&COLYEL&"exit"&COLRED&" or "&COLYEL&"exit script"&COLRED,{}),pos) end if elsif compare("break",s)=0 or compare("continue",s)=0 then if length(args)>1 then src_error(sprintf_utf( COLYEL&s&COLRED&" statement has %d arguments. It should have no more than one." ,{length(args)} ),args[2][TREE_TRUNK][CMD_POS]) elsif length(args)=0 then --append default value tree[ptr][TREE_BRANCHES]={{{"1",virtual_pos(pos)},{}}} end if elsif compare("exitscript",s)=0 then if length(args)>0 then src_error(sprintf_utf( COLYEL&s&COLRED&" statement has %d arguments. It should have none. Prehaps you meant to use "&COLYEL&"exit"&COLRED&" or "&COLYEL&"exit returning"&COLRED ,{length(args)} ),args[1][TREE_TRUNK][CMD_POS]) end if elsif compare("switch",s)=0 then tree=build_switch_node(tree,vars,ptr) elsif compare("case",s)=0 then src_error(COLYEL&"case"&COLRED&" is not allowed outside of "&COLYEL&"switch"&COLRED,pos) end if elsif kind=KIND_KEYWORD then if compare("subscript",s)=0 or compare("variable",s)=0 then --Does nothing; remove. (Kept until now to check they isn't placed in a silly place) --(Note: any attributes prefixing a subscript have already been removed in --parse_script()) tree=delete_element(tree,ptr) continue end if end if if equal(s,"") then --Floaty brackets (they haven't been removed yet). Look to parent instead. s=parent end if tree[ptr][TREE_BRANCHES]=normalize_flow_control(tree[ptr][TREE_BRANCHES],vars,s) ptr+=1 end while return(tree) end function --------------------------------------------------------------------------- --A modification of Dijkstra's shunting-yard algorithm --(This could probably subsume build_ast too) function convert_operators(NodeList tree) NodeList new_tree sequence stack --{int,Node} pairs Node opnode integer ptr integer at integer priority integer nonoperator_before stack={} new_tree={} ptr=1 nonoperator_before=false while ptr<=length(tree) do --for each peer branch on the tree at=find_in_column(tree[ptr][TREE_TRUNK][CMD_TEXT],operator_list,PAIR_NAME) if at>0 then if nonoperator_before=false then --there is no room for the before-operand. Bad! src_error(sprintf_utf("operator "&COLYEL&"%s"&COLRED&" is missing its left-side operand (did not expect to see an operator here)",{tree[ptr][TREE_TRUNK][CMD_TEXT]}),tree[ptr][TREE_TRUNK][CMD_POS]) elsif ptr=length(tree) then --there is no room for the after-operand. Bad! src_error(sprintf_utf("operator "&COLYEL&"%s"&COLRED&" is missing its right-side operand",{tree[ptr][TREE_TRUNK][CMD_TEXT]}),tree[ptr][TREE_TRUNK][CMD_POS]) end if priority=operator_list[at][PAIR_NUM] --convert the operator to its true function name (which might be the same) tree[ptr][TREE_TRUNK][CMD_TEXT]=operator_list[at][OPER_TRUENAME] while length(stack) and priority>=stack[$][1] do --pop the operator from the stack and "apply" it to the tree opnode=stack[$][2] stack=stack[1..$-1] opnode[TREE_BRANCHES]=new_tree[$-1..$] new_tree=new_tree[1..$-2]&{opnode} end while stack&={{priority,tree[ptr]}} nonoperator_before=false else if nonoperator_before then --Two non-ops in a row means moved onto the new argument in this argument list, so finish processing operators first while length(stack) do --pop the operator from the stack and "apply" it to the tree opnode=stack[$][2] stack=stack[1..$-1] opnode[TREE_BRANCHES]=new_tree[$-1..$] new_tree=new_tree[1..$-2]&{opnode} end while end if tree[ptr][TREE_BRANCHES]=convert_operators(tree[ptr][TREE_BRANCHES]) new_tree&={tree[ptr]} nonoperator_before=true end if ptr+=1 end while --copy of above while length(stack) do --pop the operator from the stack and "apply" it to the tree opnode=stack[$][2] stack=stack[1..$-1] opnode[TREE_BRANCHES]=new_tree[$-1..$] new_tree=new_tree[1..$-2]&{opnode} end while return new_tree end function --------------------------------------------------------------------------- --Get the parameters for a certain attribute of a script or function, if it exists --func_defn: an element of script_list or function_list --Returns a NodeList, possible empty function attribute_params(string name, sequence func_defn) integer attridx attridx=find_in_column(name,func_defn[FUNC_ATTRS],1) if attridx=0 then return {} else return func_defn[FUNC_ATTRS][attridx][TREE_BRANCHES] end if end function --------------------------------------------------------------------------- --Ensure correct number of arguments, add defaults, and translate variables to IDs in assignments --And also some extra warnings... function fix_arguments(Node tree, integer kind, integer id, sequence list, VarList vars) integer at integer argnum,maxargs NodeList args, params integer fatal string message, param string funcname srcpos funcpos funcname=tree[TREE_TRUNK][CMD_TEXT] funcpos=tree[TREE_TRUNK][CMD_POS] args=tree[TREE_BRANCHES] argnum=length(args) --Lookup ID because name might be scoped at=find_in_column(id,list,PAIR_NUM) if equal(list[at][FUNC_ARGS],VAR_ARGS) then return(args) else maxargs=length(list[at][FUNC_ARGS]) end if if maxargs < argnum then src_error(sprintf_utf( "%s "&COLYEL&"%s"&COLRED&" takes at most %d arguments but is being passed %d arguments" ,{KIND_LONGNAMES[kind],funcname,maxargs,argnum} ),args[maxargs+1][TREE_TRUNK][CMD_POS]) elsif maxargs > argnum then --add defaults if not enough args are present if kind=KIND_MATH then --special processing for math if list[at][PAIR_NUM]<=15 or list[at][PAIR_NUM]>=19 then --math shouldnt have defaults src_error(sprintf_utf( "math function "&COLYEL&"%s"&COLRED&" is being passed %d arguments but it should always have %d" ,{funcname,argnum,maxargs} ),funcpos) else --variable assignment commands can have defaults if argnum = 0 then --no defaults for first argument of variable function src_error(sprintf_utf( "variable manipulation function "&COLYEL&"%s"&COLRED&" has %d arguments - it needs at least 1" ,{funcname,argnum} ),funcpos) elsif argnum = 1 then --make defaults for second arg of variable function if list[at][PAIR_NUM]=16 then --setvariable args=append(args,{ {"0",virtual_pos(funcpos)} ,{} }) else --increment and decrement args=append(args,{ {"1",virtual_pos(funcpos)} ,{} }) end if end if end if else --normal processing for script and function for i=argnum+1 to maxargs do if equal(list[at][FUNC_ARGS][i],NO_DEFAULT) then src_error(sprintf_utf( "%s "&COLYEL&"%s"&COLRED&" has no default for missing argument %d" ,{KIND_LONGNAMES[kind],funcname,i} ),funcpos) end if args=append(args,{ {sprintf("%d",{list[at][FUNC_ARGS][i]}),virtual_pos(funcpos)} ,{} }) end for end if end if --Translate variables in the variable manipulation functions if kind=KIND_MATH and list[at][PAIR_NUM]>=16 and list[at][PAIR_NUM]<=18 then args[1][TREE_TRUNK]=convert_variable_to_id(args[1][TREE_TRUNK],vars,funcname) end if --Obsolescence warnings --(This seems as good a spot to do this as any, because we've looked up the function in 'list' already) --Syntax: @obsolete([error,] [replacement_cmd,] ["message"]) (arg order can var) if and_bits(list[at][FUNC_FLAGS],FLAG_OBSOLETE)!=0 then params=attribute_params("@obsolete",list[at]) message="" fatal=false for i=1 to length(params) do param=params[i][CMD_TEXT] if equal(param,"error") then fatal=true elsif param[1]='"' then message&=param[2..$-1] else message&=sprintf_utf("Use replacement function "&COLYEL&"%s"&COLRED&" instead. ",{param}) end if end for if fatal then message=sprintf_utf("Function "&COLYEL&"%s"&COLRED&" is obsolete and can no longer be used. %s",{funcname,message}) src_error(message,funcpos) else message=sprintf_utf("Function "&COLYEL&"%s"&COLRED&" is obsolete; avoid it. %s",{funcname,message}) src_warn(message,funcpos) end if end if return(args) end function --------------------------------------------------------------------------- --Calls fix_arguments on each non-flow node with arguments function normalize_arguments(NodeList tree, VarList vars) sequence kind_and_id integer kind atom id --Unlike the flow normalization and operator translation we do not insert/delete elements from --the current level, so we can use a "for" safely, and dont need a "while" for i=1 to length(tree) do kind_and_id=what_kind_and_id(tree[i][TREE_TRUNK],vars,false) kind=kind_and_id[1] id=kind_and_id[2] if kind=KIND_SCRIPT then tree[i][TREE_BRANCHES]=fix_arguments(tree[i],kind,id,script_list,vars) elsif kind=KIND_FUNCTION then tree[i][TREE_BRANCHES]=fix_arguments(tree[i],kind,id,function_list,vars) elsif kind=KIND_MATH then tree[i][TREE_BRANCHES]=fix_arguments(tree[i],kind,id,math_list,vars) end if --number, flow, global, local, parens need no argchecking if length(tree[i][TREE_BRANCHES]) then --if there are sub-arguments, recurse tree[i][TREE_BRANCHES]=normalize_arguments(tree[i][TREE_BRANCHES],vars) end if end for return(tree) end function --------------------------------------------------------------------------- --returns a two-char string that represents a 16-bit word in least-signifigant-byte-first order function output_word(integer n) integer b1,b2 b1=and_bits(n,#FF) b2=floor(and_bits(n,#FFFF)/256) return({b1,b2}) end function --------------------------------------------------------------------------- function convert_to_bytes(sequence s) sequence result result={} for i=1 to length(s) do result&=int_to_bytes(s[i]) end for return(result) end function --------------------------------------------------------------------------- --Encode a unicode (UTF32/native Euphoria) string in the format the engine --requires, which is currently Latin-1 function encode_ohr(string string, integer maxbytes=0) for i=1 to length(string) do if string[i]>=256 then string[i]='?' end if end for if maxbytes and maxbytes<length(string) then string=string[1..maxbytes] end if return string end function --------------------------------------------------------------------------- --Returns byte sequence of "32bitstring"-encoded string: --32 bit length followed one-char-per-byte padded to 4 byte multiple function encode_32bitstring(string string) sequence ret integer len string=encode_ohr(string) len=length(string) ret=int_to_bytes(len)&string if remainder(len,4) then ret&=repeat(0,4-remainder(len,4)) end if return ret end function --------------------------------------------------------------------------- --add a string to the table for this script, return its offset function table_string(integer id) integer offset offset=length(string_table)/4 string_table&=encode_32bitstring(string_list[id]) return(offset) end function --------------------------------------------------------------------------- --Used to turn the local var list into a string table. --vars must exclude nonlocals, and be in order. function variable_names_to_table(VarList vars) string varname sequence result result={} for i=1 to length(vars) do --varname=vars[i][CMD_TEXT] varname=get_srcpos_text(vars[i][CMD_POS]) result&=encode_32bitstring(varname) end for return(result) end function --------------------------------------------------------------------------- --returns the offset where this node was either added, or references an existing node function binary_compile_recurse(Node tree, VarList vars) sequence result integer kind atom id integer numargs integer at string s sequence kind_and_id sequence value_temp string str_temp integer recurse_result integer can_reference --whether can return a pointer to an earlier node integer nodeoffset srcpos cmdpos result={} can_reference=true nodeoffset=length(compiled_data) kind_and_id=what_kind_and_id(tree[TREE_TRUNK],vars,false) kind=kind_and_id[1] id=kind_and_id[2] s=tree[TREE_TRUNK][CMD_TEXT] cmdpos=tree[TREE_TRUNK][CMD_POS] if kind=KIND_REFERENCE then str_temp = s[2..length(s)] --is it a global variable? at=find(str_temp,global_list[PAIR_NAME]) if at then --yes, it is a global, compile to global ID id=global_list[PAIR_NUM][at] else --is it a script? at=lookup_scoped_script_name(str_temp) if at then --Yes, it is a script. Compile to a script ID unless it's a reference to a subscript, as --I want to eventually make subscript handles closures (without which subscript handles can't work) if all_scripts[at][SCRIPT_PARENT_IDX] then src_error("References to subscripts haven't been implemented yet. Use a regular script instead if you need its ID.",cmdpos) end if id=all_scripts[at][SCRIPT_ID] else --is it a string literal? if length(str_temp)>7 and equal("$string",str_temp[1..7]) then value_temp=value(str_temp[8..length(str_temp)]) id=table_string(value_temp[2]) else src_error(sprintf_utf("Reference "&COLYEL&"@%s"&COLRED&" not understood: references must be the name of a global variable or a script",{str_temp}),cmdpos) end if end if end if kind=KIND_NUMBER end if result&=kind result&=id if kind=KIND_NUMBER then --No children and no debug info elsif kind=KIND_GLOBAL or kind=KIND_LOCAL or kind=KIND_NONLOCAL then --No children, debug info if debug_info then result&=cmdpos-script_position end if elsif kind=KIND_SCRIPT or kind=KIND_FUNCTION or kind=KIND_FLOW or kind=KIND_MATH then --Use id from what_kind_and_id if kind=KIND_FUNCTION then if max_used_function<id then max_used_function=id end if end if numargs=length(tree[TREE_BRANCHES]) if kind!=KIND_FLOW and numargs>MAX_ARGS then src_error(sprintf_utf("Number of arguments being passed (%d) is more than allowed (%d)",{numargs,MAX_ARGS}),cmdpos) end if result&=numargs --Add placeholders for each argoffset result&=repeat(0,numargs) if debug_info then result&=cmdpos-script_position end if if nodeoffset=0 then --Special case: Most nodes are added after their children, but the root --node needs to have offset 0, so append it already compiled_data&=result end if for i=1 to numargs do --actually evaluate each argument and set the real offsets recurse_result=binary_compile_recurse(tree[TREE_BRANCHES][i],vars) if recurse_result >= nodeoffset then can_reference=false end if result[3+i]=recurse_result end for if nodeoffset=0 then --Root node: replace the dummy argument offsets, and we're done. compiled_data[1..length(result)]=result return(nodeoffset) end if nodeoffset=length(compiled_data) else src_error(sprintf_utf("Compiler Bug! Illegal kind "&COLYEL&"%d"&COLRED&" for "&COLYEL&"%s"&COLRED,{kind,s}),cmdpos) end if --If we've have multiple copies of the exact same node (subtree), deduplicate them. --Don't bother with nodes with >4 children: unlikely to repeat. --The integer(result[2]) test here is to work around a bug in Euphoria (at least in 4.0.3 32-bit): --map:hash throws an error if given an int which doesn't fit in a 31-bit integer if not fast_mode and length(result)<=7 and integer(result[2]) then if can_reference then --If false, this node must be unique, so map:get won't find anything. at=map:get(node_lookup,result,-1) if at>-1 then return at end if end if map:put(node_lookup,result,nodeoffset) end if --append data directly compiled_data&=result return(nodeoffset) end function --------------------------------------------------------------------------- function get_parent_id(Script script) integer parent_idx parent_idx=script[SCRIPT_PARENT_IDX] if parent_idx=0 then return(0) else return(all_scripts[parent_idx][SCRIPT_ID]) end if end function --------------------------------------------------------------------------- function binary_compile(Script script) NodeList tree sequence result integer recurse_ret sequence localvars string_table={} compiled_data={} node_lookup=map:new() --binary data is in a mix of 16-bit and 32-bit signed words. --the header is (mostly) 16-bit and the data (script format version 1+) is 32-bit. --output header (0-based offsets given here) --0..1: the zero-rooted byte-offset of the first executable code byte result=output_word(CODE_START_BYTE_OFFSET) --2..3: the number of local variables (including args but not nonlocals) result&=output_word(length(script[SCRIPT_VARIABLES]) - script[SCRIPT_NUM_NONLOCALS]) --4..5: the number of arguments the script takes (also in SCRIPTS.TXT) result&=output_word(length(script[SCRIPT_ARG_TOKS])) --6..7: the format of the command data (presently used to indicate 32-bit encoding) result&=output_word(HSZ_FORMAT_VERSION) --8..11: a 32bit offset to the string literal table (in bytes), we don't know it yet result&=output_word(0) result&=output_word(0) --12..13: parent script ID or 0 if none result&=output_word(get_parent_id(script)) --14..15: Nesting depth result&=output_word(script[SCRIPT_NEST_DEPTH]) --16..17: number of nonlocals result&=output_word(script[SCRIPT_NUM_NONLOCALS]) --18..21: the length of the string table in 32-bit words, also unknown result&=output_word(0) result&=output_word(0) --22..23: a bitfield detailing optional features. if debug_info then --First bit: non-leaf cmd nodes appended with srcpos result&=output_word(1) else result&=output_word(0) end if --24..27: the offset of the local variable name table, in 32-bit words --after the header, or 0 if not present. Currently unknown. result&=output_word(0) result&=output_word(0) --28..31: the srcpos offset of the start of the script integer script_start script_start=srcpos_offset(script[SCRIPT_TRIGGER_TOK][CMD_POS]) result&=int_to_bytes(script_start) script_position=script_start*power(2,9) --aka encode_srcpos(0,script_start-1,0) --what follows is command data in the format [kindID,Value,argcount,argpointerlist] --numbers and variables have no argcount or argpointerlist --an argpointer is the zero-rooted word-offset of the argument relative --to the start of the executable commands. I realise that this format is --unnecessarily complicated. I had hoped to get benefits of being able to --store frequently reused commands only once and then just point to them, --but in actual practice, it isnt worth the trouble, since the only --commands that tend to be redundant are the really short ones. --the first command is always a "do". there can be only one top-level command tree=script[SCRIPT_AST] if length(tree)!=1 then simple_error(sprintf_utf("compiler bug! script tree has %s root nodes",{length(tree)})) end if reenter_timing_zone("binary_compile_recurse") recurse_ret=binary_compile_recurse(tree[1],script[SCRIPT_VARIABLES]) if recurse_ret!=0 then simple_error(sprintf_utf("compiler bug! Root binary_compile_recurse call returned %d",{recurse_ret})) end if exit_timing_zone() result&=convert_to_bytes(compiled_data) compiled_data={} --append with a table of string literals and give its offset and length if length(string_table)>0 then if remainder((length(result)-CODE_START_BYTE_OFFSET),4) != 0 then simple_error("compiler bug! string table not aligned") end if if remainder(length(string_table),4) != 0 then simple_error("compiler bug! string table not padded") end if result[9..12]=int_to_bytes(length(result)) result[19..22]=int_to_bytes(length(string_table)/4) result&=string_table end if if debug_info then --append local variable names table if remainder(length(result)-CODE_START_BYTE_OFFSET,4) != 0 then simple_error("compiler bug! varname table not aligned") end if localvars=filter_local_variables(script[SCRIPT_VARIABLES]) if length(localvars) then result[25..28]=int_to_bytes((length(result)-CODE_START_BYTE_OFFSET)/4) --Could use seek_string_by_id instead of filter_local_variables result&=variable_names_to_table(localvars) end if --Otherwise remains 0 end if return(result) end function --------------------------------------------------------------------------- --floaty brackets are un-needed after the operators have been translated. function collapse_floaty_brackets(NodeList tree) integer i NodeList graft i=1 while i<=length(tree) do if length(tree[i][TREE_TRUNK][CMD_TEXT])=0 then --found a floaty-bracket graft=tree[i][TREE_BRANCHES] if length(graft)!=1 then src_error(sprintf_utf("Found unexpected pair of brackets containing %d expressions. Either the brackets should not exist at all, or should only enclose a single expression",{length(graft)}),tree[i][TREE_TRUNK][CMD_POS]) end if tree[i]=graft[1] --delete_element(tree,i) --tree=insert_sequence(tree,graft,i) else if length(tree[i][TREE_BRANCHES]) then tree[i][TREE_BRANCHES]=collapse_floaty_brackets(tree[i][TREE_BRANCHES]) end if i+=1 end if end while return(tree) end function --------------------------------------------------------------------------- --parent is CMD_TEXT of parent node function sanity_check(NodeList tree, VarList vars, string parent) string s sequence kind_and_id integer kind atom id srcpos pos for i=1 to length(tree) do s=tree[i][TREE_TRUNK][CMD_TEXT] pos=tree[i][TREE_TRUNK][CMD_POS] kind_and_id=what_kind_and_id(tree[i][TREE_TRUNK],vars,false) kind=kind_and_id[1] id=kind_and_id[2] if i=1 and equal("if",parent) then if kind=KIND_NUMBER then if id then src_warn(sprintf_utf("Condition is always true ("&COLYEL&"%d"&COLRED&")",{id}),pos) else src_warn("Condition is always false",pos) end if end if elsif block_command(parent) then if kind=KIND_NUMBER then src_warn(sprintf_utf("Expected script, function, or flow control, but found an expression with value "&COLYEL&"%d"&COLRED&". It will do nothing here." ,{id}),pos) elsif kind=KIND_GLOBAL then src_warn(sprintf_utf("Expected script, function, or flow control, but found global variable "&COLYEL&"%s"&COLRED&". It will do nothing here." ,{s}),pos) elsif kind=KIND_LOCAL or kind=KIND_NONLOCAL then src_warn(sprintf_utf("Expected script, function, or flow control, but found local variable "&COLYEL&"%s"&COLRED&". It will do nothing here." ,{s}),pos) elsif kind=KIND_MATH and (id<=15 or id=19 or id>=22) then -- setvariable, increment, decrement, logand, logor have side effects src_warn(sprintf_utf("Expected a statement but found built-in function "&COLYEL&"%s"&COLRED&", returning a value that is being discarded" ,{s}),pos) end if end if if kind=KIND_GLOBAL then map:put(used_globals,id,true) elsif kind=KIND_LOCAL then map:put(used_locals,id,true) end if if length(tree[i][TREE_BRANCHES]) then --if there are sub-arguments, recurse tree[i][TREE_BRANCHES]=sanity_check(tree[i][TREE_BRANCHES],vars,tree[i][TREE_TRUNK][CMD_TEXT]) end if end for return(tree) end function --------------------------------------------------------------------------- --simulate a result overflowing a signed 32 bit register function overflow_int32(atom val) --maybe check whether val is so large that this is could be inaccurate? atom ret ret=mod(val,#100000000) if ret>2147483647 then ret-=#100000000 end if return(ret) end function --------------------------------------------------------------------------- function optimized_arg(Node tree, VarList vars) sequence kind_and_id integer kind atom id object arg1,arg2 kind_and_id=what_kind_and_id(tree[TREE_TRUNK],vars,false) kind=kind_and_id[1] id=kind_and_id[2] if kind=KIND_NUMBER then return(id) elsif kind=KIND_MATH and (id<=15 or id>=19) then arg1=optimized_arg(tree[TREE_BRANCHES][1],vars) if equal(arg1,{}) then arg1=tree[TREE_BRANCHES][1] end if if length(tree[TREE_BRANCHES])>=2 then arg2=optimized_arg(tree[TREE_BRANCHES][2],vars) else arg2=0 end if if equal(arg2,{}) then arg2=tree[TREE_BRANCHES][2] end if if atom(arg1) and atom(arg2) then if id=0 then --random if arg1=arg2 then return(arg1) end if elsif id=1 then --exponent if arg2>0 then --if too large, Euphoria will signal an overflow, but I'm also worried --about overflow_int32 giving inaccurate results if log(abs(arg1)+1)+log(abs(arg2)+1)<log(1e14) then return(overflow_int32(power(arg1,arg2))) end if elsif arg2=0 then --FB takes 0^0=1 (the most useful definition), Euphoria throws an error return(1) elsif arg2<0 then if arg1=0 then src_error(sprintf_utf("Found the expression (or equivalent) "&COLYEL&"0 ^ %d"&COLRED&". This is division by zero.",{arg2}),tree[TREE_TRUNK][CMD_POS]) elsif arg1=1 or arg1=-1 then return(power(arg1,arg2)) else return(0) end if end if elsif id=2 then --modulus if arg2=0 then src_error(sprintf_utf("Found the expression (or equivalent) "&COLYEL&"%d, mod, 0"&COLRED&". This is division by zero.",{arg1}),tree[TREE_TRUNK][CMD_POS]) end if return(floor(remainder(arg1,arg2))) elsif id=3 then --divide if arg2=0 then src_error(sprintf_utf("Found the expression (or equivalent) "&COLYEL&"%d / 0"&COLRED,{arg1}),tree[TREE_TRUNK][CMD_POS]) end if --Euphoria doesn't have a truncating function? if arg1*arg2<0 then return(-floor(-arg1/arg2)) else return(floor(arg1/arg2)) end if elsif id=4 then --multiply return(overflow_int32(arg1*arg2)) elsif id=5 then --subtract return(overflow_int32(arg1-arg2)) elsif id=6 then --add return(overflow_int32(arg1+arg2)) elsif id=7 then --xor return(overflow_int32(xor_bits(arg1,arg2))) elsif id=8 then --or return(overflow_int32(or_bits(arg1,arg2))) elsif id=9 then --and return(overflow_int32(and_bits(arg1,arg2))) elsif id=10 then --equal return(arg1=arg2) elsif id=11 then --notequal return(arg1!=arg2) elsif id=12 then --lessthan return(arg1<arg2) elsif id=13 then --greaterthan return(arg1>arg2) elsif id=14 then --lessthanorequalto return(arg1<=arg2) elsif id=15 then --greaterthanorequalto return(arg1>=arg2) elsif id=19 then --not return(arg1=0) elsif id=22 then --logxor return((arg1=0)!=(arg2=0)) elsif id=23 then --abs return(overflow_int32(abs(arg1))) elsif id=24 then --sign return((arg1>0)-(arg1<0)) elsif id=25 then --sqrt if arg1<0 then src_error(sprintf_utf("Found the expression (or equivalent) "&COLYEL&"sqrt(%d)"&COLRED,{arg1}),tree[TREE_TRUNK][CMD_POS]) end if return(floor(sqrt(arg1)+0.5)) end if end if --have to allow for side effects of shortcut evaluating operators by only checking first arg if atom(arg1) then if id=4 then --multiply if arg1=0 then return 0 elsif arg1=1 then return arg2 end if elsif id=6 then --add if arg1=0 then return arg2 end if elsif id=20 then --logand if arg1=0 then return 0 end if elsif id=21 then --logor if arg1!=0 then return 1 end if end if end if if atom(arg2) then if id=4 then --multiply if arg2=0 then return 0 elsif arg2=1 then return arg1 end if elsif id=5 then --subtract if arg2=0 then return arg1 end if elsif id=6 then --add if arg2=0 then return arg1 end if end if end if end if return({}) --return nonatom end function --------------------------------------------------------------------------- --goes through a script simplifying expressions that always have the same value function optimize_script(NodeList tree, VarList vars) object arg for i=1 to length(tree) do arg=optimized_arg(tree[i],vars) if atom(arg) then tree[i][TREE_TRUNK][CMD_TEXT]=sprintf("%d",arg) tree[i][TREE_BRANCHES]={} elsif sequence(arg) and length(arg) then tree[i]=arg end if if length(tree[i][TREE_BRANCHES]) then --if there are sub-arguments, recurse tree[i][TREE_BRANCHES]=optimize_script(tree[i][TREE_BRANCHES],vars) end if end for return(tree) end function --------------------------------------------------------------------------- --FIXME: if a local is used only in a subscript it is reported as unused procedure warn_unused_locals(VarList vars) for i=1 to length(vars) do if vars[i][VAR_FRAME]=0 and not map:has(used_locals,vars[i][VAR_ID]) then src_warn(sprintf_utf("local variable "&COLYEL&"%s"&COLRED&" is never used",{vars[i][CMD_TEXT]}),vars[i][CMD_POS]) end if end for end procedure --------------------------------------------------------------------------- procedure warn_unused_globals() integer id for i=1 to length(global_list[PAIR_NUM]) do id=global_list[PAIR_NUM][i] if not map:has(used_globals,id) then src_warn(sprintf_utf("global variable "&COLYEL&"%s"&COLRED&" ID "&COLYEL&"%d"&COLRED&" is never used",{global_list[PAIR_NAME][i],id}),global_list[GLB_POS][i]) end if end for end procedure --------------------------------------------------------------------------- --script_idx is all_scripts index procedure compile_a_script(integer script_idx) Script script NodeList script_tree VarList local_vars TokenList args integer num_nonlocals integer parent_idx integer trigger_idx integer trigger cur_script_num=script_idx script=all_scripts[script_idx] trigger_idx=find_in_column(script[SCRIPT_TRIGGER_TOK][CMD_TEXT],trigger_list,PAIR_NAME) trigger=trigger_list[trigger_idx][PAIR_NUM] current_script=script[SCRIPT_FULL_NAME] parent_idx=script[SCRIPT_PARENT_IDX] reenter_timing_zone("gather_local_vars") if parent_idx then local_vars=shift_variables_frame(all_scripts[parent_idx][SCRIPT_VARIABLES]) else local_vars={} end if num_nonlocals=length(local_vars) script&={ trigger --trigger type ,num_nonlocals ,local_vars } all_scripts[script_idx]=script --Get all locals (modifies all_scripts[cur_script_num]) args=script[SCRIPT_ARG_TOKS] --The arguments are the first several local variables for i=1 to length(args) do --Checks for name validity add_local_variable(args[i]) end for gather_local_vars(script[SCRIPT_BODY_TOKS]) --script[SCRIPT_VARIABLES] is now outdated script=all_scripts[script_idx] local_vars=script[SCRIPT_VARIABLES] if length(local_vars)-num_nonlocals > MAX_LOCALS then simple_error( sprintf_utf("Script "&COLYEL&"%s"&COLRED&" has %d arguments and variables but at most %d are supported", {script[SCRIPT_FULL_NAME],length(local_vars)-num_nonlocals,MAX_LOCALS}) ) end if exit_timing_zone() used_locals=map:new() reenter_timing_zone("build_ast") script_tree=build_ast(script[SCRIPT_BODY_TOKS],local_vars) exit_timing_zone() reenter_timing_zone("convert_operators") script_tree=convert_operators(script_tree) exit_timing_zone() reenter_timing_zone("expand_macros") script_tree=expand_macros(script_tree) exit_timing_zone() reenter_timing_zone("normalize_flow_control") script_tree=normalize_flow_control(script_tree,local_vars,"") exit_timing_zone() reenter_timing_zone("normalize_arguments") script_tree=normalize_arguments(script_tree,local_vars) exit_timing_zone() reenter_timing_zone("collapse_floaty_brackets") script_tree=collapse_floaty_brackets(script_tree) exit_timing_zone() if not fast_mode then reenter_timing_zone("optimize_script") script_tree=optimize_script(script_tree,local_vars) exit_timing_zone() reenter_timing_zone("sanity_check") script_tree=sanity_check(script_tree,local_vars,"") exit_timing_zone() if find('u',optlist) then warn_unused_locals(local_vars) end if end if script&={ script_tree ,{} --compiled data to go into the HSZ lumps } all_scripts[script_idx]=script reenter_timing_zone("binary_compile") all_scripts[script_idx][SCRIPT_BINARY]=binary_compile(script) exit_timing_zone() current_script="" cur_script_num=0 end procedure --------------------------------------------------------------------------- --Parse argument declarations (initially containing default values) --in a script argument list, returns {arg_names_list, default_vals_list} pair --where any default values have been removed from the argument names function process_arglist(TokenList args) TokenList arglist sequence defaults integer i integer using_defaults arglist={} defaults={} using_defaults=false i=1 while i<=length(args) do if compare("=",args[i][CMD_TEXT])=0 then src_error("Syntax error: spurious = in script argument list",args[i][CMD_POS]) end if check_undefined_string(args[i],"argument name") if find(args[i][CMD_TEXT],column(arglist,CMD_TEXT)) then src_error(sprintf_utf("Multiple script arguments named "&COLYEL&"%s"&COLRED&" in argument list",{args[i][CMD_TEXT]}),args[i][CMD_POS]) end if if i<length(args) and compare("=",args[i+1][CMD_TEXT])=0 then if i+2>length(args) then src_error(sprintf_utf("Syntax error: expected default value for script argument "&COLYEL&"%s"&COLRED,{args[i][CMD_TEXT]}),args[i+1][CMD_POS]) end if arglist=append(arglist,args[i]) defaults&=try_string_to_number({enforce_constants(args[i+2][CMD_TEXT]),args[i+2][CMD_POS]}) using_defaults=true i+=3 else if using_defaults=true then --any argument with a default must be followed with arguments with defaults src_error(sprintf_utf("Default value missing for argument "&COLYEL&"%s"&COLRED&": the previous argument had a default value, so all following ones must too",{args[i][CMD_TEXT]}),args[i][CMD_POS]) end if arglist=append(arglist,args[i]) defaults=append(defaults,NO_DEFAULT) i+=1 end if end while return({arglist,defaults}) end function --------------------------------------------------------------------------- --Check that every declared script has been defined, with the same number of arguments, --and each script appears exactly once. Basically, compare script_list and all_scripts --Also handle argument default values. --Before this function, scripts only have IDs (stored in script_list) if they are declared in a definescript. --Afterwards all scripts have IDs assigned (in all_scripts) procedure check_script_declarations() integer at sequence args_and_defaults, args, defaults sequence scripts_found sequence src_position, attributes string fullname srcpos script_pos integer id, parent_id scripts_found=repeat(false,length(script_list)) for i=1 to length(all_scripts) do fullname=all_scripts[i][SCRIPT_FULL_NAME] parent_id=get_parent_id(all_scripts[i]) current_script=fullname --global script_pos=all_scripts[i][SCRIPT_POS] attributes=all_scripts[i][SCRIPT_ATTRIBUTES] --Attributes get copied to script_list, where they belong --at=find(fullname,column(script_list,PAIR_NAME)) at=find_in_column(fullname,script_list,PAIR_NAME) args_and_defaults=process_arglist(all_scripts[i][SCRIPT_ARG_TOKS]) args=args_and_defaults[1] defaults=args_and_defaults[2] if not at then --define script optional, add to script list id=pick_script_id(parent_id,all_scripts[i][PAIR_NAME]) script_list=create_function(script_list,id,current_script,defaults,KIND_SCRIPT,script_pos,attributes) at=length(script_list) scripts_found&={true} else if scripts_found[at]=true then simple_error(sprintf_utf("Compiler bug; two scripts named %s but this error not handled properly",{current_script})) end if scripts_found[at]=true --The srcpos stored for this script will be from definescript. Repoint it at the actual script --(doesn't really matter, all_scripts would have ended up with that srcpos anyway) script_list[at][FUNC_POS]=script_pos script_list[at][FUNC_ATTRS]&=attributes if length(args) != length(script_list[at][FUNC_ARGS]) then src_error( sprintf_utf( "Script "&COLYEL&"%s"&COLRED&" has %d arguments named, but has %d arguments in its definescript declaration" ,{current_script,length(args),length(script_list[at][FUNC_ARGS])} ) ,script_pos ) end if for j=1 to length(script_list[at][FUNC_ARGS]) do if not equal(defaults[j],NO_DEFAULT) and not equal(defaults[j],script_list[at][FUNC_ARGS][j]) then src_error( sprintf_utf( "Default value for argument "&COLYEL&"%s"&COLRED&" does not match default given in script definition, %d" ,{all_scripts[i][SCRIPT_ARG_TOKS][j][CMD_TEXT],script_list[at][FUNC_ARGS][j]} ) ,script_pos ) end if end for end if if length(args)>MAX_ARGS then src_error(sprintf_utf("Number of arguments (%d) is more than allowed (%d)",{length(args),MAX_ARGS}),script_pos) end if all_scripts[i][SCRIPT_ARG_TOKS]=args all_scripts[i]&={script_list[at][PAIR_NUM]} --append SCRIPT_ID end for current_script="" for i=1 to length(scripts_found) do if scripts_found[i]=false then src_error( sprintf_utf("Script "&COLYEL&"%s"&COLRED&" was declared using "&COLYEL&"define script"&COLRED&" but is missing.",{script_list[i][PAIR_NAME]}) ,script_list[i][FUNC_POS] ) end if end for end procedure --------------------------------------------------------------------------- procedure compile_each_script() sequence count enter_timing_zone("Compiling scripts") color_print("compiling scripts",{}) count=repeat(0,length(file_list)) enter_timing_zone("check_script_declarations") check_script_declarations() exit_timing_zone() for i=1 to length(all_scripts) do -- color_print("%s\n",{all_scripts[i][SCRIPT_NAME]}) reenter_timing_zone("compile_a_script") compile_a_script(i) exit_timing_zone() count[srcpos_file_number(all_scripts[i][SCRIPT_POS])]+=1 color_print(".",{}) end for color_print("\n",{}) if not fast_mode and find('u',optlist) then warn_unused_globals() end if for i=1 to length(count) do if count[i] then wrap_print("compiled %d scripts from "&COLBWHI&"%s"&COLWHI&"\n",{count[i],file_list[i][FILE_NAME]}) end if end for exit_timing_zone() end procedure --------------------------------------------------------------------------- function generate_scripts_dot_txt() sequence result integer print_scripts result="" print_scripts = 0 if find('a',optlist) then print_scripts = 2 elsif find('s',optlist) then print_scripts = 1 end if for i=1 to length(script_list) do result&=encode_ohr(sprintf_utf("%s\r\n%d\r\n%d\r\n",{script_list[i][PAIR_NAME],script_list[i][PAIR_NUM],length(script_list[i][FUNC_ARGS])})) for j=1 to length(script_list[i][FUNC_ARGS]) do if equal(script_list[i][FUNC_ARGS][j],NO_DEFAULT) then result&="0\r\n" else result&=sprintf("%d\r\n",{script_list[i][FUNC_ARGS][j]}) end if end for if (print_scripts=1 and script_list[i][PAIR_NUM]<autonumber_id) or print_scripts=2 then wrap_print("%s\r\n",{script_list[i][PAIR_NAME]}) end if end for return(result) end function --------------------------------------------------------------------------- function generate_scripts_dot_bin() sequence result sequence encoded_name --header size in bytes result=output_word(4) --record size in bytes result&=output_word(44) for i=1 to length(all_scripts) do result&=output_word(all_scripts[i][SCRIPT_ID]) result&=output_word(all_scripts[i][SCRIPT_TRIGGER_ID]) --trigger type encoded_name=encode_ohr(all_scripts[i][SCRIPT_NAME],36) result&=output_word(length(encoded_name)) result&=encoded_name result&=repeat(0,36-length(encoded_name)) result&=output_word(get_parent_id(all_scripts[i])) end for return(result) end function --------------------------------------------------------------------------- --Returns a list of {id, {parent_id, encode_ohr(script_name)[1..36]}} pairs function read_scripts_dot_bin(string filename) object data integer header_size integer rec_size integer offset sequence record, name integer id, parent_id integer name_len sequence ret data=read_file(filename) if equal(data,-1) then simple_error("Could not read "&filename) end if header_size=read_int16(data,0) rec_size=read_int16(data,1) if header_size!=4 or rec_size<42 then simple_error(filename&" in unrecognised format. Use the latest version of HSpeak") end if ret={} for i=0 to (length(data)-header_size) / rec_size - 1 do offset=header_size + i*rec_size record=data[offset+1..offset+rec_size] id=read_int16(record,0) name_len=read_int16(record,2) name=record[7..7+name_len-1] parent_id=0 if rec_size>=44 then --Scripts compiled by older copies of hspeak didn't include the parent ID parent_id=read_int16(record,21) end if ret&={ {id, {parent_id,name}} } end for --pretty_print(stdout,ret,{3}) return ret end function --------------------------------------------------------------------------- function generate_commands_dot_bin() sequence result sequence encoded_names integer i, id, name_num integer offset integer records function_list=sort(function_list) if length(function_list) then records=function_list[$][PAIR_NUM]+1 else records=0 end if if records>2048 then simple_warn(sprintf_utf("Maximum function id was %d, but will not export function names above id 2047",{records-1})) records=2048 end if --header size in bytes (offset to location table) result=output_word(6) --file format version result&=output_word(0) --number of records in location table result&=output_word(records) offset=length(result)+records*2 id=0 i=1 encoded_names={} while i<=length(function_list) do if function_list[i][PAIR_NUM]<id then --skip aliases i+=1 elsif function_list[i][PAIR_NUM]=id then result&=output_word(offset) encoded_names&={encode_ohr(function_list[i][PAIR_NAME])} offset+=4+length(encoded_names[$]) i+=1 id+=1 else --no function with this id result&=output_word(0) id+=1 end if end while i=1 name_num=1 id=-1 while i<=length(function_list) and function_list[i][PAIR_NUM]<records do if function_list[i][PAIR_NUM]=id then --skip aliases i+=1 continue end if if equal(function_list[i][FUNC_ARGS],VAR_ARGS) then result&=output_word(-1) else result&=output_word(length(function_list[i][FUNC_ARGS])) end if result&=output_word(length(encoded_names[name_num])) result&=encoded_names[name_num] name_num+=1 id=function_list[i][PAIR_NUM] i+=1 end while return(result) end function --------------------------------------------------------------------------- procedure write_script_files(integer fh) string fname string temp, ext --because script files with identical names in different directories can be --included, append a number to lump names which would be duplicate integer suffix sequence names_used names_used={} for i=1 to length(file_list) do fname=file_only(file_list[i][FILE_NAME]) ext=extension_only(fname) --Lump names can be at most 50 characters, so trim if longer if length(fname)>47 then temp=without_extension(fname) fname=temp[1..46-length(ext)]&"."&ext end if suffix=count(fname,names_used) names_used=append(names_used,fname) if suffix>0 then temp=without_extension(fname) --you can't slice a function result directly! :( fname=sprintf_utf("%s~%d.%s",{temp[1..$-1],suffix,ext}) end if file_list[i][FILE_LUMP]=fname --color_print("writing %d %s\n",{i,fname}) if write_lump(fh,fname,file_list[i][FILE_TEXT])=false then simple_error("unable to write a script source lump") end if end for end procedure --------------------------------------------------------------------------- function generate_srcfiles_dot_txt() sequence result result="" for i=1 to length(file_list) do result&=sprintf_utf("file=%s\nlump=%s\noffset=%d\nlength=%d\n", { absolutize_path(file_list[i][FILE_NAME]), file_list[i][FILE_LUMP], file_list[i][FILE_OFFSET], length(file_list[i][FILE_TEXT]) }) end for return(result) end function --------------------------------------------------------------------------- procedure write_output_file() sequence header integer fh object lh enter_timing_zone("Writing output file") header=sprintf("HamsterSpeak\n%s\n%d\n%s\n%d\n%d\n",{COMPILER_VERSION & COMPILER_SUB_VERSION,HSP_FORMAT_VERSION,plotscr_version,HSZ_FORMAT_VERSION,max_used_function}) if length(all_scripts) then fh=open(dest_file,"wb") dest_file_fh=fh if fh!=-1 then wrap_print("writing output file "&COLBWHI&"%s"&COLWHI&"\n",{dest_file}) --write header and version if write_lump(fh,"HS",header)=false then simple_error("unable to write header") end if --write script index (old file) if write_lump(fh,"scripts.txt",generate_scripts_dot_txt())=false then simple_error("unable to write script index") end if --write script index (new file) if write_lump(fh,"scripts.bin",generate_scripts_dot_bin())=false then simple_error("unable to write binary script index") end if --write script commands list if write_lump(fh,"commands.bin",generate_commands_dot_bin())=false then simple_error("unable to write commands listing") end if --write each script for i=1 to length(all_scripts) do if write_lump(fh,sprintf("%d.hsz",{all_scripts[i][SCRIPT_ID]}),all_scripts[i][SCRIPT_BINARY])=false then simple_error(sprintf_utf("internal problem: unable to write script "&COLYEL&"%s"&COLRED,{all_scripts[i][SCRIPT_NAME]})) end if end for if debug_info then --write copy of scripts wrap_print("copying script source code into "&COLBWHI&"%s"&COLWHI&"\n",{dest_file}) lh=begin_lump(fh,"source.lumped") if equal(lh,false) then simple_error("unable to write scripts source file") end if write_script_files(fh) if end_lump(lh)=false then simple_error("unable to finish write of scripts source file") end if --write source file table (must occur after write_script_files) if write_lump(fh,"srcfiles.txt",generate_srcfiles_dot_txt())=false then simple_error("unable to write source files listing") end if end if close(fh) else simple_error(sprintf_utf("attempt to open "&COLYEL&"%s"&COLRED&" failed",{dest_file})) end if else color_print("no scripts to output\n",{}) end if exit_timing_zone() end procedure --------------------------------------------------------------------------- init() load_and_lex_all() show_source_info() preliminary_parse_pass() parse_top_level() compile_each_script() dump_debug_report() write_output_file() run_time=time()-start_time color_print("done (%g seconds)\n",{run_time}) if find('t',optlist) then print_timing_data() end if opt_wait_for_key(was_warnings) if was_warnings = true then abort(2) end if