include std/machine.e include std/search.e include euphoria/unicode.e public enum BINARY_MODE, TEXT_MODE, UNIX_TEXT, DOS_TEXT public enum ANSI, UTF, UTF_8, UTF_16, UTF_16BE, UTF_16LE, UTF_32, UTF_32BE, UTF_32LE, $ --** -- Read the contents of a file as a single sequence of bytes. -- -- Parameters: -- # ##file## : an object, either a file path or the handle to an open file. -- # ##as_text## : integer, **BINARY_MODE** (the default) assumes //binary mode// that -- causes every byte to be read in, -- and **TEXT_MODE** assumes //text mode// that ensures that -- lines end with just a Ctrl-J (NewLine) character, -- and the first byte value of 26 (Ctrl-Z) is interpreted as End-Of-File. -- # ##encoding##: An integer. One of ANSI, UTF, UTF_8, UTF_16, UTF_16BE, -- UTF_16LE, UTF_32, UTF_32BE, UTF_32LE. The default is ANSI. -- -- Returns: -- A **sequence**, holding the entire file. -- -- Comments -- * When using BINARY_MODE, each byte in the file is returned as an element in -- the return sequence. -- * When not using BINARY_MODE, the file will be interpreted as a text file. This -- means that all line endings will be transformed to a single 0x0A character and -- the first 0x1A character (Ctrl-Z) will indicate the end of file (all data after this -- will not be returned to the caller.) -- * Text files are always returned as UTF_32 encoded files. -- * Encoding ... -- ** ANSI: no interpretation of the file data is done. All bytes are simply returned -- as characters. -- ** UTF: The file data is examined to work out which UTF encoding method was used -- to create the file. If the file starts with a valid Byte Order Marker (BOM) it can -- quickly decide between UTF_8, UTF_16 and UTF_32. For files without a BOM, -- if the file is completely valid UTF_8 encoding then that is what is used. Failing -- that, if there are no null bytes, the ANSI is assumed. Failing that, it is tested -- for being a valid UTF_16 or UTF_32 format. As a last resort, it will be assumed to -- be an ANSI file. -- ** UTF_8: Any valid UTF_8 BOM is removed and the data is converted to UTF_32 -- format before returning. This means that if it contains any invalidly encoded -- Unicode characters, they will be ignored. -- ** UTF_16: Any valid UTF_16 BOM is removed and the data is converted to UTF_32 -- format before returning. This means that if it contains any invalidly encoded -- Unicode characters, they will be ignored. -- ** UTF_16LE: Any valid little-endian UTF_16 BOM is removed and the data is converted to UTF_32 -- format before returning. This means that if it contains any invalidly encoded -- Unicode characters, they will be ignored. -- ** UTF_16BE: Any valid big-endian UTF_16 BOM is removed and the data is converted to UTF_32 -- format before returning. This means that if it contains any invalidly encoded -- Unicode characters, they will be ignored. -- ** UTF_32: Any valid UTF_32 BOM is removed. -- ** UTF_32LE: Any valid little-endian UTF_32 BOM is removed. -- ** UTF_32BE: Any valid big-endian UTF_32 BOM is removed. -- * If one of the UTF_32 encodings is supplied, invalid Unicode characters are -- not stripped out but are returned in the file data. -- -- Example 1: -- -- data = read_file("my_file.txt") -- -- data contains the entire contents of ##my_file.txt## -- -- -- Example 2: -- -- fh = open("my_file.txt", "r") -- data = read_file(fh) -- close(fh) -- -- -- data contains the entire contents of ##my_file.txt## -- -- -- Example 3: -- -- data = read_file("my_file.txt", TEXT_MODE, UTF_8) -- -- The UTF encoded contents of ##my_file.txt## is stored in 'data' as UTF_32 -- -- -- -- See Also: -- [[:write_file]], [[:read_lines]] public function read_file(object file, integer as_text = BINARY_MODE, integer encoding = ANSI) integer fn integer len sequence ret object temp atom adr if sequence(file) then fn = open(file, "rb") else fn = file end if if fn < 0 then return -1 end if temp = seek(fn, -1) len = where(fn) temp = seek(fn, 0) ret = repeat(0, len) for i = 1 to len do ret[i] = getc(fn) end for if sequence(file) then close(fn) end if ifdef WINDOWS then -- Remove any extra -1 (EOF) characters in case file -- had been opened in Windows 'text mode'. for i = len to 1 by -1 do if ret[i] != -1 then if i != len then ret = ret[1 .. i] end if exit end if end for end ifdef if as_text = BINARY_MODE then return ret end if -- Treat as a text file. while 1 label "ChkEnc" do switch encoding do case ANSI then break case UTF_8 then if length(ret) >= 3 then if equal(ret[1..3], x"ef bb bf") then -- strip out any BOM that might be present. ret = ret[4..$] end if end if ret = toUTF(ret, utf_8, utf_32) case UTF_16 then if length(ret) >= 2 then if equal(ret[1 .. 2], x"fe ff") then encoding = UTF_16BE elsif equal(ret[1 .. 2], x"ff fe") then encoding = UTF_16LE else if validate(ret, utf_16) = 0 then -- is valid encoding = UTF_16BE else encoding = UTF_16LE -- assume little-endian and retest. end if end if else break end if retry "ChkEnc" case UTF_16BE then if length(ret) >= 2 then if equal(ret[1 .. 2], x"fe ff") then ret = ret[3..$] end if end if for i = 1 to length(ret) - 1 by 2 do temp = ret[i] ret[i] = ret[i+1] ret[i+1] = temp end for fallthru case UTF_16LE then if length(ret) >= 2 then if equal(ret[1 .. 2], x"ff fe") then ret = ret[3..$] end if end if adr = allocate(length(ret),1) poke(adr, ret) ret = peek2u({adr, length(ret) / 2}) ret = toUTF(ret, utf_16, utf_32) case UTF_32 then if length(ret) >= 4 then if equal(ret[1 .. 4], x"00 00 fe ff") then encoding = UTF_32BE elsif equal(ret[1 .. 4], x"ff fe 00 00") then encoding = UTF_32LE else if validate(ret, utf_32) = 0 then -- is valid encoding = UTF_32BE else encoding = UTF_32LE -- assume little-endian and retest. end if end if else break end if retry "ChkEnc" case UTF_32BE then if length(ret) >= 4 then if equal(ret[1 .. 4], x"00 00 fe ff") then ret = ret[5..$] end if end if for i = 1 to length(ret) - 3 by 4 do temp = ret[i] ret[i] = ret[i+3] ret[i+3] = temp temp = ret[i+1] ret[i+1] = ret[i+2] ret[i+2] = temp end for fallthru case UTF_32LE then if length(ret) >= 4 then if equal(ret[1 .. 2], x"ff fe 00 00") then ret = ret[5..$] end if end if adr = allocate(length(ret),1) poke(adr, ret) ret = peek4u({adr, length(ret) / 4}) case UTF then if length(ret) >= 4 then if equal(ret[1 .. 4], x"ff fe 00 00") then encoding = UTF_32LE retry "ChkEnc" end if if equal(ret[1 .. 4], x"00 00 fe ff") then encoding = UTF_32BE retry "ChkEnc" end if end if if length(ret) >= 2 then if equal(ret[1 .. 2], x"ff fe") then encoding = UTF_16LE retry "ChkEnc" end if if equal(ret[1 .. 2], x"fe ff") then encoding = UTF_16BE retry "ChkEnc" end if end if if length(ret) >= 3 then if equal(ret[1 .. 3], x"ef bb bf") then encoding = UTF_8 retry "ChkEnc" end if end if if validate(ret, utf_8) = 0 then encoding = UTF_8 retry "ChkEnc" end if if find(0, ret) = 0 then -- No nulls, so assume ANSI exit "ChkEnc" end if adr = allocate(length(ret), 1) poke(adr, ret) temp = peek2u({adr, length(ret) / 2}) if validate(temp, utf_16) = 0 then encoding = UTF_16LE retry "ChkEnc" end if temp = peek4u({adr, length(ret) / 4}) if validate(temp, utf_32) = 0 then encoding = UTF_32LE retry "ChkEnc" end if temp = ret for i = 1 to length(temp) - 1 by 2 do integer tmp = temp[i] temp[i] = temp[i+1] temp[i+1] = tmp end for poke(adr, temp) temp = peek2u({adr, length(ret) / 2}) if validate(temp, utf_16) = 0 then encoding = UTF_16LE retry "ChkEnc" end if temp = ret for i = 1 to length(temp) - 3 by 4 do integer tmp = temp[i] temp[i] = temp[i+3] temp[i+3] = tmp tmp = temp[i+1] temp[i+1] = temp[i+2] temp[i+2] = tmp end for poke(adr, temp) temp = peek4u({adr, length(ret) / 4}) if validate(temp, utf_32) = 0 then encoding = UTF_32LE retry "ChkEnc" end if -- assume ANSI at this point. end switch exit end while fn = find(26, ret) -- Any Ctrl-Z found? if fn then -- Ok, so truncate the file data ret = ret[1 .. fn - 1] end if -- Convert Windows endings -- (commented out because this is quadratic time and doesn't handle Mac lineendings anyway; -- so instead do lineending handling in split_lines) --ret = match_replace({13,10}, ret, {10}) if length(ret) > 0 then if ret[$] != 10 then ret &= 10 end if else ret = {10} end if return ret end function --** -- Write a sequence of bytes to a file. -- -- Parameters: -- # ##file## : an object, either a file path or the handle to an open file. -- # ##data## : the sequence of bytes to write -- # ##as_text## : integer -- ** **BINARY_MODE** (the default) assumes //binary mode// that -- causes every byte to be written out as is, -- ** **TEXT_MODE** assumes //text mode// that causes a NewLine -- to be written out according to the operating system's -- end of line convention. In Unix this is Ctrl-J and in -- Windows this is the pair {Ctrl-L, Ctrl-J}. -- ** **UNIX_TEXT** ensures that lines are written out with unix style -- line endings (Ctrl-J). -- ** **DOS_TEXT** ensures that lines are written out with Windows style -- line endings {Ctrl-L, Ctrl-J}. -- # ##encoding##: an integer. One of ANSI, UTF_8, UTF_16LE, UTF_16BE, -- UTF_32LE, UTF_32BE. The default is ANSI. -- # ##with_bom##: an integer. Either 0 or 1. If 1 then when encoding as a UTF -- file, this will prepend a Byte Order Marker (BOM) to the -- file output. -- -- Returns: -- An **integer**, 1 on success, -1 on failure. -- -- Comments: -- * UTF_16LE, and UTF_32LE create little-endian files, which are the normal ones -- for Intel based CPUs. Big-endian files are more commonly found on Motorola CPUs. -- -- Errors: -- If [[:puts]] cannot write ##data##, a runtime error will occur. -- -- Comments: -- * When ##file## is a file handle, the file is not closed after writing is finished. When ##file## is a -- file name, it is opened, written to and then closed. -- * Note that when writing the file in ony of the text modes, the file is truncated -- at the first Ctrl-Z character in the input data. -- -- Example 1: -- -- if write_file("data.txt", "This is important data\nGoodbye") = -1 then -- puts(STDERR, "Failed to write data\n") -- end if -- -- -- See Also: -- [[:read_file]], [[:write_lines]] public function write_file(object file, sequence data, integer as_text = BINARY_MODE, integer encoding = ANSI, integer with_bom = 1) integer fn atom adr if as_text != BINARY_MODE then -- Truncate at first Ctrl-Z fn = find(26, data) if fn then data = data[1 .. fn-1] end if -- Ensure last line has a line-end marker. if length(data) > 0 then if data[$] != 10 then data &= 10 end if else data = {10} end if if as_text = TEXT_MODE then -- Standardize all line endings data = match_replace({13,10}, data, {10}) elsif as_text = UNIX_TEXT then data = match_replace({13,10}, data, {10}) elsif as_text = DOS_TEXT then data = match_replace({13,10}, data, {10}) data = match_replace({10}, data, {13,10}) end if end if switch encoding do case ANSI then break case UTF_8 then data = toUTF(data, utf_32, utf_8) if with_bom = 1 then data = x"ef bb bf" & data end if as_text = BINARY_MODE case UTF_16LE then data = toUTF(data, utf_32, utf_16) adr = allocate( length(data) * 2, 1) poke2(adr, data) data = peek({adr, length(data) * 2}) if with_bom = 1 then data = x"ff fe" & data end if as_text = BINARY_MODE case UTF_16BE then data = toUTF(data, utf_32, utf_16) adr = allocate( length(data) * 2, 1) poke2(adr, data) data = peek({adr, length(data) * 2}) for i = 1 to length(data) - 1 by 2 do integer tmp = data[i] data[i] = data[i+1] data[i+1] = tmp end for if with_bom = 1 then data = x"fe ff" & data end if as_text = BINARY_MODE case UTF_32LE then adr = allocate( length(data) * 4, 1) poke4(adr, data) data = peek({adr, length(data) * 4}) if with_bom = 1 then data = x"ff fe 00 00" & data end if as_text = BINARY_MODE case UTF_32BE then adr = allocate( length(data) * 4, 1) poke4(adr, data) data = peek({adr, length(data) * 4}) for i = 1 to length(data) - 3 by 4 do integer tmp = data[i] data[i] = data[i+3] data[i+3] = tmp tmp = data[i+1] data[i+1] = data[i+2] data[i+2] = tmp end for if with_bom = 1 then data = x"00 00 fe ff" & data end if as_text = BINARY_MODE case else -- Assume ANSI end switch if sequence(file) then if as_text = TEXT_MODE then fn = open(file, "w") else fn = open(file, "wb") end if else fn = file end if if fn < 0 then return -1 end if puts(fn, data) if sequence(file) then close(fn) end if return 1 end function