include std/machine.e
include std/search.e
include euphoria/unicode.e
public enum
BINARY_MODE,
TEXT_MODE,
UNIX_TEXT,
DOS_TEXT
public enum
ANSI,
UTF,
UTF_8,
UTF_16,
UTF_16BE,
UTF_16LE,
UTF_32,
UTF_32BE,
UTF_32LE,
$
--**
-- Read the contents of a file as a single sequence of bytes.
--
-- Parameters:
-- # ##file## : an object, either a file path or the handle to an open file.
-- # ##as_text## : integer, **BINARY_MODE** (the default) assumes //binary mode// that
-- causes every byte to be read in,
-- and **TEXT_MODE** assumes //text mode// that ensures that
-- lines end with just a Ctrl-J (NewLine) character,
-- and the first byte value of 26 (Ctrl-Z) is interpreted as End-Of-File.
-- # ##encoding##: An integer. One of ANSI, UTF, UTF_8, UTF_16, UTF_16BE,
-- UTF_16LE, UTF_32, UTF_32BE, UTF_32LE. The default is ANSI.
--
-- Returns:
-- A **sequence**, holding the entire file.
--
-- Comments
-- * When using BINARY_MODE, each byte in the file is returned as an element in
-- the return sequence.
-- * When not using BINARY_MODE, the file will be interpreted as a text file. This
-- means that all line endings will be transformed to a single 0x0A character and
-- the first 0x1A character (Ctrl-Z) will indicate the end of file (all data after this
-- will not be returned to the caller.)
-- * Text files are always returned as UTF_32 encoded files.
-- * Encoding ...
-- ** ANSI: no interpretation of the file data is done. All bytes are simply returned
-- as characters.
-- ** UTF: The file data is examined to work out which UTF encoding method was used
-- to create the file. If the file starts with a valid Byte Order Marker (BOM) it can
-- quickly decide between UTF_8, UTF_16 and UTF_32. For files without a BOM,
-- if the file is completely valid UTF_8 encoding then that is what is used. Failing
-- that, if there are no null bytes, the ANSI is assumed. Failing that, it is tested
-- for being a valid UTF_16 or UTF_32 format. As a last resort, it will be assumed to
-- be an ANSI file.
-- ** UTF_8: Any valid UTF_8 BOM is removed and the data is converted to UTF_32
-- format before returning. This means that if it contains any invalidly encoded
-- Unicode characters, they will be ignored.
-- ** UTF_16: Any valid UTF_16 BOM is removed and the data is converted to UTF_32
-- format before returning. This means that if it contains any invalidly encoded
-- Unicode characters, they will be ignored.
-- ** UTF_16LE: Any valid little-endian UTF_16 BOM is removed and the data is converted to UTF_32
-- format before returning. This means that if it contains any invalidly encoded
-- Unicode characters, they will be ignored.
-- ** UTF_16BE: Any valid big-endian UTF_16 BOM is removed and the data is converted to UTF_32
-- format before returning. This means that if it contains any invalidly encoded
-- Unicode characters, they will be ignored.
-- ** UTF_32: Any valid UTF_32 BOM is removed.
-- ** UTF_32LE: Any valid little-endian UTF_32 BOM is removed.
-- ** UTF_32BE: Any valid big-endian UTF_32 BOM is removed.
-- * If one of the UTF_32 encodings is supplied, invalid Unicode characters are
-- not stripped out but are returned in the file data.
--
-- Example 1:
--
-- data = read_file("my_file.txt")
-- -- data contains the entire contents of ##my_file.txt##
--
--
-- Example 2:
--
-- fh = open("my_file.txt", "r")
-- data = read_file(fh)
-- close(fh)
--
-- -- data contains the entire contents of ##my_file.txt##
--
--
-- Example 3:
--
-- data = read_file("my_file.txt", TEXT_MODE, UTF_8)
-- -- The UTF encoded contents of ##my_file.txt## is stored in 'data' as UTF_32
--
--
--
-- See Also:
-- [[:write_file]], [[:read_lines]]
public function read_file(object file, integer as_text = BINARY_MODE, integer encoding = ANSI)
integer fn
integer len
sequence ret
object temp
atom adr
if sequence(file) then
fn = open(file, "rb")
else
fn = file
end if
if fn < 0 then return -1 end if
temp = seek(fn, -1)
len = where(fn)
temp = seek(fn, 0)
ret = repeat(0, len)
for i = 1 to len do
ret[i] = getc(fn)
end for
if sequence(file) then
close(fn)
end if
ifdef WINDOWS then
-- Remove any extra -1 (EOF) characters in case file
-- had been opened in Windows 'text mode'.
for i = len to 1 by -1 do
if ret[i] != -1 then
if i != len then
ret = ret[1 .. i]
end if
exit
end if
end for
end ifdef
if as_text = BINARY_MODE then
return ret
end if
-- Treat as a text file.
while 1 label "ChkEnc" do
switch encoding do
case ANSI then
break
case UTF_8 then
if length(ret) >= 3 then
if equal(ret[1..3], x"ef bb bf") then
-- strip out any BOM that might be present.
ret = ret[4..$]
end if
end if
ret = toUTF(ret, utf_8, utf_32)
case UTF_16 then
if length(ret) >= 2 then
if equal(ret[1 .. 2], x"fe ff") then
encoding = UTF_16BE
elsif equal(ret[1 .. 2], x"ff fe") then
encoding = UTF_16LE
else
if validate(ret, utf_16) = 0 then -- is valid
encoding = UTF_16BE
else
encoding = UTF_16LE -- assume little-endian and retest.
end if
end if
else
break
end if
retry "ChkEnc"
case UTF_16BE then
if length(ret) >= 2 then
if equal(ret[1 .. 2], x"fe ff") then
ret = ret[3..$]
end if
end if
for i = 1 to length(ret) - 1 by 2 do
temp = ret[i]
ret[i] = ret[i+1]
ret[i+1] = temp
end for
fallthru
case UTF_16LE then
if length(ret) >= 2 then
if equal(ret[1 .. 2], x"ff fe") then
ret = ret[3..$]
end if
end if
adr = allocate(length(ret),1)
poke(adr, ret)
ret = peek2u({adr, length(ret) / 2})
ret = toUTF(ret, utf_16, utf_32)
case UTF_32 then
if length(ret) >= 4 then
if equal(ret[1 .. 4], x"00 00 fe ff") then
encoding = UTF_32BE
elsif equal(ret[1 .. 4], x"ff fe 00 00") then
encoding = UTF_32LE
else
if validate(ret, utf_32) = 0 then -- is valid
encoding = UTF_32BE
else
encoding = UTF_32LE -- assume little-endian and retest.
end if
end if
else
break
end if
retry "ChkEnc"
case UTF_32BE then
if length(ret) >= 4 then
if equal(ret[1 .. 4], x"00 00 fe ff") then
ret = ret[5..$]
end if
end if
for i = 1 to length(ret) - 3 by 4 do
temp = ret[i]
ret[i] = ret[i+3]
ret[i+3] = temp
temp = ret[i+1]
ret[i+1] = ret[i+2]
ret[i+2] = temp
end for
fallthru
case UTF_32LE then
if length(ret) >= 4 then
if equal(ret[1 .. 2], x"ff fe 00 00") then
ret = ret[5..$]
end if
end if
adr = allocate(length(ret),1)
poke(adr, ret)
ret = peek4u({adr, length(ret) / 4})
case UTF then
if length(ret) >= 4 then
if equal(ret[1 .. 4], x"ff fe 00 00") then
encoding = UTF_32LE
retry "ChkEnc"
end if
if equal(ret[1 .. 4], x"00 00 fe ff") then
encoding = UTF_32BE
retry "ChkEnc"
end if
end if
if length(ret) >= 2 then
if equal(ret[1 .. 2], x"ff fe") then
encoding = UTF_16LE
retry "ChkEnc"
end if
if equal(ret[1 .. 2], x"fe ff") then
encoding = UTF_16BE
retry "ChkEnc"
end if
end if
if length(ret) >= 3 then
if equal(ret[1 .. 3], x"ef bb bf") then
encoding = UTF_8
retry "ChkEnc"
end if
end if
if validate(ret, utf_8) = 0 then
encoding = UTF_8
retry "ChkEnc"
end if
if find(0, ret) = 0 then
-- No nulls, so assume ANSI
exit "ChkEnc"
end if
adr = allocate(length(ret), 1)
poke(adr, ret)
temp = peek2u({adr, length(ret) / 2})
if validate(temp, utf_16) = 0 then
encoding = UTF_16LE
retry "ChkEnc"
end if
temp = peek4u({adr, length(ret) / 4})
if validate(temp, utf_32) = 0 then
encoding = UTF_32LE
retry "ChkEnc"
end if
temp = ret
for i = 1 to length(temp) - 1 by 2 do
integer tmp = temp[i]
temp[i] = temp[i+1]
temp[i+1] = tmp
end for
poke(adr, temp)
temp = peek2u({adr, length(ret) / 2})
if validate(temp, utf_16) = 0 then
encoding = UTF_16LE
retry "ChkEnc"
end if
temp = ret
for i = 1 to length(temp) - 3 by 4 do
integer tmp = temp[i]
temp[i] = temp[i+3]
temp[i+3] = tmp
tmp = temp[i+1]
temp[i+1] = temp[i+2]
temp[i+2] = tmp
end for
poke(adr, temp)
temp = peek4u({adr, length(ret) / 4})
if validate(temp, utf_32) = 0 then
encoding = UTF_32LE
retry "ChkEnc"
end if
-- assume ANSI at this point.
end switch
exit
end while
fn = find(26, ret) -- Any Ctrl-Z found?
if fn then
-- Ok, so truncate the file data
ret = ret[1 .. fn - 1]
end if
-- Convert Windows endings
-- (commented out because this is quadratic time and doesn't handle Mac lineendings anyway;
-- so instead do lineending handling in split_lines)
--ret = match_replace({13,10}, ret, {10})
if length(ret) > 0 then
if ret[$] != 10 then
ret &= 10
end if
else
ret = {10}
end if
return ret
end function
--**
-- Write a sequence of bytes to a file.
--
-- Parameters:
-- # ##file## : an object, either a file path or the handle to an open file.
-- # ##data## : the sequence of bytes to write
-- # ##as_text## : integer
-- ** **BINARY_MODE** (the default) assumes //binary mode// that
-- causes every byte to be written out as is,
-- ** **TEXT_MODE** assumes //text mode// that causes a NewLine
-- to be written out according to the operating system's
-- end of line convention. In Unix this is Ctrl-J and in
-- Windows this is the pair {Ctrl-L, Ctrl-J}.
-- ** **UNIX_TEXT** ensures that lines are written out with unix style
-- line endings (Ctrl-J).
-- ** **DOS_TEXT** ensures that lines are written out with Windows style
-- line endings {Ctrl-L, Ctrl-J}.
-- # ##encoding##: an integer. One of ANSI, UTF_8, UTF_16LE, UTF_16BE,
-- UTF_32LE, UTF_32BE. The default is ANSI.
-- # ##with_bom##: an integer. Either 0 or 1. If 1 then when encoding as a UTF
-- file, this will prepend a Byte Order Marker (BOM) to the
-- file output.
--
-- Returns:
-- An **integer**, 1 on success, -1 on failure.
--
-- Comments:
-- * UTF_16LE, and UTF_32LE create little-endian files, which are the normal ones
-- for Intel based CPUs. Big-endian files are more commonly found on Motorola CPUs.
--
-- Errors:
-- If [[:puts]] cannot write ##data##, a runtime error will occur.
--
-- Comments:
-- * When ##file## is a file handle, the file is not closed after writing is finished. When ##file## is a
-- file name, it is opened, written to and then closed.
-- * Note that when writing the file in ony of the text modes, the file is truncated
-- at the first Ctrl-Z character in the input data.
--
-- Example 1:
--
-- if write_file("data.txt", "This is important data\nGoodbye") = -1 then
-- puts(STDERR, "Failed to write data\n")
-- end if
--
--
-- See Also:
-- [[:read_file]], [[:write_lines]]
public function write_file(object file, sequence data, integer as_text = BINARY_MODE, integer encoding = ANSI, integer with_bom = 1)
integer fn
atom adr
if as_text != BINARY_MODE then
-- Truncate at first Ctrl-Z
fn = find(26, data)
if fn then
data = data[1 .. fn-1]
end if
-- Ensure last line has a line-end marker.
if length(data) > 0 then
if data[$] != 10 then
data &= 10
end if
else
data = {10}
end if
if as_text = TEXT_MODE then
-- Standardize all line endings
data = match_replace({13,10}, data, {10})
elsif as_text = UNIX_TEXT then
data = match_replace({13,10}, data, {10})
elsif as_text = DOS_TEXT then
data = match_replace({13,10}, data, {10})
data = match_replace({10}, data, {13,10})
end if
end if
switch encoding do
case ANSI then
break
case UTF_8 then
data = toUTF(data, utf_32, utf_8)
if with_bom = 1 then
data = x"ef bb bf" & data
end if
as_text = BINARY_MODE
case UTF_16LE then
data = toUTF(data, utf_32, utf_16)
adr = allocate( length(data) * 2, 1)
poke2(adr, data)
data = peek({adr, length(data) * 2})
if with_bom = 1 then
data = x"ff fe" & data
end if
as_text = BINARY_MODE
case UTF_16BE then
data = toUTF(data, utf_32, utf_16)
adr = allocate( length(data) * 2, 1)
poke2(adr, data)
data = peek({adr, length(data) * 2})
for i = 1 to length(data) - 1 by 2 do
integer tmp = data[i]
data[i] = data[i+1]
data[i+1] = tmp
end for
if with_bom = 1 then
data = x"fe ff" & data
end if
as_text = BINARY_MODE
case UTF_32LE then
adr = allocate( length(data) * 4, 1)
poke4(adr, data)
data = peek({adr, length(data) * 4})
if with_bom = 1 then
data = x"ff fe 00 00" & data
end if
as_text = BINARY_MODE
case UTF_32BE then
adr = allocate( length(data) * 4, 1)
poke4(adr, data)
data = peek({adr, length(data) * 4})
for i = 1 to length(data) - 3 by 4 do
integer tmp = data[i]
data[i] = data[i+3]
data[i+3] = tmp
tmp = data[i+1]
data[i+1] = data[i+2]
data[i+2] = tmp
end for
if with_bom = 1 then
data = x"00 00 fe ff" & data
end if
as_text = BINARY_MODE
case else
-- Assume ANSI
end switch
if sequence(file) then
if as_text = TEXT_MODE then
fn = open(file, "w")
else
fn = open(file, "wb")
end if
else
fn = file
end if
if fn < 0 then return -1 end if
puts(fn, data)
if sequence(file) then
close(fn)
end if
return 1
end function