#define RELOADINTERNAL
#include "util.bi"
#include "reload.bi"
#include "cutil.bi"
#include "libxml/tree.bi"
#include "libxml/parser.bi"
using Reload
Enum encoding_t
encNone
encWS
encBase64
end enum
declare function chug(node as xmlNodeptr, dc as DocPtr, encoded as encoding_t) as NodePtr
declare sub optimize(node as nodePtr)
dim shared reloadns as xmlNsPtr
dim as string infile, outfile
xmlCheckVersion(LIBXML_VERSION)
infile = command(1)
outfile = command(2)
if infile = "" then
print "Usage:"
print command(0) & " infile.xml outfile.rld"
end
end if
if outfile = "" then
print "Usage:"
print command(0) & " infile.xml outfile.rld"
end
end if
dim as double startTime = Timer, realStart = Timer
dim xmlDoc as xmlDocPtr
xmlDoc = xmlReadFile(infile, 0, 0)
if xmlDoc = null then
print "Could not read document!"
end
end if
print "Loaded XML document in " & int((timer - starttime) * 1000) & " ms"
starttime = timer
dim rldDoc as Docptr
rldDoc = CreateDocument()
print "Memory usage: " & DocumentMemoryUsage(rldDoc)
dim xmlRoot as xmlNodeptr
xmlRoot = xmlDocGetRootElement(xmlDoc)
reloadns = xmlSearchNsByHref(xmlDoc, xmlRoot, @"http://hamsterrepublic.com/ohrrpgce/RELOAD")
dim rldRoot as NodePtr
rldRoot = chug(xmlRoot, rldDoc, encNone)
print "Parsed XML document in " & int((timer - starttime) * 1000) & " ms"
print "Memory usage: " & DocumentMemoryUsage(rldDoc)
starttime = timer
xmlFreeDoc(xmlDoc)
print "Freed XML document in " & int((timer - starttime) * 1000) & " ms"
starttime = timer
optimize(rldRoot)
print "Optimised document in " & int((timer - starttime) * 1000) & " ms"
print "Memory usage: " & DocumentMemoryUsage(rldDoc)
starttime = timer
SetRootNode(rldDoc, rldRoot)
SerializeBin(outfile, rldDoc)
print "Serialized document in " & int((timer - starttime) * 1000) & " ms"
starttime = timer
FreeDocument(rldDoc)
print "Tore down memory in " & int((timer - starttime) * 1000) & " ms"
print "Finished in " & int((timer - realStart) * 1000) & " ms"
'This sub sets a node's content to binary data, calling the Base64 decoder which is in base64.c
sub SetContent_base64(byval this as nodeptr, byval encoded as zstring ptr)
'This does not compute the exact length (may overestimate), find that out later
dim outlen as size_t = 3 * (len(*encoded) \ 4) + 2
'Change to a string, then reserve enough space
SetContent(this, NULL, outlen) 'An uninitialised binary blob
if base64_decode(encoded, len(*encoded), GetZString(this), @outlen) = 0 then
print "Malformed Base64 string, decode failure after " & outlen & " bytes!"
end
end if
'Now we set the length correctly
ResizeZString(this, outlen)
'optimize will still try to process this node, but w/e. This is the only decently fast code in this file
end sub
'reload2xml slaps a iso-8859-1 (aka Latin 1) header on things, but libxml will parse it into unicode and feed us UTF8.
'Go back to Latin 1 to undo that mess (in the process, foil any attempts to create unicode RELOAD documents if the file
'was something other than iso-8859-1)
sub SetContent_utf8_garbage(byval this as nodeptr, byval garbled as zstring ptr)
'Change to a string, then reserve enough space - length of the decoded string is less than or
'equal to the length of the source string, so use that as estimation
SetContent(this, NULL, len(*garbled)) 'An uninitialised binary blob
dim outlen as integer = this->strSize
dim inlen as integer = len(*garbled) 'what's the point of passing this by pointer?
outlen = UTF8Toisolat1(this->str, @outlen, garbled, @inlen)
if outlen = -2 then
print "Warning: this XML contains unicode not expressible in the Latin-1 encoding. Importing a string as raw UTF8"
*this->str = *garbled
elseif outlen = -1 then
print "UTF8Toisolat1 unspecified failure!"
end
end if
'Now we set the length correctly
ResizeZString(this, outlen)
end sub
'''' libxml-tree mini-documentation
'
'The following xml:
'
' bar more spam
'
'is parsed by libxml to the following tree:
'
' ELEMENT:
' }>
' }, children = {
' TEXT:,
' ELEMENT:
' }>
' }>
'
'where FOO: means an xmlNode of type XML_FOO_NODE where the value of a is b, and
'c points to a doubly linked list. content & name are "" if not specified, and children and
'properties are NULL if not specified.
' This function takes an XML node and creates a RELOAD node based on it.
function chug(node as xmlNodeptr, dc as DocPtr, encoded as encoding_t) as NodePtr
dim this as nodeptr
select case node->type
case XML_ELEMENT_NODE, XML_ATTRIBUTE_NODE 'this is container: either a '' or an 'attribute="..."'
dim child_enc as encoding_t = encNone
dim nodename as zstring ptr = cast(zstring ptr, node->name)
'create the RELOAD node
if node->type = XML_ATTRIBUTE_NODE then
'this is an attribute:
'Except, RELOAD doesn't do attributes. So, we reserve @ for those
this = CreateNode(dc, "@" & *nodename)
else
if node->ns = reloadns andalso *nodename = "_" then 'work around RELOAD supporting no-name nodes
this = CreateNode(dc, "")
elseif node->ns = reloadns andalso *nodename = "ws" then 'work around clobbering of whitespace
this = CreateNode(dc, "$") 'this node will be squashed later
child_enc = encWS
else
this = CreateNode(dc, *nodename)
end if
'take a look at the attributes
dim cur_attr as xmlAttrPtr = node->properties
do while cur_attr <> null
dim ch as nodeptr
if *cast(zstring ptr, cur_attr->name) = "encoding" andalso cur_attr->ns = reloadns then
'How terribly bothersome. Get the (TEXT) value of this attribute
ch = chug(cur_attr->children, dc, encNone)
if GetString(ch) = "base64" then
child_enc = encBase64
FreeNode(ch)
else
print "Unknown encoding '" & GetString(ch) & "'"
end
end if
else
ch = chug(cast(xmlNodePtr, cur_attr), dc, encNone)
'add the new child to the document tree
AddChild(this, ch)
end if
cur_attr = cur_attr->next
loop
end if
'and the children
dim cur_node as xmlNodePtr = node->children
do while cur_node <> null
'recurse to parse the children
dim ch as nodeptr = chug(cur_node, dc, child_enc)
'add the new child to the document tree
AddChild(this, ch)
'move to the next child
cur_node = cur_node->next
loop
'This is a hack to support SerializeXML debugging option: results
'in no child being appended
if child_enc = encWS and this->numChildren = 0 then
AppendChildNode(this, "$", "")
end if
case XML_TEXT_NODE 'this is any text data - aka, the content of "..."
'if the text node is blank, we don't care about it unless we're inside
if xmlIsBlankNode(node) = 0 orelse encoded = encWS then
dim content as zstring ptr = node->content
if encoded = encBase64 then
'create a node with a special name
this = CreateNode(dc, "$") 'to be squashed
'Trim whitespace, which the decode library doesn't like
SetContent_base64(this, trim(*content, any !" \t\n\r"))
elseif encoded = encWS then
'Preserve whitespace and string status
this = CreateNode(dc, "$$") 'to be squashed
SetContent_utf8_garbage(this, *content)
elseif encoded = encNone then
'and, set the content to the value of this node, less any padding of spaces, tabs or new lines
this = CreateNode(dc, "$") 'to be squashed
SetContent_utf8_garbage(this, trim(*content, any !" \t\n\r"))
end if
end if
case XML_PI_NODE 'we don't support these.
case else
'Let's see, comments, CDATA sections, etc
print "??? " & node->type
end select
return this
end function
'since all XML nodes are strings, this function figures out which can be represented by simpler data types
'it also squashes <>content> wrappers
sub optimize(node as nodePtr)
if NodeName(node) <> "$$" and NodeType(node) = rltString then 'preserve contents as strings
'Basically, if the string can be parsed as a number, it will be. We need to back off a little bit
'Eg, FB will parse "1234 dots on the door!" as 1234
'I will parse it as a string
if (ValLng(GetString(node)) <> 0 AND ValLng(GetString(node) & "1") <> ValLng(GetString(node))) or GetString(node) = "0" then
SetContent(node, ValLng(GetString(node)))
elseif (Val(GetString(node)) <> 0 AND Val(GetString(node) & "1") <> Val(GetString(node))) or GetString(node) = "0" then
SetContent(node, Val(GetString(node)))
end if
end if
dim as nodeptr c, nextc
c = FirstChild(node)
do while c <> null
nextc = NextSibling(c)
optimize(c)
if NodeName(c) = "$" or NodeName(c) = "$$" then 'this is a <>text> or text wrapper
select case NodeType(c) 'figure out what kind of wrapper it is, and make it so
case rltInt 'hoist the number up a level
SetContent(node, GetInteger(c))
FreeNode(c)
case rltFloat 'lift the double
SetContent(node, GetFloat(c))
FreeNode(c)
case rltString 'raise the string
SetContent(node, GetString(c))
FreeNode(c)
case rltNull 'uh... remove all content.
SetContent(node)
FreeNode(c)
end select
end if
c = nextc
loop
end sub