bl791/IFDB / utilities /ifarchivexml.py
bl791's picture
download
raw
15 kB
import xml.sax
import xml.sax.handler
"""ifarchivexml:
This module parses the Master-Index.xml file that is available at
<http://www.ifarchive.org/indexes/Master-Index.xml>.
You can use this module like this:
import ifarchivexml
(root, dirs, files) = ifarchivexml.parse('Master-Index.xml')
root is an IFDir object representing the root directory ('if-archive').
dirs is a dictionary mapping directory names ('if-archive/games', for
example) to IFDir objects. files is a dictionary mapping file pathname
('if-archive/games/playgame.FAQ', for example) to IFFile objects.
You can display the contents of either an IFDir or IFFile object with
the obj.dump() method.
There is also a callback form:
ifarchivexml.parse_callback('Master-Index.xml', dirfunc=FUNC, filefunc=FUNC)
The parse_callback() function returns nothing; it calls the given functions
on each IFDir and IFFile as they are encountered. (You don't have to
supply both callbacks.) Master-Index.xml is conventionally created in
directory-tree order, so your callbacks will encounter parents before
children. In this mode, the parentobj and directoryobj fields of IFDir and
IFFile will not be set.
Dec 2019: Updated to Python 3; added sha512 and metadata fields.
Apr 2025: Added parentdesc field; support date and metadata fields for
directories; removed xdir field. Added the parse_callback() form.
"""
CONTEXT_NONE = 0
CONTEXT_DIR = 1
CONTEXT_FILE = 2
CONTEXT_DIRLINK = 3
CONTEXT_FILELINK = 4
CONTEXT_METADATA = 5
CONTEXT_METAITEM = 5
class IFDir:
description = None
date = None
rawdate = None
metadata = None
def __init__(self):
self.subdirs = []
self.files = []
self.parentdescs = {}
def __repr__(self):
return '<IFDir \'' + self.name + '\'>'
def dump(self):
print('name: ', self.name)
print('parent: ', self.parent, ('('+str(self.parentobj)+')'))
print('subdircount:', self.subdircount)
print('filecount: ', self.filecount)
if (self.metadata is not None):
print('metadata:')
for (key, valls) in self.metadata.items():
print(' ', key+':', ', '.join(valls))
if (self.description is not None):
print('description:')
print(self.description)
for key in self.parentdescs:
print('parentdesc (from %s)' % (key,))
print(self.parentdescs[key])
print('subdirs:')
for subdir in self.subdirs:
print(' ', str(subdir))
print('files:')
for file in self.files:
print(' ', str(file))
class IFFile:
size = None
date = None
md5 = None
sha512 = None
rawdate = None
symlink = None
metadata = None
description = None
def __init__(self):
self.parentdescs = {}
def __repr__(self):
return '<IFFile \'' + self.path + '\'>'
def dump(self):
print('path: ', self.path)
print('name: ', self.name)
print('directory: ', self.directory, ('('+str(self.directoryobj)+')'))
if (self.symlink == 'dir'):
print('symlink to dir:')
print(' name: ', self.symlinkname)
if (self.symlink == 'file'):
print('symlink to file:')
print(' path: ', self.symlinkpath)
print('size: ', self.size)
print('date: ', self.date)
print('rawdate:', self.rawdate)
print('md5: ', self.md5)
print('sha512: ', self.sha512)
print('orderindex:', self.orderindex)
if (self.metadata is not None):
print('metadata:')
for (key, valls) in self.metadata.items():
print(' ', key+':', ', '.join(valls))
if (self.description is not None):
print('description:')
print(self.description)
for key in self.parentdescs:
print('parentdesc (from %s)' % (key,))
print(self.parentdescs[key])
class IFAParser(xml.sax.handler.ContentHandler):
def __init__(self, callbacks=None):
xml.sax.ContentHandler.__init__(self)
if not callbacks:
self.callbackmode = False
self.directories = {}
self.files = {}
else:
self.callbackmode = True
self.dircallback = callbacks[0]
self.filecallback = callbacks[1]
self.directories = None
self.files = None
self.grabbeddata = ''
self.curdir = None
self.curfile = None
self.curitem = None
self.curmetaowner = None
self.orderindex = 0
self.context = CONTEXT_NONE
self.elements = {
'ifarchive': (self.ignore_start, self.ifarchive_end),
'directory': (self.directory_start, self.directory_end),
'file': (self.file_start, self.file_end),
'metadata': (self.metadata_start, self.metadata_end),
'item': (self.item_start, self.item_end),
'key': (self.grabdata_start, self.key_end),
'value': (self.grabdata_start, self.value_end),
'name': (self.grabdata_start, self.name_end),
'filecount': (self.grabdata_start, self.filecount_end),
'subdircount': (self.grabdata_start, self.subdircount_end),
'parent': (self.grabdata_start, self.parent_end),
'path': (self.grabdata_start, self.path_end),
'size': (self.grabdata_start, self.size_end),
'date': (self.grabdata_start, self.date_end),
'rawdate': (self.grabdata_start, self.rawdate_end),
'md5': (self.grabdata_start, self.md5_end),
'sha512': (self.grabdata_start, self.sha512_end),
'description': (self.grabdata_start, self.description_end),
'parentdesc': (self.parentdesc_start, self.parentdesc_end),
'symlink': (self.symlink_start, self.symlink_end),
}
def characters(self, data):
self.grabbeddata = (self.grabbeddata + data)
def startElement(self, name, attrs):
if (name not in self.elements):
return
(startfunc, endfunc) = self.elements.get(name)
startfunc(attrs)
def endElement(self, name):
if (name not in self.elements):
return
(startfunc, endfunc) = self.elements.get(name)
endfunc()
def ignore_start(self, dict):
pass
def ignore_end(self):
pass
def grabdata_start(self, dict):
self.grabbeddata = ''
def grabdata(self):
dat = self.grabbeddata
self.grabbeddata = ''
return dat
def directory_start(self, dict):
if (self.context == CONTEXT_NONE):
self.curdir = IFDir()
self.context = CONTEXT_DIR
elif (self.context == CONTEXT_FILE):
self.grabdata_start(None)
def directory_end(self):
if (self.context == CONTEXT_DIR):
name = self.curdir.name
if self.callbackmode:
self.dircallback(self.curdir)
else:
self.directories[name] = self.curdir
self.curdir = None
self.context = CONTEXT_NONE
elif (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.directory = data
def file_start(self, dict):
if (self.context == CONTEXT_NONE):
self.curfile = IFFile()
self.context = CONTEXT_FILE
def file_end(self):
if (self.context == CONTEXT_FILE):
path = self.curfile.path
self.curfile.orderindex = self.orderindex
self.orderindex = self.orderindex+1
if self.callbackmode:
self.filecallback(self.curfile)
else:
self.files[path] = self.curfile
self.curfile = None
self.context = CONTEXT_NONE
def metadata_start(self, dict):
if (self.context == CONTEXT_FILE):
self.curmetaowner = self.curfile
self.curfile.metadata = {}
self.context = CONTEXT_METADATA
elif (self.context == CONTEXT_DIR):
self.curmetaowner = self.curdir
self.curdir.metadata = {}
self.context = CONTEXT_METADATA
def metadata_end(self):
if (self.context == CONTEXT_METADATA):
if self.curmetaowner is self.curfile:
self.context = CONTEXT_FILE
elif self.curmetaowner is self.curdir:
self.context = CONTEXT_DIR
else:
raise Exception()
self.curmetaowner = None
def item_start(self, dict):
if (self.context == CONTEXT_METADATA):
self.curitem = [None]
self.context = CONTEXT_METAITEM
def item_end(self):
if (self.context == CONTEXT_METAITEM):
if self.curitem[0] and len(self.curitem) > 1:
self.curmetaowner.metadata[self.curitem[0]] = self.curitem[1:]
self.curitem = None
self.context = CONTEXT_METADATA
def key_end(self):
if (self.context == CONTEXT_METAITEM):
val = self.grabdata()
if (self.curitem is not None):
self.curitem[0] = val
def value_end(self):
if (self.context == CONTEXT_METAITEM):
val = self.grabdata()
if (self.curitem is not None):
self.curitem.append(val)
def symlink_start(self, dict):
if (self.context == CONTEXT_FILE):
if (dict['type'] == 'dir'):
self.context = CONTEXT_DIRLINK
self.curfile.symlink = 'dir'
else:
self.context = CONTEXT_FILELINK
self.curfile.symlink = 'file'
def symlink_end(self):
if (self.context == CONTEXT_DIRLINK):
self.context = CONTEXT_FILE
elif (self.context == CONTEXT_FILELINK):
self.context = CONTEXT_FILE
def name_end(self):
if (self.context == CONTEXT_DIR):
name = self.grabdata()
if (self.curdir is not None):
self.curdir.name = name
elif (self.context == CONTEXT_FILE):
name = self.grabdata()
if (self.curfile is not None):
self.curfile.name = name
elif (self.context == CONTEXT_DIRLINK):
name = self.grabdata()
if (self.curfile is not None):
self.curfile.symlinkname = name
def parent_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir is not None):
self.curdir.parent = data
def subdircount_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir is not None):
self.curdir.subdircount = int(data)
def filecount_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir is not None):
self.curdir.filecount = int(data)
def path_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.path = data
elif (self.context == CONTEXT_FILELINK):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.symlinkpath = data
def size_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.size = int(data)
def date_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.date = data
elif (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir is not None):
self.curdir.date = data
def rawdate_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.rawdate = int(data)
elif (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir is not None):
self.curdir.rawdate = int(data)
def md5_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.md5 = data
def sha512_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.sha512 = data
def parentdesc_start(self, dict):
if (self.context == CONTEXT_DIR or self.context == CONTEXT_FILE):
self.grabbeddata = ''
self.curitem = dict['dir']
def parentdesc_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir is not None):
self.curdir.parentdescs[self.curitem] = data
self.curitem = None
elif (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.parentdescs[self.curitem] = data
self.curitem = None
def description_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir is not None):
self.curdir.description = data
elif (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile is not None):
self.curfile.description = data
def ifarchive_end(self):
if not self.callbackmode:
for dir in self.directories.values():
parent = dir.parent
if (parent == ''):
dir.parentobj = None
else:
dir.parentobj = self.directories[parent]
dir.parentobj.subdirs.append(dir)
for file in self.files.values():
parent = file.directory
file.directoryobj = self.directories[parent]
file.directoryobj.files.append(file)
def parse(filename):
parser = IFAParser()
fl = open(filename, 'r')
xml.sax.parse(fl, parser)
fl.close()
rootdir = parser.directories['if-archive']
result = (rootdir, parser.directories, parser.files)
return result
def parse_callback(filename, dirfunc=None, filefunc=None):
if not dirfunc:
dirfunc = lambda obj: None
if not filefunc:
filefunc = lambda obj: None
parser = IFAParser(callbacks=(dirfunc, filefunc))
fl = open(filename, 'r')
xml.sax.parse(fl, parser)
fl.close()

Xet Storage Details

Size:
15 kB
·
Xet hash:
2188c45b868faeba14226ae2c7fb0900cfb97a9c91517eeeddca950b6afbba01

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.