|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import absolute_import |
|
|
from __future__ import print_function |
|
|
from __future__ import unicode_literals |
|
|
from itertools import islice |
|
|
import json |
|
|
import struct |
|
|
|
|
|
from . import dataio |
|
|
from . import filestructure |
|
|
from .dataio import dumpbytes |
|
|
from .dataio import Eof |
|
|
from .dataio import UINT32 |
|
|
from .tagids import HWPTAG_BEGIN |
|
|
from .tagids import tagnames |
|
|
from .utils import JsonObjects |
|
|
|
|
|
|
|
|
def tagname(tagid): |
|
|
return tagnames.get(tagid, 'HWPTAG%d' % (tagid - HWPTAG_BEGIN)) |
|
|
|
|
|
|
|
|
def Record(tagid, level, payload, size=None, seqno=None): |
|
|
if size is None: |
|
|
size = len(payload) |
|
|
d = dict(tagid=tagid, tagname=tagname(tagid), level=level, |
|
|
size=size, payload=payload) |
|
|
if seqno is not None: |
|
|
d['seqno'] = seqno |
|
|
return d |
|
|
|
|
|
|
|
|
def decode_record_header(f): |
|
|
try: |
|
|
|
|
|
rechdr = UINT32.read(f) |
|
|
tagid = rechdr & 0x3ff |
|
|
level = (rechdr >> 10) & 0x3ff |
|
|
size = (rechdr >> 20) & 0xfff |
|
|
if size == 0xfff: |
|
|
size = UINT32.read(f) |
|
|
return (tagid, level, size) |
|
|
except Eof: |
|
|
return None |
|
|
|
|
|
|
|
|
def encode_record_header(rec): |
|
|
size = len(rec['payload']) |
|
|
level = rec['level'] |
|
|
tagid = rec['tagid'] |
|
|
if size < 0xfff: |
|
|
hdr = (size << 20) | (level << 10) | tagid |
|
|
return struct.pack('<I', hdr) |
|
|
else: |
|
|
hdr = (0xfff << 20) | (level << 10) | tagid |
|
|
return struct.pack('<II', hdr, size) |
|
|
|
|
|
|
|
|
def read_record(f, seqno): |
|
|
header = decode_record_header(f) |
|
|
if header is None: |
|
|
return |
|
|
tagid, level, size = header |
|
|
payload = dataio.readn(f, size) |
|
|
return Record(tagid, level, payload, size, seqno) |
|
|
|
|
|
|
|
|
def dump_record(f, record): |
|
|
hdr = encode_record_header(record) |
|
|
f.write(hdr) |
|
|
f.write(record['payload']) |
|
|
|
|
|
|
|
|
def read_records(f): |
|
|
seqno = 0 |
|
|
while True: |
|
|
record = read_record(f, seqno) |
|
|
if record: |
|
|
yield record |
|
|
else: |
|
|
return |
|
|
seqno += 1 |
|
|
|
|
|
|
|
|
def link_records(records): |
|
|
prev = None |
|
|
for rec in records: |
|
|
if prev is not None: |
|
|
if rec['level'] == prev['level']: |
|
|
rec['sister'] = prev |
|
|
rec['parent'] = prev.get('parent') |
|
|
elif rec['level'] == prev['level'] + 1: |
|
|
rec['parent'] = prev |
|
|
yield rec |
|
|
prev = rec |
|
|
|
|
|
|
|
|
def record_to_json(record, *args, **kwargs): |
|
|
''' convert a record to json ''' |
|
|
record['payload'] = list(dumpbytes(record['payload'])) |
|
|
return json.dumps(record, *args, **kwargs) |
|
|
|
|
|
|
|
|
def nth(iterable, n, default=None): |
|
|
try: |
|
|
return next(islice(iterable, n, None)) |
|
|
except StopIteration: |
|
|
return default |
|
|
|
|
|
|
|
|
def group_records_by_toplevel(records, group_as_list=True): |
|
|
''' group records by top-level trees and return iterable of the groups |
|
|
''' |
|
|
context = dict() |
|
|
|
|
|
try: |
|
|
context['top'] = next(records) |
|
|
except StopIteration: |
|
|
return |
|
|
|
|
|
def records_in_a_tree(): |
|
|
yield context.pop('top') |
|
|
|
|
|
for record in records: |
|
|
if record['level'] == 0: |
|
|
context['top'] = record |
|
|
return |
|
|
yield record |
|
|
|
|
|
while 'top' in context: |
|
|
group = records_in_a_tree() |
|
|
if group_as_list: |
|
|
group = list(group) |
|
|
yield group |
|
|
|
|
|
|
|
|
class RecordStream(filestructure.VersionSensitiveItem): |
|
|
|
|
|
def records(self, **kwargs): |
|
|
records = read_records(self.open()) |
|
|
if 'range' in kwargs: |
|
|
range = kwargs['range'] |
|
|
records = islice(records, range[0], range[1]) |
|
|
elif 'treegroup' in kwargs: |
|
|
groups = group_records_by_toplevel(records, group_as_list=True) |
|
|
records = nth(groups, kwargs['treegroup']) |
|
|
return records |
|
|
|
|
|
def record(self, idx): |
|
|
''' get the record at `idx' ''' |
|
|
return nth(self.records(), idx) |
|
|
|
|
|
def records_json(self, **kwargs): |
|
|
records = self.records(**kwargs) |
|
|
return JsonObjects(records, record_to_json) |
|
|
|
|
|
def records_treegrouped(self, group_as_list=True): |
|
|
''' group records by top-level trees and return iterable of the groups |
|
|
''' |
|
|
records = self.records() |
|
|
return group_records_by_toplevel(records, group_as_list) |
|
|
|
|
|
def records_treegroup(self, n): |
|
|
''' returns list of records in `n'th top-level tree ''' |
|
|
groups = self.records_treegrouped() |
|
|
return nth(groups, n) |
|
|
|
|
|
def other_formats(self): |
|
|
return {'.records': self.records_json().open} |
|
|
|
|
|
|
|
|
class Sections(filestructure.Sections): |
|
|
|
|
|
section_class = RecordStream |
|
|
|
|
|
|
|
|
class Hwp5File(filestructure.Hwp5File): |
|
|
''' Hwp5File for 'rec' layer |
|
|
''' |
|
|
|
|
|
docinfo_class = RecordStream |
|
|
bodytext_class = Sections |
|
|
|