|
|
|
|
|
from __future__ import absolute_import |
|
|
from __future__ import print_function |
|
|
from __future__ import unicode_literals |
|
|
from io import BytesIO |
|
|
from unittest import TestCase |
|
|
from xml.etree import ElementTree |
|
|
import base64 |
|
|
import io |
|
|
|
|
|
from hwp5 import binmodel |
|
|
from hwp5 import xmlmodel |
|
|
from hwp5 import treeop |
|
|
from hwp5.binmodel import BinData |
|
|
from hwp5.binmodel import ControlChar |
|
|
from hwp5.binmodel import PageDef |
|
|
from hwp5.binmodel import ParaCharShape |
|
|
from hwp5.binmodel import ParaLineSeg |
|
|
from hwp5.binmodel import ParaText |
|
|
from hwp5.binmodel import SectionDef |
|
|
from hwp5.tagids import HWPTAG_PARA_LINE_SEG |
|
|
from hwp5.treeop import STARTEVENT, ENDEVENT |
|
|
from hwp5.utils import cached_property |
|
|
from hwp5.xmlmodel import DocInfo |
|
|
from hwp5.xmlmodel import Hwp5File |
|
|
from hwp5.xmlmodel import ModelEventStream |
|
|
from hwp5.xmlmodel import Section |
|
|
from hwp5.xmlmodel import XmlEvents |
|
|
from hwp5.xmlmodel import embed_bindata |
|
|
from hwp5.xmlmodel import line_segmented |
|
|
from hwp5.xmlmodel import make_ranged_shapes |
|
|
from hwp5.xmlmodel import merge_paragraph_text_charshape_lineseg |
|
|
from hwp5.xmlmodel import split_and_shape |
|
|
|
|
|
from . import test_binmodel |
|
|
from .fixtures import get_fixture_path |
|
|
|
|
|
|
|
|
class TestBase(test_binmodel.TestBase): |
|
|
|
|
|
@cached_property |
|
|
def hwp5file_xml(self): |
|
|
return Hwp5File(self.olestg) |
|
|
|
|
|
hwp5file = hwp5file_xml |
|
|
|
|
|
|
|
|
class TestXmlEvents(TestBase): |
|
|
|
|
|
def test_dump_quoteattr_cr(self): |
|
|
sio = BytesIO() |
|
|
|
|
|
context = dict() |
|
|
attrs = dict(char='\r') |
|
|
events = [(STARTEVENT, (ControlChar, attrs, context)), |
|
|
(ENDEVENT, (ControlChar, attrs, context))] |
|
|
xmlevents = XmlEvents(iter(events)) |
|
|
xmlevents.dump(sio) |
|
|
|
|
|
data = sio.getvalue() |
|
|
self.assertTrue(b' ' in data) |
|
|
|
|
|
def test_bytechunks_quoteattr_cr(self): |
|
|
|
|
|
context = dict() |
|
|
attrs = dict(char='\r') |
|
|
item = (ControlChar, attrs, context) |
|
|
modelevents = [(STARTEVENT, item), |
|
|
(ENDEVENT, item)] |
|
|
xmlevents = XmlEvents(iter(modelevents)) |
|
|
xml = b''.join(xmlevents.bytechunks()) |
|
|
|
|
|
self.assertTrue(b' ' in xml) |
|
|
|
|
|
|
|
|
class TestModelEventStream(TestBase): |
|
|
|
|
|
@cached_property |
|
|
def docinfo(self): |
|
|
return ModelEventStream(self.hwp5file_bin['DocInfo'], |
|
|
self.hwp5file_bin.header.version) |
|
|
|
|
|
def test_modelevents(self): |
|
|
self.assertEqual(len(list(self.docinfo.models())) * 2, |
|
|
len(list(self.docinfo.modelevents()))) |
|
|
|
|
|
|
|
|
|
|
|
class TestDocInfo(TestBase): |
|
|
|
|
|
@cached_property |
|
|
def docinfo(self): |
|
|
return DocInfo(self.hwp5file_bin['DocInfo'], |
|
|
self.hwp5file_bin.header.version) |
|
|
|
|
|
def test_events(self): |
|
|
events = list(self.docinfo.events()) |
|
|
self.assertEqual(136, len(events)) |
|
|
|
|
|
|
|
|
|
|
|
self.assertTrue('<text>' not in events[4][1][1]['bindata']) |
|
|
|
|
|
def test_events_with_embedbin(self): |
|
|
bindata = self.hwp5file_bin['BinData'] |
|
|
events = list(self.docinfo.events(embedbin=bindata)) |
|
|
self.assertTrue('<text>' in events[4][1][1]['bindata']) |
|
|
self.assertEqual(bindata['BIN0002.jpg'].open().read(), |
|
|
base64.b64decode(events[4][1][1] |
|
|
['bindata']['<text>'])) |
|
|
|
|
|
|
|
|
class TestSection(TestBase): |
|
|
|
|
|
def test_events(self): |
|
|
section = Section(self.hwp5file_bin['BodyText']['Section0'], |
|
|
self.hwp5file_bin.fileheader.version) |
|
|
events = list(section.events()) |
|
|
ev, (tag, attrs, ctx) = events[0] |
|
|
self.assertEqual((STARTEVENT, SectionDef), (ev, tag)) |
|
|
self.assertFalse('section-id' in attrs) |
|
|
|
|
|
ev, (tag, attrs, ctx) = events[1] |
|
|
self.assertEqual((STARTEVENT, PageDef), (ev, tag)) |
|
|
|
|
|
ev, (tag, attrs, ctx) = events[2] |
|
|
self.assertEqual((ENDEVENT, PageDef), (ev, tag)) |
|
|
|
|
|
ev, (tag, attrs, ctx) = events[-1] |
|
|
self.assertEqual((ENDEVENT, SectionDef), (ev, tag)) |
|
|
|
|
|
|
|
|
class TestHwp5File(TestBase): |
|
|
|
|
|
def test_docinfo_class(self): |
|
|
self.assertTrue(isinstance(self.hwp5file.docinfo, DocInfo)) |
|
|
|
|
|
def test_events(self): |
|
|
list(self.hwp5file.events()) |
|
|
|
|
|
def test_events_embedbin_without_bindata(self): |
|
|
|
|
|
self.hwp5file_name = 'parashape.hwp' |
|
|
hwp5file = self.hwp5file |
|
|
self.assertTrue('BinData' not in hwp5file) |
|
|
list(hwp5file.events(embedbin=True)) |
|
|
|
|
|
def test_xmlevents(self): |
|
|
events = iter(self.hwp5file.xmlevents()) |
|
|
ev = next(events) |
|
|
self.assertEqual((STARTEVENT, |
|
|
('HwpDoc', dict(version='5.0.1.7'))), ev) |
|
|
list(events) |
|
|
|
|
|
def test_xmlevents_dump(self): |
|
|
with io.open(self.id() + '.xml', 'wb+') as outfile: |
|
|
self.hwp5file.xmlevents().dump(outfile) |
|
|
|
|
|
outfile.seek(0) |
|
|
doc = ElementTree.parse(outfile) |
|
|
|
|
|
self.assertEqual('HwpDoc', doc.getroot().tag) |
|
|
|
|
|
|
|
|
class TestShapedText(TestCase): |
|
|
def test_make_shape_range(self): |
|
|
charshapes = [(0, 'A'), (4, 'B'), (6, 'C'), (10, 'D')] |
|
|
ranged_shapes = make_ranged_shapes(charshapes) |
|
|
self.assertEqual([((0, 4), 'A'), ((4, 6), 'B'), ((6, 10), 'C'), |
|
|
((10, 0x7fffffff), 'D')], list(ranged_shapes)) |
|
|
|
|
|
def test_split(self): |
|
|
chunks = [((0, 3), None, 'aaa'), ((3, 6), None, 'bbb'), |
|
|
((6, 9), None, 'ccc'), ((9, 12), None, 'ddd')] |
|
|
charshapes = [(0, 'A'), (4, 'B'), (6, 'C'), (10, 'D')] |
|
|
shaped_chunks = split_and_shape(iter(chunks), |
|
|
make_ranged_shapes(charshapes)) |
|
|
shaped_chunks = list(shaped_chunks) |
|
|
self.assertEqual([ |
|
|
((0, 3), ('A', None), 'aaa'), |
|
|
((3, 4), ('A', None), 'b'), |
|
|
((4, 6), ('B', None), 'bb'), |
|
|
((6, 9), ('C', None), 'ccc'), |
|
|
((9, 10), ('C', None), 'd'), |
|
|
((10, 12), ('D', None), 'dd')], |
|
|
shaped_chunks) |
|
|
|
|
|
|
|
|
chunks = [((0, 112), None, 'x' * 112)] |
|
|
charshapes = [(0, 'a'), (3, 'b'), (5, 'c')] |
|
|
linesegs = [(0, 'A'), (51, 'B'), (103, 'C')] |
|
|
shaped = split_and_shape(iter(chunks), make_ranged_shapes(charshapes)) |
|
|
shaped = list(shaped) |
|
|
self.assertEqual([((0, 3), ('a', None), 'xxx'), |
|
|
((3, 5), ('b', None), 'xx'), |
|
|
((5, 112), ('c', None), 'x' * 107)], shaped) |
|
|
lines = split_and_shape(iter(shaped), make_ranged_shapes(linesegs)) |
|
|
lines = list(lines) |
|
|
self.assertEqual([ |
|
|
((0, 3), ('A', ('a', None)), 'xxx'), |
|
|
((3, 5), ('A', ('b', None)), 'xx'), |
|
|
((5, 51), ('A', ('c', None)), 'x' * (51 - 5)), |
|
|
((51, 103), ('B', ('c', None)), 'x' * (103 - 51)), |
|
|
((103, 112), ('C', ('c', None)), 'x' * (112 - 103))], lines) |
|
|
|
|
|
|
|
|
class TestLineSeg(TestCase): |
|
|
def test_line_segmented(self): |
|
|
chunks = [((0, 3), None, 'aaa'), ((3, 6), None, 'bbb'), |
|
|
((6, 9), None, 'ccc'), ((9, 12), None, 'ddd')] |
|
|
linesegs = [(0, 'A'), (4, 'B'), (6, 'C'), (10, 'D')] |
|
|
lines = line_segmented(iter(chunks), make_ranged_shapes(linesegs)) |
|
|
lines = list(lines) |
|
|
self.assertEqual([('A', [((0, 3), None, 'aaa'), |
|
|
((3, 4), None, 'b')]), |
|
|
('B', [((4, 6), None, 'bb')]), |
|
|
('C', [((6, 9), None, 'ccc'), |
|
|
((9, 10), None, 'd')]), |
|
|
('D', [((10, 12), None, 'dd')])], lines) |
|
|
|
|
|
|
|
|
class TestDistributionBodyText(TestBase): |
|
|
|
|
|
hwp5file_name = 'viewtext.hwp' |
|
|
|
|
|
def test_issue33_missing_paralineseg(self): |
|
|
section0 = self.hwp5file_bin.bodytext.section(0) |
|
|
tagids = set(model['tagid'] for model in section0.models()) |
|
|
types = set(model['type'] for model in section0.models()) |
|
|
self.assertTrue(HWPTAG_PARA_LINE_SEG not in tagids) |
|
|
self.assertTrue(ParaLineSeg not in types) |
|
|
|
|
|
paratext = self.hwp5file_bin.bodytext.section(0).model(1) |
|
|
self.assertEqual(ParaText, paratext['type']) |
|
|
|
|
|
paracharshape = self.bodytext.section(0).model(2) |
|
|
self.assertEqual(ParaCharShape, paracharshape['type']) |
|
|
|
|
|
evs = merge_paragraph_text_charshape_lineseg( |
|
|
(paratext['type'], paratext['content'], dict()), |
|
|
(paracharshape['type'], paracharshape['content'], dict()), |
|
|
None |
|
|
) |
|
|
|
|
|
|
|
|
list(evs) |
|
|
|
|
|
|
|
|
class TestMatchFieldStartEnd(TestCase): |
|
|
|
|
|
def test_match_field_start_end(self): |
|
|
|
|
|
records = \ |
|
|
[{'level': 2, |
|
|
'payload': b'\x1c\x00\x00\x00\x18\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x00\x80', |
|
|
'seqno': 196, |
|
|
'size': 22, |
|
|
'tagid': 66, |
|
|
'tagname': 'HWPTAG_PARA_HEADER'}, |
|
|
{'level': 3, |
|
|
'payload': b'\x04\x00umf\x08\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x03\x00umf%\x00\x00\x00\x00\x00\x00\x00\x00\x03\x002\x009\x009\x00\x04\x00umf\x08\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\r\x00', |
|
|
'seqno': 197, |
|
|
'size': 56, |
|
|
'tagid': 67, |
|
|
'tagname': 'HWPTAG_PARA_TEXT'}, |
|
|
{'level': 3, |
|
|
'payload': b'\x00\x00\x00\x00\x13\x00\x00\x00', |
|
|
'seqno': 198, |
|
|
'size': 8, |
|
|
'tagid': 68, |
|
|
'tagname': b'HWPTAG_PARA_CHAR_SHAPE'}, |
|
|
{'level': 3, |
|
|
'payload': b'\x00\x00\x00\x00\x00\x00\x00\x00\x14\x05\x00\x00\x14\x05\x00\x00Q\x04\x00\x00\xfc\xfe\xff\xff\x00\x00\x00\x00X\x0b\x00\x00\x00\x00\x06\x00', |
|
|
'seqno': 199, |
|
|
'size': 36, |
|
|
'tagid': 69, |
|
|
'tagname': 'HWPTAG_PARA_LINE_SEG'}, |
|
|
{'level': 3, |
|
|
'payload': b'umf%\x00\x00\x00\x00\x08\x15\x00=\x00S\x00U\x00M\x00(\x00R\x00I\x00G\x00H\x00T\x00)\x00?\x00?\x00%\x00g\x00,\x00;\x00;\x002\x009\x009\x00P\x89\xa0z\x00\x00\x00\x00', |
|
|
'seqno': 200, |
|
|
'size': 61, |
|
|
'tagid': 71, |
|
|
'tagname': 'HWPTAG_CTRL_HEADER'}] |
|
|
|
|
|
models = binmodel.parse_models(dict(), records) |
|
|
events = xmlmodel.prefix_binmodels_with_event(dict(), models) |
|
|
events = xmlmodel.make_texts_linesegmented_and_charshaped(events) |
|
|
events = xmlmodel.make_extended_controls_inline(events) |
|
|
events = xmlmodel.match_field_start_end(events) |
|
|
events = list(events) |
|
|
|
|
|
def test_issue144_fields_crossing_lineseg_boundary(self): |
|
|
|
|
|
name = 'issue144-fields-crossing-lineseg-boundary.hwp' |
|
|
path = get_fixture_path(name) |
|
|
hwp5file = xmlmodel.Hwp5File(path) |
|
|
xmlevents = hwp5file.bodytext.xmlevents() |
|
|
|
|
|
|
|
|
stack_fields = [] |
|
|
for ev, model in xmlevents: |
|
|
|
|
|
if ev is treeop.STARTEVENT: |
|
|
tag = model[0] |
|
|
else: |
|
|
tag = model |
|
|
|
|
|
if tag.startswith('Field'): |
|
|
if ev is treeop.STARTEVENT: |
|
|
stack_fields.append(model) |
|
|
else: |
|
|
stack_fields.pop() |
|
|
elif tag == 'LineSeg': |
|
|
|
|
|
if ev is treeop.STARTEVENT: |
|
|
assert len(stack_fields) == 0 |
|
|
else: |
|
|
assert len(stack_fields) == 0 |
|
|
|
|
|
|
|
|
class TestEmbedBinData(TestBase): |
|
|
|
|
|
def test_embed_bindata(self): |
|
|
|
|
|
bindata = dict(flags=BinData.Flags(BinData.StorageType.EMBEDDING), |
|
|
bindata=dict(storage_id=2, ext='jpg')) |
|
|
events = [(STARTEVENT, (BinData, bindata, dict())), |
|
|
(ENDEVENT, (BinData, bindata, dict()))] |
|
|
events = list(embed_bindata(events, self.hwp5file_bin['BinData'])) |
|
|
self.assertTrue('<text>' in bindata['bindata']) |
|
|
|