| | |
| | from __future__ import absolute_import |
| | from __future__ import print_function |
| | from __future__ import unicode_literals |
| | from io import BytesIO |
| | from unittest import TestCase |
| | from xml.etree import ElementTree |
| | import base64 |
| | import io |
| |
|
| | from hwp5 import binmodel |
| | from hwp5 import xmlmodel |
| | from hwp5 import treeop |
| | from hwp5.binmodel import BinData |
| | from hwp5.binmodel import ControlChar |
| | from hwp5.binmodel import PageDef |
| | from hwp5.binmodel import ParaCharShape |
| | from hwp5.binmodel import ParaLineSeg |
| | from hwp5.binmodel import ParaText |
| | from hwp5.binmodel import SectionDef |
| | from hwp5.tagids import HWPTAG_PARA_LINE_SEG |
| | from hwp5.treeop import STARTEVENT, ENDEVENT |
| | from hwp5.utils import cached_property |
| | from hwp5.xmlmodel import DocInfo |
| | from hwp5.xmlmodel import Hwp5File |
| | from hwp5.xmlmodel import ModelEventStream |
| | from hwp5.xmlmodel import Section |
| | from hwp5.xmlmodel import XmlEvents |
| | from hwp5.xmlmodel import embed_bindata |
| | from hwp5.xmlmodel import line_segmented |
| | from hwp5.xmlmodel import make_ranged_shapes |
| | from hwp5.xmlmodel import merge_paragraph_text_charshape_lineseg |
| | from hwp5.xmlmodel import split_and_shape |
| |
|
| | from . import test_binmodel |
| | from .fixtures import get_fixture_path |
| |
|
| |
|
| | class TestBase(test_binmodel.TestBase): |
| |
|
| | @cached_property |
| | def hwp5file_xml(self): |
| | return Hwp5File(self.olestg) |
| |
|
| | hwp5file = hwp5file_xml |
| |
|
| |
|
| | class TestXmlEvents(TestBase): |
| |
|
| | def test_dump_quoteattr_cr(self): |
| | sio = BytesIO() |
| |
|
| | context = dict() |
| | attrs = dict(char='\r') |
| | events = [(STARTEVENT, (ControlChar, attrs, context)), |
| | (ENDEVENT, (ControlChar, attrs, context))] |
| | xmlevents = XmlEvents(iter(events)) |
| | xmlevents.dump(sio) |
| |
|
| | data = sio.getvalue() |
| | self.assertTrue(b' ' in data) |
| |
|
| | def test_bytechunks_quoteattr_cr(self): |
| |
|
| | context = dict() |
| | attrs = dict(char='\r') |
| | item = (ControlChar, attrs, context) |
| | modelevents = [(STARTEVENT, item), |
| | (ENDEVENT, item)] |
| | xmlevents = XmlEvents(iter(modelevents)) |
| | xml = b''.join(xmlevents.bytechunks()) |
| |
|
| | self.assertTrue(b' ' in xml) |
| |
|
| |
|
| | class TestModelEventStream(TestBase): |
| |
|
| | @cached_property |
| | def docinfo(self): |
| | return ModelEventStream(self.hwp5file_bin['DocInfo'], |
| | self.hwp5file_bin.header.version) |
| |
|
| | def test_modelevents(self): |
| | self.assertEqual(len(list(self.docinfo.models())) * 2, |
| | len(list(self.docinfo.modelevents()))) |
| | |
| |
|
| |
|
| | class TestDocInfo(TestBase): |
| |
|
| | @cached_property |
| | def docinfo(self): |
| | return DocInfo(self.hwp5file_bin['DocInfo'], |
| | self.hwp5file_bin.header.version) |
| |
|
| | def test_events(self): |
| | events = list(self.docinfo.events()) |
| | self.assertEqual(136, len(events)) |
| | |
| |
|
| | |
| | self.assertTrue('<text>' not in events[4][1][1]['bindata']) |
| |
|
| | def test_events_with_embedbin(self): |
| | bindata = self.hwp5file_bin['BinData'] |
| | events = list(self.docinfo.events(embedbin=bindata)) |
| | self.assertTrue('<text>' in events[4][1][1]['bindata']) |
| | self.assertEqual(bindata['BIN0002.jpg'].open().read(), |
| | base64.b64decode(events[4][1][1] |
| | ['bindata']['<text>'])) |
| |
|
| |
|
| | class TestSection(TestBase): |
| |
|
| | def test_events(self): |
| | section = Section(self.hwp5file_bin['BodyText']['Section0'], |
| | self.hwp5file_bin.fileheader.version) |
| | events = list(section.events()) |
| | ev, (tag, attrs, ctx) = events[0] |
| | self.assertEqual((STARTEVENT, SectionDef), (ev, tag)) |
| | self.assertFalse('section-id' in attrs) |
| |
|
| | ev, (tag, attrs, ctx) = events[1] |
| | self.assertEqual((STARTEVENT, PageDef), (ev, tag)) |
| |
|
| | ev, (tag, attrs, ctx) = events[2] |
| | self.assertEqual((ENDEVENT, PageDef), (ev, tag)) |
| |
|
| | ev, (tag, attrs, ctx) = events[-1] |
| | self.assertEqual((ENDEVENT, SectionDef), (ev, tag)) |
| |
|
| |
|
| | class TestHwp5File(TestBase): |
| |
|
| | def test_docinfo_class(self): |
| | self.assertTrue(isinstance(self.hwp5file.docinfo, DocInfo)) |
| |
|
| | def test_events(self): |
| | list(self.hwp5file.events()) |
| |
|
| | def test_events_embedbin_without_bindata(self): |
| | |
| | self.hwp5file_name = 'parashape.hwp' |
| | hwp5file = self.hwp5file |
| | self.assertTrue('BinData' not in hwp5file) |
| | list(hwp5file.events(embedbin=True)) |
| |
|
| | def test_xmlevents(self): |
| | events = iter(self.hwp5file.xmlevents()) |
| | ev = next(events) |
| | self.assertEqual((STARTEVENT, |
| | ('HwpDoc', dict(version='5.0.1.7'))), ev) |
| | list(events) |
| |
|
| | def test_xmlevents_dump(self): |
| | with io.open(self.id() + '.xml', 'wb+') as outfile: |
| | self.hwp5file.xmlevents().dump(outfile) |
| |
|
| | outfile.seek(0) |
| | doc = ElementTree.parse(outfile) |
| |
|
| | self.assertEqual('HwpDoc', doc.getroot().tag) |
| |
|
| |
|
| | class TestShapedText(TestCase): |
| | def test_make_shape_range(self): |
| | charshapes = [(0, 'A'), (4, 'B'), (6, 'C'), (10, 'D')] |
| | ranged_shapes = make_ranged_shapes(charshapes) |
| | self.assertEqual([((0, 4), 'A'), ((4, 6), 'B'), ((6, 10), 'C'), |
| | ((10, 0x7fffffff), 'D')], list(ranged_shapes)) |
| |
|
| | def test_split(self): |
| | chunks = [((0, 3), None, 'aaa'), ((3, 6), None, 'bbb'), |
| | ((6, 9), None, 'ccc'), ((9, 12), None, 'ddd')] |
| | charshapes = [(0, 'A'), (4, 'B'), (6, 'C'), (10, 'D')] |
| | shaped_chunks = split_and_shape(iter(chunks), |
| | make_ranged_shapes(charshapes)) |
| | shaped_chunks = list(shaped_chunks) |
| | self.assertEqual([ |
| | ((0, 3), ('A', None), 'aaa'), |
| | ((3, 4), ('A', None), 'b'), |
| | ((4, 6), ('B', None), 'bb'), |
| | ((6, 9), ('C', None), 'ccc'), |
| | ((9, 10), ('C', None), 'd'), |
| | ((10, 12), ('D', None), 'dd')], |
| | shaped_chunks) |
| |
|
| | |
| | chunks = [((0, 112), None, 'x' * 112)] |
| | charshapes = [(0, 'a'), (3, 'b'), (5, 'c')] |
| | linesegs = [(0, 'A'), (51, 'B'), (103, 'C')] |
| | shaped = split_and_shape(iter(chunks), make_ranged_shapes(charshapes)) |
| | shaped = list(shaped) |
| | self.assertEqual([((0, 3), ('a', None), 'xxx'), |
| | ((3, 5), ('b', None), 'xx'), |
| | ((5, 112), ('c', None), 'x' * 107)], shaped) |
| | lines = split_and_shape(iter(shaped), make_ranged_shapes(linesegs)) |
| | lines = list(lines) |
| | self.assertEqual([ |
| | ((0, 3), ('A', ('a', None)), 'xxx'), |
| | ((3, 5), ('A', ('b', None)), 'xx'), |
| | ((5, 51), ('A', ('c', None)), 'x' * (51 - 5)), |
| | ((51, 103), ('B', ('c', None)), 'x' * (103 - 51)), |
| | ((103, 112), ('C', ('c', None)), 'x' * (112 - 103))], lines) |
| |
|
| |
|
| | class TestLineSeg(TestCase): |
| | def test_line_segmented(self): |
| | chunks = [((0, 3), None, 'aaa'), ((3, 6), None, 'bbb'), |
| | ((6, 9), None, 'ccc'), ((9, 12), None, 'ddd')] |
| | linesegs = [(0, 'A'), (4, 'B'), (6, 'C'), (10, 'D')] |
| | lines = line_segmented(iter(chunks), make_ranged_shapes(linesegs)) |
| | lines = list(lines) |
| | self.assertEqual([('A', [((0, 3), None, 'aaa'), |
| | ((3, 4), None, 'b')]), |
| | ('B', [((4, 6), None, 'bb')]), |
| | ('C', [((6, 9), None, 'ccc'), |
| | ((9, 10), None, 'd')]), |
| | ('D', [((10, 12), None, 'dd')])], lines) |
| |
|
| |
|
| | class TestDistributionBodyText(TestBase): |
| |
|
| | hwp5file_name = 'viewtext.hwp' |
| |
|
| | def test_issue33_missing_paralineseg(self): |
| | section0 = self.hwp5file_bin.bodytext.section(0) |
| | tagids = set(model['tagid'] for model in section0.models()) |
| | types = set(model['type'] for model in section0.models()) |
| | self.assertTrue(HWPTAG_PARA_LINE_SEG not in tagids) |
| | self.assertTrue(ParaLineSeg not in types) |
| |
|
| | paratext = self.hwp5file_bin.bodytext.section(0).model(1) |
| | self.assertEqual(ParaText, paratext['type']) |
| |
|
| | paracharshape = self.bodytext.section(0).model(2) |
| | self.assertEqual(ParaCharShape, paracharshape['type']) |
| |
|
| | evs = merge_paragraph_text_charshape_lineseg( |
| | (paratext['type'], paratext['content'], dict()), |
| | (paracharshape['type'], paracharshape['content'], dict()), |
| | None |
| | ) |
| |
|
| | |
| | list(evs) |
| |
|
| |
|
| | class TestMatchFieldStartEnd(TestCase): |
| |
|
| | def test_match_field_start_end(self): |
| |
|
| | records = \ |
| | [{'level': 2, |
| | 'payload': b'\x1c\x00\x00\x00\x18\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x00\x80', |
| | 'seqno': 196, |
| | 'size': 22, |
| | 'tagid': 66, |
| | 'tagname': 'HWPTAG_PARA_HEADER'}, |
| | {'level': 3, |
| | 'payload': b'\x04\x00umf\x08\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x03\x00umf%\x00\x00\x00\x00\x00\x00\x00\x00\x03\x002\x009\x009\x00\x04\x00umf\x08\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\r\x00', |
| | 'seqno': 197, |
| | 'size': 56, |
| | 'tagid': 67, |
| | 'tagname': 'HWPTAG_PARA_TEXT'}, |
| | {'level': 3, |
| | 'payload': b'\x00\x00\x00\x00\x13\x00\x00\x00', |
| | 'seqno': 198, |
| | 'size': 8, |
| | 'tagid': 68, |
| | 'tagname': b'HWPTAG_PARA_CHAR_SHAPE'}, |
| | {'level': 3, |
| | 'payload': b'\x00\x00\x00\x00\x00\x00\x00\x00\x14\x05\x00\x00\x14\x05\x00\x00Q\x04\x00\x00\xfc\xfe\xff\xff\x00\x00\x00\x00X\x0b\x00\x00\x00\x00\x06\x00', |
| | 'seqno': 199, |
| | 'size': 36, |
| | 'tagid': 69, |
| | 'tagname': 'HWPTAG_PARA_LINE_SEG'}, |
| | {'level': 3, |
| | 'payload': b'umf%\x00\x00\x00\x00\x08\x15\x00=\x00S\x00U\x00M\x00(\x00R\x00I\x00G\x00H\x00T\x00)\x00?\x00?\x00%\x00g\x00,\x00;\x00;\x002\x009\x009\x00P\x89\xa0z\x00\x00\x00\x00', |
| | 'seqno': 200, |
| | 'size': 61, |
| | 'tagid': 71, |
| | 'tagname': 'HWPTAG_CTRL_HEADER'}] |
| |
|
| | models = binmodel.parse_models(dict(), records) |
| | events = xmlmodel.prefix_binmodels_with_event(dict(), models) |
| | events = xmlmodel.make_texts_linesegmented_and_charshaped(events) |
| | events = xmlmodel.make_extended_controls_inline(events) |
| | events = xmlmodel.match_field_start_end(events) |
| | events = list(events) |
| |
|
| | def test_issue144_fields_crossing_lineseg_boundary(self): |
| |
|
| | name = 'issue144-fields-crossing-lineseg-boundary.hwp' |
| | path = get_fixture_path(name) |
| | hwp5file = xmlmodel.Hwp5File(path) |
| | xmlevents = hwp5file.bodytext.xmlevents() |
| | |
| |
|
| | stack_fields = [] |
| | for ev, model in xmlevents: |
| |
|
| | if ev is treeop.STARTEVENT: |
| | tag = model[0] |
| | else: |
| | tag = model |
| |
|
| | if tag.startswith('Field'): |
| | if ev is treeop.STARTEVENT: |
| | stack_fields.append(model) |
| | else: |
| | stack_fields.pop() |
| | elif tag == 'LineSeg': |
| | |
| | if ev is treeop.STARTEVENT: |
| | assert len(stack_fields) == 0 |
| | else: |
| | assert len(stack_fields) == 0 |
| |
|
| |
|
| | class TestEmbedBinData(TestBase): |
| |
|
| | def test_embed_bindata(self): |
| |
|
| | bindata = dict(flags=BinData.Flags(BinData.StorageType.EMBEDDING), |
| | bindata=dict(storage_id=2, ext='jpg')) |
| | events = [(STARTEVENT, (BinData, bindata, dict())), |
| | (ENDEVENT, (BinData, bindata, dict()))] |
| | events = list(embed_bindata(events, self.hwp5file_bin['BinData'])) |
| | self.assertTrue('<text>' in bindata['bindata']) |
| |
|