|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import absolute_import |
|
|
from __future__ import print_function |
|
|
from __future__ import unicode_literals |
|
|
from io import BytesIO |
|
|
from itertools import takewhile |
|
|
import json |
|
|
import logging |
|
|
import inspect |
|
|
|
|
|
from .. import recordstream |
|
|
from ..bintype import ERROREVENT |
|
|
from ..bintype import resolve_type_events |
|
|
from ..bintype import resolve_values_from_stream |
|
|
from ..dataio import ParseError |
|
|
from ..dataio import dumpbytes |
|
|
from ..recordstream import nth |
|
|
from ..tagids import tagnames |
|
|
from ..treeop import STARTEVENT |
|
|
from ..treeop import ENDEVENT |
|
|
from ..treeop import prefix_ancestors_from_level |
|
|
from ..utils import JsonObjects |
|
|
|
|
|
from ._shared import tag_models |
|
|
from ._shared import RecordModel |
|
|
from ._shared import BinStorageId |
|
|
from ._shared import COLORREF |
|
|
from ._shared import Margin |
|
|
from .controlchar import CHID |
|
|
from .controlchar import ControlChar |
|
|
from .tagid16_document_properties import DocumentProperties |
|
|
from .tagid17_id_mappings import IdMappings |
|
|
from .tagid18_bin_data import BinData |
|
|
from .tagid20_border_fill import BorderFill |
|
|
from .tagid19_face_name import FaceName |
|
|
from .tagid21_char_shape import CharShape |
|
|
from .tagid21_char_shape import LanguageStruct |
|
|
from .tagid22_tab_def import TabDef |
|
|
from .tagid23_numbering import Numbering |
|
|
from .tagid24_bullet import Bullet |
|
|
from .tagid25_para_shape import ParaShape |
|
|
from .tagid26_style import Style |
|
|
from .tagid27_doc_data import DocData |
|
|
from .tagid28_distribute_doc_data import DistributeDocData |
|
|
from .tagid30_compatible_document import CompatibleDocument |
|
|
from .tagid31_layout_compatibility import LayoutCompatibility |
|
|
from .tagid32_unknown import TagModel32 |
|
|
from .tagid50_para_header import Paragraph |
|
|
from .tagid51_para_text import ParaText |
|
|
from .tagid51_para_text import ParaTextChunks |
|
|
from .tagid52_para_char_shape import ParaCharShape |
|
|
from .tagid53_para_line_seg import ParaLineSeg |
|
|
from .tagid53_para_line_seg import ParaLineSegList |
|
|
from .tagid53_para_line_seg import LineSeg |
|
|
from .tagid54_para_range_tag import ParaRangeTag |
|
|
from .tagid55_ctrl_header import Control |
|
|
from .controls.bookmark_control import BookmarkControl |
|
|
from .controls.columns_def import ColumnsDef |
|
|
from .controls.common_controls import CommonControl |
|
|
from .controls.dutmal import Dutmal |
|
|
from .controls.field import Field |
|
|
from .controls.field import FieldUnknown |
|
|
from .controls.field import FieldDate |
|
|
from .controls.field import FieldDocDate |
|
|
from .controls.field import FieldPath |
|
|
from .controls.field import FieldBookmark |
|
|
from .controls.field import FieldMailMerge |
|
|
from .controls.field import FieldCrossRef |
|
|
from .controls.field import FieldFormula |
|
|
from .controls.field import FieldClickHere |
|
|
from .controls.field import FieldClickHereData |
|
|
from .controls.field import FieldSummary |
|
|
from .controls.field import FieldUserInfo |
|
|
from .controls.field import FieldHyperLink |
|
|
from .controls.field import FieldMemo |
|
|
from .controls.field import FieldPrivateInfoSecurity |
|
|
from .controls.gshape_object_control import GShapeObjectControl |
|
|
from .controls.header_footer import HeaderFooter |
|
|
from .controls.header_footer import Header |
|
|
from .controls.header_footer import Footer |
|
|
from .controls.hidden_comment import HiddenComment |
|
|
from .controls.index_marker import IndexMarker |
|
|
from .controls.note import Note |
|
|
from .controls.note import FootNote |
|
|
from .controls.note import EndNote |
|
|
from .controls.numbering import AutoNumbering |
|
|
from .controls.numbering import NewNumbering |
|
|
from .controls.page_hide import PageHide |
|
|
from .controls.page_number_position import PageNumberPosition |
|
|
from .controls.page_odd_even import PageOddEven |
|
|
from .controls.section_def import SectionDef |
|
|
from .controls.table_control import TableControl |
|
|
from .controls.tcps_control import TCPSControl |
|
|
from .tagid56_list_header import ListHeader |
|
|
from .tagid56_list_header import TableCaption |
|
|
from .tagid56_list_header import TableCell |
|
|
from .tagid56_list_header import TextboxParagraphList |
|
|
from .tagid56_list_header import HeaderParagraphList |
|
|
from .tagid56_list_header import FooterParagraphList |
|
|
from .tagid57_page_def import PageDef |
|
|
from .tagid58_footnote_shape import FootnoteShape |
|
|
from .tagid59_page_border_fill import PageBorderFill |
|
|
from .tagid60_shape_component import ShapeComponent |
|
|
from .tagid61_table import TableBody |
|
|
from .tagid62_shape_component_line import ShapeLine |
|
|
from .tagid63_shape_component_rectangle import ShapeRectangle |
|
|
from .tagid64_shape_component_ellipse import ShapeEllipse |
|
|
from .tagid65_shape_component_arc import ShapeArc |
|
|
from .tagid66_shape_component_polygon import ShapePolygon |
|
|
from .tagid67_shape_component_curve import ShapeCurve |
|
|
from .tagid68_shape_component_ole import ShapeOLE |
|
|
from .tagid69_shape_component_picture import ShapePicture |
|
|
from .tagid70_shape_component_container import ShapeContainer |
|
|
from .tagid71_ctrl_data import ControlData |
|
|
from .tagid72_ctrl_eqedit import EqEdit |
|
|
from .tagid74_shape_component_textart import ShapeTextArt |
|
|
from .tagid75_form_object import FormObject |
|
|
from .tagid76_memo_shape import MemoShape |
|
|
from .tagid77_memo_list import MemoList |
|
|
from .tagid78_forbidden_char import ForbiddenChar |
|
|
from .tagid79_chart_data import ChartData |
|
|
from .tagid99_shape_component_unknown import ShapeUnknown |
|
|
|
|
|
|
|
|
RecordModel |
|
|
BinStorageId |
|
|
COLORREF |
|
|
Margin |
|
|
DocumentProperties |
|
|
BinData |
|
|
BorderFill |
|
|
CharShape |
|
|
LanguageStruct |
|
|
TabDef |
|
|
Numbering |
|
|
Bullet |
|
|
ParaShape |
|
|
Style |
|
|
DocData |
|
|
DistributeDocData |
|
|
CompatibleDocument |
|
|
LayoutCompatibility |
|
|
TagModel32 |
|
|
Paragraph |
|
|
ParaText |
|
|
ParaTextChunks |
|
|
ParaCharShape |
|
|
ParaLineSeg |
|
|
ParaLineSegList |
|
|
LineSeg |
|
|
ParaRangeTag |
|
|
Control |
|
|
ListHeader |
|
|
TableCaption |
|
|
TableCell |
|
|
TextboxParagraphList |
|
|
PageDef |
|
|
FootnoteShape |
|
|
PageBorderFill |
|
|
ShapeComponent |
|
|
TableBody |
|
|
ShapeLine |
|
|
ShapeRectangle |
|
|
ShapeEllipse |
|
|
ShapeArc |
|
|
ShapePolygon |
|
|
ShapeCurve |
|
|
ShapeOLE |
|
|
ShapePicture |
|
|
ShapeContainer |
|
|
ControlData |
|
|
EqEdit |
|
|
ShapeTextArt |
|
|
FormObject |
|
|
MemoShape |
|
|
MemoList |
|
|
ForbiddenChar |
|
|
ChartData |
|
|
ShapeUnknown |
|
|
CHID |
|
|
ControlChar |
|
|
BookmarkControl |
|
|
ColumnsDef |
|
|
CommonControl |
|
|
Dutmal |
|
|
Field |
|
|
FieldUnknown |
|
|
FieldDate |
|
|
FieldDocDate |
|
|
FieldPath |
|
|
FieldBookmark |
|
|
FieldMailMerge |
|
|
FieldCrossRef |
|
|
FieldFormula |
|
|
FieldClickHere |
|
|
FieldClickHereData |
|
|
FieldSummary |
|
|
FieldUserInfo |
|
|
FieldHyperLink |
|
|
FieldMemo |
|
|
FieldPrivateInfoSecurity |
|
|
GShapeObjectControl |
|
|
HeaderFooter |
|
|
Header |
|
|
HeaderParagraphList |
|
|
Footer |
|
|
FooterParagraphList |
|
|
HiddenComment |
|
|
IndexMarker |
|
|
Note |
|
|
FootNote |
|
|
EndNote |
|
|
AutoNumbering |
|
|
NewNumbering |
|
|
PageHide |
|
|
PageNumberPosition |
|
|
PageOddEven |
|
|
SectionDef |
|
|
TableControl |
|
|
TCPSControl |
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class UnknownTagModel(RecordModel): |
|
|
pass |
|
|
|
|
|
|
|
|
class Text(object): |
|
|
pass |
|
|
|
|
|
|
|
|
def _check_tag_models(): |
|
|
for tagid, name in tagnames.items(): |
|
|
assert tagid in tag_models, 'RecordModel for %s is missing!' % name |
|
|
|
|
|
|
|
|
_check_tag_models() |
|
|
|
|
|
|
|
|
def init_record_parsing_context(base, record): |
|
|
''' Initialize a context to parse the given record |
|
|
|
|
|
the initializations includes followings: |
|
|
- context = dict(base) |
|
|
- context['record'] = record |
|
|
- context['stream'] = record payload stream |
|
|
|
|
|
:param base: the base context to be shallow-copied into the new one |
|
|
:param record: to be parsed |
|
|
:returns: new context |
|
|
''' |
|
|
|
|
|
return dict(base, record=record, stream=BytesIO(record['payload'])) |
|
|
|
|
|
|
|
|
def parse_models(context, records): |
|
|
for context, model in parse_models_intern(context, records): |
|
|
yield model |
|
|
|
|
|
|
|
|
def parse_models_intern(context, records): |
|
|
context_models = ((init_record_parsing_context(context, record), record) |
|
|
for record in records) |
|
|
context_models = parse_models_with_parent(context_models) |
|
|
for context, model in context_models: |
|
|
stream = context['stream'] |
|
|
unparsed = stream.read() |
|
|
if unparsed: |
|
|
model['unparsed'] = unparsed |
|
|
yield context, model |
|
|
|
|
|
|
|
|
def parse_models_with_parent(context_models): |
|
|
level_prefixed = ((model['level'], (context, model)) |
|
|
for context, model in context_models) |
|
|
root_item = (dict(), dict()) |
|
|
ancestors_prefixed = prefix_ancestors_from_level(level_prefixed, root_item) |
|
|
for ancestors, (context, model) in ancestors_prefixed: |
|
|
context['parent'] = ancestors[-1] |
|
|
parse_model(context, model) |
|
|
yield context, model |
|
|
|
|
|
|
|
|
def parse_model(context, model): |
|
|
''' HWPTAG로 모델 결정 후 기본 파싱 ''' |
|
|
|
|
|
stream = context['stream'] |
|
|
context['resolve_values'] = resolve_values_from_stream(stream) |
|
|
events = resolve_model_events(context, model) |
|
|
events = raise_on_errorevent(context, events) |
|
|
model['binevents'] = list(events) |
|
|
|
|
|
logger.debug('model: %s', model['type'].__name__) |
|
|
logger.debug('%s', model['content']) |
|
|
|
|
|
|
|
|
def raise_on_errorevent(context, events): |
|
|
binevents = list() |
|
|
for ev, item in events: |
|
|
yield ev, item |
|
|
binevents.append((ev, item)) |
|
|
if ev is ERROREVENT: |
|
|
e = item['exception'] |
|
|
msg = 'can\'t parse %s' % item['type'] |
|
|
pe = ParseError(msg) |
|
|
pe.cause = e |
|
|
pe.path = context.get('path') |
|
|
pe.treegroup = context.get('treegroup') |
|
|
pe.record = context.get('record') |
|
|
pe.offset = item.get('bin_offset') |
|
|
pe.binevents = binevents |
|
|
raise pe |
|
|
|
|
|
|
|
|
def resolve_models(context, records): |
|
|
model_contexts = (dict(context, record=record, model=dict(record)) |
|
|
for record in records) |
|
|
|
|
|
level_prefixed = ((context['model']['level'], context) |
|
|
for context in model_contexts) |
|
|
root_item = {} |
|
|
ancestors_prefixed = prefix_ancestors_from_level(level_prefixed, root_item) |
|
|
for ancestors, context in ancestors_prefixed: |
|
|
parent = ancestors[-1] |
|
|
context['parent'] = parent, parent.get('model', {}) |
|
|
|
|
|
record_frame = context['record'] |
|
|
context['type'] = RecordModel |
|
|
context['name'] = record_frame['tagname'] |
|
|
yield STARTEVENT, context |
|
|
for x in resolve_model_events(context, context['model']): |
|
|
yield x |
|
|
event, item = x |
|
|
context['value'] = item |
|
|
yield ENDEVENT, context |
|
|
|
|
|
|
|
|
def resolve_model_events(context, model): |
|
|
|
|
|
resolve_values = context['resolve_values'] |
|
|
|
|
|
model['type'] = model_type = tag_models.get(model['tagid'], |
|
|
UnknownTagModel) |
|
|
|
|
|
for ev, item in resolve_type_events(model_type, context, resolve_values): |
|
|
yield ev, item |
|
|
|
|
|
model['content'] = item['value'] |
|
|
|
|
|
extension_types = getattr(model['type'], 'extension_types', None) |
|
|
if extension_types: |
|
|
key = model['type'].get_extension_key(context, model) |
|
|
extension = extension_types.get(key) |
|
|
if extension is not None: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for cls in get_extension_mro(extension, model['type']): |
|
|
extension_type_events = resolve_type_events(cls, context, |
|
|
resolve_values) |
|
|
for ev, item in extension_type_events: |
|
|
yield ev, item |
|
|
content = item['value'] |
|
|
model['content'].update(content) |
|
|
model['type'] = extension |
|
|
|
|
|
if 'parent' in context: |
|
|
parent = context['parent'] |
|
|
parent_context, parent_model = parent |
|
|
parent_type = parent_model.get('type') |
|
|
parent_content = parent_model.get('content') |
|
|
|
|
|
on_child = getattr(parent_type, 'on_child', None) |
|
|
if on_child: |
|
|
on_child(parent_content, parent_context, (context, model)) |
|
|
|
|
|
|
|
|
def get_extension_mro(cls, up_to_cls=None): |
|
|
mro = inspect.getmro(cls) |
|
|
mro = takewhile(lambda cls: cls is not up_to_cls, mro) |
|
|
mro = list(cls for cls in mro if 'attributes' in cls.__dict__) |
|
|
mro = reversed(mro) |
|
|
return mro |
|
|
|
|
|
|
|
|
class ModelJsonEncoder(json.JSONEncoder): |
|
|
|
|
|
def default(self, obj): |
|
|
if isinstance(obj, bytes): |
|
|
return obj.decode('latin1') |
|
|
return json.JSONEncoder.default(self, obj) |
|
|
|
|
|
|
|
|
def model_to_json(model, *args, **kwargs): |
|
|
''' convert a model to json ''' |
|
|
kwargs['cls'] = ModelJsonEncoder |
|
|
model = dict(model) |
|
|
model['type'] = model['type'].__name__ |
|
|
record = model |
|
|
record['payload'] = list(dumpbytes(record['payload'])) |
|
|
if 'unparsed' in model: |
|
|
model['unparsed'] = list(dumpbytes(model['unparsed'])) |
|
|
if 'binevents' in model: |
|
|
del model['binevents'] |
|
|
return json.dumps(model, *args, **kwargs) |
|
|
|
|
|
|
|
|
def chain_iterables(iterables): |
|
|
for iterable in iterables: |
|
|
for item in iterable: |
|
|
yield item |
|
|
|
|
|
|
|
|
class ModelStream(recordstream.RecordStream): |
|
|
|
|
|
def models(self, **kwargs): |
|
|
|
|
|
kwargs.setdefault('version', self.version) |
|
|
try: |
|
|
kwargs.setdefault('path', self.path) |
|
|
except AttributeError: |
|
|
pass |
|
|
treegroup = kwargs.get('treegroup', None) |
|
|
if treegroup is not None: |
|
|
records = self.records_treegroup(treegroup) |
|
|
models = parse_models(kwargs, records) |
|
|
else: |
|
|
groups = self.models_treegrouped(**kwargs) |
|
|
models = chain_iterables(groups) |
|
|
return models |
|
|
|
|
|
def models_treegrouped(self, **kwargs): |
|
|
''' iterable of iterable of the models, grouped by the top-level tree |
|
|
''' |
|
|
kwargs.setdefault('version', self.version) |
|
|
for group_idx, records in enumerate(self.records_treegrouped()): |
|
|
kwargs['treegroup'] = group_idx |
|
|
yield parse_models(kwargs, records) |
|
|
|
|
|
def model(self, idx): |
|
|
return nth(self.models(), idx) |
|
|
|
|
|
def models_json(self, **kwargs): |
|
|
models = self.models(**kwargs) |
|
|
return JsonObjects(models, model_to_json) |
|
|
|
|
|
def other_formats(self): |
|
|
d = super(ModelStream, self).other_formats() |
|
|
d['.models'] = self.models_json().open |
|
|
return d |
|
|
|
|
|
def parse_model_events(self): |
|
|
context = dict(version=self.version) |
|
|
|
|
|
def resolve_values_from_record(record): |
|
|
stream = BytesIO(record['payload']) |
|
|
return resolve_values_from_stream(stream) |
|
|
|
|
|
for group_idx, records in enumerate(self.records_treegrouped()): |
|
|
context['treegroup'] = group_idx |
|
|
for x in resolve_models(context, records): |
|
|
event, item = x |
|
|
if item['type'] is RecordModel: |
|
|
if event is STARTEVENT: |
|
|
record_frame = item['record'] |
|
|
stream = BytesIO(record_frame['payload']) |
|
|
resolve_values = resolve_values_from_stream(stream) |
|
|
item['stream'] = stream |
|
|
item['resolve_values'] = resolve_values |
|
|
elif event is ENDEVENT: |
|
|
stream = item['stream'] |
|
|
item['leftover'] = { |
|
|
'offset': stream.tell(), |
|
|
'bytes': stream.read() |
|
|
} |
|
|
yield x |
|
|
|
|
|
|
|
|
class DocInfo(ModelStream): |
|
|
|
|
|
@property |
|
|
def idmappings(self): |
|
|
for model in self.models(): |
|
|
if model['type'] is IdMappings: |
|
|
return model |
|
|
|
|
|
@property |
|
|
def facenames_by_lang(self): |
|
|
facenames = list(m for m in self.models() |
|
|
if m['type'] is FaceName) |
|
|
languages = 'ko', 'en', 'cn', 'jp', 'other', 'symbol', 'user' |
|
|
facenames_by_lang = dict() |
|
|
offset = 0 |
|
|
for lang in languages: |
|
|
n_fonts = self.idmappings['content'][lang + '_fonts'] |
|
|
facenames_by_lang[lang] = facenames[offset:offset + n_fonts] |
|
|
offset += n_fonts |
|
|
return facenames_by_lang |
|
|
|
|
|
@property |
|
|
def charshapes(self): |
|
|
return (m for m in self.models() |
|
|
if m['type'] is CharShape) |
|
|
|
|
|
def get_charshape(self, charshape_id): |
|
|
return nth(self.charshapes, charshape_id) |
|
|
|
|
|
def charshape_lang_facename(self, charshape_id, lang): |
|
|
charshape = self.get_charshape(charshape_id) |
|
|
lang_facename_offset = charshape['content']['font_face'][lang] |
|
|
return self.facenames_by_lang[lang][lang_facename_offset] |
|
|
|
|
|
|
|
|
class Sections(recordstream.Sections): |
|
|
|
|
|
section_class = ModelStream |
|
|
|
|
|
|
|
|
class Hwp5File(recordstream.Hwp5File): |
|
|
|
|
|
docinfo_class = DocInfo |
|
|
bodytext_class = Sections |
|
|
|
|
|
|
|
|
def create_context(file=None, **context): |
|
|
if file is not None: |
|
|
context['version'] = file.fileheader.version |
|
|
assert 'version' in context |
|
|
return context |
|
|
|