Spaces:

seawolf2357
/

pyhwp1

Configuration error

App Files Files Community

pyhwp1 / src /hwp5 /xmlmodel.py

seawolf2357

Upload folder using huggingface_hub

619197a verified about 1 month ago

raw

history blame contribute delete

27.2 kB

	# -- coding: utf-8 --
	#
	# pyhwp : hwp file format parser in python
	# Copyright (C) 2010-2023 mete0r <https://github.com/mete0r>
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU Affero General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU Affero General Public License for more details.
	#
	# You should have received a copy of the GNU Affero General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	#
	from __future__ import absolute_import
	from __future__ import print_function
	from __future__ import unicode_literals
	from collections import deque
	from itertools import chain
	from pprint import pformat
	from tempfile import TemporaryFile
	import base64
	import logging
	import sys

	from . import binmodel
	from . import filestructure
	from .binmodel.controls import SectionDef
	from .binmodel.controls import TableControl
	from .binmodel.controls import GShapeObjectControl
	from .binmodel import BinData
	from .binmodel import ListHeader
	from .binmodel import Paragraph
	from .binmodel import Text
	from .binmodel import ShapeComponent
	from .binmodel import TableBody
	from .binmodel import TableCell
	from .binmodel import ParaText
	from .binmodel import ParaLineSeg
	from .binmodel import ParaCharShape
	from .binmodel import LineSeg
	from .binmodel import ParaRangeTag
	from .binmodel import Field
	from .binmodel import ControlChar
	from .binmodel import Control
	from .charsets import tokenize_unicode_by_lang
	from .dataio import Struct
	from .filestructure import VERSION
	from .treeop import STARTEVENT, ENDEVENT
	from .treeop import prefix_event
	from .treeop import build_subtree
	from .treeop import tree_events
	from .treeop import tree_events_multi
	from .xmlformat import startelement
	from .xmlformat import xmlevents_to_bytechunks


	PY3 = sys.version_info.major == 3
	if PY3:
	basestring = str
	unichr = chr


	logger = logging.getLogger(__name__)


	def give_elements_unique_id(event_prefixed_mac):
	paragraph_id = 0
	table_id = 0
	gshape_id = 0
	shape_id = 0
	for event, item in event_prefixed_mac:
	(model, attributes, context) = item
	if event == STARTEVENT:
	if model == Paragraph:
	attributes['paragraph_id'] = paragraph_id
	paragraph_id += 1
	elif model == TableControl:
	attributes['table_id'] = table_id
	table_id += 1
	elif model == GShapeObjectControl:
	attributes['gshape_id'] = gshape_id
	gshape_id += 1
	elif model == ShapeComponent:
	attributes['shape_id'] = shape_id
	shape_id += 1
	yield event, item


	def make_ranged_shapes(shapes):
	last = None
	for item in shapes:
	if last is not None:
	yield (last[0], item[0]), last[1]
	last = item
	yield (item[0], 0x7fffffff), item[1]


	def split_and_shape(chunks, ranged_shapes):
	try:
	(chunk_start, chunk_end), chunk_attr, chunk = next(chunks)
	except StopIteration:
	return
	for (shape_start, shape_end), shape in ranged_shapes:
	while True:
	# case 0: chunk has left intersection
	# vvvv
	# ----...
	if chunk_start < shape_start:
	assert False

	# case 1: chunk is far right: get next shape
	# vvvv
	# ----
	if shape_end <= chunk_start: # (1)
	break

	assert chunk_start < shape_end # by (1)
	assert shape_start <= chunk_start
	# case 2: chunk has left intersection
	# vvvv
	# ..----
	if shape_end < chunk_end: # (2)
	prev = ((chunk_start, shape_end),
	chunk[:shape_end - chunk_start])
	nexT = ((shape_end, chunk_end),
	chunk[shape_end - chunk_start:])
	(chunk_start, chunk_end), chunk = prev
	else:
	nexT = None

	assert chunk_end <= shape_end # by (2)
	yield (chunk_start, chunk_end), (shape, chunk_attr), chunk

	if nexT is not None:
	(chunk_start, chunk_end), chunk = nexT
	continue

	try:
	(chunk_start, chunk_end), chunk_attr, chunk = next(chunks)
	except StopIteration:
	return


	def line_segmented(chunks, ranged_linesegs):
	prev_lineseg = None
	line = None
	for ((chunk_start, chunk_end),
	(lineseg, chunk_attr),
	chunk) in split_and_shape(chunks, ranged_linesegs):
	if lineseg is not prev_lineseg:
	if line is not None:
	yield prev_lineseg, line
	line = []
	line.append(((chunk_start, chunk_end), chunk_attr, chunk))
	prev_lineseg = lineseg
	if line is not None:
	yield prev_lineseg, line


	def make_texts_linesegmented_and_charshaped(event_prefixed_mac):
	''' lineseg/charshaped text chunks '''

	stack = [] # stack of ancestor Paragraphs
	for event, item in event_prefixed_mac:
	model, attributes, context = item
	if model is Paragraph:
	if event == STARTEVENT:
	stack.append(dict())
	yield STARTEVENT, item
	else:
	paratext = stack[-1].get(ParaText)
	paracharshape = stack[-1].get(ParaCharShape)
	paralineseg = stack[-1].get(ParaLineSeg)
	# TODO: RangeTags are not used for now
	# pararangetag = stack[-1].get(ParaRangeTag)
	if paratext is None:
	paratext = (ParaText,
	dict(chunks=[((0, 0), '')]),
	dict(context))
	for x in merge_paragraph_text_charshape_lineseg(paratext,
	paracharshape,
	paralineseg):
	yield x

	yield ENDEVENT, (model, attributes, context)
	stack.pop()
	elif model in (ParaText, ParaCharShape, ParaLineSeg, ParaRangeTag):
	if event == STARTEVENT:
	stack[-1][model] = model, attributes, context
	else:
	yield event, (model, attributes, context)


	def merge_paragraph_text_charshape_lineseg(paratext, paracharshape,
	paralineseg):

	paratext_model, paratext_attributes, paratext_context = paratext

	chunks = ((range, None, chunk)
	for range, chunk in paratext_attributes['chunks'])
	charshapes = paracharshape[1]['charshapes']
	shaped_chunks = split_and_shape(chunks, make_ranged_shapes(charshapes))

	if paralineseg:
	paralineseg_content = paralineseg[1]
	paralineseg_context = paralineseg[2]
	else:
	# 배포용 문서의 더미 BodyText 에는 LineSeg 정보가 없음
	# (see https://github.com/mete0r/pyhwp/issues/33)
	# 더미 LineSeg를 만들어 준다
	lineseg = dict(chpos=0, y=0, height=0, height2=0, height85=0,
	space_below=0, x=0, width=0, a8=0, flags=0)
	paralineseg_content = dict(linesegs=[lineseg])
	paralineseg_context = dict()
	linesegs = ((lineseg['chpos'], lineseg)
	for lineseg in paralineseg_content['linesegs'])
	lined_shaped_chunks = line_segmented(shaped_chunks,
	make_ranged_shapes(linesegs))
	for lineseg_content, shaped_chunks in lined_shaped_chunks:
	lineseg = (LineSeg, lineseg_content, paralineseg_context)
	chunk_events = range_shaped_textchunk_events(paratext_context,
	shaped_chunks)
	for x in wrap_modelevents(lineseg, chunk_events):
	yield x


	def range_shaped_textchunk_events(paratext_context, range_shaped_textchunks):
	for (startpos, endpos), (shape, none), chunk in range_shaped_textchunks:
	if isinstance(chunk, basestring):
	textitem = (Text,
	dict(text=chunk, charshape_id=shape),
	paratext_context)
	yield STARTEVENT, textitem
	yield ENDEVENT, textitem
	elif isinstance(chunk, dict):
	code = chunk['code']
	uch = unichr(code)
	name = ControlChar.get_name_by_code(code)
	kind = ControlChar.kinds[uch]
	chunk_attributes = dict(name=name,
	code=code,
	kind=kind,
	charshape_id=shape)
	if code in (0x9, 0xa, 0xd): # http://www.w3.org/TR/xml/#NT-Char
	chunk_attributes['char'] = uch
	ctrlch = (ControlChar, chunk_attributes, paratext_context)
	yield STARTEVENT, ctrlch
	yield ENDEVENT, ctrlch


	def wrap_section(event_prefixed_mac, sect_id=None):
	''' wrap a section with SectionDef '''
	starting_buffer = list()
	started = False
	sectiondef = None
	for event, item in event_prefixed_mac:
	if started:
	yield event, item
	else:
	model, attributes, context = item
	if model is SectionDef and event is STARTEVENT:
	sectiondef, sectdef_child = build_subtree(event_prefixed_mac)
	if sect_id is not None:
	attributes['section_id'] = sect_id
	yield STARTEVENT, sectiondef
	for k in tree_events_multi(sectdef_child):
	yield k
	for evented_item in starting_buffer:
	yield evented_item
	started = True
	else:
	starting_buffer.append((event, item))
	yield ENDEVENT, sectiondef


	class ColumnSet:
	pass


	def wrap_columns(event_prefixed_mac):

	stack = []

	for event, item in event_prefixed_mac:
	model, attributes, context = item

	if model is Paragraph:
	if event is STARTEVENT:

	split = attributes['split']
	split = Paragraph.SplitFlags(split)

	if split.new_columnsdef:
	if stack[-1][0] is ColumnSet:
	yield ENDEVENT, stack.pop()

	columns = (ColumnSet, {}, {})
	stack.append(columns)
	yield STARTEVENT, columns

	else:
	if event is STARTEVENT:
	stack.append(item)
	else:
	if model != stack[-1][0]:
	assert stack[-1][0] is ColumnSet
	yield ENDEVENT, stack.pop()
	stack.pop()

	yield event, item


	def make_extended_controls_inline(event_prefixed_mac, stack=None):
	''' inline extended-controls into paragraph texts '''
	if stack is None:
	stack = [] # stack of ancestor Paragraphs
	for event, item in event_prefixed_mac:
	model, attributes, context = item
	if model is Paragraph:
	for x in meci_paragraph(event, stack, item):
	yield x
	elif model is ControlChar:
	for x in meci_controlchar(event, stack, item, attributes):
	yield x
	elif issubclass(model, Control) and event == STARTEVENT:
	control_subtree = build_subtree(event_prefixed_mac)
	paragraph = stack[-1]
	paragraph_controls = paragraph.setdefault(Control, [])
	paragraph_controls.append(control_subtree)
	else:
	yield event, item


	def meci_paragraph(event, stack, item):
	if event == STARTEVENT:
	stack.append(dict())
	yield STARTEVENT, item
	else:
	yield ENDEVENT, item
	stack.pop()


	def meci_controlchar(event, stack, item, attributes):
	if event is STARTEVENT:
	if attributes['kind'] is ControlChar.EXTENDED:
	paragraph = stack[-1]
	paragraph_controls = paragraph.get(Control)
	control_subtree = paragraph_controls.pop(0)
	tev = tree_events(*control_subtree)
	# to evade the Control/STARTEVENT trigger
	# in parse_models_pass3()
	yield next(tev)

	for k in make_extended_controls_inline(tev, stack):
	yield k
	else:
	yield STARTEVENT, item
	yield ENDEVENT, item


	def make_paragraphs_children_of_listheader(event_prefixed_mac,
	parentmodel=ListHeader,
	childmodel=Paragraph):
	''' make paragraphs children of the listheader '''
	stack = []
	level = 0
	for event, item in event_prefixed_mac:
	model, attributes, context = item
	if event is STARTEVENT:
	level += 1
	if len(stack) > 0 and ((event is STARTEVENT
	and stack[-1][0] == level
	and model is not childmodel) or
	(event is ENDEVENT
	and stack[-1][0] - 1 == level)):
	lh_level, lh_item = stack.pop()
	yield ENDEVENT, lh_item

	if issubclass(model, parentmodel):
	if event is STARTEVENT:
	stack.append((level, item))
	yield event, item
	else:
	pass
	else:
	yield event, item

	if event is ENDEVENT:
	level -= 1


	def match_field_start_end(event_prefixed_mac):
	stack = []
	for event, item in event_prefixed_mac:
	(model, attributes, context) = item
	if issubclass(model, Field):
	for x in mfse_field(event, stack, item):
	yield x
	elif model is LineSeg:
	for x in mfse_lineseg(event, stack, item):
	yield x
	elif model is ControlChar and attributes['name'] == 'FIELD_END':
	for x in mfse_field_end(event, stack, item):
	yield x
	else:
	yield event, item


	def mfse_field(event, stack, item):
	if event is STARTEVENT:
	stack.append(item)
	yield event, item
	else:
	pass


	def mfse_lineseg(event, stack, item):
	if event is ENDEVENT:
	# fields still not closed; temporarily close them
	for field_item in reversed(stack):
	yield ENDEVENT, field_item
	yield event, item
	elif event is STARTEVENT:
	yield event, item
	# fields temporarily closed; open them again
	for field_item in stack:
	yield STARTEVENT, field_item


	def mfse_field_end(event, stack, item):
	if event is ENDEVENT:
	if len(stack) > 0:
	yield event, stack.pop()
	else:
	logger.warning('unmatched field end')


	class TableRow:
	pass


	ROW_OPEN = 1
	ROW_CLOSE = 2


	def restructure_tablebody(event_prefixed_mac):
	''' Group table columns in each rows and wrap them with TableRow. '''
	stack = []
	for event, item in event_prefixed_mac:
	(model, attributes, context) = item
	if model is TableBody:
	for x in rstbody_tablebody(event, stack, item, attributes,
	context):
	yield x
	elif model is TableCell:
	for x in rstbody_tablecell(event, stack, item):
	yield x
	else:
	yield event, item


	def rstbody_tablebody(event, stack, item, attributes, context):
	if event is STARTEVENT:
	rowcols = deque()
	for cols in attributes.pop('rowcols'):
	if cols == 1:
	rowcols.append(ROW_OPEN \| ROW_CLOSE)
	else:
	rowcols.append(ROW_OPEN)
	for i in range(0, cols - 2):
	rowcols.append(0)
	rowcols.append(ROW_CLOSE)
	stack.append((context, rowcols))
	yield event, item
	else:
	yield event, item
	stack.pop()


	def rstbody_tablecell(event, stack, item):
	table_context, rowcols = stack[-1]
	row_context = dict(table_context)
	if event is STARTEVENT:
	how = rowcols[0]
	if how & ROW_OPEN:
	yield STARTEVENT, (TableRow, dict(), row_context)
	yield event, item
	if event is ENDEVENT:
	how = rowcols.popleft()
	if how & ROW_CLOSE:
	yield ENDEVENT, (TableRow, dict(), row_context)


	def tokenize_text_by_lang(event_prefixed_mac):
	''' Group table columns in each rows and wrap them with TableRow. '''
	for event, item in event_prefixed_mac:
	(model, attributes, context) = item
	if model is Text:
	if event is STARTEVENT:
	charshape_id = attributes['charshape_id']
	for lang, text in tokenize_unicode_by_lang(attributes['text']):
	token = (Text, {
	'charshape_id': charshape_id,
	'lang': lang,
	'text': text,
	}, context)
	yield STARTEVENT, token
	yield ENDEVENT, token
	else:
	yield event, item


	def embed_bindata(event_prefixed_mac, bindata):
	for event, item in event_prefixed_mac:
	(model, attributes, context) = item
	if event is STARTEVENT and model is BinData:
	if attributes['flags'].storage is BinData.StorageType.EMBEDDING:
	name = ('BIN%04X' % attributes['bindata']['storage_id']
	+ '.'
	+ attributes['bindata']['ext'])
	bin_stream = bindata[name].open()
	try:
	binary = bin_stream.read()
	finally:
	bin_stream.close()
	b64 = base64.b64encode(binary)
	b64 = b64.decode('ascii')
	truncated = []
	while b64:
	if len(b64) > 64:
	truncated.append(b64[:64])
	b64 = b64[64:]
	else:
	truncated.append(b64)
	b64 = ''
	b64 = '\n'.join(truncated)
	b64 = '\n' + b64 + '\n'
	attributes['bindata']['<text>'] = b64
	attributes['bindata']['inline'] = 'true'
	yield event, item


	def prefix_binmodels_with_event(context, models):
	level_prefixed = ((model['level'],
	(model['type'], model['content'], context))
	for model in models)
	return prefix_event(level_prefixed)


	def wrap_modelevents(wrapper_model, modelevents):
	yield STARTEVENT, wrapper_model
	for mev in modelevents:
	yield mev
	yield ENDEVENT, wrapper_model


	def modelevents_to_xmlevents(modelevents):
	for event, (model, attributes, context) in modelevents:
	try:
	if event is STARTEVENT:
	for x in startelement(context, (model, attributes)):
	yield x
	elif event is ENDEVENT:
	yield ENDEVENT, model.__name__
	except:
	logger.error('model: %s', pformat({
	'event': event,
	'model': model,
	'attributes': attributes,
	'context': context
	}))
	raise


	class XmlEvents(object):

	def __init__(self, events):
	self.events = events

	def __iter__(self):
	return modelevents_to_xmlevents(self.events)

	def bytechunks(self, xml_declaration=True, **kwargs):
	encoding = kwargs.get('xml_encoding', 'utf-8')
	if xml_declaration:
	yield '<?xml version="1.0" encoding="{}"?>\n'.format(
	encoding
	).encode(
	encoding
	)
	bytechunks = xmlevents_to_bytechunks(self, encoding)
	for chunk in bytechunks:
	yield chunk

	def dump(self, outfile, **kwargs):
	bytechunks = self.bytechunks(**kwargs)
	for chunk in bytechunks:
	outfile.write(chunk)
	if hasattr(outfile, 'flush'):
	outfile.flush()

	def open(self, **kwargs):
	tmpfile = TemporaryFile()
	try:
	self.dump(tmpfile, **kwargs)
	except:
	tmpfile.close()
	raise

	tmpfile.seek(0)
	return tmpfile


	class XmlEventsMixin(object):

	def xmlevents(self, **kwargs):
	return XmlEvents(self.events(**kwargs))


	class ModelEventStream(binmodel.ModelStream, XmlEventsMixin):

	def modelevents(self, **kwargs):
	models = self.models(**kwargs)

	# prepare modelevents context
	kwargs.setdefault('version', self.version)
	return prefix_binmodels_with_event(kwargs, models)

	def other_formats(self):
	d = super(ModelEventStream, self).other_formats()
	d['.xml'] = self.xmlevents().open
	return d


	class HwpSummaryInfo(filestructure.HwpSummaryInfo, XmlEventsMixin):

	def events(self, **context):
	generator = PropertySetStreamModelEventsGenerator(context)
	events = generator.generateModelEvents(self.propertySetStream)
	element = HwpSummaryInfo, {}, context
	return wrap_modelevents(element, events)


	class PropertySetStreamModelEventsGenerator(object):

	def __init__(self, context):
	self.context = context

	def generateModelEvents(self, stream):
	return self.getPropertySetStreamEvents(stream)

	def getPropertySetStreamEvents(self, stream):
	from .msoleprops import PropertySetStream
	sectionEvents = [
	self.getPropertySetEvents(propertyset)
	for propertyset in stream.propertysets
	]
	events = chain(*sectionEvents)

	content = dict(
	byte_order='{:04x}'.format(
	stream.byteOrder,
	),
	version=str(stream.version),
	system_identifier='{:08x}'.format(
	stream.systemIdentifier,
	),
	clsid=str(stream.clsid)
	)
	element = PropertySetStream, content, self.context
	return wrap_modelevents(element, events)

	def getPropertySetEvents(self, propertyset):
	from .msoleprops import PropertySet
	propertyEvents = [
	self.getPropertyEvents(property)
	for property in sorted(
	propertyset.properties,
	key=lambda property: property.desc.offset
	)
	]
	events = chain(*propertyEvents)

	content = dict(
	fmtid=propertyset.fmtid,
	offset=propertyset.desc.offset,
	)
	element = PropertySet, content, self.context
	return wrap_modelevents(element, events)

	def getPropertyEvents(self, property):
	from .msoleprops import PID_DICTIONARY
	from .msoleprops import Property
	content = dict(
	id=property.desc.id,
	offset=property.desc.offset,
	)
	if property.idLabel is not None:
	content['id_label'] = property.idLabel
	if property.type is not None:
	content['type'] = str(property.type.vt_type.__name__)
	content['type_code'] = '0x{:04x}'.format(property.type.code)
	if property.id == PID_DICTIONARY.id:
	events = self.getDictionaryEvents(property.value)
	else:
	events = ()
	content['value'] = property.value
	element = Property, content, self.context
	return wrap_modelevents(element, events)

	def getDictionaryEvents(self, dictionary):
	events = list(self.getDictionaryEntryEvents(entry)
	for entry in dictionary.entries)
	return chain(*events)

	def getDictionaryEntryEvents(self, entry):
	from .msoleprops import DictionaryEntry
	content = dict(
	id=entry.id,
	name=entry.name,
	)
	element = DictionaryEntry, content, self.context
	return wrap_modelevents(element, ())


	class DocInfo(ModelEventStream):

	def events(self, **kwargs):
	docinfo = DocInfo, dict(), dict()
	events = self.modelevents(**kwargs)
	if 'embedbin' in kwargs:
	events = embed_bindata(events, kwargs['embedbin'])
	events = wrap_modelevents(docinfo, events)
	return events


	class Section(ModelEventStream):

	def events(self, **kwargs):
	events = self.modelevents(**kwargs)

	events = make_texts_linesegmented_and_charshaped(events)
	events = make_extended_controls_inline(events)
	events = match_field_start_end(events)
	events = make_paragraphs_children_of_listheader(events)
	events = make_paragraphs_children_of_listheader(events, TableBody,
	TableCell)
	events = restructure_tablebody(events)
	events = tokenize_text_by_lang(events)

	section_idx = kwargs.get('section_idx')
	events = wrap_section(events, section_idx)
	events = wrap_columns(events)

	return events


	class Sections(binmodel.Sections, XmlEventsMixin):

	section_class = Section

	def events(self, **kwargs):
	bodytext_events = []
	for idx in self.section_indexes():
	kwargs['section_idx'] = idx
	section = self.section(idx)
	events = section.events(**kwargs)
	bodytext_events.append(events)

	class BodyText(object):
	pass
	bodytext_events = chain(*bodytext_events)
	bodytext = BodyText, dict(), dict()
	return wrap_modelevents(bodytext, bodytext_events)

	def other_formats(self):
	d = super(Sections, self).other_formats()
	d['.xml'] = self.xmlevents().open
	return d


	class HwpDoc(Struct):

	def attributes():
	yield VERSION, 'version'
	attributes = staticmethod(attributes)


	class Hwp5File(binmodel.Hwp5File, XmlEventsMixin):

	summaryinfo_class = HwpSummaryInfo
	docinfo_class = DocInfo
	bodytext_class = Sections

	def events(self, **kwargs):
	if 'embedbin' in kwargs and kwargs['embedbin'] and 'BinData' in self:
	kwargs['embedbin'] = self['BinData']
	else:
	kwargs.pop('embedbin', None)

	events = chain(self.summaryinfo.events(**kwargs),
	self.docinfo.events(**kwargs),
	self.text.events(**kwargs))

	hwpdoc = HwpDoc, dict(version=self.header.version), dict()
	events = wrap_modelevents(hwpdoc, events)

	# for easy references in styles
	events = give_elements_unique_id(events)

	return events