cobiz / src /hwp5 /msoleprops.py
seawolf2357's picture
Add src
d94b56e verified
# -*- coding: utf-8 -*-
#
# pyhwp : hwp file format parser in python
# Copyright (C) 2010-2023 mete0r <https://github.com/mete0r>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
from collections import namedtuple
from datetime import datetime
from datetime import timedelta
from uuid import UUID
import logging
import struct
from hwp5.dataio import Struct
from hwp5.dataio import Flags
from hwp5.dataio import N_ARRAY
from hwp5.dataio import ARRAY
from hwp5.dataio import BYTE
from hwp5.dataio import UINT16
from hwp5.dataio import UINT32
from hwp5.dataio import INT32
from hwp5.bintype import read_type
logger = logging.getLogger(__name__)
vt_types = dict()
def PropertyType(code):
def decorator(cls):
cls.code = code
vt_types[code] = cls
return cls
return decorator
@PropertyType(code=0x0003)
class VT_I4(object):
@classmethod
def read_value(cls, context, f):
return read_type(INT32, context, f)
@PropertyType(code=0x001F)
class VT_LPWSTR(object):
@classmethod
def read_value(cls, context, f):
length = read_type(UINT32, context, f)
data = f.read(length * 2)
return data.decode('utf-16le')[:-1] # remove null character
@PropertyType(code=0x0040)
class VT_FILETIME(object):
@classmethod
def read_value(cls, context, f):
lword = read_type(UINT32, context, f)
hword = read_type(UINT32, context, f)
value = hword << 32 | lword
value = FILETIME(value)
return value
class FILETIME(object):
__slots__ = ('value', )
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.datetime)
@property
def datetime(self):
return (
datetime(1601, 1, 1, 0, 0, 0) +
timedelta(microseconds=self.value / 10)
)
PropertyIdentifier = namedtuple('PropertyIdentifier', [
'id',
'label',
])
PID_DICTIONARY = PropertyIdentifier(
id=0x00000000,
label='PID_DICTIONARY',
)
PID_CODEPAGE = PropertyIdentifier(
id=0x00000001,
label='PID_CODEPAGE',
)
PID_LOCALE = PropertyIdentifier(
id=0x80000000,
label='PID_LOCALE',
)
PID_BEHAVIOR = PropertyIdentifier(
id=0x80000003,
label='PID_BEHAVIOR',
)
PIDSI_TITLE = PropertyIdentifier(
id=0x02,
label='PIDSI_TITLE'
)
PIDSI_SUBJECT = PropertyIdentifier(
id=0x03,
label='PIDSI_SUBJECT'
)
PIDSI_AUTHOR = PropertyIdentifier(
id=0x04,
label='PIDSI_AUTHOR'
)
PIDSI_KEYWORDS = PropertyIdentifier(
id=0x05,
label='PIDSI_KEYWORDS'
)
PIDSI_COMMENTS = PropertyIdentifier(
id=0x06,
label='PIDSI_COMMENTS'
)
PIDSI_TEMPLATE = PropertyIdentifier(
id=0x07,
label='PIDSI_TEMPLATE'
)
PIDSI_LASTAUTHOR = PropertyIdentifier(
id=0x08,
label='PIDSI_LASTAUTHOR'
)
PIDSI_REVNUMBER = PropertyIdentifier(
id=0x09,
label='PIDSI_REVNUMBER'
)
PIDSI_EDITTIME = PropertyIdentifier(
id=0x0a,
label='PIDSI_EDITTIME'
)
PIDSI_LASTPRINTED = PropertyIdentifier(
id=0x0b,
label='PIDSI_LASTPRINTED'
)
PIDSI_CREATE_DTM = PropertyIdentifier(
id=0x0c,
label='PIDSI_CREATE_DTM'
)
PIDSI_LASTSAVE_DTM = PropertyIdentifier(
id=0x0d,
label='PIDSI_LASTSAVE_DTM'
)
PIDSI_PAGECOUNT = PropertyIdentifier(
id=0x0e,
label='PIDSI_PAGECOUNT'
)
PIDSI_WORDCOUNT = PropertyIdentifier(
id=0x0f,
label='PIDSI_WORDCOUNT'
)
PIDSI_CHARCOUNT = PropertyIdentifier(
id=0x10,
label='PIDSI_CHARCOUNT'
)
PIDSI_THUMBNAIL = PropertyIdentifier(
id=0x11,
label='PIDSI_THUMBNAIL'
)
PIDSI_APPNAME = PropertyIdentifier(
id=0x12,
label='PIDSI_APPNAME'
)
PIDSI_SECURITY = PropertyIdentifier(
id=0x13,
label='PIDSI_SECURITY'
)
RESERVED_PROPERTIES = (
PID_DICTIONARY,
PID_CODEPAGE,
PID_LOCALE,
PID_BEHAVIOR,
)
SUMMARY_INFORMATION_PROPERTIES = (
PIDSI_TITLE,
PIDSI_SUBJECT,
PIDSI_AUTHOR,
PIDSI_KEYWORDS,
PIDSI_COMMENTS,
PIDSI_TEMPLATE,
PIDSI_LASTAUTHOR,
PIDSI_REVNUMBER,
PIDSI_EDITTIME,
PIDSI_LASTPRINTED,
PIDSI_CREATE_DTM,
PIDSI_LASTSAVE_DTM,
PIDSI_PAGECOUNT,
PIDSI_WORDCOUNT,
PIDSI_CHARCOUNT,
PIDSI_THUMBNAIL,
PIDSI_APPNAME,
PIDSI_SECURITY,
)
class Property(object):
def __init__(self, desc, idLabel, type, value):
self.desc = desc
self.idLabel = idLabel
self.type = type
self.value = value
@property
def id(self):
return self.desc.id
class PropertyDesc(Struct):
def __init__(self, id, offset):
self.id = id
self.offset = offset
@classmethod
def fromDict(cls, d):
return cls(id=d['id'], offset=d['offset'])
def attributes():
yield UINT32, 'id'
yield UINT32, 'offset' # offset from section start
attributes = staticmethod(attributes)
class PropertyReader(object):
def __init__(self, propsetDesc, propDesc, idLabel, codepage,
displayName=None):
self.propsetDesc = propsetDesc
self.propDesc = propDesc
self.idLabel = idLabel
self.codepage = codepage
self.displayName = displayName
def read(self, f):
f.seek(self.propsetDesc.offset + self.propDesc.offset)
context = {}
propType = read_type(TypedPropertyValue, context, f)
propType = TypedPropertyValue.fromDict(propType)
vt_type = vt_types[propType.code]
propValue = vt_type.read_value(context, f)
return Property(
desc=self.propDesc,
idLabel=self.idLabel,
type=propType,
value=propValue,
)
class TypedPropertyValue(Struct):
'''
[MS-OLEPS] 2.15 TypedPropertyValue
'''
def __init__(self, code):
self.code = code
@classmethod
def fromDict(cls, d):
return cls(code=d['type'].code)
TypeFlags = Flags(UINT32,
0, 16, 'code')
def attributes(cls):
yield cls.TypeFlags, 'type'
attributes = classmethod(attributes)
@property
def vt_type(self):
try:
return vt_types[self.code]
except KeyError:
return None
class DictionaryEntry(Struct):
'''
[MS-OLEPS] 2.16 DictionaryEntry
'''
def __init__(self, id, name):
self.id = id
self.name = name
@classmethod
def fromDict(cls, d):
return cls(
id=d['id'],
name=nullterminated_string(d['name']),
)
def attributes():
from hwp5.dataio import N_ARRAY
from hwp5.dataio import BYTE
yield UINT32, 'id'
yield N_ARRAY(UINT32, BYTE), 'name'
attributes = staticmethod(attributes)
class Dictionary(Struct):
'''
[MS-OLEPS] 2.17 Dictionary
'''
def __init__(self, entries):
self.entries = entries
@classmethod
def fromDict(cls, d):
entries = tuple(
DictionaryEntry.fromDict(entry)
for entry in d['entries']
)
return cls(entries=entries)
def attributes():
from hwp5.dataio import N_ARRAY
yield N_ARRAY(UINT32, DictionaryEntry), 'entries'
attributes = staticmethod(attributes)
def get(self, id, defvalue=None):
for entry in self.entries:
if id == entry.id:
return entry.name
return defvalue
class DictionaryReader(object):
def __init__(self, propsetDesc, propDesc, idLabel, codepage):
self.propsetDesc = propsetDesc
self.propDesc = propDesc
self.idLabel = idLabel
self.codepage = codepage
def read(self, f):
propsetDesc = self.propsetDesc
propDesc = self.propDesc
idLabel = self.idLabel
f.seek(propsetDesc.offset + propDesc.offset)
context = {}
propType = None
propValue = read_type(Dictionary, context, f)
propValue = Dictionary.fromDict(propValue)
return Property(
desc=propDesc,
idLabel=idLabel,
type=propType,
value=propValue,
)
class PropertySet(object):
'''
[MS-OLEPS] 2.20 PropertySet
'''
def __init__(self, desc, header, properties):
self.desc = desc
self.header = header
self.properties = properties
@property
def fmtid(self):
return self.desc.fmtid
def __getitem__(self, propertyIdentifier):
for property in self.properties:
if property.id == propertyIdentifier.id:
return property.value
raise KeyError(propertyIdentifier)
class PropertySetHeader(Struct):
def __init__(self, bytesize, propDescList):
self.bytesize = bytesize,
self.propDescList = propDescList
@classmethod
def fromDict(cls, d):
return cls(
bytesize=d['bytesize'],
propDescList=tuple(
PropertyDesc.fromDict(
propDesc
)
for propDesc in d['propDescList']
),
)
def attributes():
from hwp5.dataio import N_ARRAY
yield UINT32, 'bytesize'
yield N_ARRAY(UINT32, PropertyDesc), 'propDescList'
attributes = staticmethod(attributes)
class PropertySetDesc(Struct):
def __init__(self, fmtid, offset):
self.fmtid = fmtid
self.offset = offset
def attributes():
yield ARRAY(BYTE, 16), 'fmtid'
yield UINT32, 'offset'
attributes = staticmethod(attributes)
@classmethod
def fromDict(cls, d):
return cls(
fmtid=uuid_from_bytes_tuple(d['fmtid']),
offset=d['offset'],
)
class PropertySetStreamHeader(Struct):
def __init__(self, byteOrder, version, systemIdentifier, clsid,
propsetDescList):
self.byteOrder = byteOrder
self.version = version
self.systemIdentifier = systemIdentifier
self.clsid = clsid
self.propsetDescList = propsetDescList
@classmethod
def fromDict(cls, d):
return cls(
byteOrder=d['byteOrder'],
version=d['version'],
systemIdentifier=d['systemIdentifier'],
clsid=uuid_from_bytes_tuple(d['clsid']),
propsetDescList=tuple(
PropertySetDesc.fromDict(
propsetDesc
)
for propsetDesc in d['propsetDescList']
)
)
def attributes():
yield UINT16, 'byteOrder'
yield UINT16, 'version'
yield UINT32, 'systemIdentifier'
yield ARRAY(BYTE, 16), 'clsid'
yield N_ARRAY(UINT32, PropertySetDesc), 'propsetDescList'
attributes = staticmethod(attributes)
class PropertySetStream(object):
'''
[MS-OLEPS] 2.21 PropertySetStream
'''
def __init__(self, header, propertysets):
self.header = header
self.propertysets = propertysets
@property
def byteOrder(self):
return self.header.byteOrder
@property
def version(self):
return self.header.version
@property
def systemIdentifier(self):
return self.header.systemIdentifier
@property
def clsid(self):
return self.header.clsid
class PropertySetFormat(object):
def __init__(self, fmtid, propertyIdentifiers):
self.fmtid = fmtid
self.propertyIdentifiers = propertyIdentifiers
@property
def idLabels(self):
return {
p.id: p.label
for p in self.propertyIdentifiers
}
class PropertySetStreamReader(object):
def __init__(self, propertySetFormats):
self.propertySetFormats = {
propsetFormat.fmtid: propsetFormat
for propsetFormat in propertySetFormats
}
def read(self, f):
context = {}
streamHeader = read_type(PropertySetStreamHeader, context, f)
streamHeader = PropertySetStreamHeader.fromDict(streamHeader)
propertysetList = list()
for propsetDesc in streamHeader.propsetDescList:
f.seek(propsetDesc.offset)
propsetHeader = read_type(PropertySetHeader, context, f)
propsetHeader = PropertySetHeader.fromDict(
propsetHeader,
)
try:
propsetFormat = self.propertySetFormats[propsetDesc.fmtid]
except KeyError:
idLabels = {}
else:
idLabels = propsetFormat.idLabels
properties = []
propDescMap = {
propDesc.id: propDesc
for propDesc in propsetHeader.propDescList
}
propDesc = propDescMap.pop(PID_CODEPAGE.id, None)
if propDesc is not None:
idLabel = idLabels.get(propDesc.id)
propReader = PropertyReader(
propsetDesc=propsetDesc,
propDesc=propDesc,
idLabel=idLabel,
codepage=None,
displayName=None,
)
prop = propReader.read(f)
properties.append(prop)
codepage = prop.value
else:
codepage = None
propDesc = propDescMap.pop(PID_DICTIONARY.id, None)
if propDesc is not None:
idLabel = idLabels.get(propDesc.id)
propReader = DictionaryReader(
propsetDesc,
propDesc,
idLabel,
codepage,
)
prop = propReader.read(f)
properties.append(prop)
dictionary = prop.value
else:
dictionary = None
for propDesc in propDescMap.values():
idLabel = idLabels.get(propDesc.id)
displayName = dictionary.get(propDesc.id, None)
propReader = PropertyReader(
propsetDesc=propsetDesc,
propDesc=propDesc,
idLabel=idLabel,
codepage=codepage,
displayName=displayName,
)
prop = propReader.read(f)
properties.append(prop)
propertyset = PropertySet(
desc=propsetDesc,
header=propsetHeader,
properties=properties,
)
propertysetList.append(propertyset)
return PropertySetStream(
header=streamHeader,
propertysets=propertysetList,
)
class PropertySetStreamTextFormatter(object):
def formatTextLines(self, stream):
yield '- ByteOrder: 0x%x' % stream.byteOrder
yield '- Version: %d' % stream.version
yield '- SystemIdentifier: 0x%08x' % stream.systemIdentifier
yield '- CLSID: %s' % stream.clsid
yield ''
for propertyset in stream.propertysets:
title = 'Property Set {}'.format(
propertyset.fmtid,
)
yield '- {:08x}: {}'.format(
propertyset.desc.offset,
title,
)
yield ' {}'.format(
'-' * len(title)
)
properties = sorted(
propertyset.properties,
key=lambda property: property.desc.offset,
)
for property in properties:
if property.id == PID_DICTIONARY.id:
yield '- {:08x}: {}(=0x{:08x}):'.format(
propertyset.desc.offset + property.desc.offset,
property.idLabel if property.idLabel is not None
else '',
property.id,
)
for entry in property.value.entries:
yield ' - {}: {}'.format(
entry.id,
entry.name,
)
else:
yield '- {:08x}: {}(=0x{:08x}): {}'.format(
propertyset.desc.offset + property.desc.offset,
property.idLabel if property.idLabel is not None
else '',
property.id,
property.value
)
def uuid_from_bytes_tuple(t):
fmt = 'B' * len(t)
fmt = '<' + fmt
bytes_le = struct.pack(fmt, *t)
return UUID(bytes_le=bytes_le)
def nullterminated_string(bs):
return ''.join(chr(x) for x in bs)[:-1]