|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
''' Find record models with specified predicates. |
|
|
|
|
|
Usage:: |
|
|
|
|
|
hwp5proc find [--model=<model-name> | --tag=<hwptag>] |
|
|
[--incomplete] [--dump] [--format=<format>] |
|
|
[--loglevel=<loglevel>] [--logfile=<logfile>] |
|
|
(--from-stdin | <hwp5files>...) |
|
|
hwp5proc find --help |
|
|
|
|
|
Options:: |
|
|
|
|
|
-h --help Show this screen |
|
|
--loglevel=<level> Set log level. |
|
|
--logfile=<file> Set log file. |
|
|
|
|
|
--from-stdin get filenames fro stdin |
|
|
|
|
|
--model=<model-name> filter with record model name |
|
|
--tag=<hwptag> filter with record HWPTAG |
|
|
--incomplete filter with incompletely parsed content |
|
|
|
|
|
--format=<format> record output format |
|
|
%(filename)s %(stream)s %(seqno)s %(type)s |
|
|
--dump dump record |
|
|
|
|
|
<hwp5files>... HWPv5 files (*.hwp) |
|
|
|
|
|
Example: Find paragraphs:: |
|
|
|
|
|
$ hwp5proc find --model=Paragraph samples/*.hwp |
|
|
$ hwp5proc find --tag=HWPTAG_PARA_TEXT samples/*.hwp |
|
|
$ hwp5proc find --tag=66 samples/*.hwp |
|
|
|
|
|
Example: Find and dump records of ``HWPTAG_LIST_HEADER`` which is parsed |
|
|
incompletely:: |
|
|
|
|
|
$ hwp5proc find --tag=HWPTAG_LIST_HEADER --incomplete --dump samples/*.hwp |
|
|
|
|
|
''' |
|
|
from __future__ import absolute_import |
|
|
from __future__ import print_function |
|
|
from __future__ import unicode_literals |
|
|
from functools import partial |
|
|
import logging |
|
|
import itertools |
|
|
import sys |
|
|
|
|
|
from ..binmodel import Hwp5File |
|
|
from ..binmodel import model_to_json |
|
|
from ..bintype import log_events |
|
|
from ..dataio import ParseError |
|
|
from ..tagids import tagnames |
|
|
|
|
|
|
|
|
PY2 = sys.version_info.major == 2 |
|
|
if PY2: |
|
|
ifilter = itertools.ifilter |
|
|
imap = itertools.imap |
|
|
else: |
|
|
ifilter = filter |
|
|
imap = map |
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def main(args): |
|
|
filenames = filenames_from_args(args) |
|
|
|
|
|
conditions = list(conditions_from_args(args)) |
|
|
filter_conditions = partial( |
|
|
ifilter, lambda m: all(condition(m) for condition in conditions) |
|
|
) |
|
|
|
|
|
print_model = printer_from_args(args) |
|
|
|
|
|
for filename in filenames: |
|
|
try: |
|
|
models = hwp5file_models(filename) |
|
|
models = filter_conditions(models) |
|
|
for model in models: |
|
|
print_model(model) |
|
|
except ParseError as e: |
|
|
logger.error('---- On processing %s:', filename) |
|
|
e.print_to_logger(logger) |
|
|
|
|
|
|
|
|
def find_argparser(subparsers, _): |
|
|
parser = subparsers.add_parser( |
|
|
'find', |
|
|
help=_( |
|
|
'Find record models with specified predicates.' |
|
|
), |
|
|
description=_( |
|
|
'Find record models with specified predicates.' |
|
|
), |
|
|
) |
|
|
parser.add_argument( |
|
|
'hwp5files', |
|
|
nargs='*', |
|
|
metavar='<hwp5files>', |
|
|
help=_('.hwp files to analyze'), |
|
|
) |
|
|
parser.add_argument( |
|
|
'--from-stdin', |
|
|
action='store_true', |
|
|
help=_('get filenames from stdin'), |
|
|
) |
|
|
filter_group = parser.add_mutually_exclusive_group() |
|
|
filter_group.add_argument( |
|
|
'--model', |
|
|
metavar='<model-name>', |
|
|
help=_( |
|
|
'filter with record model name' |
|
|
), |
|
|
) |
|
|
filter_group.add_argument( |
|
|
'--tag', |
|
|
metavar='<hwptag>', |
|
|
help=_( |
|
|
'filter with record HWPTAG' |
|
|
), |
|
|
) |
|
|
parser.add_argument( |
|
|
'--incomplete', |
|
|
action='store_true', |
|
|
help=_('filter with incompletely parsed content'), |
|
|
) |
|
|
parser.add_argument( |
|
|
'--format', |
|
|
metavar='<format>', |
|
|
help=_( |
|
|
'record output format' |
|
|
), |
|
|
) |
|
|
parser.add_argument( |
|
|
'--dump', |
|
|
action='store_true', |
|
|
help=_('dump record'), |
|
|
) |
|
|
parser.set_defaults(func=main) |
|
|
return parser |
|
|
|
|
|
|
|
|
def filenames_from_args(args): |
|
|
if args.from_stdin: |
|
|
return filenames_from_stdin(args) |
|
|
return args.hwp5files |
|
|
|
|
|
|
|
|
def filenames_from_stdin(args): |
|
|
return imap(lambda line: line[:-1], sys.stdin) |
|
|
|
|
|
|
|
|
def conditions_from_args(args): |
|
|
|
|
|
if args.model: |
|
|
def with_model_name(model): |
|
|
return args.model == model['type'].__name__ |
|
|
yield with_model_name |
|
|
|
|
|
if args.tag: |
|
|
tag = args.tag |
|
|
try: |
|
|
tag = int(tag) |
|
|
except ValueError: |
|
|
pass |
|
|
else: |
|
|
tag = tagnames[tag] |
|
|
|
|
|
def with_tag(model): |
|
|
return model['tagname'] == tag |
|
|
yield with_tag |
|
|
|
|
|
if args.incomplete: |
|
|
def with_incomplete(model): |
|
|
return 'unparsed' in model |
|
|
yield with_incomplete |
|
|
|
|
|
|
|
|
def hwp5file_models(filename): |
|
|
hwp5file = Hwp5File(filename) |
|
|
for model in flat_models(hwp5file): |
|
|
model['filename'] = filename |
|
|
yield model |
|
|
|
|
|
|
|
|
def flat_models(hwp5file, **kwargs): |
|
|
for model in hwp5file.docinfo.models(**kwargs): |
|
|
model['stream'] = 'DocInfo' |
|
|
yield model |
|
|
|
|
|
for section in hwp5file.bodytext: |
|
|
for model in hwp5file.bodytext[section].models(**kwargs): |
|
|
model['stream'] = 'BodyText/' + section |
|
|
yield model |
|
|
|
|
|
|
|
|
def printer_from_args(args): |
|
|
|
|
|
if args.format: |
|
|
fmt = args.format |
|
|
else: |
|
|
fmt = '%(filename)s %(stream)s %(seqno)s %(tagname)s %(type)s' |
|
|
|
|
|
dump = args.dump |
|
|
|
|
|
def print_model(model): |
|
|
printable_model = dict(model, type=model['type'].__name__) |
|
|
print(fmt % printable_model) |
|
|
if dump: |
|
|
print(model_to_json(model, sort_keys=True, indent=2)) |
|
|
|
|
|
def print_log(fmt, *args): |
|
|
print(fmt % args) |
|
|
list(log_events(model['binevents'], print_log)) |
|
|
return print_model |
|
|
|