| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | ''' Find record models with specified predicates. |
| | |
| | Usage:: |
| | |
| | hwp5proc find [--model=<model-name> | --tag=<hwptag>] |
| | [--incomplete] [--dump] [--format=<format>] |
| | [--loglevel=<loglevel>] [--logfile=<logfile>] |
| | (--from-stdin | <hwp5files>...) |
| | hwp5proc find --help |
| | |
| | Options:: |
| | |
| | -h --help Show this screen |
| | --loglevel=<level> Set log level. |
| | --logfile=<file> Set log file. |
| | |
| | --from-stdin get filenames fro stdin |
| | |
| | --model=<model-name> filter with record model name |
| | --tag=<hwptag> filter with record HWPTAG |
| | --incomplete filter with incompletely parsed content |
| | |
| | --format=<format> record output format |
| | %(filename)s %(stream)s %(seqno)s %(type)s |
| | --dump dump record |
| | |
| | <hwp5files>... HWPv5 files (*.hwp) |
| | |
| | Example: Find paragraphs:: |
| | |
| | $ hwp5proc find --model=Paragraph samples/*.hwp |
| | $ hwp5proc find --tag=HWPTAG_PARA_TEXT samples/*.hwp |
| | $ hwp5proc find --tag=66 samples/*.hwp |
| | |
| | Example: Find and dump records of ``HWPTAG_LIST_HEADER`` which is parsed |
| | incompletely:: |
| | |
| | $ hwp5proc find --tag=HWPTAG_LIST_HEADER --incomplete --dump samples/*.hwp |
| | |
| | ''' |
| | from __future__ import absolute_import |
| | from __future__ import print_function |
| | from __future__ import unicode_literals |
| | from functools import partial |
| | import logging |
| | import itertools |
| | import sys |
| |
|
| | from ..binmodel import Hwp5File |
| | from ..binmodel import model_to_json |
| | from ..bintype import log_events |
| | from ..dataio import ParseError |
| | from ..tagids import tagnames |
| |
|
| |
|
| | PY2 = sys.version_info.major == 2 |
| | if PY2: |
| | ifilter = itertools.ifilter |
| | imap = itertools.imap |
| | else: |
| | ifilter = filter |
| | imap = map |
| |
|
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def main(args): |
| | filenames = filenames_from_args(args) |
| |
|
| | conditions = list(conditions_from_args(args)) |
| | filter_conditions = partial( |
| | ifilter, lambda m: all(condition(m) for condition in conditions) |
| | ) |
| |
|
| | print_model = printer_from_args(args) |
| |
|
| | for filename in filenames: |
| | try: |
| | models = hwp5file_models(filename) |
| | models = filter_conditions(models) |
| | for model in models: |
| | print_model(model) |
| | except ParseError as e: |
| | logger.error('---- On processing %s:', filename) |
| | e.print_to_logger(logger) |
| |
|
| |
|
| | def find_argparser(subparsers, _): |
| | parser = subparsers.add_parser( |
| | 'find', |
| | help=_( |
| | 'Find record models with specified predicates.' |
| | ), |
| | description=_( |
| | 'Find record models with specified predicates.' |
| | ), |
| | ) |
| | parser.add_argument( |
| | 'hwp5files', |
| | nargs='*', |
| | metavar='<hwp5files>', |
| | help=_('.hwp files to analyze'), |
| | ) |
| | parser.add_argument( |
| | '--from-stdin', |
| | action='store_true', |
| | help=_('get filenames from stdin'), |
| | ) |
| | filter_group = parser.add_mutually_exclusive_group() |
| | filter_group.add_argument( |
| | '--model', |
| | metavar='<model-name>', |
| | help=_( |
| | 'filter with record model name' |
| | ), |
| | ) |
| | filter_group.add_argument( |
| | '--tag', |
| | metavar='<hwptag>', |
| | help=_( |
| | 'filter with record HWPTAG' |
| | ), |
| | ) |
| | parser.add_argument( |
| | '--incomplete', |
| | action='store_true', |
| | help=_('filter with incompletely parsed content'), |
| | ) |
| | parser.add_argument( |
| | '--format', |
| | metavar='<format>', |
| | help=_( |
| | 'record output format' |
| | ), |
| | ) |
| | parser.add_argument( |
| | '--dump', |
| | action='store_true', |
| | help=_('dump record'), |
| | ) |
| | parser.set_defaults(func=main) |
| | return parser |
| |
|
| |
|
| | def filenames_from_args(args): |
| | if args.from_stdin: |
| | return filenames_from_stdin(args) |
| | return args.hwp5files |
| |
|
| |
|
| | def filenames_from_stdin(args): |
| | return imap(lambda line: line[:-1], sys.stdin) |
| |
|
| |
|
| | def conditions_from_args(args): |
| |
|
| | if args.model: |
| | def with_model_name(model): |
| | return args.model == model['type'].__name__ |
| | yield with_model_name |
| |
|
| | if args.tag: |
| | tag = args.tag |
| | try: |
| | tag = int(tag) |
| | except ValueError: |
| | pass |
| | else: |
| | tag = tagnames[tag] |
| |
|
| | def with_tag(model): |
| | return model['tagname'] == tag |
| | yield with_tag |
| |
|
| | if args.incomplete: |
| | def with_incomplete(model): |
| | return 'unparsed' in model |
| | yield with_incomplete |
| |
|
| |
|
| | def hwp5file_models(filename): |
| | hwp5file = Hwp5File(filename) |
| | for model in flat_models(hwp5file): |
| | model['filename'] = filename |
| | yield model |
| |
|
| |
|
| | def flat_models(hwp5file, **kwargs): |
| | for model in hwp5file.docinfo.models(**kwargs): |
| | model['stream'] = 'DocInfo' |
| | yield model |
| |
|
| | for section in hwp5file.bodytext: |
| | for model in hwp5file.bodytext[section].models(**kwargs): |
| | model['stream'] = 'BodyText/' + section |
| | yield model |
| |
|
| |
|
| | def printer_from_args(args): |
| |
|
| | if args.format: |
| | fmt = args.format |
| | else: |
| | fmt = '%(filename)s %(stream)s %(seqno)s %(tagname)s %(type)s' |
| |
|
| | dump = args.dump |
| |
|
| | def print_model(model): |
| | printable_model = dict(model, type=model['type'].__name__) |
| | print(fmt % printable_model) |
| | if dump: |
| | print(model_to_json(model, sort_keys=True, indent=2)) |
| |
|
| | def print_log(fmt, *args): |
| | print(fmt % args) |
| | list(log_events(model['binevents'], print_log)) |
| | return print_model |
| |
|