| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| ''' Find record models with specified predicates. |
| |
| Usage:: |
| |
| hwp5proc find [--model=<model-name> | --tag=<hwptag>] |
| [--incomplete] [--dump] [--format=<format>] |
| [--loglevel=<loglevel>] [--logfile=<logfile>] |
| (--from-stdin | <hwp5files>...) |
| hwp5proc find --help |
| |
| Options:: |
| |
| -h --help Show this screen |
| --loglevel=<level> Set log level. |
| --logfile=<file> Set log file. |
| |
| --from-stdin get filenames fro stdin |
| |
| --model=<model-name> filter with record model name |
| --tag=<hwptag> filter with record HWPTAG |
| --incomplete filter with incompletely parsed content |
| |
| --format=<format> record output format |
| %(filename)s %(stream)s %(seqno)s %(type)s |
| --dump dump record |
| |
| <hwp5files>... HWPv5 files (*.hwp) |
| |
| Example: Find paragraphs:: |
| |
| $ hwp5proc find --model=Paragraph samples/*.hwp |
| $ hwp5proc find --tag=HWPTAG_PARA_TEXT samples/*.hwp |
| $ hwp5proc find --tag=66 samples/*.hwp |
| |
| Example: Find and dump records of ``HWPTAG_LIST_HEADER`` which is parsed |
| incompletely:: |
| |
| $ hwp5proc find --tag=HWPTAG_LIST_HEADER --incomplete --dump samples/*.hwp |
| |
| ''' |
| from __future__ import absolute_import |
| from __future__ import print_function |
| from __future__ import unicode_literals |
| from functools import partial |
| import logging |
| import itertools |
| import sys |
|
|
| from ..binmodel import Hwp5File |
| from ..binmodel import model_to_json |
| from ..bintype import log_events |
| from ..dataio import ParseError |
| from ..tagids import tagnames |
|
|
|
|
| PY2 = sys.version_info.major == 2 |
| if PY2: |
| ifilter = itertools.ifilter |
| imap = itertools.imap |
| else: |
| ifilter = filter |
| imap = map |
|
|
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def main(args): |
| filenames = filenames_from_args(args) |
|
|
| conditions = list(conditions_from_args(args)) |
| filter_conditions = partial( |
| ifilter, lambda m: all(condition(m) for condition in conditions) |
| ) |
|
|
| print_model = printer_from_args(args) |
|
|
| for filename in filenames: |
| try: |
| models = hwp5file_models(filename) |
| models = filter_conditions(models) |
| for model in models: |
| print_model(model) |
| except ParseError as e: |
| logger.error('---- On processing %s:', filename) |
| e.print_to_logger(logger) |
|
|
|
|
| def find_argparser(subparsers, _): |
| parser = subparsers.add_parser( |
| 'find', |
| help=_( |
| 'Find record models with specified predicates.' |
| ), |
| description=_( |
| 'Find record models with specified predicates.' |
| ), |
| ) |
| parser.add_argument( |
| 'hwp5files', |
| nargs='*', |
| metavar='<hwp5files>', |
| help=_('.hwp files to analyze'), |
| ) |
| parser.add_argument( |
| '--from-stdin', |
| action='store_true', |
| help=_('get filenames from stdin'), |
| ) |
| filter_group = parser.add_mutually_exclusive_group() |
| filter_group.add_argument( |
| '--model', |
| metavar='<model-name>', |
| help=_( |
| 'filter with record model name' |
| ), |
| ) |
| filter_group.add_argument( |
| '--tag', |
| metavar='<hwptag>', |
| help=_( |
| 'filter with record HWPTAG' |
| ), |
| ) |
| parser.add_argument( |
| '--incomplete', |
| action='store_true', |
| help=_('filter with incompletely parsed content'), |
| ) |
| parser.add_argument( |
| '--format', |
| metavar='<format>', |
| help=_( |
| 'record output format' |
| ), |
| ) |
| parser.add_argument( |
| '--dump', |
| action='store_true', |
| help=_('dump record'), |
| ) |
| parser.set_defaults(func=main) |
| return parser |
|
|
|
|
| def filenames_from_args(args): |
| if args.from_stdin: |
| return filenames_from_stdin(args) |
| return args.hwp5files |
|
|
|
|
| def filenames_from_stdin(args): |
| return imap(lambda line: line[:-1], sys.stdin) |
|
|
|
|
| def conditions_from_args(args): |
|
|
| if args.model: |
| def with_model_name(model): |
| return args.model == model['type'].__name__ |
| yield with_model_name |
|
|
| if args.tag: |
| tag = args.tag |
| try: |
| tag = int(tag) |
| except ValueError: |
| pass |
| else: |
| tag = tagnames[tag] |
|
|
| def with_tag(model): |
| return model['tagname'] == tag |
| yield with_tag |
|
|
| if args.incomplete: |
| def with_incomplete(model): |
| return 'unparsed' in model |
| yield with_incomplete |
|
|
|
|
| def hwp5file_models(filename): |
| hwp5file = Hwp5File(filename) |
| for model in flat_models(hwp5file): |
| model['filename'] = filename |
| yield model |
|
|
|
|
| def flat_models(hwp5file, **kwargs): |
| for model in hwp5file.docinfo.models(**kwargs): |
| model['stream'] = 'DocInfo' |
| yield model |
|
|
| for section in hwp5file.bodytext: |
| for model in hwp5file.bodytext[section].models(**kwargs): |
| model['stream'] = 'BodyText/' + section |
| yield model |
|
|
|
|
| def printer_from_args(args): |
|
|
| if args.format: |
| fmt = args.format |
| else: |
| fmt = '%(filename)s %(stream)s %(seqno)s %(tagname)s %(type)s' |
|
|
| dump = args.dump |
|
|
| def print_model(model): |
| printable_model = dict(model, type=model['type'].__name__) |
| print(fmt % printable_model) |
| if dump: |
| print(model_to_json(model, sort_keys=True, indent=2)) |
|
|
| def print_log(fmt, *args): |
| print(fmt % args) |
| list(log_events(model['binevents'], print_log)) |
| return print_model |
|
|