Initial commit with adapted deliverables from Clarin: http://hdl.handle.net/20.500.12537/301
4f09c24
| from combo.predict import COMBO | |
| from allennlp.data import tokenizers | |
| from argparse import ArgumentParser | |
| parser = ArgumentParser() | |
| parser.add_argument('--parser') | |
| parser.add_argument('--infile') | |
| parser.add_argument('--pretokenized', action='store_true') | |
| args = parser.parse_args() | |
| # If your data is pre-tokenized, you can add the --pretokenized flag | |
| # If you have a GPU available, you can add cuda_device=<your-device> to COMBO.from_pretrained | |
| # The parser expects input in the same format as test_file.txt, i.e. one sentence per line | |
| if args.pretokenized: | |
| from Tokenizer.src.tokenizer import split_into_sentences | |
| nlp = COMBO.from_pretrained('combo-is-combined-v211', tokenizer=tokenizers.SpacyTokenizer(split_on_spaces=True)) | |
| else: | |
| nlp = COMBO.from_pretrained(args.parser) | |
| def read_test_file(file): | |
| with open(file, 'r', encoding='utf-8') as infile: | |
| for line in infile: | |
| if args.pretokenized: | |
| yield ' '.join(split_into_sentences(line)) | |
| else: | |
| yield line.rstrip() | |
| test_file = read_test_file(args.infile) | |
| for sent in test_file: | |
| sentence = nlp(sent) | |
| for index, token in enumerate(sentence.tokens, 1): | |
| print(f'{token.id}\t{token.token}\t{token.lemma}\t{token.upostag}\t{token.xpostag}\t{token.feats}\t{token.head}\t{token.deprel}\t{token.deps}\t{token.misc}') | |
| print() | |