| |
| |
| |
| |
| |
| |
|
|
| """ |
| (Hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. |
| Assumes tokenized and sentence-split text. |
| |
| To get Moses XML format, first projectivize the trees, then use |
| conll2mosesxml.py. |
| """ |
|
|
| from __future__ import print_function, unicode_literals |
| import os |
| import sys |
| import codecs |
| import argparse |
|
|
| from subprocess import Popen, PIPE |
|
|
| |
| from io import open |
| argparse.open = open |
|
|
|
|
| def create_parser(): |
| parser = argparse.ArgumentParser( |
| description=( |
| """Wrapper around Stanford CoreNLP to produce CoNLL dependency format. |
| Assumes that text is tokenized and has one sentence per line.""")) |
|
|
| parser.add_argument( |
| '--stanford', type=str, |
| metavar='PATH', required=True, |
| help='path to Stanford CoreNLP') |
|
|
| parser.add_argument( |
| '--java', type=str, default='java', |
| metavar='PATH', |
| help='path to java executable') |
|
|
| parser.add_argument( |
| '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, |
| metavar='PATH', |
| help="Input text (default: standard input).") |
| parser.add_argument( |
| '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, |
| metavar='PATH', |
| help="Output text (default: standard output).") |
|
|
| return parser |
|
|
|
|
| def process_stanford(infile, javacmd, stanfordpath): |
|
|
| corenlp_jar = os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') |
| corenlp_models_jar = os.path.join( |
| stanfordpath, 'stanford-corenlp-3.5.0-models.jar') |
| stanford = Popen( |
| [ |
| javacmd, |
| '-cp', "%s:%s" % (corenlp_jar, corenlp_models_jar), |
| 'edu.stanford.nlp.pipeline.StanfordCoreNLP', |
| '-annotators', 'tokenize, ssplit, pos, depparse, lemma', |
| '-ssplit.eolonly', 'true', |
| '-tokenize.whitespace', 'true', |
| '-numThreads', '8', |
| '-textFile', '-', |
| 'outFile', '-', |
| ], |
| stdin=infile, stdout=PIPE) |
| return stanford.stdout |
|
|
|
|
| def get_sentences(instream): |
| sentence = [] |
| expect = 0 |
|
|
| for line in instream: |
| if expect == 0 and line.startswith('Sentence #'): |
| if sentence: |
| yield sentence |
| sentence = [] |
| expect = 1 |
|
|
| elif line == '\n': |
| expect = 0 |
|
|
| elif expect == 3: |
| rel, remainder = line.split('(') |
| head, dep = remainder.split() |
| head_int = int(head.split('-')[-1][:-1]) |
| dep_int = int(dep.split('-')[-1][:-1]) |
| sentence[dep_int - 1]['head'] = head_int |
| sentence[dep_int - 1]['label'] = rel |
|
|
| elif expect == 2: |
| linesplit = line.split('[', 1)[1].rsplit(']', 1)[0].split('] [') |
| if len(linesplit) != len(sentence): |
| sys.stderr.write( |
| "Warning: mismatch in number of words in sentence\n") |
| sys.stderr.write(' '.join(w['word'] for w in sentence)) |
| for i in range(len(sentence)): |
| sentence[i]['pos'] = '-' |
| sentence[i]['lemma'] = '-' |
| sentence[i]['head'] = 0 |
| sentence[i]['label'] = '-' |
| expect = 0 |
| continue |
| for i, w in enumerate(linesplit): |
| sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0] |
| sentence[i]['lemma'] = w.split(' Lemma=')[-1] |
| expect = 3 |
|
|
| elif expect == 1: |
| for w in line.split(): |
| sentence.append({'word': w}) |
| expect = 2 |
|
|
| if sentence: |
| yield sentence |
|
|
|
|
| def write(sentence, outstream): |
| for i, w in enumerate(sentence): |
| outstream.write( |
| '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format( |
| i + 1, w['word'], w['lemma'], w['pos'], w['pos'], '-', |
| w['head'], w['label'])) |
|
|
|
|
| if __name__ == '__main__': |
| if sys.version_info < (3, 0): |
| sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) |
| sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) |
| sys.stdin = codecs.getreader('UTF-8')(sys.stdin) |
|
|
| parser = create_parser() |
| options = parser.parse_args() |
|
|
| stanford = process_stanford(options.input, options.java, options.stanford) |
| for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)): |
| write(sentence, options.output) |
| options.output.write('\n') |
|
|