| from nltk.parse.dependencygraph import DependencyGraph | |
| from nltk.parse.malt import MaltParser | |
| import os | |
| import tempfile | |
| from .stemmer import FindStems | |
| from .postagger import POSTagger | |
| from .tokenizer import Tokenizer | |
| from .normalizer import Normalizer | |
| class MyMaltParser(MaltParser): | |
| def __init__(self, parser_dirname, model_filename, tagger, stemmer): | |
| """ | |
| An interface for parsing with the Malt Parser. | |
| :param parser_dirname: The path to the maltparser directory that | |
| contains the maltparser-1.x.jar | |
| :type parser_dirname: str | |
| :param model_filename: The name of the pre-trained model with .mco file | |
| extension. If provided, training will not be required. | |
| (see http://www.maltparser.org/mco/mco.html and | |
| see http://www.patful.com/chalk/node/185) | |
| :type model_filename: str | |
| :param tagger: The tagger used to POS tag the raw string before | |
| formatting to CONLL format. It should behave like `nltk.pos_tag` | |
| :type tagger: function | |
| :param stemmer: a function which returns stem of the word | |
| :type function | |
| """ | |
| self.working_dir = parser_dirname | |
| self.mco = model_filename | |
| self.pos_tagger = tagger | |
| self._malt_bin = os.path.join(parser_dirname, 'maltparser-1.9.2.jar') | |
| self.stemmer = stemmer.convert_to_stem if stemmer else lambda w, t: '_' | |
| def parse_tagged_sent(self, sentences, verbose=False, top_relation_label='null'): | |
| tmp_file_address = tempfile.gettempdir() | |
| input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=tmp_file_address, delete=False) | |
| output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=tmp_file_address, delete=False) | |
| for sentence in sentences: | |
| for i, (word, tag) in enumerate(sentence, start=1): | |
| word = word.strip() | |
| if not word: | |
| word = '_' | |
| input_file.write(('\t'.join([str(i), word.replace(' ', '_'), self.stemmer(word, tag).replace(' ', '_'), tag, tag, '_', '0', 'ROOT', '_', '_', '\n'])).encode('utf8')) | |
| input_file.write('\n'.encode('utf8')) | |
| input_file.close() | |
| cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse'] | |
| if self._execute(cmd, verbose) != 0: | |
| raise Exception("MaltParser parsing failed: %s" % (' '.join(cmd))) | |
| dependency_graph = [] | |
| with open(output_file.name, encoding='utf-8') as infile: | |
| content = infile.read().strip().split('\n\n') | |
| for sent in content: | |
| dependency_graph.append(DependencyGraph(sent)) | |
| input_file.close() | |
| output_file.close() | |
| os.remove(input_file.name) | |
| os.remove(output_file.name) | |
| return dependency_graph | |
| class DependencyParser: | |
| def __init__(self, _normalizer=None, _tokenizer=None, _stemmer=None, _tagger=None): | |
| self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/" | |
| if _normalizer is None: | |
| self.my_normalizer = Normalizer() | |
| else: | |
| self.my_normalizer = _normalizer | |
| if _tokenizer is None: | |
| self.my_tokenizer = Tokenizer() | |
| else: | |
| self.my_tokenizer = _tokenizer | |
| if _stemmer is None: | |
| self.my_stemmer = FindStems() | |
| else: | |
| self.my_stemmer = _stemmer | |
| if _tagger is None: | |
| self.my_tagger = POSTagger(tagging_model="wapiti").parse | |
| else: | |
| self.my_tagger = _tagger | |
| self.parser = MyMaltParser(parser_dirname=self.dir_path + 'resource/dependency_parser', | |
| model_filename='total_dep_parser.mco', | |
| tagger=self.my_tagger, | |
| stemmer=self.my_stemmer) | |
| def make_trainable_corpus(self, in_file, out_file): | |
| tagger = self.my_tagger | |
| with open(in_file, 'r') as infile: | |
| content = infile.read().strip().split('\n\n') | |
| for i, sent in enumerate(content): | |
| if len(sent) == 0: | |
| continue | |
| lines = sent.split('\n') | |
| sent_tokens = [x.split('\t')[1] for x in lines] | |
| tagged_sent = tagger(sent_tokens) | |
| tages = [x[1] for x in tagged_sent] | |
| for j, line in enumerate(lines): | |
| line = line.split('\t') | |
| line[3] = tages[j] | |
| line[4] = tages[j] | |
| line = '\t'.join(line) | |
| lines[j] = line | |
| sent = '\n'.join(lines) | |
| content[i] = sent | |
| content = '\n\n'.join(content) | |
| with open(out_file, 'w') as outfile: | |
| outfile.write(content) | |
| return content | |
| def parse_sents(self, sents, verbose=False): | |
| tagger = self.my_tagger | |
| tagged_sents = [tagger(self.my_tokenizer.tokenize_words(sent)) for sent in sents] | |
| return self.parser.parse_tagged_sent(tagged_sents, verbose) | |