| | """ |
| | A module for interfacing with ``split-sentences.perl`` from Moses toolkit. |
| | |
| | Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com> |
| | """ |
| |
|
| | usage = """ |
| | Usage: |
| | moses-sentence-splitter [options] <lang> [<inputfile> [<outputfile>]] |
| | moses-sentence-splitter --selftest [--verbose] |
| | |
| | Options: |
| | --selftest, -t Run selftests. |
| | --verbose, -v Be more verbose. |
| | --unwrap, -u Assume that the text is wrapped and try to unwrap it. |
| | Note that this option will cause all consecutive non-empty |
| | lines to be buffered in memory. If you give this option |
| | make sure that you have empty lines separating paragraphs. |
| | When this option is not given, each line is assumed to be |
| | an independent paragraph or sentence and thus will not be |
| | joined with other lines. |
| | --more Also split on colons and semi-colons. |
| | |
| | 2016, Luís Gomes <luismsgomes@gmail.com> |
| | """ |
| |
|
| |
|
| | from docopt import docopt |
| | from openfile import openfile |
| | from os import path |
| | from toolwrapper import ToolWrapper |
| | import sys |
| |
|
| |
|
| | class MosesSentenceSplitter(ToolWrapper): |
| | """ |
| | A class for interfacing with ``split-sentences.perl`` from Moses toolkit. |
| | |
| | This class communicates with split-sentences.perl process via pipes. When |
| | the MosesSentenceSplitter object is no longer needed, the close() method |
| | should be called to free system resources. The class supports the context |
| | manager interface. If used in a with statement, the close() method is |
| | invoked automatically. |
| | |
| | When attribute ``more`` is True, colons and semi-colons are considered |
| | sentence separators. |
| | |
| | >>> split_sents = MosesSentenceSplitter('en') |
| | >>> split_sents(['Hello World! Hello', 'again.']) |
| | ['Hello World!', 'Hello again.'] |
| | |
| | """ |
| |
|
| | def __init__(self, lang="en", more=True): |
| | self.lang = lang |
| | program = path.join( |
| | path.dirname(__file__), |
| | "split-sentences.perl" |
| | ) |
| | argv = ["perl", program, "-q", "-b", "-l", self.lang] |
| | if more: |
| | argv.append("-m") |
| | super().__init__(argv) |
| |
|
| | def __str__(self): |
| | return "MosesSentenceSplitter(lang=\"{lang}\")".format(lang=self.lang) |
| |
|
| | def __call__(self, paragraph): |
| | """Splits sentences within a paragraph. |
| | The paragraph is a list of non-empty lines. XML-like tags are not |
| | allowed. |
| | """ |
| | assert isinstance(paragraph, (list, tuple)) |
| | if not paragraph: |
| | return [] |
| | assert all(isinstance(line, str) for line in paragraph) |
| | paragraph = [line.strip() for line in paragraph] |
| | assert all(paragraph), "blank lines are not allowed" |
| | for line in paragraph: |
| | self.writeline(line) |
| | self.writeline("<P>") |
| | sentences = [] |
| | while True: |
| | sentence = self.readline().strip() |
| | if sentence == "<P>": |
| | break |
| | sentences.append(sentence) |
| | return sentences |
| |
|
| |
|
| | def read_paragraphs(inputfile, wrapped=True): |
| | lines = map(str.strip, inputfile) |
| | if wrapped: |
| | paragraph = [] |
| | for line in lines: |
| | if line: |
| | paragraph.append(line) |
| | elif paragraph: |
| | yield paragraph |
| | paragraph = [] |
| | if paragraph: |
| | yield paragraph |
| | else: |
| | for line in lines: |
| | yield [line] if line else [] |
| |
|
| |
|
| | def write_paragraphs(paragraphs, outputfile, blank_sep=True): |
| | for paragraph in paragraphs: |
| | for sentence in paragraph: |
| | print(sentence, file=outputfile) |
| | if blank_sep or not paragraph: |
| | print(file=outputfile) |
| |
|
| |
|
| | def main(): |
| | args = docopt(usage) |
| | if args["--selftest"]: |
| | import doctest |
| | import mosestokenizer.sentsplitter |
| | doctest.testmod(mosestokenizer.sentsplitter) |
| | if not args["<lang>"]: |
| | sys.exit(0) |
| | split_sents = MosesSentenceSplitter(args["<lang>"], more=args["--more"]) |
| | inputfile = openfile(args["<inputfile>"]) |
| | outputfile = openfile(args["<outputfile>"], "wt") |
| | with inputfile, outputfile: |
| | paragraphs = read_paragraphs(inputfile, wrapped=args["--unwrap"]) |
| | paragraphs = map(split_sents, paragraphs) |
| | write_paragraphs(paragraphs, outputfile, blank_sep=args["--unwrap"]) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|