""" A module for interfacing with ``split-sentences.perl`` from Moses toolkit. Copyright ® 2016-2017, Luís Gomes """ usage = """ Usage: moses-sentence-splitter [options] [ []] moses-sentence-splitter --selftest [--verbose] Options: --selftest, -t Run selftests. --verbose, -v Be more verbose. --unwrap, -u Assume that the text is wrapped and try to unwrap it. Note that this option will cause all consecutive non-empty lines to be buffered in memory. If you give this option make sure that you have empty lines separating paragraphs. When this option is not given, each line is assumed to be an independent paragraph or sentence and thus will not be joined with other lines. --more Also split on colons and semi-colons. 2016, Luís Gomes """ from docopt import docopt from openfile import openfile from os import path from toolwrapper import ToolWrapper import sys class MosesSentenceSplitter(ToolWrapper): """ A class for interfacing with ``split-sentences.perl`` from Moses toolkit. This class communicates with split-sentences.perl process via pipes. When the MosesSentenceSplitter object is no longer needed, the close() method should be called to free system resources. The class supports the context manager interface. If used in a with statement, the close() method is invoked automatically. When attribute ``more`` is True, colons and semi-colons are considered sentence separators. >>> split_sents = MosesSentenceSplitter('en') >>> split_sents(['Hello World! Hello', 'again.']) ['Hello World!', 'Hello again.'] """ def __init__(self, lang="en", more=True): self.lang = lang program = path.join( path.dirname(__file__), "split-sentences.perl" ) argv = ["perl", program, "-q", "-b", "-l", self.lang] if more: argv.append("-m") super().__init__(argv) def __str__(self): return "MosesSentenceSplitter(lang=\"{lang}\")".format(lang=self.lang) def __call__(self, paragraph): """Splits sentences within a paragraph. The paragraph is a list of non-empty lines. XML-like tags are not allowed. """ assert isinstance(paragraph, (list, tuple)) if not paragraph: # empty paragraph is OK return [] assert all(isinstance(line, str) for line in paragraph) paragraph = [line.strip() for line in paragraph] assert all(paragraph), "blank lines are not allowed" for line in paragraph: self.writeline(line) self.writeline("

") sentences = [] while True: sentence = self.readline().strip() if sentence == "

": break sentences.append(sentence) return sentences def read_paragraphs(inputfile, wrapped=True): lines = map(str.strip, inputfile) if wrapped: paragraph = [] for line in lines: if line: paragraph.append(line) elif paragraph: yield paragraph paragraph = [] if paragraph: yield paragraph else: for line in lines: yield [line] if line else [] def write_paragraphs(paragraphs, outputfile, blank_sep=True): for paragraph in paragraphs: for sentence in paragraph: print(sentence, file=outputfile) if blank_sep or not paragraph: print(file=outputfile) # paragraph separator def main(): args = docopt(usage) if args["--selftest"]: import doctest import mosestokenizer.sentsplitter doctest.testmod(mosestokenizer.sentsplitter) if not args[""]: sys.exit(0) split_sents = MosesSentenceSplitter(args[""], more=args["--more"]) inputfile = openfile(args[""]) outputfile = openfile(args[""], "wt") with inputfile, outputfile: paragraphs = read_paragraphs(inputfile, wrapped=args["--unwrap"]) paragraphs = map(split_sents, paragraphs) write_paragraphs(paragraphs, outputfile, blank_sep=args["--unwrap"]) if __name__ == "__main__": main()