| | |
| | |
| | |
| | |
| |
|
| | """Create a test corpus, using a previously pruned vocabulary.""" |
| |
|
| |
|
| | import logging |
| | import optparse |
| | import os |
| | import os.path |
| | import sys |
| |
|
| | import extract |
| |
|
| |
|
| | def read_vocab(filename, offset=0): |
| | vocab = {} |
| | for i, line in enumerate(open(filename)): |
| | vocab[line.strip()] = i + offset |
| | return vocab, i + offset |
| |
|
| |
|
| | def main(): |
| | logging.basicConfig( |
| | format='%(asctime)s %(levelname)s: %(message)s', |
| | datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) |
| | parser = optparse.OptionParser("%prog [options]") |
| | parser.add_option( |
| | "-e", "--target-language", type="string", dest="target_language") |
| | parser.add_option( |
| | "-f", "--source-language", type="string", dest="source_language") |
| | parser.add_option( |
| | "-c", "--corpus", type="string", dest="corpus_stem") |
| | parser.add_option( |
| | "-t", "--tagged-corpus", type="string", dest="tagged_stem") |
| | parser.add_option( |
| | "-a", "--align", type="string", dest="align_file") |
| | parser.add_option( |
| | "-w", "--working-dir", type="string", dest="working_dir") |
| |
|
| | parser.set_defaults( |
| | target_language="en", |
| | source_language="de", |
| | corpus_stem="test", |
| | align_file="test.align", |
| | working_dir="working") |
| | options, args = parser.parse_args(sys.argv) |
| | if not os.path.exists(options.working_dir): |
| | raise Exception( |
| | "Working directory '%s' not found" % options.working_dir) |
| |
|
| | m, n = None, None |
| | for line in open(options.working_dir + "/info"): |
| | name, value = line[:-1].split() |
| | if name == "m": |
| | m = int(value) |
| | if name == "n": |
| | n = int(value) |
| | if m is None or n is None: |
| | raise Exception("Info file is incomplete.") |
| |
|
| | tvocab, offset = read_vocab(options.working_dir + "/vocab.target") |
| | svocab, offset = read_vocab( |
| | options.working_dir + "/vocab.source", offset + 1) |
| |
|
| | file_stem = os.path.basename(options.corpus_stem) |
| | ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w") |
| | extract.get_ngrams( |
| | options.corpus_stem, |
| | options.align_file, |
| | options.tagged_stem, |
| | svocab, |
| | tvocab, |
| | options.source_language, |
| | options.target_language, |
| | m, |
| | n, |
| | ofh) |
| |
|
| | numberized_file = options.working_dir + "/" + file_stem + ".numberized" |
| | ngrams_file_handle = open( |
| | os.path.join(options.working_dir, file_stem + ".ngrams"), 'r') |
| | numberized_file_handle = open(numberized_file, 'w') |
| |
|
| | |
| | for line in ngrams_file_handle: |
| | numberized_file_handle.write(extract.numberize( |
| | line, m, n, svocab, tvocab)) |
| |
|
| | numberized_file_handle.close() |
| | ngrams_file_handle.close() |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|