| |
| |
| |
| |
|
|
| """Create a test corpus, using a previously pruned vocabulary.""" |
|
|
|
|
| import logging |
| import optparse |
| import os |
| import os.path |
| import sys |
|
|
| import extract |
|
|
|
|
| def read_vocab(filename, offset=0): |
| vocab = {} |
| for i, line in enumerate(open(filename)): |
| vocab[line.strip()] = i + offset |
| return vocab, i + offset |
|
|
|
|
| def main(): |
| logging.basicConfig( |
| format='%(asctime)s %(levelname)s: %(message)s', |
| datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) |
| parser = optparse.OptionParser("%prog [options]") |
| parser.add_option( |
| "-e", "--target-language", type="string", dest="target_language") |
| parser.add_option( |
| "-f", "--source-language", type="string", dest="source_language") |
| parser.add_option( |
| "-c", "--corpus", type="string", dest="corpus_stem") |
| parser.add_option( |
| "-t", "--tagged-corpus", type="string", dest="tagged_stem") |
| parser.add_option( |
| "-a", "--align", type="string", dest="align_file") |
| parser.add_option( |
| "-w", "--working-dir", type="string", dest="working_dir") |
|
|
| parser.set_defaults( |
| target_language="en", |
| source_language="de", |
| corpus_stem="test", |
| align_file="test.align", |
| working_dir="working") |
| options, args = parser.parse_args(sys.argv) |
| if not os.path.exists(options.working_dir): |
| raise Exception( |
| "Working directory '%s' not found" % options.working_dir) |
|
|
| m, n = None, None |
| for line in open(options.working_dir + "/info"): |
| name, value = line[:-1].split() |
| if name == "m": |
| m = int(value) |
| if name == "n": |
| n = int(value) |
| if m is None or n is None: |
| raise Exception("Info file is incomplete.") |
|
|
| tvocab, offset = read_vocab(options.working_dir + "/vocab.target") |
| svocab, offset = read_vocab( |
| options.working_dir + "/vocab.source", offset + 1) |
|
|
| file_stem = os.path.basename(options.corpus_stem) |
| ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w") |
| extract.get_ngrams( |
| options.corpus_stem, |
| options.align_file, |
| options.tagged_stem, |
| svocab, |
| tvocab, |
| options.source_language, |
| options.target_language, |
| m, |
| n, |
| ofh) |
|
|
| numberized_file = options.working_dir + "/" + file_stem + ".numberized" |
| ngrams_file_handle = open( |
| os.path.join(options.working_dir, file_stem + ".ngrams"), 'r') |
| numberized_file_handle = open(numberized_file, 'w') |
|
|
| |
| for line in ngrams_file_handle: |
| numberized_file_handle.write(extract.numberize( |
| line, m, n, svocab, tvocab)) |
|
|
| numberized_file_handle.close() |
| ngrams_file_handle.close() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|