diff --git a/mosesdecoder/contrib/arrow-pipelines/README b/mosesdecoder/contrib/arrow-pipelines/README new file mode 100644 index 0000000000000000000000000000000000000000..86f2128748d4b4392d71638772e32adaec7a3bcc --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/README @@ -0,0 +1,58 @@ +Arrow Based Moses Training Pipeline +=================================== + +This demonstration implements a training pipeline that is shown in the Dia diagram in documentation/training-pipeline/moses-pypeline.dia. + +The demo has been tested with: + + - Moses v1.0 + - Giza++ v1.0.7 + - IRSTLM v5.70.04 + + +Setup +----- + +To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command: + +$ git submodule update --init --recursive + +This will clone PCL, available at Github (git://github.com/ianj-als/pcl.git), and Pypeline submodules, available at GitHub (git://github.com/ianj-als/pypeline.git). + +Return to the arrow-pipelines contrib directory: + +$ cd contrib/arrow-pipelines + +To use the PCL compiler and run-time set the following environment variables (assuming Bash shell): + +$ export PATH=$PATH:`pwd`/python/pcl/src/pclc:`pwd`/python/pcl/src/pcl-run +$ export PYTHONPATH=$PYTHONPATH:`pwd`/python/pcl/libs/pypeline/src +$ export PCL_IMPORT_PATH=`pwd`/python/pcl/src/runtime:`pwd`/pcl + +Three environment variables need to be set before the pipeline can be run, they are: + + - MOSES_HOME : The directory where Moses has been cloned, or installed, + - IRSTLM : The installation directory of your IRSTLM, and + - GIZA_HOME : The installation directory of GIZA++. + + +Building the example training pipeline +-------------------------------------- + +$ cd pcl +$ make + + +Running the example training pipeline +------------------------------------- + +To execute the training pipeline run the following command: + +$ pcl-run.py training_pipeline + +Once complete the output of the pipeline can be found in the directories: + + - training/tokenisation + - training/model + - training/lm + - training/mert diff --git a/mosesdecoder/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/mosesdecoder/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia new file mode 100644 index 0000000000000000000000000000000000000000..1d35a1dea7ad04f671561738f682da9aac844ec5 Binary files /dev/null and b/mosesdecoder/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia differ diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/Makefile b/mosesdecoder/contrib/arrow-pipelines/pcl/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..43839a3e2b6873f6eae0ef4a88b208e08c97c962 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/Makefile @@ -0,0 +1,23 @@ +CC = pclc.py +CFLAGS=-i +SOURCES = training_pipeline.pcl +OBJS = $(SOURCES:.pcl=.py) +SUBDIRS = components + +all: subdirs build + +build: $(OBJS) + +%.py: %.pcl + $(CC) $(CFLAGS) $< + +clean: + for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir clean; \ + done + rm -f *.py *.pyc *.log *~ + +subdirs: + for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir ; \ + done diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/Makefile b/mosesdecoder/contrib/arrow-pipelines/pcl/components/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c0aa0f886bc327babe9a36e0ba22753f9547c54b --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/Makefile @@ -0,0 +1,24 @@ +CC = pclc.py +CFLAGS = -i +SOURCES = src_trg_tokeniser.pcl translation_model_training.pcl +OBJS = $(SOURCES:.pcl=.py) +SUBDIRS = wrappers + +all: subdirs build + +build: $(OBJS) + +%.py: %.pcl + $(CC) $(CFLAGS) $< + +clean: + for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir clean; \ + done + rm -f *.py *.pyc *.log *~ + +subdirs: + for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir ; \ + done + diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg b/mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg new file mode 100644 index 0000000000000000000000000000000000000000..1f31d984e7742e9a4180b2b4a8833c0300e8709d --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg @@ -0,0 +1,10 @@ +[Configuration] +tokeniser.src.language = en +tokeniser.src.tokenisation_dir = test_data/src_trg_tokenizer/tokenised +tokeniser.trg.language = lt +tokeniser.trg.tokenisation_dir = test_data/src_trg_tokenizer/tokenised +tokeniser.moses.installation = /opt/moses + +[Inputs] +src_filename = test_data/src_trg_tokenizer/cleantrain.en +trg_filename = test_data/src_trg_tokenizer/cleantrain.lt diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl b/mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl new file mode 100644 index 0000000000000000000000000000000000000000..462b43d2a453c0e66e2e4bf3539f0792c3e219e3 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl @@ -0,0 +1,40 @@ +# +# Import all of the components to be composed +# +import wrappers.tokenizer.tokenizer as tokeniser + +# +# Component definition +# +# +---------+ +---------+ +---------+ +---------+ +# src_filename -->+ +--> filename -->+-- src --+--> tokenised_filename -->+---------+--> tokenised_filename -->+ +--> tokenised_src_filename +# | | | | | | | | +# trg_filename -->+ +--> filename -->+---------+-------> filename ------->+-- trg --+--> tokenised_filename -->+ +--> tokenised_trg_filename +# +---------+ +---------+ +---------+ +---------+ +# Config: {language::String, Config: {language::String, +# tokenisation_dir::String, tokenisation_dir::String, +# moses_installation_dir::String} moses_installation_dir::String} +# +component src_trg_tokeniser + inputs (src_filename), (trg_filename) + outputs (tokenised_src_filename), (tokenised_trg_filename) + configuration tokeniser.src.language, + tokeniser.src.tokenisation_dir, + tokeniser.trg.language, + tokeniser.trg.tokenisation_dir, + tokeniser.moses.installation + declare + src_tokeniser := new tokeniser with + tokeniser.src.language -> corpus.language, + tokeniser.src.tokenisation_dir -> working.directory.root, + tokeniser.moses.installation -> moses.installation + trg_tokeniser := new tokeniser with + tokeniser.trg.language -> corpus.language, + tokeniser.trg.tokenisation_dir -> working.directory.root, + tokeniser.moses.installation -> moses.installation + as + wire (src_filename -> corpus.filename), + (trg_filename -> corpus.filename) >>> + (src_tokeniser *** trg_tokeniser) >>> + wire (corpus.tokenised.filename -> tokenised_src_filename), + (corpus.tokenised.filename -> tokenised_trg_filename) diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg b/mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg new file mode 100644 index 0000000000000000000000000000000000000000..2950fcf7560473702be6ad6a18d04dbf47d92307 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg @@ -0,0 +1,15 @@ +[Configuration] +model_training.max_segment_length = 20 +model_training.corpus.development_size = 4500 +model_training.corpus.evaluation_size = 5000 +model_training.src.language = en +model_training.trg.language = lt +model_training.method.alignment = grow-diag-final-and +model_training.method.reordering = msd-bidirectional-fe +model_training.moses.installation = /opt/moses +model_training.giza.installation = /opt/moses/giza++-v1.0.7 +model_training.translation_model.dir = test_data/translation_model_training/translation_model + +[Inputs] +src_filename = test_data/translation_model_training/cleantrain.en +trg_filename = test_data/translation_model_training/cleantrain.lt diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl b/mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl new file mode 100644 index 0000000000000000000000000000000000000000..a185a8d7904657d23f1491593c694436f5c1715f --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl @@ -0,0 +1,70 @@ +# +# Import all of the components to be composed +# +import wrappers.cleanup.cleanup as cleanup +import wrappers.data_split.data_split as data_split +import wrappers.model_training.model_training as model_training + +# +# Component definition +# +# {cleaned_src_filename, {src_filename, {[devel|eval|train]_src_filename, {src_filename, {moses_ini_file, +# cleaned_trg_filename} trg_filename} [devel|eval|train]_trg_filename} trg_filename} evaluation_data_filename} +# | | | | +-------+ | +# +-------+ | | +-------+ | +-------+ V | Model | {moses_ini_file} +-------+ V +# | Clean | V V | Data | V | +---------------->+ Train +----------------->+ Merge +-----> +# {src_filename, -->+ +----->+ +------------->+ Split | +-------+ +---+---+ +# trg_filename} | Up | | Split | | +---\ Config: {[src|trg]_language::String, ^ +# +-------+ +-------+ +-------+ | alignment_method::String, | +# Config: {segment_length::Int} Config: {development_size::Int, | reordering_method::String, | +# evaluation_size::Int} | giza_installation_dir::String, | +# | model_directory::String} | +# \--------------------------------------------/ +# +component translation_model_training + inputs src_filename, trg_filename + outputs evaluation_data_filename, moses_ini_filename + configuration model_training.max_segment_length, + model_training.corpus.development_size, + model_training.corpus.evaluation_size, + model_training.src.language, + model_training.trg.language, + model_training.method.alignment, + model_training.method.reordering, + model_training.moses.installation, + model_training.giza.installation, + model_training.translation_model.dir + declare + cleanup := new cleanup with + model_training.max_segment_length -> segment_length_limit + data_split := new data_split with + model_training.corpus.development_size -> development_data_size, + model_training.corpus.evaluation_size -> evaluation_data_size + model_training := new model_training with + model_training.src.language -> source_language, + model_training.trg.language -> target_language, + model_training.method.alignment -> alignment_method, + model_training.method.reordering -> reordering_method, + model_training.moses.installation -> moses_installation_dir, + model_training.giza.installation -> giza_installation_dir, + model_training.translation_model.dir -> translation_model_directory + as + cleanup >>> + wire cleaned_src_filename -> src_filename, + cleaned_trg_filename -> trg_filename >>> + data_split >>> + wire devel_src_filename -> devel_src_filename, + eval_src_filename -> evaluation_data_filename, + train_trg_filename -> _, + train_src_filename -> _, + eval_trg_filename -> _, + devel_trg_filename -> devel_trg_filename >>> + ((wire devel_src_filename -> src_filename, + devel_trg_filename -> trg_filename, + evaluation_data_filename -> _ >>> + model_training) &&& + wire evaluation_data_filename -> evaluation_data_filename, + devel_src_filename -> _, + devel_trg_filename -> _) >>> + merge top[moses_ini_filename] -> moses_ini_filename, + bottom[evaluation_data_filename] -> evaluation_data_filename diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/Makefile b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..16c8744b6ea8b1f386afa2dbaeee03b0fdc11291 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/Makefile @@ -0,0 +1,14 @@ +SUBDIRS = tokenizer + +all: subdirs + +clean: + for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir clean; \ + done + +subdirs: + for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir ; \ + done + diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/__init__.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/__init__.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py new file mode 100644 index 0000000000000000000000000000000000000000..44b7e98727d5c6f58ca041d17cf261722035bd73 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py @@ -0,0 +1,129 @@ +def get_name(): + return 'cleanup' + +def get_inputs(): + return ['src_filename', 'trg_filename'] + +def get_outputs(): + return ['cleaned_src_filename', 'cleaned_trg_filename'] + +def get_configuration(): + return ['segment_length_limit'] + +def configure(args): + return {'segment_length' : args['segment_length_limit']} + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print >>ofh1, l1, + print >>ofh2, l2, + + def _make_cleaned_filename(filename): + bits = filename.split(".") + bits.insert(-1, "clean") + return ".".join(bits) + + def _filter_main(a, s): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = a['src_filename'] + input_trg_filename = a['trg_filename'] + + print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename) + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + + cleaned_src_filename = _make_cleaned_filename(input_src_filename) + cleaned_trg_filename = _make_cleaned_filename(input_trg_filename) + ofh1 = open(cleaned_src_filename, "w") + ofh2 = open(cleaned_trg_filename, "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': cleaned_src_filename, + 'cleaned_trg_filename': cleaned_trg_filename} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _filter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected"} + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + try: + thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename']) + finally: + os.unlink(output['cleaned_src_filename']) + os.unlink(output['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/__init__.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg new file mode 100644 index 0000000000000000000000000000000000000000..730dcd19a9cc256a81850c1965274e60a38b605c --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg @@ -0,0 +1,7 @@ +[Configuration] +evaluation_data_size = 7 +development_data_size = 13 + +[Inputs] +src_filename = test_data/data.en +trg_filename = test_data/data.de diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py new file mode 100644 index 0000000000000000000000000000000000000000..d4b5ddacdd5bfb249fb895b2dfdaa7601056c25f --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py @@ -0,0 +1,144 @@ +def get_name(): + return 'data_split' + +def get_inputs(): + return ['src_filename', 'trg_filename'] + +def get_outputs(): + return ['devel_src_filename', 'devel_trg_filename', + 'eval_src_filename', 'eval_trg_filename', + 'train_src_filename', 'train_trg_filename'] + +def get_configuration(): + return ['evaluation_data_size', 'development_data_size'] + +def configure(args): + result = {} + result['evaluate_size'] = args['evaluation_data_size'] + result['development_size'] = args['development_data_size'] + return result + +def initialise(config): + def _copy(size, inp, ofh1, ofh2): + try: + while size != 0: + (l1, l2) = inp.next() + print >>ofh1, l1, + print >>ofh2, l2, + size -= 1 + except StopIteration: + pass + + def _make_split_filename(filename, data_set): + bits = filename.split(".") + bits.insert(-1, data_set) + + new_filename = ".".join(bits) + return new_filename + + def _splitter_main(a, s): + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = a['src_filename'] + input_trg_filename = a['trg_filename'] + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + inp = iter(zip(ifh1, ifh2)) + + result = {} + for (data_set, size) in [('devel', config['development_size']), + ('eval', config['evaluate_size']), + ('train', -1)]: + output_src_filename = _make_split_filename(input_src_filename, data_set) + output_trg_filename = _make_split_filename(input_trg_filename, data_set) + ofh1 = open(output_src_filename, "w") + ofh2 = open(output_trg_filename, "w") + + _copy(size, inp, ofh1, ofh2) + result[data_set + '_src_filename'] = output_src_filename + result[data_set + '_trg_filename'] = output_trg_filename + + return result + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _splitter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = {'evaluation_data_size': 7, + 'development_data_size': 13} + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = {'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'devel_src_expected': src_filename[1] + ".devel.expected", + 'devel_trg_expected': trg_filename[1] + ".devel.expected", + 'eval_src_expected': src_filename[1] + ".eval.expected", + 'eval_trg_expected': trg_filename[1] + ".eval.expected", + 'train_src_expected': src_filename[1] + ".train.expected", + 'train_trg_expected': trg_filename[1] + ".train.expected"} + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + for data_set in ['devel', 'eval', 'train']: + for lang in ['src', 'trg']: + filename = output[data_set + '_' + lang + '_filename'] + filename_expected = box_eval[data_set + '_' + lang + '_expected'] + thelp.diff(filename_expected, filename) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line(range(50))) + thelp.cat(box_eval['trg_filename'], _line(range(50))) + #expected output: + thelp.cat(box_eval['devel_src_expected'], _line(range(0,13))) + thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13))) + thelp.cat(box_eval['eval_src_expected'], _line(range(13,20))) + thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20))) + thelp.cat(box_eval['train_src_expected'], _line(range(20,50))) + thelp.cat(box_eval['train_trg_expected'], _line(range(20,50))) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de new file mode 100644 index 0000000000000000000000000000000000000000..9ff58226d2bae217eb73fb937ea45325f9482f6a --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de @@ -0,0 +1,50 @@ + +tok0 +tok0 tok1 +tok0 tok1 tok2 +tok0 tok1 tok2 tok3 +tok0 tok1 tok2 tok3 tok4 +tok0 tok1 tok2 tok3 tok4 tok5 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48 diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en new file mode 100644 index 0000000000000000000000000000000000000000..9ff58226d2bae217eb73fb937ea45325f9482f6a --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en @@ -0,0 +1,50 @@ + +tok0 +tok0 tok1 +tok0 tok1 tok2 +tok0 tok1 tok2 tok3 +tok0 tok1 tok2 tok3 tok4 +tok0 tok1 tok2 tok3 tok4 tok5 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 +tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48 diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/__init__.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/irstlm_build.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/irstlm_build.py new file mode 100644 index 0000000000000000000000000000000000000000..c1f7856db8c3b1606a382e652b073e3e1202f671 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/irstlm_build.py @@ -0,0 +1,117 @@ +import os +import shutil +import subprocess +import tempfile + + +def get_name(): + return 'irstlm_build' + +def get_inputs(): + return ['input_filename'] + +def get_outputs(): + return ['add_start_end_filename', 'lm_filename', 'compiled_lm_filename'] + +def get_configuration(): + return ['irstlm_installation_dir', 'irstlm_smoothing_method', 'language_model_directory'] + +def configure(args): + config = dict() + config['irstlm_install_directory'] = args['irstlm_installation_dir'] + config['smoothing_method'] = args['irstlm_smoothing_method'] + config['lm_directory'] = args['language_model_directory'] + return config + +def initialise(config): + def process(a, s): + # Create the LM directory if we need to + if os.path.exists(config['lm_directory']) is False: + os.makedirs(config['lm_directory']) + + # The filename of the file to chew through + start_end_input_filename = a['input_filename'] + if os.path.exists(start_end_input_filename) is False: + raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename) + + # Derive the output file name for the add start-end marker processor + filename_bits = os.path.basename(start_end_input_filename).split(".") + filename_bits[2] = "sb"; + start_end_output_filename = os.path.join(config['lm_directory'], ".".join(filename_bits)) + + # Derive the output file name of the LM build + filename_bits[2] = "lm" + lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits)) + + # Derive the compiled LM file name + filename_bits[2] = "arpa" + compiled_lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits)) + + # First thing to do is add start and end markers + start_end_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "add-start-end.sh")] + infile = open(start_end_input_filename, 'r') + outfile = open(start_end_output_filename, 'w') + print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline) + return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile) + if return_code: + raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \ + start_end_input_filename, start_end_output_filename, return_code) + + # Next build the language model + tmp_dir = tempfile.mkdtemp(dir = "/tmp") + try: + build_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "build-lm.sh"), + "-i", start_end_output_filename, + "-t", tmp_dir, + "-p", + "-s", config['smoothing_method'], + "-o", lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline) + return_code = subprocess.check_call(build_lm_cmdline) + if return_code: + raise Exception("IRST language model failed to build: return code = [%d]" % return_code) + finally: + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + # Compile the LM + lm_filename = lm_filename + ".gz" + compile_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "compile-lm"), + "--text", "yes", + lm_filename, + compiled_lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline) + return_code = subprocess.check_call(compile_lm_cmdline) + if return_code: + raise Exception("IRST language model compilation failed: return code = [%d]" % return_code) + + output = {'add_start_end_filename': start_end_output_filename, + 'lm_filename': lm_filename, + 'compiled_lm_filename': compiled_lm_filename} + + print "IRSTLM Build: Output = %s" % output + + return output + + return process + + +if __name__ == '__main__': + from pypeline.helpers.helpers import eval_pipeline, cons_function_component + + lm_dir = os.environ["PWD"] + configuration = {'irstlm_root': os.environ["IRSTLM"], + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': lm_dir} + component_config = configure(configuration) + component = initialise(component_config) + + value = eval_pipeline(cons_function_component(component), + {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'}, + component_config) + target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'), + 'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'), + 'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')} + print "Target: %s" % target + if value != target: + raise Exception("Massive fail!") diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/__init__.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/mert.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/mert.py new file mode 100644 index 0000000000000000000000000000000000000000..b30bbf5958d6b6c8a45ab6f1bd4fbcc6f314a875 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/mert.py @@ -0,0 +1,98 @@ +import os +import shutil +import subprocess + +def get_name(): + return 'mert' + +def get_inputs(): + return ['evaluation_data_filename', 'trg_language_model_filename', + 'trg_language_model_order', 'trg_language_model_type', + 'moses_ini_filename'] + +def get_outputs(): + return ['moses_ini_filename'] + +def get_configuration(): + return ['source_language', 'target_language', + 'moses_installation_dir', 'mert_working_directory', + 'mert_max_no_iterations'] + +def configure(args): + result = {} + result['src_lang'] = args['source_language'] + result['trg_lang'] = args['target_language'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['mert_working_dir'] = args['mert_working_directory'] + result['max_no_iterations'] = args['mert_max_no_iterations'] + return result + +def initialise(config): + def process(a, s): + infilename = os.path.abspath(a['evaluation_data_filename']) + infilename = ".".join(infilename.split(".")[:-1]) + lm_file = os.path.abspath(a['trg_language_model_filename']) + lm_order = int(a['trg_language_model_order']) + lm_type = int(a['trg_language_model_type']) + max_no_iters = int(config['max_no_iterations']) + orig_moses_ini = os.path.abspath(a['moses_ini_filename']) + + if not os.path.exists(orig_moses_ini): + raise Exception, "Error: Input moses.ini does not exist" + + workdir = os.path.abspath(config['mert_working_dir']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + moses_install_dir = os.path.abspath(config['moses_installation_dir']) + mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl') + bin_dir = os.path.join(moses_install_dir, 'bin') + moses_bin = os.path.join(moses_install_dir, 'bin', 'moses') + src_file = infilename + '.' + config['src_lang'] + ref_file = infilename + '.' + config['trg_lang'] + logfile = os.path.join(workdir, 'log') + #change lm configuration in moses ini + moses_ini = os.path.join(workdir, 'trained-moses.ini') + cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s" + cmd = cmd % locals() + os.system(cmd) + + #the command + cmd = '%(mert_perl)s --maximum-iterations %(max_no_iters)d --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s' + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + new_mosesini = os.path.join(workdir, 'moses.ini') + if not os.path.exists(new_mosesini): + raise Exception, 'Failed MERT' + + return {'moses_ini_filename' : new_mosesini} + + return process + + +if __name__ == '__main__': + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.path.abspath('../../../../'), + 'mert_working_dir':'../../../../../tuning'} + values = {'development_data_filename':'../../../../../corpus/tune', + 'moses_ini_file':'../../../../../model/model/moses.ini', + 'trg_language_model_filename':'../../../../../corpus/train.lt.lm', + 'trg_language_model_type':9, + 'trg_language_model_order':4} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/__init__.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py new file mode 100644 index 0000000000000000000000000000000000000000..56c7f6a70718af00a85fe0196a5c5df284f00694 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py @@ -0,0 +1,103 @@ +import os +import shutil +import subprocess + + +def get_name(): + return 'model_training' + +def get_inputs(): + return ['src_filename', 'trg_filename'] + +def get_outputs(): + return ['moses_ini_filename'] + +def get_configuration(): + return ['source_language', 'target_language', + 'moses_installation_dir', 'giza_installation_dir', + 'translation_model_directory', 'alignment_method', + 'reordering_method'] + +# Alignment = grow-diag-final-and +# Reordering = msd-bidirectional-fe +def configure(args): + result = {} + result['src_lang'] = args['source_language'] + result['trg_lang'] = args['target_language'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['external_bin_dir'] = args['giza_installation_dir'] + result['model_directory'] = args['translation_model_directory'] + result['alignment'] = args['alignment_method'] + result['reordering'] = args['reordering_method'] + return result + +def initialise(config): + def process(a, s): + get_corpora_name_fn = lambda fn: ".".join(os.path.basename(fn).split('.')[:-1]) + src_filename = os.path.abspath(a['src_filename']) + trg_filename = os.path.abspath(a['trg_filename']) + src_corpora_name = get_corpora_name_fn(src_filename) + trg_corpora_name = get_corpora_name_fn(trg_filename) + if src_corpora_name != trg_corpora_name: + raise Exception, "Mismatch of source [%s] and target [%s] filename" % (src_filename, trg_filename) + + infilename = os.path.abspath(os.path.join(os.path.dirname(src_filename), src_corpora_name)) + workdir = os.path.abspath(config['model_directory']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + train_model_perl = os.path.abspath(os.path.join(config['moses_installation_dir'], + 'scripts', + 'training', + 'train-model.perl')) + src_lang = config['src_lang'].lower() + trg_lang = config['trg_lang'].lower() + external_bin = os.path.abspath(config['external_bin_dir']) + #create a dummy lm file + dummy_lmfile = os.path.join(workdir, 'dummy.lm') + f = open(dummy_lmfile, 'w') + print >> f, "dummy lm file" + f.close() + logfile = os.path.join(workdir, 'log') + + #the command + alignment_method = config['alignment'] + reordering_method = config['reordering'] + cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s ' \ + '-f %(src_lang)s -e %(trg_lang)s -alignment %(alignment_method)s ' \ + '-reordering %(reordering_method)s -lm 0:5:%(dummy_lmfile)s:0 ' \ + '-external-bin-dir %(external_bin)s 2> %(logfile)s' + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + # check the moses ini + mosesini = os.path.join(workdir, 'model', 'moses.ini') + if not os.path.exists(mosesini): + raise Exception, 'Failed training model' + + return {'moses_ini_filename' : mosesini} + + return process + + +if __name__ == '__main__': + def __test(): + configuration = {'src_lang' : 'en', + 'trg_lang' : 'lt', + 'moses_installation_dir' : os.environ['MOSES_HOME'], + 'giza_installation_dir' : os.environ['GIZA_HOME'], + 'translation_model_directory' : 'model-dir'} + values = {'training_data_filename' : '/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + print run_pipeline(box, values, None) + + #do some test + __test() diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.cfg b/mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.cfg new file mode 100644 index 0000000000000000000000000000000000000000..9e1570cbc86fa7e395528de00c6808e165771184 --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.cfg @@ -0,0 +1,21 @@ +[Configuration] +source_language = en +target_language = lt +max_segment_length = 20 +corpus_development_size = 1000 +corpus_evaluation_size = 500 +alignment_method = grow-diag-final-and +reordering_method = msd-bidirectional-fe +smoothing_method = improved-kneser-ney +tokenisation_directory = training/tokenisation +translation_model_directory = training/model +language_model_directory = training/lm +mert_directory = training/mert +mert_max_no_iterations = 10 +moses_installation_directory = $(MOSES_HOME) +giza_installation_directory = $(GIZA_HOME) +irstlm_installation_directory = $(IRSTLM) + +[Inputs] +src_filename = ../test_data/cleantrain.en +trg_filename = ../test_data/cleantrain.lt diff --git a/mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.pcl b/mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.pcl new file mode 100644 index 0000000000000000000000000000000000000000..f8361b3535cea4b10d7a35e096d44e4739f94aaa --- /dev/null +++ b/mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.pcl @@ -0,0 +1,117 @@ +# +# Import all of the components to be composed +# +import components.src_trg_tokeniser as tokeniser +import components.translation_model_training as model_training +import components.wrappers.irstlm_build.irstlm_build as lang_model +import components.wrappers.mert.mert as mert + +# +# Component definition +# +# Config: {model_training.max_segment_length, +# model_training.corpus.[development_size|evaluation_size], +# model_training.[src|trg].language, +# model_training.method.[alignment|reordering], {moses_ini_filename, +# model_training.giza.installation, evaluation_data_filename} +# {src_filename, {tokenised_src_filename, model_training.translation_model.dir} | +# trg_filename} tokenised_trg_filename} +-----------------------------------------+ +-------+ | {moses_ini_filename} +# | +-------+ +-------+ +-------+ | +-------+ | tokenised_src_filename -> src_filename, | | Model | V +-------+ | +# V | +--->+ Src/ +--->+ | V | +-->+ tokenised_trg_filename -> trg_filename +-->+ Train +------>+ | +------+ V +# --->+ Split | | Trg | | Merge +--->+ Split | +-----------------------------------------+ +-------+ | Merge +----->+ MERT +---> +# | +--->+ Token +--->+ | | +--\ +------------------------------------------+ +--------+ | | ^ +------+ +# +-------+ +-------+ +-------+ +-------+ \->+ tokenised_trg_filename -> input_filename +-->+ IRSTLM +-->+ | | +# Config: {tokeniser.[src|trg].language, +------------------------------------------+ +--------+ ^ +-------+ | +# tokeniser.[src|trg].tokeniser_dir Config: {irstlm_installation_dir::String, | | +# tokeniser.moses.installation} irstlm_smoothing_method::String, | | +# language_model_directory} | | +# | | +# {lm_filename, compiled_lm_filename, add_start_end_filename} | +# | +# {moses_ini_file, evaluation_data_filename, trg_language_model_filename, +# trg_language_model_order, trg_language_model_type} +# +component training_pipeline + inputs src_filename, trg_filename + output moses_ini_filename + configuration source_language, + target_language, + max_segment_length, + corpus_development_size, + corpus_evaluation_size, + alignment_method, + reordering_method, + smoothing_method, + tokenisation_directory, + translation_model_directory, + language_model_directory, + mert_directory, + mert_max_no_iterations, + moses_installation_directory, + giza_installation_directory, + irstlm_installation_directory + declare + tokeniser := new tokeniser with + source_language -> tokeniser.src.language, + target_language -> tokeniser.trg.language, + tokenisation_directory -> tokeniser.src.tokenisation_dir, + tokenisation_directory -> tokeniser.trg.tokenisation_dir, + moses_installation_directory -> tokeniser.moses.installation + model_training := new model_training with + max_segment_length -> model_training.max_segment_length, + corpus_development_size -> model_training.corpus.development_size, + corpus_evaluation_size -> model_training.corpus.evaluation_size, + translation_model_directory -> model_training.translation_model.dir, + alignment_method -> model_training.method.alignment, + reordering_method -> model_training.method.reordering, + source_language -> model_training.src.language, + moses_installation_directory -> model_training.moses.installation, + giza_installation_directory -> model_training.giza.installation, + target_language -> model_training.trg.language + irstlm := new lang_model with + irstlm_installation_directory -> irstlm_installation_dir, + smoothing_method -> irstlm_smoothing_method, + language_model_directory -> language_model_directory + mert := new mert with + source_language -> source_language, + target_language -> target_language, + moses_installation_directory -> moses_installation_dir, + mert_directory -> mert_working_directory, + mert_max_no_iterations -> mert_max_no_iterations + as + # Split and transform the input to the tokeniser component + # Inputs: src_filename, trg_filename + # Outputs: (tokenised_src_filename), (tokenised_trg_filename) + (wire src_filename -> src_filename, + trg_filename -> _ &&& + wire trg_filename -> trg_filename, + src_filename -> _) >>> + tokeniser >>> + + # Merge output from tokeniser + # Inputs: (tokenised_src_filename), (tokenised_trg_filename) + # Outputs: tokenised_src_filename, tokenised_trg_filename + merge top[tokenised_src_filename] -> tokenised_src_filename, + bottom[tokenised_trg_filename] -> tokenised_trg_filename >>> + + # Train the translation table and target language model + # Inputs: tokenised_src_filename, tokenised_trg_filename + # Outputs: (moses_ini_filename), ('add_start_end_filename', 'lm_filename', 'compiled_lm_filename') + ((wire tokenised_src_filename -> src_filename, + tokenised_trg_filename -> trg_filename >>> model_training) &&& + (wire tokenised_trg_filename -> input_filename, + tokenised_src_filename -> _ >>> irstlm)) >>> + + # Merge the output from the TT and LM training component + # Inputs: (moses_ini_filename, evaluation_data_filename), + # (compiled_lm_filename, add_start_end_filename, lm_filename) + # Outputs: moses_ini_filename, evaluation_data_filename, evaluation_data_filename, + # trg_language_model_filename, trg_language_model_order, trg_language_model_type + merge top[moses_ini_filename] -> moses_ini_filename, + top[evaluation_data_filename] -> evaluation_data_filename, + bottom[compiled_lm_filename] -> trg_language_model_filename, + bottom[add_start_end_filename] -> _, + bottom[lm_filename] -> _, + 3 -> trg_language_model_order, + 9 -> trg_language_model_type >>> + mert diff --git a/mosesdecoder/contrib/checkplf/Makefile b/mosesdecoder/contrib/checkplf/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9d6111cd615633c7e4d687c8ddbdd976d32ba5fb --- /dev/null +++ b/mosesdecoder/contrib/checkplf/Makefile @@ -0,0 +1,2 @@ +checkplf: checkplf.cpp + g++ checkplf.cpp -I../../moses/ ../../lib/libmoses.a -o checkplf diff --git a/mosesdecoder/contrib/lmserver/Makefile.in b/mosesdecoder/contrib/lmserver/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..3d62eb5fd7cdf5eee551de1d61439321d3fc8248 --- /dev/null +++ b/mosesdecoder/contrib/lmserver/Makefile.in @@ -0,0 +1,645 @@ +# Makefile.in generated by automake 1.9.2 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +SOURCES = $(lmserver_SOURCES) $(lmserver_debug_SOURCES) + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = . +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ +bin_PROGRAMS = lmserver$(EXEEXT) lmserver-debug$(EXEEXT) +DIST_COMMON = README $(am__configure_deps) $(srcdir)/Makefile.am \ + $(srcdir)/Makefile.in $(srcdir)/config.h.in \ + $(top_srcdir)/configure AUTHORS COPYING ChangeLog INSTALL NEWS \ + compile config.guess config.sub depcomp install-sh missing +subdir = . +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ + configure.lineno configure.status.lineno +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = config.h +CONFIG_CLEAN_FILES = +am__installdirs = "$(DESTDIR)$(bindir)" +binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) +PROGRAMS = $(bin_PROGRAMS) +am_lmserver_OBJECTS = lmserver-lmserver.$(OBJEXT) \ + lmserver-thread.$(OBJEXT) lmserver-srilm.$(OBJEXT) +lmserver_OBJECTS = $(am_lmserver_OBJECTS) +am__objects_1 = lmserver.$(OBJEXT) thread.$(OBJEXT) srilm.$(OBJEXT) +am_lmserver_debug_OBJECTS = $(am__objects_1) +lmserver_debug_OBJECTS = $(am_lmserver_debug_OBJECTS) +DEFAULT_INCLUDES = -I. -I$(srcdir) -I. +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +CXXLD = $(CXX) +CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \ + -o $@ +SOURCES = $(lmserver_SOURCES) $(lmserver_debug_SOURCES) +DIST_SOURCES = $(lmserver_SOURCES) $(lmserver_debug_SOURCES) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +distdir = $(PACKAGE)-$(VERSION) +top_distdir = $(distdir) +am__remove_distdir = \ + { test ! -d $(distdir) \ + || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \ + && rm -fr $(distdir); }; } +DIST_ARCHIVES = $(distdir).tar.gz +GZIP_ENV = --best +distuninstallcheck_listfiles = find . -type f -print +distcleancheck_listfiles = find . -type f -print +ACLOCAL = @ACLOCAL@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DAEMON_OBJ = @DAEMON_OBJ@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +SRI_LM_FALSE = @SRI_LM_FALSE@ +SRI_LM_TRUE = @SRI_LM_TRUE@ +STRIP = @STRIP@ +VERSION = @VERSION@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_STRIP = @ac_ct_STRIP@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +lmserver_SOURCES = lmserver.c lmserver.h thread.c srilm.cc +lmserver_debug_SOURCES = $(lmserver_SOURCES) +lmserver_CPPFLAGS = -DNDEBUG +lmserver_LDADD = @DAEMON_OBJ@ +lmserver_debug_LDADD = @DAEMON_OBJ@ +lmserver_DEPENDENCIES = @DAEMON_OBJ@ +lmserver_debug_DEPENDENCIES = @DAEMON_OBJ@ +DIST_DIRS = examples +EXTRA_DIST = examples daemon.c +all: config.h + $(MAKE) $(AM_MAKEFLAGS) all-am + +.SUFFIXES: +.SUFFIXES: .c .cc .o .obj +am--refresh: + @: +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + echo ' cd $(srcdir) && $(AUTOMAKE) --gnu '; \ + cd $(srcdir) && $(AUTOMAKE) --gnu \ + && exit 0; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \ + cd $(top_srcdir) && \ + $(AUTOMAKE) --gnu Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + echo ' $(SHELL) ./config.status'; \ + $(SHELL) ./config.status;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + $(SHELL) ./config.status --recheck + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(srcdir) && $(AUTOCONF) +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS) + +config.h: stamp-h1 + @if test ! -f $@; then \ + rm -f stamp-h1; \ + $(MAKE) stamp-h1; \ + else :; fi + +stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status + @rm -f stamp-h1 + cd $(top_builddir) && $(SHELL) ./config.status config.h +$(srcdir)/config.h.in: $(am__configure_deps) + cd $(top_srcdir) && $(AUTOHEADER) + rm -f stamp-h1 + touch $@ + +distclean-hdr: + -rm -f config.h stamp-h1 +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + test -z "$(bindir)" || $(mkdir_p) "$(DESTDIR)$(bindir)" + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \ + if test -f $$p \ + ; then \ + f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \ + echo " $(INSTALL_PROGRAM_ENV) $(binPROGRAMS_INSTALL) '$$p' '$(DESTDIR)$(bindir)/$$f'"; \ + $(INSTALL_PROGRAM_ENV) $(binPROGRAMS_INSTALL) "$$p" "$(DESTDIR)$(bindir)/$$f" || exit 1; \ + else :; fi; \ + done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \ + echo " rm -f '$(DESTDIR)$(bindir)/$$f'"; \ + rm -f "$(DESTDIR)$(bindir)/$$f"; \ + done + +clean-binPROGRAMS: + -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) +lmserver$(EXEEXT): $(lmserver_OBJECTS) $(lmserver_DEPENDENCIES) + @rm -f lmserver$(EXEEXT) + $(CXXLINK) $(lmserver_LDFLAGS) $(lmserver_OBJECTS) $(lmserver_LDADD) $(LIBS) +lmserver-debug$(EXEEXT): $(lmserver_debug_OBJECTS) $(lmserver_debug_DEPENDENCIES) + @rm -f lmserver-debug$(EXEEXT) + $(CXXLINK) $(lmserver_debug_LDFLAGS) $(lmserver_debug_OBJECTS) $(lmserver_debug_LDADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lmserver-lmserver.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lmserver-srilm.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lmserver-thread.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lmserver.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/srilm.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/thread.Po@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +lmserver-lmserver.o: lmserver.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT lmserver-lmserver.o -MD -MP -MF "$(DEPDIR)/lmserver-lmserver.Tpo" -c -o lmserver-lmserver.o `test -f 'lmserver.c' || echo '$(srcdir)/'`lmserver.c; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/lmserver-lmserver.Tpo" "$(DEPDIR)/lmserver-lmserver.Po"; else rm -f "$(DEPDIR)/lmserver-lmserver.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='lmserver.c' object='lmserver-lmserver.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o lmserver-lmserver.o `test -f 'lmserver.c' || echo '$(srcdir)/'`lmserver.c + +lmserver-lmserver.obj: lmserver.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT lmserver-lmserver.obj -MD -MP -MF "$(DEPDIR)/lmserver-lmserver.Tpo" -c -o lmserver-lmserver.obj `if test -f 'lmserver.c'; then $(CYGPATH_W) 'lmserver.c'; else $(CYGPATH_W) '$(srcdir)/lmserver.c'; fi`; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/lmserver-lmserver.Tpo" "$(DEPDIR)/lmserver-lmserver.Po"; else rm -f "$(DEPDIR)/lmserver-lmserver.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='lmserver.c' object='lmserver-lmserver.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o lmserver-lmserver.obj `if test -f 'lmserver.c'; then $(CYGPATH_W) 'lmserver.c'; else $(CYGPATH_W) '$(srcdir)/lmserver.c'; fi` + +lmserver-thread.o: thread.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT lmserver-thread.o -MD -MP -MF "$(DEPDIR)/lmserver-thread.Tpo" -c -o lmserver-thread.o `test -f 'thread.c' || echo '$(srcdir)/'`thread.c; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/lmserver-thread.Tpo" "$(DEPDIR)/lmserver-thread.Po"; else rm -f "$(DEPDIR)/lmserver-thread.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='thread.c' object='lmserver-thread.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o lmserver-thread.o `test -f 'thread.c' || echo '$(srcdir)/'`thread.c + +lmserver-thread.obj: thread.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT lmserver-thread.obj -MD -MP -MF "$(DEPDIR)/lmserver-thread.Tpo" -c -o lmserver-thread.obj `if test -f 'thread.c'; then $(CYGPATH_W) 'thread.c'; else $(CYGPATH_W) '$(srcdir)/thread.c'; fi`; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/lmserver-thread.Tpo" "$(DEPDIR)/lmserver-thread.Po"; else rm -f "$(DEPDIR)/lmserver-thread.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='thread.c' object='lmserver-thread.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o lmserver-thread.obj `if test -f 'thread.c'; then $(CYGPATH_W) 'thread.c'; else $(CYGPATH_W) '$(srcdir)/thread.c'; fi` + +.cc.o: +@am__fastdepCXX_TRUE@ if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ +@am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ $< + +.cc.obj: +@am__fastdepCXX_TRUE@ if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \ +@am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +lmserver-srilm.o: srilm.cc +@am__fastdepCXX_TRUE@ if $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT lmserver-srilm.o -MD -MP -MF "$(DEPDIR)/lmserver-srilm.Tpo" -c -o lmserver-srilm.o `test -f 'srilm.cc' || echo '$(srcdir)/'`srilm.cc; \ +@am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/lmserver-srilm.Tpo" "$(DEPDIR)/lmserver-srilm.Po"; else rm -f "$(DEPDIR)/lmserver-srilm.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='srilm.cc' object='lmserver-srilm.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o lmserver-srilm.o `test -f 'srilm.cc' || echo '$(srcdir)/'`srilm.cc + +lmserver-srilm.obj: srilm.cc +@am__fastdepCXX_TRUE@ if $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT lmserver-srilm.obj -MD -MP -MF "$(DEPDIR)/lmserver-srilm.Tpo" -c -o lmserver-srilm.obj `if test -f 'srilm.cc'; then $(CYGPATH_W) 'srilm.cc'; else $(CYGPATH_W) '$(srcdir)/srilm.cc'; fi`; \ +@am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/lmserver-srilm.Tpo" "$(DEPDIR)/lmserver-srilm.Po"; else rm -f "$(DEPDIR)/lmserver-srilm.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='srilm.cc' object='lmserver-srilm.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(lmserver_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o lmserver-srilm.obj `if test -f 'srilm.cc'; then $(CYGPATH_W) 'srilm.cc'; else $(CYGPATH_W) '$(srcdir)/srilm.cc'; fi` +uninstall-info-am: + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + $(am__remove_distdir) + mkdir $(distdir) + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkdir_p) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done + $(MAKE) $(AM_MAKEFLAGS) \ + top_distdir="$(top_distdir)" distdir="$(distdir)" \ + dist-hook + -find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \ + ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \ + ! -type d ! -perm -400 -exec chmod a+r {} \; -o \ + ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \ + || chmod -R a+r $(distdir) +dist-gzip: distdir + tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz + $(am__remove_distdir) + +dist-bzip2: distdir + tardir=$(distdir) && $(am__tar) | bzip2 -9 -c >$(distdir).tar.bz2 + $(am__remove_distdir) + +dist-tarZ: distdir + tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z + $(am__remove_distdir) + +dist-shar: distdir + shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz + $(am__remove_distdir) + +dist-zip: distdir + -rm -f $(distdir).zip + zip -rq $(distdir).zip $(distdir) + $(am__remove_distdir) + +dist dist-all: distdir + tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz + $(am__remove_distdir) + +# This target untars the dist file and tries a VPATH configuration. Then +# it guarantees that the distribution is self-contained by making another +# tarfile. +distcheck: dist + case '$(DIST_ARCHIVES)' in \ + *.tar.gz*) \ + GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(am__untar) ;;\ + *.tar.bz2*) \ + bunzip2 -c $(distdir).tar.bz2 | $(am__untar) ;;\ + *.tar.Z*) \ + uncompress -c $(distdir).tar.Z | $(am__untar) ;;\ + *.shar.gz*) \ + GZIP=$(GZIP_ENV) gunzip -c $(distdir).shar.gz | unshar ;;\ + *.zip*) \ + unzip $(distdir).zip ;;\ + esac + chmod -R a-w $(distdir); chmod a+w $(distdir) + mkdir $(distdir)/_build + mkdir $(distdir)/_inst + chmod a-w $(distdir) + dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \ + && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \ + && cd $(distdir)/_build \ + && ../configure --srcdir=.. --prefix="$$dc_install_base" \ + $(DISTCHECK_CONFIGURE_FLAGS) \ + && $(MAKE) $(AM_MAKEFLAGS) \ + && $(MAKE) $(AM_MAKEFLAGS) dvi \ + && $(MAKE) $(AM_MAKEFLAGS) check \ + && $(MAKE) $(AM_MAKEFLAGS) install \ + && $(MAKE) $(AM_MAKEFLAGS) installcheck \ + && $(MAKE) $(AM_MAKEFLAGS) uninstall \ + && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \ + distuninstallcheck \ + && chmod -R a-w "$$dc_install_base" \ + && ({ \ + (cd ../.. && umask 077 && mkdir "$$dc_destdir") \ + && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \ + && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \ + && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \ + distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \ + } || { rm -rf "$$dc_destdir"; exit 1; }) \ + && rm -rf "$$dc_destdir" \ + && $(MAKE) $(AM_MAKEFLAGS) dist \ + && rm -rf $(DIST_ARCHIVES) \ + && $(MAKE) $(AM_MAKEFLAGS) distcleancheck + $(am__remove_distdir) + @(echo "$(distdir) archives ready for distribution: "; \ + list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \ + sed -e '1{h;s/./=/g;p;x;}' -e '$${p;x;}' +distuninstallcheck: + @cd $(distuninstallcheck_dir) \ + && test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \ + || { echo "ERROR: files left after uninstall:" ; \ + if test -n "$(DESTDIR)"; then \ + echo " (check DESTDIR support)"; \ + fi ; \ + $(distuninstallcheck_listfiles) ; \ + exit 1; } >&2 +distcleancheck: distclean + @if test '$(srcdir)' = . ; then \ + echo "ERROR: distcleancheck can only run from a VPATH build" ; \ + exit 1 ; \ + fi + @test `$(distcleancheck_listfiles) | wc -l` -eq 0 \ + || { echo "ERROR: files left in build directory after distclean:" ; \ + $(distcleancheck_listfiles) ; \ + exit 1; } >&2 +check-am: all-am +check: check-am +all-am: Makefile $(PROGRAMS) config.h +installdirs: + for dir in "$(DESTDIR)$(bindir)"; do \ + test -z "$$dir" || $(mkdir_p) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-binPROGRAMS clean-generic mostlyclean-am + +distclean: distclean-am + -rm -f $(am__CONFIG_DISTCLEAN_FILES) + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-hdr distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +info: info-am + +info-am: + +install-data-am: + +install-exec-am: install-binPROGRAMS + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f $(am__CONFIG_DISTCLEAN_FILES) + -rm -rf $(top_srcdir)/autom4te.cache + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-binPROGRAMS uninstall-info-am + +.PHONY: CTAGS GTAGS all all-am am--refresh check check-am clean \ + clean-binPROGRAMS clean-generic ctags dist dist-all dist-bzip2 \ + dist-gzip dist-hook dist-shar dist-tarZ dist-zip distcheck \ + distclean distclean-compile distclean-generic distclean-hdr \ + distclean-tags distcleancheck distdir distuninstallcheck dvi \ + dvi-am html html-am info info-am install install-am \ + install-binPROGRAMS install-data install-data-am install-exec \ + install-exec-am install-info install-info-am install-man \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-binPROGRAMS \ + uninstall-info-am + + +dist-hook: + rm -rf $(distdir)/doc/.svn/ + rm -rf $(distdir)/scripts/.svn/ + rm -rf $(distdir)/t/.svn/ + rm -rf $(distdir)/t/lib/.svn/ +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/mosesdecoder/contrib/lmserver/daemon.c b/mosesdecoder/contrib/lmserver/daemon.c new file mode 100644 index 0000000000000000000000000000000000000000..9cb7884d0d43932bd3052b16d6477ce1be551dee --- /dev/null +++ b/mosesdecoder/contrib/lmserver/daemon.c @@ -0,0 +1,68 @@ +/* $Header: /cvsroot/wikipedia/willow/src/bin/willow/daemon.c,v 1.1 2005/05/02 19:15:21 kateturner Exp $ */ +/* $NetBSD: daemon.c,v 1.9 2003/08/07 16:42:46 agc Exp $ */ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined __SUNPRO_C || defined __DECC || defined __HP_cc +# pragma ident "@(#)$Header: /cvsroot/wikipedia/willow/src/bin/willow/daemon.c,v 1.1 2005/05/02 19:15:21 kateturner Exp $" +# pragma ident "$NetBSD: daemon.c,v 1.9 2003/08/07 16:42:46 agc Exp $" +#endif + +#include +#include +#include + +int daemon(int nochdir, int noclose) +{ + int fd; + + switch (fork()) { + case -1: + return (-1); + case 0: + break; + default: + _exit(EXIT_SUCCESS); + } + + if (setsid() == -1) + return (-1); + + if (nochdir == 0) + (void)chdir("/"); + + if (noclose == 0 && (fd = open("/dev/null", O_RDWR, 0)) != -1) { + (void)dup2(fd, STDIN_FILENO); + (void)dup2(fd, STDOUT_FILENO); + (void)dup2(fd, STDERR_FILENO); + if (fd > STDERR_FILENO) + (void)close(fd); + } + return (0); +} diff --git a/mosesdecoder/contrib/lmserver/lmserver.c b/mosesdecoder/contrib/lmserver/lmserver.c new file mode 100644 index 0000000000000000000000000000000000000000..d3aa685cc794fc78a583b3b13b5a30e0d143ae2d --- /dev/null +++ b/mosesdecoder/contrib/lmserver/lmserver.c @@ -0,0 +1,2140 @@ +/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* + * memcached - memory caching daemon + * + * http://www.danga.com/memcached/ + * + * Copyright 2003 Danga Interactive, Inc. All rights reserved. + * + * Use and distribution licensed under the BSD license. See + * the LICENSE file for full text. + * + * Authors: + * Anatoly Vorobey + * Brad Fitzpatrick +std * + * $Id$ + */ +#include "lmserver.h" +#include "srilm.h" +#include +#include +#include +#include +#include +#include + +/* some POSIX systems need the following definition + * to get mlockall flags out of sys/mman.h. */ +#ifndef _P1003_1B_VISIBLE +#define _P1003_1B_VISIBLE +#endif +/* need this to get IOV_MAX on some platforms. */ +#ifndef __need_IOV_MAX +#define __need_IOV_MAX +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_MALLOC_H +/* OpenBSD has a malloc.h, but warns to use stdlib.h instead */ +#ifndef __OpenBSD__ +#include +#endif +#endif + +/* FreeBSD 4.x doesn't have IOV_MAX exposed. */ +#ifndef IOV_MAX +#if defined(__FreeBSD__) || defined(__APPLE__) +# define IOV_MAX 1024 +#endif +#endif + +/* + * forward declarations + */ +static void drive_machine(conn *c); +static int new_socket(struct addrinfo *ai); +static int server_socket(const int port, const bool is_udp); +static int try_read_command(conn *c); +static int try_read_network(conn *c); +static int try_read_udp(conn *c); + +/* stats */ +static void stats_reset(void); +static void stats_init(void); + +/* defaults */ +static void settings_init(void); + +/* event handling, network IO */ +static void event_handler(const int fd, const short which, void *arg); +static void conn_close(conn *c); +static void conn_init(void); +static void accept_new_conns(const bool do_accept); +static bool update_event(conn *c, const int new_flags); +static void complete_nread(conn *c); +static void process_command(conn *c, char *command); +static int transmit(conn *c); +static int ensure_iov_space(conn *c); +static int add_iov(conn *c, const void *buf, int len); +static int add_msghdr(conn *c); + +/* time handling */ +static void set_current_time(void); /* update the global variable holding + global 32-bit seconds-since-start time + (to avoid 64 bit time_t) */ + +static void conn_free(conn *c); + +/** exported globals **/ +struct stats stats; +struct settings settings; + +/** file scope variables **/ +static item **todelete = NULL; +static int delcurr; +static int deltotal; +static conn *listen_conn = NULL; +static struct event_base *main_base; + +#define TRANSMIT_COMPLETE 0 +#define TRANSMIT_INCOMPLETE 1 +#define TRANSMIT_SOFT_ERROR 2 +#define TRANSMIT_HARD_ERROR 3 + +static int *buckets = 0; /* bucket->generation array for a managed instance */ + +#define REALTIME_MAXDELTA 60*60*24*30 +/* + * given time value that's either unix time or delta from current unix time, return + * unix time. Use the fact that delta can't exceed one month (and real time value can't + * be that low). + */ +static rel_time_t realtime(const time_t exptime) { + /* no. of seconds in 30 days - largest possible delta exptime */ + + if (exptime == 0) return 0; /* 0 means never expire */ + + if (exptime > REALTIME_MAXDELTA) { + /* if item expiration is at/before the server started, give it an + expiration time of 1 second after the server started. + (because 0 means don't expire). without this, we'd + underflow and wrap around to some large value way in the + future, effectively making items expiring in the past + really expiring never */ + if (exptime <= stats.started) + return (rel_time_t)1; + return (rel_time_t)(exptime - stats.started); + } else { + return (rel_time_t)(exptime + current_time); + } +} + +static void stats_init(void) { + stats.curr_items = stats.total_items = stats.curr_conns = stats.total_conns = stats.conn_structs = 0; + stats.get_cmds = stats.set_cmds = stats.get_hits = stats.get_misses = stats.evictions = 0; + stats.curr_bytes = stats.bytes_read = stats.bytes_written = 0; + + /* make the time we started always be 2 seconds before we really + did, so time(0) - time.started is never zero. if so, things + like 'settings.oldest_live' which act as booleans as well as + values are now false in boolean context... */ + stats.started = time(0) - 2; +} + +static void stats_reset(void) { + STATS_LOCK(); + stats.total_items = stats.total_conns = 0; + stats.get_cmds = stats.set_cmds = stats.get_hits = stats.get_misses = stats.evictions = 0; + stats.bytes_read = stats.bytes_written = 0; + STATS_UNLOCK(); +} + +static void settings_init(void) { + settings.srilm = NULL; + settings.srilm_order = 3; + settings.access=0700; + settings.port = 11211; + settings.udpport = 0; + /* By default this string should be NULL for getaddrinfo() */ + settings.inter = NULL; + settings.maxbytes = 64 * 1024 * 1024; /* default is 64MB */ + settings.maxconns = 1024; /* to limit connections-related memory to about 5MB */ + settings.verbose = 0; + settings.oldest_live = 0; + settings.evict_to_free = 1; /* push old items out of cache when memory runs out */ + settings.socketpath = NULL; /* by default, not using a unix socket */ + settings.managed = false; + settings.factor = 1.25; + settings.chunk_size = 48; /* space for a modest key and value */ +#ifdef USE_THREADS + settings.num_threads = 4; +#else + settings.num_threads = 1; +#endif + settings.detail_enabled = 0; +} + +/* returns true if a deleted item's delete-locked-time is over, and it + should be removed from the namespace */ +static bool item_delete_lock_over (item *it) { + assert(it->it_flags & ITEM_DELETED); + return (current_time >= it->exptime); +} + +/* + * Adds a message header to a connection. + * + * Returns 0 on success, -1 on out-of-memory. + */ +static int add_msghdr(conn *c) +{ + struct msghdr *msg; + + assert(c != NULL); + + if (c->msgsize == c->msgused) { + msg = realloc(c->msglist, c->msgsize * 2 * sizeof(struct msghdr)); + if (! msg) + return -1; + c->msglist = msg; + c->msgsize *= 2; + } + + msg = c->msglist + c->msgused; + + /* this wipes msg_iovlen, msg_control, msg_controllen, and + msg_flags, the last 3 of which aren't defined on solaris: */ + memset(msg, 0, sizeof(struct msghdr)); + + msg->msg_iov = &c->iov[c->iovused]; + + if (c->request_addr_size > 0) { + msg->msg_name = &c->request_addr; + msg->msg_namelen = c->request_addr_size; + } + + c->msgbytes = 0; + c->msgused++; + + if (c->udp) { + /* Leave room for the UDP header, which we'll fill in later. */ + return add_iov(c, NULL, UDP_HEADER_SIZE); + } + + return 0; +} + + +/* + * Free list management for connections. + */ + +static conn **freeconns; +static int freetotal; +static int freecurr; + + +static void conn_init(void) { + freetotal = 200; + freecurr = 0; + if ((freeconns = (conn **)malloc(sizeof(conn *) * freetotal)) == NULL) { + fprintf(stderr, "malloc()\n"); + } + return; +} + +/* + * Returns a connection from the freelist, if any. Should call this using + * conn_from_freelist() for thread safety. + */ +conn *do_conn_from_freelist() { + conn *c; + + if (freecurr > 0) { + c = freeconns[--freecurr]; + } else { + c = NULL; + } + + return c; +} + +/* + * Adds a connection to the freelist. 0 = success. Should call this using + * conn_add_to_freelist() for thread safety. + */ +bool do_conn_add_to_freelist(conn *c) { + if (freecurr < freetotal) { + freeconns[freecurr++] = c; + return false; + } else { + /* try to enlarge free connections array */ + conn **new_freeconns = realloc(freeconns, sizeof(conn *) * freetotal * 2); + if (new_freeconns) { + freetotal *= 2; + freeconns = new_freeconns; + freeconns[freecurr++] = c; + return false; + } + } + return true; +} + +conn *conn_new(const int sfd, const int init_state, const int event_flags, + const int read_buffer_size, const bool is_udp, struct event_base *base) { + conn *c = conn_from_freelist(); + + if (NULL == c) { + if (!(c = (conn *)calloc(1, sizeof(conn)))) { + fprintf(stderr, "calloc()\n"); + return NULL; + } + + c->rbuf = c->wbuf = 0; + c->ilist = 0; + c->suffixlist = 0; + c->iov = 0; + c->msglist = 0; + c->hdrbuf = 0; + + c->rsize = read_buffer_size; + c->wsize = DATA_BUFFER_SIZE; + c->isize = ITEM_LIST_INITIAL; + c->suffixsize = SUFFIX_LIST_INITIAL; + c->iovsize = IOV_LIST_INITIAL; + c->msgsize = MSG_LIST_INITIAL; + c->hdrsize = 0; + + c->rbuf = (char *)malloc((size_t)c->rsize); + c->wbuf = (char *)malloc((size_t)c->wsize); + c->ilist = (item **)malloc(sizeof(item *) * c->isize); + c->suffixlist = (char **)malloc(sizeof(char *) * c->suffixsize); + c->iov = (struct iovec *)malloc(sizeof(struct iovec) * c->iovsize); + c->msglist = (struct msghdr *)malloc(sizeof(struct msghdr) * c->msgsize); + + if (c->rbuf == 0 || c->wbuf == 0 || c->ilist == 0 || c->iov == 0 || + c->msglist == 0 || c->suffixlist == 0) { + conn_free(c); + fprintf(stderr, "malloc()\n"); + return NULL; + } + + STATS_LOCK(); + stats.conn_structs++; + STATS_UNLOCK(); + } + + if (settings.verbose > 1) { + if (init_state == conn_listening) + fprintf(stderr, "<%d server listening\n", sfd); + else if (is_udp) + fprintf(stderr, "<%d server listening (udp)\n", sfd); + else + fprintf(stderr, "<%d new client connection\n", sfd); + } + + c->sfd = sfd; + c->udp = is_udp; + c->state = init_state; + c->rlbytes = 0; + c->rbytes = c->wbytes = 0; + c->wcurr = c->wbuf; + c->rcurr = c->rbuf; + c->ritem = 0; + c->icurr = c->ilist; + c->suffixcurr = c->suffixlist; + c->ileft = 0; + c->suffixleft = 0; + c->iovused = 0; + c->msgcurr = 0; + c->msgused = 0; + + c->write_and_go = conn_read; + c->write_and_free = 0; + c->item = 0; + c->bucket = -1; + c->gen = 0; + + c->noreply = false; + + event_set(&c->event, sfd, event_flags, event_handler, (void *)c); + event_base_set(base, &c->event); + c->ev_flags = event_flags; + + if (event_add(&c->event, 0) == -1) { + if (conn_add_to_freelist(c)) { + conn_free(c); + } + perror("event_add"); + return NULL; + } + + STATS_LOCK(); + stats.curr_conns++; + stats.total_conns++; + STATS_UNLOCK(); + + return c; +} + +static void conn_cleanup(conn *c) { + assert(c != NULL); + + if (c->write_and_free) { + free(c->write_and_free); + c->write_and_free = 0; + } +} + +/* + * Frees a connection. + */ +void conn_free(conn *c) { + if (c) { + if (c->hdrbuf) + free(c->hdrbuf); + if (c->msglist) + free(c->msglist); + if (c->rbuf) + free(c->rbuf); + if (c->wbuf) + free(c->wbuf); + if (c->ilist) + free(c->ilist); + if (c->suffixlist) + free(c->suffixlist); + if (c->iov) + free(c->iov); + free(c); + } +} + +static void conn_close(conn *c) { + assert(c != NULL); + + /* delete the event, the socket and the conn */ + event_del(&c->event); + + if (settings.verbose > 1) + fprintf(stderr, "<%d connection closed.\n", c->sfd); + + close(c->sfd); + accept_new_conns(true); + conn_cleanup(c); + + /* if the connection has big buffers, just free it */ + if (c->rsize > READ_BUFFER_HIGHWAT || conn_add_to_freelist(c)) { + conn_free(c); + } + + STATS_LOCK(); + stats.curr_conns--; + STATS_UNLOCK(); + + return; +} + + +/* + * Shrinks a connection's buffers if they're too big. This prevents + * periodic large "get" requests from permanently chewing lots of server + * memory. + * + * This should only be called in between requests since it can wipe output + * buffers! + */ +static void conn_shrink(conn *c) { + assert(c != NULL); + + if (c->udp) + return; + + if (c->rsize > READ_BUFFER_HIGHWAT && c->rbytes < DATA_BUFFER_SIZE) { + char *newbuf; + + if (c->rcurr != c->rbuf) + memmove(c->rbuf, c->rcurr, (size_t)c->rbytes); + + newbuf = (char *)realloc((void *)c->rbuf, DATA_BUFFER_SIZE); + + if (newbuf) { + c->rbuf = newbuf; + c->rsize = DATA_BUFFER_SIZE; + } + /* TODO check other branch... */ + c->rcurr = c->rbuf; + } + + if (c->isize > ITEM_LIST_HIGHWAT) { + item **newbuf = (item**) realloc((void *)c->ilist, ITEM_LIST_INITIAL * sizeof(c->ilist[0])); + if (newbuf) { + c->ilist = newbuf; + c->isize = ITEM_LIST_INITIAL; + } + /* TODO check error condition? */ + } + + if (c->msgsize > MSG_LIST_HIGHWAT) { + struct msghdr *newbuf = (struct msghdr *) realloc((void *)c->msglist, MSG_LIST_INITIAL * sizeof(c->msglist[0])); + if (newbuf) { + c->msglist = newbuf; + c->msgsize = MSG_LIST_INITIAL; + } + /* TODO check error condition? */ + } + + if (c->iovsize > IOV_LIST_HIGHWAT) { + struct iovec *newbuf = (struct iovec *) realloc((void *)c->iov, IOV_LIST_INITIAL * sizeof(c->iov[0])); + if (newbuf) { + c->iov = newbuf; + c->iovsize = IOV_LIST_INITIAL; + } + /* TODO check return value */ + } +} + +/* + * Sets a connection's current state in the state machine. Any special + * processing that needs to happen on certain state transitions can + * happen here. + */ +static void conn_set_state(conn *c, int state) { + assert(c != NULL); + + if (state != c->state) { + if (state == conn_read) { + conn_shrink(c); + //assoc_move_next_bucket(); + } + c->state = state; + } +} + +/* + * Ensures that there is room for another struct iovec in a connection's + * iov list. + * + * Returns 0 on success, -1 on out-of-memory. + */ +static int ensure_iov_space(conn *c) { + assert(c != NULL); + + if (c->iovused >= c->iovsize) { + int i, iovnum; + struct iovec *new_iov = (struct iovec *)realloc(c->iov, + (c->iovsize * 2) * sizeof(struct iovec)); + if (! new_iov) + return -1; + c->iov = new_iov; + c->iovsize *= 2; + + /* Point all the msghdr structures at the new list. */ + for (i = 0, iovnum = 0; i < c->msgused; i++) { + c->msglist[i].msg_iov = &c->iov[iovnum]; + iovnum += c->msglist[i].msg_iovlen; + } + } + + return 0; +} + + +/* + * Adds data to the list of pending data that will be written out to a + * connection. + * + * Returns 0 on success, -1 on out-of-memory. + */ + +static int add_iov(conn *c, const void *buf, int len) { + struct msghdr *m; + int leftover; + bool limit_to_mtu; + + assert(c != NULL); + + do { + m = &c->msglist[c->msgused - 1]; + + /* + * Limit UDP packets, and the first payloads of TCP replies, to + * UDP_MAX_PAYLOAD_SIZE bytes. + */ + limit_to_mtu = c->udp || (1 == c->msgused); + + /* We may need to start a new msghdr if this one is full. */ + if (m->msg_iovlen == IOV_MAX || + (limit_to_mtu && c->msgbytes >= UDP_MAX_PAYLOAD_SIZE)) { + add_msghdr(c); + m = &c->msglist[c->msgused - 1]; + } + + if (ensure_iov_space(c) != 0) + return -1; + + /* If the fragment is too big to fit in the datagram, split it up */ + if (limit_to_mtu && len + c->msgbytes > UDP_MAX_PAYLOAD_SIZE) { + leftover = len + c->msgbytes - UDP_MAX_PAYLOAD_SIZE; + len -= leftover; + } else { + leftover = 0; + } + + m = &c->msglist[c->msgused - 1]; + m->msg_iov[m->msg_iovlen].iov_base = (void *)buf; + m->msg_iov[m->msg_iovlen].iov_len = len; + + c->msgbytes += len; + c->iovused++; + m->msg_iovlen++; + + buf = ((char *)buf) + len; + len = leftover; + } while (leftover > 0); + + return 0; +} + + +/* + * Constructs a set of UDP headers and attaches them to the outgoing messages. + */ +static int build_udp_headers(conn *c) { + int i; + unsigned char *hdr; + + assert(c != NULL); + + if (c->msgused > c->hdrsize) { + void *new_hdrbuf; + if (c->hdrbuf) + new_hdrbuf = realloc(c->hdrbuf, c->msgused * 2 * UDP_HEADER_SIZE); + else + new_hdrbuf = malloc(c->msgused * 2 * UDP_HEADER_SIZE); + if (! new_hdrbuf) + return -1; + c->hdrbuf = (unsigned char *)new_hdrbuf; + c->hdrsize = c->msgused * 2; + } + + hdr = c->hdrbuf; + for (i = 0; i < c->msgused; i++) { + c->msglist[i].msg_iov[0].iov_base = hdr; + c->msglist[i].msg_iov[0].iov_len = UDP_HEADER_SIZE; + *hdr++ = c->request_id / 256; + *hdr++ = c->request_id % 256; + *hdr++ = i / 256; + *hdr++ = i % 256; + *hdr++ = c->msgused / 256; + *hdr++ = c->msgused % 256; + *hdr++ = 0; + *hdr++ = 0; + assert((void *) hdr == (void *)c->msglist[i].msg_iov[0].iov_base + UDP_HEADER_SIZE); + } + + return 0; +} + + +static void out_string(conn *c, const char *str) { + size_t len; + + assert(c != NULL); + + if (c->noreply) { + if (settings.verbose > 1) + fprintf(stderr, ">%d NOREPLY %s\n", c->sfd, str); + c->noreply = false; + conn_set_state(c, conn_read); + return; + } + + if (settings.verbose > 1) + fprintf(stderr, ">%d %s\n", c->sfd, str); + + len = strlen(str); + if ((len + 2) > c->wsize) { + /* ought to be always enough. just fail for simplicity */ + str = "SERVER_ERROR output line too long"; + len = strlen(str); + } + + memcpy(c->wbuf, str, len); + memcpy(c->wbuf + len, "\r\n", 2); + c->wbytes = len + 2; + c->wcurr = c->wbuf; + + conn_set_state(c, conn_write); + c->write_and_go = conn_read; + return; +} + +typedef struct token_s { + char *value; + size_t length; +} token_t; + +#define COMMAND_TOKEN 0 +#define SUBCOMMAND_TOKEN 1 +#define KEY_TOKEN 1 +#define KEY_MAX_LENGTH 250 + +#define MAX_TOKENS 8 + +/* + * Tokenize the command string by replacing whitespace with '\0' and update + * the token array tokens with pointer to start of each token and length. + * Returns total number of tokens. The last valid token is the terminal + * token (value points to the first unprocessed character of the string and + * length zero). + * + * Usage example: + * + * while(tokenize_command(command, ncommand, tokens, max_tokens) > 0) { + * for(int ix = 0; tokens[ix].length != 0; ix++) { + * ... + * } + * ncommand = tokens[ix].value - command; + * command = tokens[ix].value; + * } + */ +static size_t tokenize_command(char *command, token_t *tokens, const size_t max_tokens) { + char *s, *e; + size_t ntokens = 0; + + assert(command != NULL && tokens != NULL && max_tokens > 1); + + for (s = e = command; ntokens < max_tokens - 1; ++e) { + if (*e == ' ') { + if (s != e) { + tokens[ntokens].value = s; + tokens[ntokens].length = e - s; + ntokens++; + *e = '\0'; + } + s = e + 1; + } + else if (*e == '\0') { + if (s != e) { + tokens[ntokens].value = s; + tokens[ntokens].length = e - s; + ntokens++; + } + + break; /* string end */ + } + } + + /* + * If we scanned the whole string, the terminal value pointer is null, + * otherwise it is the first unprocessed character. + */ + tokens[ntokens].value = *e == '\0' ? NULL : e; + tokens[ntokens].length = 0; + ntokens++; + + return ntokens; +} + +/* set up a connection to write a buffer then free it, used for stats */ +static void write_and_free(conn *c, char *buf, int bytes) { + if (buf) { + c->write_and_free = buf; + c->wcurr = buf; + c->wbytes = bytes; + conn_set_state(c, conn_write); + c->write_and_go = conn_read; + } else { + out_string(c, "SERVER_ERROR out of memory writing stats"); + } +} + +static inline void set_noreply_maybe(conn *c, token_t *tokens, size_t ntokens) +{ + int noreply_index = ntokens - 2; + + /* + NOTE: this function is not the first place where we are going to + send the reply. We could send it instead from process_command() + if the request line has wrong number of tokens. However parsing + malformed line for "noreply" option is not reliable anyway, so + it can't be helped. + */ + if (tokens[noreply_index].value + && strcmp(tokens[noreply_index].value, "noreply") == 0) { + c->noreply = true; + } +} + +inline static void process_stats_detail(conn *c, const char *command) { + assert(c != NULL); + + if (strcmp(command, "on") == 0) { + settings.detail_enabled = 1; + out_string(c, "OK"); + } + else if (strcmp(command, "off") == 0) { + settings.detail_enabled = 0; + out_string(c, "OK"); + } else { + out_string(c, "CLIENT_ERROR usage: stats detail on|off|dump"); + } +} + +static void process_stat(conn *c, token_t *tokens, const size_t ntokens) { + rel_time_t now = current_time; + char *command; + char *subcommand; + + assert(c != NULL); + + if(ntokens < 2) { + out_string(c, "CLIENT_ERROR bad command line"); + return; + } + + command = tokens[COMMAND_TOKEN].value; + + if (ntokens == 2 && strcmp(command, "stats") == 0) { + char temp[1024]; + pid_t pid = getpid(); + char *pos = temp; + +#ifndef WIN32 + struct rusage usage; + getrusage(RUSAGE_SELF, &usage); +#endif /* !WIN32 */ + + STATS_LOCK(); + pos += sprintf(pos, "STAT pid %u\r\n", pid); + pos += sprintf(pos, "STAT uptime %u\r\n", now); + pos += sprintf(pos, "STAT time %ld\r\n", now + stats.started); + pos += sprintf(pos, "STAT version " VERSION "\r\n"); + pos += sprintf(pos, "STAT pointer_size %d\r\n", 8 * sizeof(void *)); +#ifndef WIN32 + pos += sprintf(pos, "STAT rusage_user %ld.%06ld\r\n", usage.ru_utime.tv_sec, usage.ru_utime.tv_usec); + pos += sprintf(pos, "STAT rusage_system %ld.%06ld\r\n", usage.ru_stime.tv_sec, usage.ru_stime.tv_usec); +#endif /* !WIN32 */ + pos += sprintf(pos, "STAT curr_items %u\r\n", stats.curr_items); + pos += sprintf(pos, "STAT total_items %u\r\n", stats.total_items); + pos += sprintf(pos, "STAT bytes %llu\r\n", stats.curr_bytes); + pos += sprintf(pos, "STAT curr_connections %u\r\n", stats.curr_conns - 1); /* ignore listening conn */ + pos += sprintf(pos, "STAT total_connections %u\r\n", stats.total_conns); + pos += sprintf(pos, "STAT connection_structures %u\r\n", stats.conn_structs); + pos += sprintf(pos, "STAT cmd_get %llu\r\n", stats.get_cmds); + pos += sprintf(pos, "STAT cmd_set %llu\r\n", stats.set_cmds); + pos += sprintf(pos, "STAT get_hits %llu\r\n", stats.get_hits); + pos += sprintf(pos, "STAT get_misses %llu\r\n", stats.get_misses); + pos += sprintf(pos, "STAT evictions %llu\r\n", stats.evictions); + pos += sprintf(pos, "STAT bytes_read %llu\r\n", stats.bytes_read); + pos += sprintf(pos, "STAT bytes_written %llu\r\n", stats.bytes_written); + pos += sprintf(pos, "STAT limit_maxbytes %llu\r\n", (uint64_t) settings.maxbytes); + pos += sprintf(pos, "STAT threads %u\r\n", settings.num_threads); + pos += sprintf(pos, "END"); + STATS_UNLOCK(); + out_string(c, temp); + return; + } + + subcommand = tokens[SUBCOMMAND_TOKEN].value; + + if (strcmp(subcommand, "reset") == 0) { + stats_reset(); + out_string(c, "RESET"); + return; + } + +#ifdef HAVE_MALLOC_H +#ifdef HAVE_STRUCT_MALLINFO + if (strcmp(subcommand, "malloc") == 0) { + char temp[512]; + struct mallinfo info; + char *pos = temp; + + info = mallinfo(); + pos += sprintf(pos, "STAT arena_size %d\r\n", info.arena); + pos += sprintf(pos, "STAT free_chunks %d\r\n", info.ordblks); + pos += sprintf(pos, "STAT fastbin_blocks %d\r\n", info.smblks); + pos += sprintf(pos, "STAT mmapped_regions %d\r\n", info.hblks); + pos += sprintf(pos, "STAT mmapped_space %d\r\n", info.hblkhd); + pos += sprintf(pos, "STAT max_total_alloc %d\r\n", info.usmblks); + pos += sprintf(pos, "STAT fastbin_space %d\r\n", info.fsmblks); + pos += sprintf(pos, "STAT total_alloc %d\r\n", info.uordblks); + pos += sprintf(pos, "STAT total_free %d\r\n", info.fordblks); + pos += sprintf(pos, "STAT releasable_space %d\r\nEND", info.keepcost); + out_string(c, temp); + return; + } +#endif /* HAVE_STRUCT_MALLINFO */ +#endif /* HAVE_MALLOC_H */ + out_string(c, "ERROR"); +} + +static inline void process_srilm_command(conn *c, token_t *tokens, size_t ntokens) { + int context[6]; + int i = 1; + int j = ntokens - 3; + while (tokens[i].length) { + context[i-1] = srilm_getvoc(tokens[i].value); + ++i; + } + float p = -999.0f; + if (context[0] != -1) { + context[i-1] = -1; + p = srilm_wordprob(context[0], &context[1]); + } + + memcpy(c->wbuf, &p, sizeof(float)); + memcpy(c->wbuf + sizeof(float), "\r\n", 2); + c->wbytes = sizeof(float) + 2; + c->wcurr = c->wbuf; + + conn_set_state(c, conn_write); + c->write_and_go = conn_read; +} + +static void process_command(conn *c, char *command) { + + token_t tokens[MAX_TOKENS]; + size_t ntokens; + int comm; + + assert(c != NULL); + + if (settings.verbose > 1) + fprintf(stderr, "<%d %s\n", c->sfd, command); + + /* + * for commands set/add/replace, we build an item and read the data + * directly into it, then continue in nread_complete(). + */ + + c->msgcurr = 0; + c->msgused = 0; + c->iovused = 0; + if (add_msghdr(c) != 0) { + out_string(c, "SERVER_ERROR out of memory preparing response"); + return; + } + + ntokens = tokenize_command(command, tokens, MAX_TOKENS); + if (ntokens >1 && + strcmp(tokens[COMMAND_TOKEN].value, "prob") == 0) { + process_srilm_command(c, tokens, ntokens); + } else if (ntokens >= 2 && (strcmp(tokens[COMMAND_TOKEN].value, "stats") == 0)) { + + process_stat(c, tokens, ntokens); + + } else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "version") == 0)) { + + out_string(c, "VERSION " VERSION); + + } else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "quit") == 0)) { + + conn_set_state(c, conn_closing); + + } else { + out_string(c, "ERROR"); + } + return; +} + +/* + * if we have a complete line in the buffer, process it. + */ +static int try_read_command(conn *c) { + char *el, *cont; + + assert(c != NULL); + assert(c->rcurr <= (c->rbuf + c->rsize)); + + if (c->rbytes == 0) + return 0; + el = memchr(c->rcurr, '\n', c->rbytes); + if (!el) + return 0; + cont = el + 1; + if ((el - c->rcurr) > 1 && *(el - 1) == '\r') { + el--; + } + *el = '\0'; + + assert(cont <= (c->rcurr + c->rbytes)); + + process_command(c, c->rcurr); + + c->rbytes -= (cont - c->rcurr); + c->rcurr = cont; + + assert(c->rcurr <= (c->rbuf + c->rsize)); + + return 1; +} + +/* + * read a UDP request. + * return 0 if there's nothing to read. + */ +static int try_read_udp(conn *c) { + int res; + + assert(c != NULL); + + c->request_addr_size = sizeof(c->request_addr); + res = recvfrom(c->sfd, c->rbuf, c->rsize, + 0, &c->request_addr, &c->request_addr_size); + if (res > 8) { + unsigned char *buf = (unsigned char *)c->rbuf; + STATS_LOCK(); + stats.bytes_read += res; + STATS_UNLOCK(); + + /* Beginning of UDP packet is the request ID; save it. */ + c->request_id = buf[0] * 256 + buf[1]; + + /* If this is a multi-packet request, drop it. */ + if (buf[4] != 0 || buf[5] != 1) { + out_string(c, "SERVER_ERROR multi-packet request not supported"); + return 0; + } + + /* Don't care about any of the rest of the header. */ + res -= 8; + memmove(c->rbuf, c->rbuf + 8, res); + + c->rbytes += res; + c->rcurr = c->rbuf; + return 1; + } + return 0; +} + +/* + * read from network as much as we can, handle buffer overflow and connection + * close. + * before reading, move the remaining incomplete fragment of a command + * (if any) to the beginning of the buffer. + * return 0 if there's nothing to read on the first read. + */ +static int try_read_network(conn *c) { + int gotdata = 0; + int res; + + assert(c != NULL); + + if (c->rcurr != c->rbuf) { + if (c->rbytes != 0) /* otherwise there's nothing to copy */ + memmove(c->rbuf, c->rcurr, c->rbytes); + c->rcurr = c->rbuf; + } + + while (1) { + if (c->rbytes >= c->rsize) { + char *new_rbuf = realloc(c->rbuf, c->rsize * 2); + if (!new_rbuf) { + if (settings.verbose > 0) + fprintf(stderr, "Couldn't realloc input buffer\n"); + c->rbytes = 0; /* ignore what we read */ + out_string(c, "SERVER_ERROR out of memory reading request"); + c->write_and_go = conn_closing; + return 1; + } + c->rcurr = c->rbuf = new_rbuf; + c->rsize *= 2; + } + + /* unix socket mode doesn't need this, so zeroed out. but why + * is this done for every command? presumably for UDP + * mode. */ + if (!settings.socketpath) { + c->request_addr_size = sizeof(c->request_addr); + } else { + c->request_addr_size = 0; + } + + int avail = c->rsize - c->rbytes; + res = read(c->sfd, c->rbuf + c->rbytes, avail); + if (res > 0) { + STATS_LOCK(); + stats.bytes_read += res; + STATS_UNLOCK(); + gotdata = 1; + c->rbytes += res; + if (res == avail) { + continue; + } else { + break; + } + } + if (res == 0) { + /* connection closed */ + conn_set_state(c, conn_closing); + return 1; + } + if (res == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK) break; + /* Should close on unhandled errors. */ + conn_set_state(c, conn_closing); + return 1; + } + } + return gotdata; +} + +static bool update_event(conn *c, const int new_flags) { + assert(c != NULL); + + struct event_base *base = c->event.ev_base; + if (c->ev_flags == new_flags) + return true; + if (event_del(&c->event) == -1) return false; + event_set(&c->event, c->sfd, new_flags, event_handler, (void *)c); + event_base_set(base, &c->event); + c->ev_flags = new_flags; + if (event_add(&c->event, 0) == -1) return false; + return true; +} + +/* + * Sets whether we are listening for new connections or not. + */ +void accept_new_conns(const bool do_accept) { + conn *next; + + if (! is_listen_thread()) + return; + + for (next = listen_conn; next; next = next->next) { + if (do_accept) { + update_event(next, EV_READ | EV_PERSIST); + if (listen(next->sfd, 1024) != 0) { + perror("listen"); + } + } + else { + update_event(next, 0); + if (listen(next->sfd, 0) != 0) { + perror("listen"); + } + } + } +} + + +/* + * Transmit the next chunk of data from our list of msgbuf structures. + * + * Returns: + * TRANSMIT_COMPLETE All done writing. + * TRANSMIT_INCOMPLETE More data remaining to write. + * TRANSMIT_SOFT_ERROR Can't write any more right now. + * TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing) + */ +static int transmit(conn *c) { + assert(c != NULL); + + if (c->msgcurr < c->msgused && + c->msglist[c->msgcurr].msg_iovlen == 0) { + /* Finished writing the current msg; advance to the next. */ + c->msgcurr++; + } + if (c->msgcurr < c->msgused) { + ssize_t res; + struct msghdr *m = &c->msglist[c->msgcurr]; + + res = sendmsg(c->sfd, m, 0); + if (res > 0) { + STATS_LOCK(); + stats.bytes_written += res; + STATS_UNLOCK(); + + /* We've written some of the data. Remove the completed + iovec entries from the list of pending writes. */ + while (m->msg_iovlen > 0 && res >= m->msg_iov->iov_len) { + res -= m->msg_iov->iov_len; + m->msg_iovlen--; + m->msg_iov++; + } + + /* Might have written just part of the last iovec entry; + adjust it so the next write will do the rest. */ + if (res > 0) { + m->msg_iov->iov_base += res; + m->msg_iov->iov_len -= res; + } + return TRANSMIT_INCOMPLETE; + } + if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + if (!update_event(c, EV_WRITE | EV_PERSIST)) { + if (settings.verbose > 0) + fprintf(stderr, "Couldn't update event\n"); + conn_set_state(c, conn_closing); + return TRANSMIT_HARD_ERROR; + } + return TRANSMIT_SOFT_ERROR; + } + /* if res==0 or res==-1 and error is not EAGAIN or EWOULDBLOCK, + we have a real error, on which we close the connection */ + if (settings.verbose > 0) + perror("Failed to write, and not due to blocking"); + + if (c->udp) + conn_set_state(c, conn_read); + else + conn_set_state(c, conn_closing); + return TRANSMIT_HARD_ERROR; + } else { + return TRANSMIT_COMPLETE; + } +} + +static void drive_machine(conn *c) { + bool stop = false; + int sfd, flags = 1; + socklen_t addrlen; + struct sockaddr_storage addr; + int res; + + assert(c != NULL); + + while (!stop) { + + switch(c->state) { + case conn_listening: + addrlen = sizeof(addr); + if ((sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen)) == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + /* these are transient, so don't log anything */ + stop = true; + } else if (errno == EMFILE) { + if (settings.verbose > 0) + fprintf(stderr, "Too many open connections\n"); + accept_new_conns(false); + stop = true; + } else { + perror("accept()"); + stop = true; + } + break; + } + if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 || + fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) { + perror("setting O_NONBLOCK"); + close(sfd); + break; + } + dispatch_conn_new(sfd, conn_read, EV_READ | EV_PERSIST, + DATA_BUFFER_SIZE, false); + break; + + case conn_read: + if (try_read_command(c) != 0) { + continue; + } + if ((c->udp ? try_read_udp(c) : try_read_network(c)) != 0) { + continue; + } + /* we have no command line and no data to read from network */ + if (!update_event(c, EV_READ | EV_PERSIST)) { + if (settings.verbose > 0) + fprintf(stderr, "Couldn't update event\n"); + conn_set_state(c, conn_closing); + break; + } + stop = true; + break; + + case conn_nread: + assert(!"nread should not be possible"); + break; + + case conn_swallow: + /* we are reading sbytes and throwing them away */ + if (c->sbytes == 0) { + conn_set_state(c, conn_read); + break; + } + + /* first check if we have leftovers in the conn_read buffer */ + if (c->rbytes > 0) { + int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes; + c->sbytes -= tocopy; + c->rcurr += tocopy; + c->rbytes -= tocopy; + break; + } + + /* now try reading from the socket */ + res = read(c->sfd, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize); + if (res > 0) { + STATS_LOCK(); + stats.bytes_read += res; + STATS_UNLOCK(); + c->sbytes -= res; + break; + } + if (res == 0) { /* end of stream */ + conn_set_state(c, conn_closing); + break; + } + if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + if (!update_event(c, EV_READ | EV_PERSIST)) { + if (settings.verbose > 0) + fprintf(stderr, "Couldn't update event\n"); + conn_set_state(c, conn_closing); + break; + } + stop = true; + break; + } + /* otherwise we have a real error, on which we close the connection */ + if (settings.verbose > 0) + fprintf(stderr, "Failed to read, and not due to blocking\n"); + conn_set_state(c, conn_closing); + break; + + case conn_write: + /* + * We want to write out a simple response. If we haven't already, + * assemble it into a msgbuf list (this will be a single-entry + * list for TCP or a two-entry list for UDP). + */ + if (c->iovused == 0 || (c->udp && c->iovused == 1)) { + if (add_iov(c, c->wcurr, c->wbytes) != 0 || + (c->udp && build_udp_headers(c) != 0)) { + if (settings.verbose > 0) + fprintf(stderr, "Couldn't build response\n"); + conn_set_state(c, conn_closing); + break; + } + } + + /* fall through... */ + + case conn_mwrite: + switch (transmit(c)) { + case TRANSMIT_COMPLETE: + if (c->state == conn_write) { + if (c->write_and_free) { + free(c->write_and_free); + c->write_and_free = 0; + } + conn_set_state(c, c->write_and_go); + } else { + if (settings.verbose > 0) + fprintf(stderr, "Unexpected state %d\n", c->state); + conn_set_state(c, conn_closing); + } + break; + + case TRANSMIT_INCOMPLETE: + case TRANSMIT_HARD_ERROR: + break; /* Continue in state machine. */ + + case TRANSMIT_SOFT_ERROR: + stop = true; + break; + } + break; + + case conn_closing: + if (c->udp) + conn_cleanup(c); + else + conn_close(c); + stop = true; + break; + } + } + + return; +} + +void event_handler(const int fd, const short which, void *arg) { + conn *c; + + c = (conn *)arg; + assert(c != NULL); + + c->which = which; + + /* sanity */ + if (fd != c->sfd) { + if (settings.verbose > 0) + fprintf(stderr, "Catastrophic: event fd doesn't match conn fd!\n"); + conn_close(c); + return; + } + + drive_machine(c); + + /* wait for next event */ + return; +} + +static int new_socket(struct addrinfo *ai) { + int sfd; + int flags; + + if ((sfd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) == -1) { + perror("socket()"); + return -1; + } + + if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 || + fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) { + perror("setting O_NONBLOCK"); + close(sfd); + return -1; + } + return sfd; +} + + +/* + * Sets a socket's send buffer size to the maximum allowed by the system. + */ +static void maximize_sndbuf(const int sfd) { + socklen_t intsize = sizeof(int); + int last_good = 0; + int min, max, avg; + int old_size; + + /* Start with the default size. */ + if (getsockopt(sfd, SOL_SOCKET, SO_SNDBUF, &old_size, &intsize) != 0) { + if (settings.verbose > 0) + perror("getsockopt(SO_SNDBUF)"); + return; + } + + /* Binary-search for the real maximum. */ + min = old_size; + max = MAX_SENDBUF_SIZE; + + while (min <= max) { + avg = ((unsigned int)(min + max)) / 2; + if (setsockopt(sfd, SOL_SOCKET, SO_SNDBUF, (void *)&avg, intsize) == 0) { + last_good = avg; + min = avg + 1; + } else { + max = avg - 1; + } + } + + if (settings.verbose > 1) + fprintf(stderr, "<%d send buffer was %d, now %d\n", sfd, old_size, last_good); +} + +static int server_socket(const int port, const bool is_udp) { + int sfd; + struct linger ling = {0, 0}; + struct addrinfo *ai; + struct addrinfo *next; + struct addrinfo hints; + char port_buf[NI_MAXSERV]; + int error; + int success = 0; + + int flags =1; + + /* + * the memset call clears nonstandard fields in some impementations + * that otherwise mess things up. + */ + memset(&hints, 0, sizeof (hints)); + hints.ai_flags = AI_PASSIVE|AI_ADDRCONFIG; + if (is_udp) + { + hints.ai_protocol = IPPROTO_UDP; + hints.ai_socktype = SOCK_DGRAM; + hints.ai_family = AF_INET; /* This left here because of issues with OSX 10.5 */ + } else { + hints.ai_family = AF_UNSPEC; + hints.ai_protocol = IPPROTO_TCP; + hints.ai_socktype = SOCK_STREAM; + } + + snprintf(port_buf, NI_MAXSERV, "%d", port); + error= getaddrinfo(settings.inter, port_buf, &hints, &ai); + if (error != 0) { + if (error != EAI_SYSTEM) + fprintf(stderr, "getaddrinfo(): %s\n", gai_strerror(error)); + else + perror("getaddrinfo()"); + + return 1; + } + + for (next= ai; next; next= next->ai_next) { + conn *listen_conn_add; + if ((sfd = new_socket(next)) == -1) { + freeaddrinfo(ai); + return 1; + } + + setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, (void *)&flags, sizeof(flags)); + if (is_udp) { + maximize_sndbuf(sfd); + } else { + setsockopt(sfd, SOL_SOCKET, SO_KEEPALIVE, (void *)&flags, sizeof(flags)); + setsockopt(sfd, SOL_SOCKET, SO_LINGER, (void *)&ling, sizeof(ling)); + setsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, (void *)&flags, sizeof(flags)); + } + + if (bind(sfd, next->ai_addr, next->ai_addrlen) == -1) { + if (errno != EADDRINUSE) { + perror("bind()"); + close(sfd); + freeaddrinfo(ai); + return 1; + } + close(sfd); + continue; + } else { + success++; + if (!is_udp && listen(sfd, 1024) == -1) { + perror("listen()"); + close(sfd); + freeaddrinfo(ai); + return 1; + } + } + + if (is_udp) + { + int c; + + for (c = 0; c < settings.num_threads; c++) { + /* this is guaranteed to hit all threads because we round-robin */ + dispatch_conn_new(sfd, conn_read, EV_READ | EV_PERSIST, + UDP_READ_BUFFER_SIZE, 1); + } + } else { + if (!(listen_conn_add = conn_new(sfd, conn_listening, + EV_READ | EV_PERSIST, 1, false, main_base))) { + fprintf(stderr, "failed to create listening connection\n"); + exit(EXIT_FAILURE); + } + + listen_conn_add->next = listen_conn; + listen_conn = listen_conn_add; + } + } + + freeaddrinfo(ai); + + /* Return zero iff we detected no errors in starting up connections */ + return success == 0; +} + +static int new_socket_unix(void) { + int sfd; + int flags; + + if ((sfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + perror("socket()"); + return -1; + } + + if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 || + fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) { + perror("setting O_NONBLOCK"); + close(sfd); + return -1; + } + return sfd; +} + +static int server_socket_unix(const char *path, int access_mask) { + int sfd; + struct linger ling = {0, 0}; + struct sockaddr_un addr; + struct stat tstat; + int flags =1; + int old_umask; + + if (!path) { + return 1; + } + + if ((sfd = new_socket_unix()) == -1) { + return 1; + } + + /* + * Clean up a previous socket file if we left it around + */ + if (lstat(path, &tstat) == 0) { + if (S_ISSOCK(tstat.st_mode)) + unlink(path); + } + + setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, (void *)&flags, sizeof(flags)); + setsockopt(sfd, SOL_SOCKET, SO_KEEPALIVE, (void *)&flags, sizeof(flags)); + setsockopt(sfd, SOL_SOCKET, SO_LINGER, (void *)&ling, sizeof(ling)); + + /* + * the memset call clears nonstandard fields in some impementations + * that otherwise mess things up. + */ + memset(&addr, 0, sizeof(addr)); + + addr.sun_family = AF_UNIX; + strcpy(addr.sun_path, path); + old_umask=umask( ~(access_mask&0777)); + if (bind(sfd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + perror("bind()"); + close(sfd); + umask(old_umask); + return 1; + } + umask(old_umask); + if (listen(sfd, 1024) == -1) { + perror("listen()"); + close(sfd); + return 1; + } + if (!(listen_conn = conn_new(sfd, conn_listening, + EV_READ | EV_PERSIST, 1, false, main_base))) { + fprintf(stderr, "failed to create listening connection\n"); + exit(EXIT_FAILURE); + } + + return 0; +} + +/* + * We keep the current time of day in a global variable that's updated by a + * timer event. This saves us a bunch of time() system calls (we really only + * need to get the time once a second, whereas there can be tens of thousands + * of requests a second) and allows us to use server-start-relative timestamps + * rather than absolute UNIX timestamps, a space savings on systems where + * sizeof(time_t) > sizeof(unsigned int). + */ +volatile rel_time_t current_time; +static struct event clockevent; + +/* time-sensitive callers can call it by hand with this, outside the normal ever-1-second timer */ +static void set_current_time(void) { + struct timeval timer; + + gettimeofday(&timer, NULL); + current_time = (rel_time_t) (timer.tv_sec - stats.started); +} + +static void clock_handler(const int fd, const short which, void *arg) { + struct timeval t = {.tv_sec = 1, .tv_usec = 0}; + static bool initialized = false; + + if (initialized) { + /* only delete the event if it's actually there. */ + evtimer_del(&clockevent); + } else { + initialized = true; + } + + evtimer_set(&clockevent, clock_handler, 0); + event_base_set(main_base, &clockevent); + evtimer_add(&clockevent, &t); + + set_current_time(); +} + +static void usage(void) { + printf(PACKAGE " " VERSION "\n"); + printf("-p TCP port number to listen on (default: 11211)\n" + "-U UDP port number to listen on (default: 0, off)\n" + "-s unix socket path to listen on (disables network support)\n" + "-a access mask for unix socket, in octal (default 0700)\n" + "-l interface to listen on, default is INDRR_ANY\n" + "-d run as a daemon\n" + "-r maximize core file limit\n" + "-u assume identity of (only when run as root)\n" + "-m max memory to use for items in megabytes, default is 64 MB\n" + "-M return error on memory exhausted (rather than removing items)\n" + "-c max simultaneous connections, default is 1024\n" + "-k lock down all paged memory. Note that there is a\n" + " limit on how much memory you may lock. Trying to\n" + " allocate more than that would fail, so be sure you\n" + " set the limit correctly for the user you started\n" + " the daemon with (not for -u user;\n" + " under sh this is done with 'ulimit -S -l NUM_KB').\n" + "-v verbose (print errors/warnings while in event loop)\n" + "-vv very verbose (also print client commands/reponses)\n" + "-h print this help and exit\n" + "-i print memcached and libevent license\n" + "-b run a managed instanced (mnemonic: buckets)\n" + "-P save PID in , only used with -d option\n" + "-f chunk size growth factor, default 1.25\n" + "-n minimum space allocated for key+value+flags, default 48\n" + +#if defined(HAVE_GETPAGESIZES) && defined(HAVE_MEMCNTL) + "-L Try to use large memory pages (if available). Increasing\n" + " the memory page size could reduce the number of TLB misses\n" + " and improve the performance. In order to get large pages\n" + " from the OS, memcached will allocate the total item-cache\n" + " in one large chunk.\n" +#endif + ); + +#ifdef USE_THREADS + printf("-t number of threads to use, default 4\n"); +#endif + return; +} + +static void usage_license(void) { + printf(PACKAGE " " VERSION "\n\n"); + printf( + "Copyright (c) 2003, Danga Interactive, Inc. \n" + "All rights reserved.\n" + "\n" + "Redistribution and use in source and binary forms, with or without\n" + "modification, are permitted provided that the following conditions are\n" + "met:\n" + "\n" + " * Redistributions of source code must retain the above copyright\n" + "notice, this list of conditions and the following disclaimer.\n" + "\n" + " * Redistributions in binary form must reproduce the above\n" + "copyright notice, this list of conditions and the following disclaimer\n" + "in the documentation and/or other materials provided with the\n" + "distribution.\n" + "\n" + " * Neither the name of the Danga Interactive nor the names of its\n" + "contributors may be used to endorse or promote products derived from\n" + "this software without specific prior written permission.\n" + "\n" + "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n" + "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n" + "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n" + "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n" + "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n" + "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n" + "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n" + "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n" + "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n" + "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n" + "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n" + "\n" + "\n" + "This product includes software developed by Niels Provos.\n" + "\n" + "[ libevent ]\n" + "\n" + "Copyright 2000-2003 Niels Provos \n" + "All rights reserved.\n" + "\n" + "Redistribution and use in source and binary forms, with or without\n" + "modification, are permitted provided that the following conditions\n" + "are met:\n" + "1. Redistributions of source code must retain the above copyright\n" + " notice, this list of conditions and the following disclaimer.\n" + "2. Redistributions in binary form must reproduce the above copyright\n" + " notice, this list of conditions and the following disclaimer in the\n" + " documentation and/or other materials provided with the distribution.\n" + "3. All advertising materials mentioning features or use of this software\n" + " must display the following acknowledgement:\n" + " This product includes software developed by Niels Provos.\n" + "4. The name of the author may not be used to endorse or promote products\n" + " derived from this software without specific prior written permission.\n" + "\n" + "THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR\n" + "IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\n" + "OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n" + "IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,\n" + "INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT\n" + "NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n" + "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n" + "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n" + "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF\n" + "THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n" + ); + + return; +} + +static void save_pid(const pid_t pid, const char *pid_file) { + FILE *fp; + if (pid_file == NULL) + return; + + if ((fp = fopen(pid_file, "w")) == NULL) { + fprintf(stderr, "Could not open the pid file %s for writing\n", pid_file); + return; + } + + fprintf(fp,"%ld\n", (long)pid); + if (fclose(fp) == -1) { + fprintf(stderr, "Could not close the pid file %s.\n", pid_file); + return; + } +} + +static void remove_pidfile(const char *pid_file) { + if (pid_file == NULL) + return; + + if (unlink(pid_file) != 0) { + fprintf(stderr, "Could not remove the pid file %s.\n", pid_file); + } + +} + + +static void sig_handler(const int sig) { + printf("SIGINT handled.\n"); + exit(EXIT_SUCCESS); +} + +#if defined(HAVE_GETPAGESIZES) && defined(HAVE_MEMCNTL) +/* + * On systems that supports multiple page sizes we may reduce the + * number of TLB-misses by using the biggest available page size + */ +int enable_large_pages(void) { + int ret = -1; + size_t sizes[32]; + int avail = getpagesizes(sizes, 32); + if (avail != -1) { + size_t max = sizes[0]; + struct memcntl_mha arg = {0}; + int ii; + + for (ii = 1; ii < avail; ++ii) { + if (max < sizes[ii]) { + max = sizes[ii]; + } + } + + arg.mha_flags = 0; + arg.mha_pagesize = max; + arg.mha_cmd = MHA_MAPSIZE_BSSBRK; + + if (memcntl(0, 0, MC_HAT_ADVISE, (caddr_t)&arg, 0, 0) == -1) { + fprintf(stderr, "Failed to set large pages: %s\n", + strerror(errno)); + fprintf(stderr, "Will use default page size\n"); + } else { + ret = 0; + } + } else { + fprintf(stderr, "Failed to get supported pagesizes: %s\n", + strerror(errno)); + fprintf(stderr, "Will use default page size\n"); + } + + return ret; +} +#endif + +int main (int argc, char **argv) { + int c; + int x; + bool lock_memory = false; + bool daemonize = false; + bool preallocate = false; + int maxcore = 0; + char *username = NULL; + char *pid_file = NULL; + struct passwd *pw; + struct sigaction sa; + struct rlimit rlim; + /* listening socket */ + static int *l_socket = NULL; + + /* udp socket */ + static int *u_socket = NULL; + static int u_socket_count = 0; + + /* handle SIGINT */ + signal(SIGINT, sig_handler); + + /* init settings */ + settings_init(); + + /* set stderr non-buffering (for running under, say, daemontools) */ + setbuf(stderr, NULL); + + /* process arguments */ + while ((c = getopt(argc, argv, "x:o:a:bp:s:U:m:Mc:khirvdl:u:P:f:s:n:t:L")) != -1) { + switch (c) { + case 'x': + settings.srilm = optarg; + break; + case 'o': + settings.srilm_order = atoi(optarg); + break; + case 'a': + /* access for unix domain socket, as octal mask (like chmod)*/ + settings.access= strtol(optarg,NULL,8); + break; + + case 'U': + settings.udpport = atoi(optarg); + break; + case 'b': + settings.managed = true; + break; + case 'p': + settings.port = atoi(optarg); + break; + case 's': + settings.socketpath = optarg; + break; + case 'm': + settings.maxbytes = ((size_t)atoi(optarg)) * 1024 * 1024; + break; + case 'M': + settings.evict_to_free = 0; + break; + case 'c': + settings.maxconns = atoi(optarg); + break; + case 'h': + usage(); + exit(EXIT_SUCCESS); + case 'i': + usage_license(); + exit(EXIT_SUCCESS); + case 'k': + lock_memory = true; + break; + case 'v': + settings.verbose++; + break; + case 'l': + settings.inter= strdup(optarg); + break; + case 'd': + daemonize = true; + break; + case 'r': + maxcore = 1; + break; + case 'u': + username = optarg; + break; + case 'P': + pid_file = optarg; + break; + case 'f': + settings.factor = atof(optarg); + if (settings.factor <= 1.0) { + fprintf(stderr, "Factor must be greater than 1\n"); + return 1; + } + break; + case 'n': + settings.chunk_size = atoi(optarg); + if (settings.chunk_size == 0) { + fprintf(stderr, "Chunk size must be greater than 0\n"); + return 1; + } + break; + case 't': + settings.num_threads = atoi(optarg); + if (settings.num_threads == 0) { + fprintf(stderr, "Number of threads must be greater than 0\n"); + return 1; + } + break; +#if defined(HAVE_GETPAGESIZES) && defined(HAVE_MEMCNTL) + case 'L' : + if (enable_large_pages() == 0) { + preallocate = true; + } + break; +#endif + default: + fprintf(stderr, "Illegal argument \"%c\"\n", c); + return 1; + } + } + + if (maxcore != 0) { + struct rlimit rlim_new; + /* + * First try raising to infinity; if that fails, try bringing + * the soft limit to the hard. + */ + if (getrlimit(RLIMIT_CORE, &rlim) == 0) { + rlim_new.rlim_cur = rlim_new.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_CORE, &rlim_new)!= 0) { + /* failed. try raising just to the old max */ + rlim_new.rlim_cur = rlim_new.rlim_max = rlim.rlim_max; + (void)setrlimit(RLIMIT_CORE, &rlim_new); + } + } + /* + * getrlimit again to see what we ended up with. Only fail if + * the soft limit ends up 0, because then no core files will be + * created at all. + */ + + if ((getrlimit(RLIMIT_CORE, &rlim) != 0) || rlim.rlim_cur == 0) { + fprintf(stderr, "failed to ensure corefile creation\n"); + exit(EXIT_FAILURE); + } + } + + /* + * If needed, increase rlimits to allow as many connections + * as needed. + */ + + if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) { + fprintf(stderr, "failed to getrlimit number of files\n"); + exit(EXIT_FAILURE); + } else { + int maxfiles = settings.maxconns; + if (rlim.rlim_cur < maxfiles) + rlim.rlim_cur = maxfiles + 3; + if (rlim.rlim_max < rlim.rlim_cur) + rlim.rlim_max = rlim.rlim_cur; + if (setrlimit(RLIMIT_NOFILE, &rlim) != 0) { + fprintf(stderr, "failed to set rlimit for open files. Try running as root or requesting smaller maxconns value.\n"); + exit(EXIT_FAILURE); + } + } + + /* daemonize if requested */ + /* if we want to ensure our ability to dump core, don't chdir to / */ + if (daemonize) { + int res; + res = daemon(maxcore, settings.verbose); + if (res == -1) { + fprintf(stderr, "failed to daemon() in order to daemonize\n"); + return 1; + } + } + + /* lock paged memory if needed */ + if (lock_memory) { +#ifdef HAVE_MLOCKALL + int res = mlockall(MCL_CURRENT | MCL_FUTURE); + if (res != 0) { + fprintf(stderr, "warning: -k invalid, mlockall() failed: %s\n", + strerror(errno)); + } +#else + fprintf(stderr, "warning: -k invalid, mlockall() not supported on this platform. proceeding without.\n"); +#endif + } + + /* lose root privileges if we have them */ + if (getuid() == 0 || geteuid() == 0) { + if (username == 0 || *username == '\0') { + fprintf(stderr, "can't run as root without the -u switch\n"); + return 1; + } + if ((pw = getpwnam(username)) == 0) { + fprintf(stderr, "can't find the user %s to switch to\n", username); + return 1; + } + if (setgid(pw->pw_gid) < 0 || setuid(pw->pw_uid) < 0) { + fprintf(stderr, "failed to assume identity of user %s\n", username); + return 1; + } + } + + /* initialize main thread libevent instance */ + main_base = event_init(); + + /* initialize other stuff */ + stats_init(); + conn_init(); + if (!settings.srilm) { + fprintf(stderr, "please specify a LM file with -x\n"); + exit(EXIT_FAILURE); + } + srilm_init(settings.srilm, settings.srilm_order); + + /* managed instance? alloc and zero a bucket array */ + if (settings.managed) { + buckets = malloc(sizeof(int) * MAX_BUCKETS); + if (buckets == 0) { + fprintf(stderr, "failed to allocate the bucket array"); + exit(EXIT_FAILURE); + } + memset(buckets, 0, sizeof(int) * MAX_BUCKETS); + } + + /* + * ignore SIGPIPE signals; we can use errno==EPIPE if we + * need that information + */ + sa.sa_handler = SIG_IGN; + sa.sa_flags = 0; + if (sigemptyset(&sa.sa_mask) == -1 || + sigaction(SIGPIPE, &sa, 0) == -1) { + perror("failed to ignore SIGPIPE; sigaction"); + exit(EXIT_FAILURE); + } + /* start up worker threads if MT mode */ + thread_init(settings.num_threads, main_base); + /* save the PID in if we're a daemon, do this after thread_init due to + a file descriptor handling bug somewhere in libevent */ + if (daemonize) + save_pid(getpid(), pid_file); + /* initialise clock event */ + clock_handler(0, 0, 0); + + /* create unix mode sockets after dropping privileges */ + if (settings.socketpath != NULL) { + if (server_socket_unix(settings.socketpath,settings.access)) { + fprintf(stderr, "failed to listen\n"); + exit(EXIT_FAILURE); + } + } + + /* create the listening socket, bind it, and init */ + if (settings.socketpath == NULL) { + int udp_port; + + if (server_socket(settings.port, 0)) { + fprintf(stderr, "failed to listen\n"); + exit(EXIT_FAILURE); + } + /* + * initialization order: first create the listening sockets + * (may need root on low ports), then drop root if needed, + * then daemonise if needed, then init libevent (in some cases + * descriptors created by libevent wouldn't survive forking). + */ + udp_port = settings.udpport ? settings.udpport : settings.port; + + /* create the UDP listening socket and bind it */ + if (server_socket(udp_port, 1)) { + fprintf(stderr, "failed to listen on UDP port %d\n", settings.udpport); + exit(EXIT_FAILURE); + } + } + + /* enter the event loop */ + event_base_loop(main_base, 0); + /* remove the PID file if we're a daemon */ + if (daemonize) + remove_pidfile(pid_file); + /* Clean up strdup() call for bind() address */ + if (settings.inter) + free(settings.inter); + if (l_socket) + free(l_socket); + if (u_socket) + free(u_socket); + + return 0; +} diff --git a/mosesdecoder/contrib/lmserver/missing b/mosesdecoder/contrib/lmserver/missing new file mode 100644 index 0000000000000000000000000000000000000000..1c8ff7049d8f3aaa9741c53e7f3145d9b76a77d8 --- /dev/null +++ b/mosesdecoder/contrib/lmserver/missing @@ -0,0 +1,367 @@ +#! /bin/sh +# Common stub for a few missing GNU programs while installing. + +scriptversion=2006-05-10.23 + +# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006 +# Free Software Foundation, Inc. +# Originally by Fran,cois Pinard , 1996. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA. + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +if test $# -eq 0; then + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 +fi + +run=: +sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p' +sed_minuso='s/.* -o \([^ ]*\).*/\1/p' + +# In the cases where this matters, `missing' is being run in the +# srcdir already. +if test -f configure.ac; then + configure_ac=configure.ac +else + configure_ac=configure.in +fi + +msg="missing on your system" + +case $1 in +--run) + # Try to run requested program, and just exit if it succeeds. + run= + shift + "$@" && exit 0 + # Exit code 63 means version mismatch. This often happens + # when the user try to use an ancient version of a tool on + # a file that requires a minimum version. In this case we + # we should proceed has if the program had been absent, or + # if --run hadn't been passed. + if test $? = 63; then + run=: + msg="probably too old" + fi + ;; + + -h|--h|--he|--hel|--help) + echo "\ +$0 [OPTION]... PROGRAM [ARGUMENT]... + +Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an +error status if there is no known handling for PROGRAM. + +Options: + -h, --help display this help and exit + -v, --version output version information and exit + --run try to run the given command, and emulate it if it fails + +Supported PROGRAM values: + aclocal touch file \`aclocal.m4' + autoconf touch file \`configure' + autoheader touch file \`config.h.in' + autom4te touch the output file, or create a stub one + automake touch all \`Makefile.in' files + bison create \`y.tab.[ch]', if possible, from existing .[ch] + flex create \`lex.yy.c', if possible, from existing .c + help2man touch the output file + lex create \`lex.yy.c', if possible, from existing .c + makeinfo touch the output file + tar try tar, gnutar, gtar, then tar without non-portable flags + yacc create \`y.tab.[ch]', if possible, from existing .[ch] + +Send bug reports to ." + exit $? + ;; + + -v|--v|--ve|--ver|--vers|--versi|--versio|--version) + echo "missing $scriptversion (GNU Automake)" + exit $? + ;; + + -*) + echo 1>&2 "$0: Unknown \`$1' option" + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 + ;; + +esac + +# Now exit if we have it, but it failed. Also exit now if we +# don't have it and --version was passed (most likely to detect +# the program). +case $1 in + lex|yacc) + # Not GNU programs, they don't have --version. + ;; + + tar) + if test -n "$run"; then + echo 1>&2 "ERROR: \`tar' requires --run" + exit 1 + elif test "x$2" = "x--version" || test "x$2" = "x--help"; then + exit 1 + fi + ;; + + *) + if test -z "$run" && ($1 --version) > /dev/null 2>&1; then + # We have it, but it failed. + exit 1 + elif test "x$2" = "x--version" || test "x$2" = "x--help"; then + # Could not run --version or --help. This is probably someone + # running `$TOOL --version' or `$TOOL --help' to check whether + # $TOOL exists and not knowing $TOOL uses missing. + exit 1 + fi + ;; +esac + +# If it does not exist, or fails to run (possibly an outdated version), +# try to emulate it. +case $1 in + aclocal*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified \`acinclude.m4' or \`${configure_ac}'. You might want + to install the \`Automake' and \`Perl' packages. Grab them from + any GNU archive site." + touch aclocal.m4 + ;; + + autoconf) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified \`${configure_ac}'. You might want to install the + \`Autoconf' and \`GNU m4' packages. Grab them from any GNU + archive site." + touch configure + ;; + + autoheader) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified \`acconfig.h' or \`${configure_ac}'. You might want + to install the \`Autoconf' and \`GNU m4' packages. Grab them + from any GNU archive site." + files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}` + test -z "$files" && files="config.h" + touch_files= + for f in $files; do + case $f in + *:*) touch_files="$touch_files "`echo "$f" | + sed -e 's/^[^:]*://' -e 's/:.*//'`;; + *) touch_files="$touch_files $f.in";; + esac + done + touch $touch_files + ;; + + automake*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'. + You might want to install the \`Automake' and \`Perl' packages. + Grab them from any GNU archive site." + find . -type f -name Makefile.am -print | + sed 's/\.am$/.in/' | + while read f; do touch "$f"; done + ;; + + autom4te) + echo 1>&2 "\ +WARNING: \`$1' is needed, but is $msg. + You might have modified some files without having the + proper tools for further handling them. + You can get \`$1' as part of \`Autoconf' from any GNU + archive site." + + file=`echo "$*" | sed -n "$sed_output"` + test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` + if test -f "$file"; then + touch $file + else + test -z "$file" || exec >$file + echo "#! /bin/sh" + echo "# Created by GNU Automake missing as a replacement of" + echo "# $ $@" + echo "exit 0" + chmod +x $file + exit 1 + fi + ;; + + bison|yacc) + echo 1>&2 "\ +WARNING: \`$1' $msg. You should only need it if + you modified a \`.y' file. You may need the \`Bison' package + in order for those modifications to take effect. You can get + \`Bison' from any GNU archive site." + rm -f y.tab.c y.tab.h + if test $# -ne 1; then + eval LASTARG="\${$#}" + case $LASTARG in + *.y) + SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` + if test -f "$SRCFILE"; then + cp "$SRCFILE" y.tab.c + fi + SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` + if test -f "$SRCFILE"; then + cp "$SRCFILE" y.tab.h + fi + ;; + esac + fi + if test ! -f y.tab.h; then + echo >y.tab.h + fi + if test ! -f y.tab.c; then + echo 'main() { return 0; }' >y.tab.c + fi + ;; + + lex|flex) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified a \`.l' file. You may need the \`Flex' package + in order for those modifications to take effect. You can get + \`Flex' from any GNU archive site." + rm -f lex.yy.c + if test $# -ne 1; then + eval LASTARG="\${$#}" + case $LASTARG in + *.l) + SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` + if test -f "$SRCFILE"; then + cp "$SRCFILE" lex.yy.c + fi + ;; + esac + fi + if test ! -f lex.yy.c; then + echo 'main() { return 0; }' >lex.yy.c + fi + ;; + + help2man) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified a dependency of a manual page. You may need the + \`Help2man' package in order for those modifications to take + effect. You can get \`Help2man' from any GNU archive site." + + file=`echo "$*" | sed -n "$sed_output"` + test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` + if test -f "$file"; then + touch $file + else + test -z "$file" || exec >$file + echo ".ab help2man is required to generate this page" + exit 1 + fi + ;; + + makeinfo) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified a \`.texi' or \`.texinfo' file, or any other file + indirectly affecting the aspect of the manual. The spurious + call might also be the consequence of using a buggy \`make' (AIX, + DU, IRIX). You might want to install the \`Texinfo' package or + the \`GNU make' package. Grab either from any GNU archive site." + # The file to touch is that specified with -o ... + file=`echo "$*" | sed -n "$sed_output"` + test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` + if test -z "$file"; then + # ... or it is the one specified with @setfilename ... + infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` + file=`sed -n ' + /^@setfilename/{ + s/.* \([^ ]*\) *$/\1/ + p + q + }' $infile` + # ... or it is derived from the source name (dir/f.texi becomes f.info) + test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info + fi + # If the file does not exist, the user really needs makeinfo; + # let's fail without touching anything. + test -f $file || exit 1 + touch $file + ;; + + tar) + shift + + # We have already tried tar in the generic part. + # Look for gnutar/gtar before invocation to avoid ugly error + # messages. + if (gnutar --version > /dev/null 2>&1); then + gnutar "$@" && exit 0 + fi + if (gtar --version > /dev/null 2>&1); then + gtar "$@" && exit 0 + fi + firstarg="$1" + if shift; then + case $firstarg in + *o*) + firstarg=`echo "$firstarg" | sed s/o//` + tar "$firstarg" "$@" && exit 0 + ;; + esac + case $firstarg in + *h*) + firstarg=`echo "$firstarg" | sed s/h//` + tar "$firstarg" "$@" && exit 0 + ;; + esac + fi + + echo 1>&2 "\ +WARNING: I can't seem to be able to run \`tar' with the given arguments. + You may want to install GNU tar or Free paxutils, or check the + command line arguments." + exit 1 + ;; + + *) + echo 1>&2 "\ +WARNING: \`$1' is needed, and is $msg. + You might have modified some files without having the + proper tools for further handling them. Check the \`README' file, + it often tells you about the needed prerequisites for installing + this package. You may also peek at any GNU archive site, in case + some other package would contain this missing \`$1' program." + exit 1 + ;; +esac + +exit 0 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-end: "$" +# End: diff --git a/mosesdecoder/contrib/lmserver/srilm.h b/mosesdecoder/contrib/lmserver/srilm.h new file mode 100644 index 0000000000000000000000000000000000000000..d9b00ef928d831e51342b9b57dac6ccb392676dd --- /dev/null +++ b/mosesdecoder/contrib/lmserver/srilm.h @@ -0,0 +1,8 @@ +#ifndef lmserver_srilm_h +#define lmserver_srilm_h + +void srilm_init(const char* fname, int order); +int srilm_getvoc(const char* word); +float srilm_wordprob(int, int*); + +#endif diff --git a/mosesdecoder/contrib/tmcombine/README.md b/mosesdecoder/contrib/tmcombine/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7b8ebd45e79a378563c4f4131584b83afa0649e1 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/README.md @@ -0,0 +1,88 @@ +tmcombine - a tool for Moses translation model combination + +Author: Rico Sennrich + +ABOUT +----- + +This program handles the combination of Moses phrase tables, either through +linear interpolation of the phrase translation probabilities/lexical weights, +or through a recomputation based on the (weighted) combined counts. + +It also supports an automatic search for weights that minimize the cross-entropy +between the model and a tuning set of word/phrase alignments. + + +REQUIREMENTS +------------ + +The script requires Python >= 2.6. +SciPy is recommended. If it is missing, an ad-hoc hill-climbing optimizer will be used (which may be slower, but is actually recommended for PyPy and/or a high number of models). +On Debian-based systems, you can install SciPy from the repository: + sudo apt-get install python-scipy + + +USAGE +----- + +for usage information, run + ./tmcombine.py -h + +Two basic command line examples: + +linearly interpolate two translation models with fixed weights: + ./tmcombine.py combine_given_weights test/model1 test/model2 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test2 + +do a count-based combination of two translation models with weights that minimize perplexity on a set of reference phrase pairs. + ./tmcombine.py combine_given_tuning_set test/model1 test/model2 -o test/phrase-table_test5 -m counts -r test/extract + +Typically, you have to specify one action out of the following: + + - `combine_given_weights`: write a new phrase table with defined weights + + - `combine_given_tuning_set`: write a new phrase table, using the weights that minimize cross-entropy on a tuning set + + - `compare_cross_entropies`: print cross-entropies for each model/feature, using the intersection of phrase pairs. + + - `compute_cross_entropy`: return cross-entropy for a tuning set, a set of models and a set of weights. + + - `return_best_cross_entropy`: return the set of weights and cross-entropy that is optimal for a tuning set and a set of models. + +You can check the docstrings of `Combine_TMs()` for more information and find some example commands in the function `test()`. +Some configuration options (i.e. normalization of linear interpolation) are not accessible from the command line. +You can gain a bit more flexibility by writing/modifying python code that initializes `Combine_TMs()` with your desired arguments, or by just fiddling with the default values in the script. + +Regression tests (check if the output files (`test/phrase-table_testN`) differ from the files in the repositorys): + ./tmcombine.py test + +FURTHER NOTES +------------- + + - Different combination algorithms require different statistics. To be on the safe side, use the option and `-write-lexical-counts` when training models. + + - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). Sort the tables with `LC_ALL=C`. Phrase tables produced by Moses are sorted correctly. + + - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). + If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007). + + - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files. The script will automatically search for the unzipped file first, and for the gzipped file if the former doesn't exist. + + - The cross-entropy estimation assumes that phrase tables contain true probability distributions (i.e. a probability mass of 1 for each conditional probability distribution). If this is not true, the results may be skewed. + + - Unknown phrase pairs are not considered for the cross-entropy estimation. A comparison of models with different vocabularies may be misleading. + + - Don't directly compare cross-entropies obtained from a combination with different modes. Depending on how some corner cases are treated, linear interpolation does not distribute the full probability mass and thus shows higher (i.e. worse) cross-entropies. + + +REFERENCES +---------- + +The algorithms are described in + +Sennrich, Rico (2012). Perplexity Minimization for Translation Model Domain Adaptation in Statistical Machine Translation. In: Proceedings of EACL 2012. + +The evaluated algorithms are: + + - linear interpolation (naive): default + - linear interpolation (modified): use options `--normalized` and `--recompute_lexweights` + - weighted counts: use option `-m counts` diff --git a/mosesdecoder/contrib/tmcombine/argparse.py b/mosesdecoder/contrib/tmcombine/argparse.py new file mode 100644 index 0000000000000000000000000000000000000000..87d0cef35c61b363ffff6efab158b19124c7d812 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/argparse.py @@ -0,0 +1,2382 @@ +# Author: Steven J. Bethard . + +"""Command-line parsing library + +This module is an optparse-inspired command-line parsing library that: + + - handles both optional and positional arguments + - produces highly informative usage messages + - supports parsers that dispatch to sub-parsers + +The following is a simple usage example that sums integers from the +command-line and writes the result to a file:: + + parser = argparse.ArgumentParser( + description='sum the integers at the command line') + parser.add_argument( + 'integers', metavar='int', nargs='+', type=int, + help='an integer to be summed') + parser.add_argument( + '--log', default=sys.stdout, type=argparse.FileType('w'), + help='the file where the sum should be written') + args = parser.parse_args() + args.log.write('%s' % sum(args.integers)) + args.log.close() + +The module contains the following public classes: + + - ArgumentParser -- The main entry point for command-line parsing. As the + example above shows, the add_argument() method is used to populate + the parser with actions for optional and positional arguments. Then + the parse_args() method is invoked to convert the args at the + command-line into an object with attributes. + + - ArgumentError -- The exception raised by ArgumentParser objects when + there are errors with the parser's actions. Errors raised while + parsing the command-line are caught by ArgumentParser and emitted + as command-line messages. + + - FileType -- A factory for defining types of files to be created. As the + example above shows, instances of FileType are typically passed as + the type= argument of add_argument() calls. + + - Action -- The base class for parser actions. Typically actions are + selected by passing strings like 'store_true' or 'append_const' to + the action= argument of add_argument(). However, for greater + customization of ArgumentParser actions, subclasses of Action may + be defined and passed as the action= argument. + + - HelpFormatter, RawDescriptionHelpFormatter, RawTextHelpFormatter, + ArgumentDefaultsHelpFormatter -- Formatter classes which + may be passed as the formatter_class= argument to the + ArgumentParser constructor. HelpFormatter is the default, + RawDescriptionHelpFormatter and RawTextHelpFormatter tell the parser + not to change the formatting for help text, and + ArgumentDefaultsHelpFormatter adds information about argument defaults + to the help. + +All other classes in this module are considered implementation details. +(Also note that HelpFormatter and RawDescriptionHelpFormatter are only +considered public as object names -- the API of the formatter objects is +still considered an implementation detail.) +""" + +__version__ = '1.1' +__all__ = [ + 'ArgumentParser', + 'ArgumentError', + 'ArgumentTypeError', + 'FileType', + 'HelpFormatter', + 'ArgumentDefaultsHelpFormatter', + 'RawDescriptionHelpFormatter', + 'RawTextHelpFormatter', + 'MetavarTypeHelpFormatter', + 'Namespace', + 'Action', + 'ONE_OR_MORE', + 'OPTIONAL', + 'PARSER', + 'REMAINDER', + 'SUPPRESS', + 'ZERO_OR_MORE', +] + + +import collections as _collections +import copy as _copy +import os as _os +import re as _re +import sys as _sys +import textwrap as _textwrap + +from gettext import gettext as _, ngettext + + +SUPPRESS = '==SUPPRESS==' + +OPTIONAL = '?' +ZERO_OR_MORE = '*' +ONE_OR_MORE = '+' +PARSER = 'A...' +REMAINDER = '...' +_UNRECOGNIZED_ARGS_ATTR = '_unrecognized_args' + +# ============================= +# Utility functions and classes +# ============================= + +class _AttributeHolder(object): + """Abstract base class that provides __repr__. + + The __repr__ method returns a string in the format:: + ClassName(attr=name, attr=name, ...) + The attributes are determined either by a class-level attribute, + '_kwarg_names', or by inspecting the instance __dict__. + """ + + def __repr__(self): + type_name = type(self).__name__ + arg_strings = [] + for arg in self._get_args(): + arg_strings.append(repr(arg)) + for name, value in self._get_kwargs(): + arg_strings.append('%s=%r' % (name, value)) + return '%s(%s)' % (type_name, ', '.join(arg_strings)) + + def _get_kwargs(self): + return sorted(self.__dict__.items()) + + def _get_args(self): + return [] + + +def _ensure_value(namespace, name, value): + if getattr(namespace, name, None) is None: + setattr(namespace, name, value) + return getattr(namespace, name) + + +# =============== +# Formatting Help +# =============== + +class HelpFormatter(object): + """Formatter for generating usage messages and argument help strings. + + Only the name of this class is considered a public API. All the methods + provided by the class are considered an implementation detail. + """ + + def __init__(self, + prog, + indent_increment=2, + max_help_position=24, + width=None): + + # default setting for width + if width is None: + try: + width = int(_os.environ['COLUMNS']) + except (KeyError, ValueError): + width = 80 + width -= 2 + + self._prog = prog + self._indent_increment = indent_increment + self._max_help_position = max_help_position + self._width = width + + self._current_indent = 0 + self._level = 0 + self._action_max_length = 0 + + self._root_section = self._Section(self, None) + self._current_section = self._root_section + + self._whitespace_matcher = _re.compile(r'\s+') + self._long_break_matcher = _re.compile(r'\n\n\n+') + + # =============================== + # Section and indentation methods + # =============================== + def _indent(self): + self._current_indent += self._indent_increment + self._level += 1 + + def _dedent(self): + self._current_indent -= self._indent_increment + assert self._current_indent >= 0, 'Indent decreased below 0.' + self._level -= 1 + + class _Section(object): + + def __init__(self, formatter, parent, heading=None): + self.formatter = formatter + self.parent = parent + self.heading = heading + self.items = [] + + def format_help(self): + # format the indented section + if self.parent is not None: + self.formatter._indent() + join = self.formatter._join_parts + for func, args in self.items: + func(*args) + item_help = join([func(*args) for func, args in self.items]) + if self.parent is not None: + self.formatter._dedent() + + # return nothing if the section was empty + if not item_help: + return '' + + # add the heading if the section was non-empty + if self.heading is not SUPPRESS and self.heading is not None: + current_indent = self.formatter._current_indent + heading = '%*s%s:\n' % (current_indent, '', self.heading) + else: + heading = '' + + # join the section-initial newline, the heading and the help + return join(['\n', heading, item_help, '\n']) + + def _add_item(self, func, args): + self._current_section.items.append((func, args)) + + # ======================== + # Message building methods + # ======================== + def start_section(self, heading): + self._indent() + section = self._Section(self, self._current_section, heading) + self._add_item(section.format_help, []) + self._current_section = section + + def end_section(self): + self._current_section = self._current_section.parent + self._dedent() + + def add_text(self, text): + if text is not SUPPRESS and text is not None: + self._add_item(self._format_text, [text]) + + def add_usage(self, usage, actions, groups, prefix=None): + if usage is not SUPPRESS: + args = usage, actions, groups, prefix + self._add_item(self._format_usage, args) + + def add_argument(self, action): + if action.help is not SUPPRESS: + + # find all invocations + get_invocation = self._format_action_invocation + invocations = [get_invocation(action)] + for subaction in self._iter_indented_subactions(action): + invocations.append(get_invocation(subaction)) + + # update the maximum item length + invocation_length = max([len(s) for s in invocations]) + action_length = invocation_length + self._current_indent + self._action_max_length = max(self._action_max_length, + action_length) + + # add the item to the list + self._add_item(self._format_action, [action]) + + def add_arguments(self, actions): + for action in actions: + self.add_argument(action) + + # ======================= + # Help-formatting methods + # ======================= + def format_help(self): + help = self._root_section.format_help() + if help: + help = self._long_break_matcher.sub('\n\n', help) + help = help.strip('\n') + '\n' + return help + + def _join_parts(self, part_strings): + return ''.join([part + for part in part_strings + if part and part is not SUPPRESS]) + + def _format_usage(self, usage, actions, groups, prefix): + if prefix is None: + prefix = _('usage: ') + + # if usage is specified, use that + if usage is not None: + usage = usage % dict(prog=self._prog) + + # if no optionals or positionals are available, usage is just prog + elif usage is None and not actions: + usage = '%(prog)s' % dict(prog=self._prog) + + # if optionals and positionals are available, calculate usage + elif usage is None: + prog = '%(prog)s' % dict(prog=self._prog) + + # split optionals from positionals + optionals = [] + positionals = [] + for action in actions: + if action.option_strings: + optionals.append(action) + else: + positionals.append(action) + + # build full usage string + format = self._format_actions_usage + action_usage = format(optionals + positionals, groups) + usage = ' '.join([s for s in [prog, action_usage] if s]) + + # wrap the usage parts if it's too long + text_width = self._width - self._current_indent + if len(prefix) + len(usage) > text_width: + + # break usage into wrappable parts + part_regexp = r'\(.*?\)+|\[.*?\]+|\S+' + opt_usage = format(optionals, groups) + pos_usage = format(positionals, groups) + opt_parts = _re.findall(part_regexp, opt_usage) + pos_parts = _re.findall(part_regexp, pos_usage) + assert ' '.join(opt_parts) == opt_usage + assert ' '.join(pos_parts) == pos_usage + + # helper for wrapping lines + def get_lines(parts, indent, prefix=None): + lines = [] + line = [] + if prefix is not None: + line_len = len(prefix) - 1 + else: + line_len = len(indent) - 1 + for part in parts: + if line_len + 1 + len(part) > text_width: + lines.append(indent + ' '.join(line)) + line = [] + line_len = len(indent) - 1 + line.append(part) + line_len += len(part) + 1 + if line: + lines.append(indent + ' '.join(line)) + if prefix is not None: + lines[0] = lines[0][len(indent):] + return lines + + # if prog is short, follow it with optionals or positionals + if len(prefix) + len(prog) <= 0.75 * text_width: + indent = ' ' * (len(prefix) + len(prog) + 1) + if opt_parts: + lines = get_lines([prog] + opt_parts, indent, prefix) + lines.extend(get_lines(pos_parts, indent)) + elif pos_parts: + lines = get_lines([prog] + pos_parts, indent, prefix) + else: + lines = [prog] + + # if prog is long, put it on its own line + else: + indent = ' ' * len(prefix) + parts = opt_parts + pos_parts + lines = get_lines(parts, indent) + if len(lines) > 1: + lines = [] + lines.extend(get_lines(opt_parts, indent)) + lines.extend(get_lines(pos_parts, indent)) + lines = [prog] + lines + + # join lines into usage + usage = '\n'.join(lines) + + # prefix with 'usage:' + return '%s%s\n\n' % (prefix, usage) + + def _format_actions_usage(self, actions, groups): + # find group indices and identify actions in groups + group_actions = set() + inserts = {} + for group in groups: + try: + start = actions.index(group._group_actions[0]) + except ValueError: + continue + else: + end = start + len(group._group_actions) + if actions[start:end] == group._group_actions: + for action in group._group_actions: + group_actions.add(action) + if not group.required: + if start in inserts: + inserts[start] += ' [' + else: + inserts[start] = '[' + inserts[end] = ']' + else: + if start in inserts: + inserts[start] += ' (' + else: + inserts[start] = '(' + inserts[end] = ')' + for i in range(start + 1, end): + inserts[i] = '|' + + # collect all actions format strings + parts = [] + for i, action in enumerate(actions): + + # suppressed arguments are marked with None + # remove | separators for suppressed arguments + if action.help is SUPPRESS: + parts.append(None) + if inserts.get(i) == '|': + inserts.pop(i) + elif inserts.get(i + 1) == '|': + inserts.pop(i + 1) + + # produce all arg strings + elif not action.option_strings: + default = self._get_default_metavar_for_positional(action) + part = self._format_args(action, default) + + # if it's in a group, strip the outer [] + if action in group_actions: + if part[0] == '[' and part[-1] == ']': + part = part[1:-1] + + # add the action string to the list + parts.append(part) + + # produce the first way to invoke the option in brackets + else: + option_string = action.option_strings[0] + + # if the Optional doesn't take a value, format is: + # -s or --long + if action.nargs == 0: + part = '%s' % option_string + + # if the Optional takes a value, format is: + # -s ARGS or --long ARGS + else: + default = self._get_default_metavar_for_optional(action) + args_string = self._format_args(action, default) + part = '%s %s' % (option_string, args_string) + + # make it look optional if it's not required or in a group + if not action.required and action not in group_actions: + part = '[%s]' % part + + # add the action string to the list + parts.append(part) + + # insert things at the necessary indices + for i in sorted(inserts, reverse=True): + parts[i:i] = [inserts[i]] + + # join all the action items with spaces + text = ' '.join([item for item in parts if item is not None]) + + # clean up separators for mutually exclusive groups + open = r'[\[(]' + close = r'[\])]' + text = _re.sub(r'(%s) ' % open, r'\1', text) + text = _re.sub(r' (%s)' % close, r'\1', text) + text = _re.sub(r'%s *%s' % (open, close), r'', text) + text = _re.sub(r'\(([^|]*)\)', r'\1', text) + text = text.strip() + + # return the text + return text + + def _format_text(self, text): + if '%(prog)' in text: + text = text % dict(prog=self._prog) + text_width = self._width - self._current_indent + indent = ' ' * self._current_indent + return self._fill_text(text, text_width, indent) + '\n\n' + + def _format_action(self, action): + # determine the required width and the entry label + help_position = min(self._action_max_length + 2, + self._max_help_position) + help_width = self._width - help_position + action_width = help_position - self._current_indent - 2 + action_header = self._format_action_invocation(action) + + # ho nelp; start on same line and add a final newline + if not action.help: + tup = self._current_indent, '', action_header + action_header = '%*s%s\n' % tup + + # short action name; start on the same line and pad two spaces + elif len(action_header) <= action_width: + tup = self._current_indent, '', action_width, action_header + action_header = '%*s%-*s ' % tup + indent_first = 0 + + # long action name; start on the next line + else: + tup = self._current_indent, '', action_header + action_header = '%*s%s\n' % tup + indent_first = help_position + + # collect the pieces of the action help + parts = [action_header] + + # if there was help for the action, add lines of help text + if action.help: + help_text = self._expand_help(action) + help_lines = self._split_lines(help_text, help_width) + parts.append('%*s%s\n' % (indent_first, '', help_lines[0])) + for line in help_lines[1:]: + parts.append('%*s%s\n' % (help_position, '', line)) + + # or add a newline if the description doesn't end with one + elif not action_header.endswith('\n'): + parts.append('\n') + + # if there are any sub-actions, add their help as well + for subaction in self._iter_indented_subactions(action): + parts.append(self._format_action(subaction)) + + # return a single string + return self._join_parts(parts) + + def _format_action_invocation(self, action): + if not action.option_strings: + default = self._get_default_metavar_for_positional(action) + metavar, = self._metavar_formatter(action, default)(1) + return metavar + + else: + parts = [] + + # if the Optional doesn't take a value, format is: + # -s, --long + if action.nargs == 0: + parts.extend(action.option_strings) + + # if the Optional takes a value, format is: + # -s ARGS, --long ARGS + else: + default = self._get_default_metavar_for_optional(action) + args_string = self._format_args(action, default) + for option_string in action.option_strings: + parts.append('%s %s' % (option_string, args_string)) + + return ', '.join(parts) + + def _metavar_formatter(self, action, default_metavar): + if action.metavar is not None: + result = action.metavar + elif action.choices is not None: + choice_strs = [str(choice) for choice in action.choices] + result = '{%s}' % ','.join(choice_strs) + else: + result = default_metavar + + def format(tuple_size): + if isinstance(result, tuple): + return result + else: + return (result, ) * tuple_size + return format + + def _format_args(self, action, default_metavar): + get_metavar = self._metavar_formatter(action, default_metavar) + if action.nargs is None: + result = '%s' % get_metavar(1) + elif action.nargs == OPTIONAL: + result = '[%s]' % get_metavar(1) + elif action.nargs == ZERO_OR_MORE: + result = '[%s [%s ...]]' % get_metavar(2) + elif action.nargs == ONE_OR_MORE: + result = '%s [%s ...]' % get_metavar(2) + elif action.nargs == REMAINDER: + result = '...' + elif action.nargs == PARSER: + result = '%s ...' % get_metavar(1) + else: + formats = ['%s' for _ in range(action.nargs)] + result = ' '.join(formats) % get_metavar(action.nargs) + return result + + def _expand_help(self, action): + params = dict(vars(action), prog=self._prog) + for name in list(params): + if params[name] is SUPPRESS: + del params[name] + for name in list(params): + if hasattr(params[name], '__name__'): + params[name] = params[name].__name__ + if params.get('choices') is not None: + choices_str = ', '.join([str(c) for c in params['choices']]) + params['choices'] = choices_str + return self._get_help_string(action) % params + + def _iter_indented_subactions(self, action): + try: + get_subactions = action._get_subactions + except AttributeError: + pass + else: + self._indent() + for subaction in get_subactions(): + yield subaction + self._dedent() + + def _split_lines(self, text, width): + text = self._whitespace_matcher.sub(' ', text).strip() + return _textwrap.wrap(text, width) + + def _fill_text(self, text, width, indent): + text = self._whitespace_matcher.sub(' ', text).strip() + return _textwrap.fill(text, width, initial_indent=indent, + subsequent_indent=indent) + + def _get_help_string(self, action): + return action.help + + def _get_default_metavar_for_optional(self, action): + return action.dest.upper() + + def _get_default_metavar_for_positional(self, action): + return action.dest + + +class RawDescriptionHelpFormatter(HelpFormatter): + """Help message formatter which retains any formatting in descriptions. + + Only the name of this class is considered a public API. All the methods + provided by the class are considered an implementation detail. + """ + + def _fill_text(self, text, width, indent): + return ''.join(indent + line for line in text.splitlines(keepends=True)) + + +class RawTextHelpFormatter(RawDescriptionHelpFormatter): + """Help message formatter which retains formatting of all help text. + + Only the name of this class is considered a public API. All the methods + provided by the class are considered an implementation detail. + """ + + def _split_lines(self, text, width): + return text.splitlines() + + +class ArgumentDefaultsHelpFormatter(HelpFormatter): + """Help message formatter which adds default values to argument help. + + Only the name of this class is considered a public API. All the methods + provided by the class are considered an implementation detail. + """ + + def _get_help_string(self, action): + help = action.help + if '%(default)' not in action.help: + if action.default is not SUPPRESS: + defaulting_nargs = [OPTIONAL, ZERO_OR_MORE] + if action.option_strings or action.nargs in defaulting_nargs: + help += ' (default: %(default)s)' + return help + + +class MetavarTypeHelpFormatter(HelpFormatter): + """Help message formatter which uses the argument 'type' as the default + metavar value (instead of the argument 'dest') + + Only the name of this class is considered a public API. All the methods + provided by the class are considered an implementation detail. + """ + + def _get_default_metavar_for_optional(self, action): + return action.type.__name__ + + def _get_default_metavar_for_positional(self, action): + return action.type.__name__ + + + +# ===================== +# Options and Arguments +# ===================== + +def _get_action_name(argument): + if argument is None: + return None + elif argument.option_strings: + return '/'.join(argument.option_strings) + elif argument.metavar not in (None, SUPPRESS): + return argument.metavar + elif argument.dest not in (None, SUPPRESS): + return argument.dest + else: + return None + + +class ArgumentError(Exception): + """An error from creating or using an argument (optional or positional). + + The string value of this exception is the message, augmented with + information about the argument that caused it. + """ + + def __init__(self, argument, message): + self.argument_name = _get_action_name(argument) + self.message = message + + def __str__(self): + if self.argument_name is None: + format = '%(message)s' + else: + format = 'argument %(argument_name)s: %(message)s' + return format % dict(message=self.message, + argument_name=self.argument_name) + + +class ArgumentTypeError(Exception): + """An error from trying to convert a command line string to a type.""" + pass + + +# ============== +# Action classes +# ============== + +class Action(_AttributeHolder): + """Information about how to convert command line strings to Python objects. + + Action objects are used by an ArgumentParser to represent the information + needed to parse a single argument from one or more strings from the + command line. The keyword arguments to the Action constructor are also + all attributes of Action instances. + + Keyword Arguments: + + - option_strings -- A list of command-line option strings which + should be associated with this action. + + - dest -- The name of the attribute to hold the created object(s) + + - nargs -- The number of command-line arguments that should be + consumed. By default, one argument will be consumed and a single + value will be produced. Other values include: + - N (an integer) consumes N arguments (and produces a list) + - '?' consumes zero or one arguments + - '*' consumes zero or more arguments (and produces a list) + - '+' consumes one or more arguments (and produces a list) + Note that the difference between the default and nargs=1 is that + with the default, a single value will be produced, while with + nargs=1, a list containing a single value will be produced. + + - const -- The value to be produced if the option is specified and the + option uses an action that takes no values. + + - default -- The value to be produced if the option is not specified. + + - type -- The type which the command-line arguments should be converted + to, should be one of 'string', 'int', 'float', 'complex' or a + callable object that accepts a single string argument. If None, + 'string' is assumed. + + - choices -- A container of values that should be allowed. If not None, + after a command-line argument has been converted to the appropriate + type, an exception will be raised if it is not a member of this + collection. + + - required -- True if the action must always be specified at the + command line. This is only meaningful for optional command-line + arguments. + + - help -- The help string describing the argument. + + - metavar -- The name to be used for the option's argument with the + help string. If None, the 'dest' value will be used as the name. + """ + + def __init__(self, + option_strings, + dest, + nargs=None, + const=None, + default=None, + type=None, + choices=None, + required=False, + help=None, + metavar=None): + self.option_strings = option_strings + self.dest = dest + self.nargs = nargs + self.const = const + self.default = default + self.type = type + self.choices = choices + self.required = required + self.help = help + self.metavar = metavar + + def _get_kwargs(self): + names = [ + 'option_strings', + 'dest', + 'nargs', + 'const', + 'default', + 'type', + 'choices', + 'help', + 'metavar', + ] + return [(name, getattr(self, name)) for name in names] + + def __call__(self, parser, namespace, values, option_string=None): + raise NotImplementedError(_('.__call__() not defined')) + + +class _StoreAction(Action): + + def __init__(self, + option_strings, + dest, + nargs=None, + const=None, + default=None, + type=None, + choices=None, + required=False, + help=None, + metavar=None): + if nargs == 0: + raise ValueError('nargs for store actions must be > 0; if you ' + 'have nothing to store, actions such as store ' + 'true or store const may be more appropriate') + if const is not None and nargs != OPTIONAL: + raise ValueError('nargs must be %r to supply const' % OPTIONAL) + super(_StoreAction, self).__init__( + option_strings=option_strings, + dest=dest, + nargs=nargs, + const=const, + default=default, + type=type, + choices=choices, + required=required, + help=help, + metavar=metavar) + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) + + +class _StoreConstAction(Action): + + def __init__(self, + option_strings, + dest, + const, + default=None, + required=False, + help=None, + metavar=None): + super(_StoreConstAction, self).__init__( + option_strings=option_strings, + dest=dest, + nargs=0, + const=const, + default=default, + required=required, + help=help) + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, self.const) + + +class _StoreTrueAction(_StoreConstAction): + + def __init__(self, + option_strings, + dest, + default=False, + required=False, + help=None): + super(_StoreTrueAction, self).__init__( + option_strings=option_strings, + dest=dest, + const=True, + default=default, + required=required, + help=help) + + +class _StoreFalseAction(_StoreConstAction): + + def __init__(self, + option_strings, + dest, + default=True, + required=False, + help=None): + super(_StoreFalseAction, self).__init__( + option_strings=option_strings, + dest=dest, + const=False, + default=default, + required=required, + help=help) + + +class _AppendAction(Action): + + def __init__(self, + option_strings, + dest, + nargs=None, + const=None, + default=None, + type=None, + choices=None, + required=False, + help=None, + metavar=None): + if nargs == 0: + raise ValueError('nargs for append actions must be > 0; if arg ' + 'strings are not supplying the value to append, ' + 'the append const action may be more appropriate') + if const is not None and nargs != OPTIONAL: + raise ValueError('nargs must be %r to supply const' % OPTIONAL) + super(_AppendAction, self).__init__( + option_strings=option_strings, + dest=dest, + nargs=nargs, + const=const, + default=default, + type=type, + choices=choices, + required=required, + help=help, + metavar=metavar) + + def __call__(self, parser, namespace, values, option_string=None): + items = _copy.copy(_ensure_value(namespace, self.dest, [])) + items.append(values) + setattr(namespace, self.dest, items) + + +class _AppendConstAction(Action): + + def __init__(self, + option_strings, + dest, + const, + default=None, + required=False, + help=None, + metavar=None): + super(_AppendConstAction, self).__init__( + option_strings=option_strings, + dest=dest, + nargs=0, + const=const, + default=default, + required=required, + help=help, + metavar=metavar) + + def __call__(self, parser, namespace, values, option_string=None): + items = _copy.copy(_ensure_value(namespace, self.dest, [])) + items.append(self.const) + setattr(namespace, self.dest, items) + + +class _CountAction(Action): + + def __init__(self, + option_strings, + dest, + default=None, + required=False, + help=None): + super(_CountAction, self).__init__( + option_strings=option_strings, + dest=dest, + nargs=0, + default=default, + required=required, + help=help) + + def __call__(self, parser, namespace, values, option_string=None): + new_count = _ensure_value(namespace, self.dest, 0) + 1 + setattr(namespace, self.dest, new_count) + + +class _HelpAction(Action): + + def __init__(self, + option_strings, + dest=SUPPRESS, + default=SUPPRESS, + help=None): + super(_HelpAction, self).__init__( + option_strings=option_strings, + dest=dest, + default=default, + nargs=0, + help=help) + + def __call__(self, parser, namespace, values, option_string=None): + parser.print_help() + parser.exit() + + +class _VersionAction(Action): + + def __init__(self, + option_strings, + version=None, + dest=SUPPRESS, + default=SUPPRESS, + help="show program's version number and exit"): + super(_VersionAction, self).__init__( + option_strings=option_strings, + dest=dest, + default=default, + nargs=0, + help=help) + self.version = version + + def __call__(self, parser, namespace, values, option_string=None): + version = self.version + if version is None: + version = parser.version + formatter = parser._get_formatter() + formatter.add_text(version) + parser.exit(message=formatter.format_help()) + + +class _SubParsersAction(Action): + + class _ChoicesPseudoAction(Action): + + def __init__(self, name, aliases, help): + metavar = dest = name + if aliases: + metavar += ' (%s)' % ', '.join(aliases) + sup = super(_SubParsersAction._ChoicesPseudoAction, self) + sup.__init__(option_strings=[], dest=dest, help=help, + metavar=metavar) + + def __init__(self, + option_strings, + prog, + parser_class, + dest=SUPPRESS, + help=None, + metavar=None): + + self._prog_prefix = prog + self._parser_class = parser_class + self._name_parser_map = _collections.OrderedDict() + self._choices_actions = [] + + super(_SubParsersAction, self).__init__( + option_strings=option_strings, + dest=dest, + nargs=PARSER, + choices=self._name_parser_map, + help=help, + metavar=metavar) + + def add_parser(self, name, **kwargs): + # set prog from the existing prefix + if kwargs.get('prog') is None: + kwargs['prog'] = '%s %s' % (self._prog_prefix, name) + + aliases = kwargs.pop('aliases', ()) + + # create a pseudo-action to hold the choice help + if 'help' in kwargs: + help = kwargs.pop('help') + choice_action = self._ChoicesPseudoAction(name, aliases, help) + self._choices_actions.append(choice_action) + + # create the parser and add it to the map + parser = self._parser_class(**kwargs) + self._name_parser_map[name] = parser + + # make parser available under aliases also + for alias in aliases: + self._name_parser_map[alias] = parser + + return parser + + def _get_subactions(self): + return self._choices_actions + + def __call__(self, parser, namespace, values, option_string=None): + parser_name = values[0] + arg_strings = values[1:] + + # set the parser name if requested + if self.dest is not SUPPRESS: + setattr(namespace, self.dest, parser_name) + + # select the parser + try: + parser = self._name_parser_map[parser_name] + except KeyError: + args = {'parser_name': parser_name, + 'choices': ', '.join(self._name_parser_map)} + msg = _('unknown parser %(parser_name)r (choices: %(choices)s)') % args + raise ArgumentError(self, msg) + + # parse all the remaining options into the namespace + # store any unrecognized options on the object, so that the top + # level parser can decide what to do with them + namespace, arg_strings = parser.parse_known_args(arg_strings, namespace) + if arg_strings: + vars(namespace).setdefault(_UNRECOGNIZED_ARGS_ATTR, []) + getattr(namespace, _UNRECOGNIZED_ARGS_ATTR).extend(arg_strings) + + +# ============== +# Type classes +# ============== + +class FileType(object): + """Factory for creating file object types + + Instances of FileType are typically passed as type= arguments to the + ArgumentParser add_argument() method. + + Keyword Arguments: + - mode -- A string indicating how the file is to be opened. Accepts the + same values as the builtin open() function. + - bufsize -- The file's desired buffer size. Accepts the same values as + the builtin open() function. + """ + + def __init__(self, mode='r', bufsize=-1): + self._mode = mode + self._bufsize = bufsize + + def __call__(self, string): + # the special argument "-" means sys.std{in,out} + if string == '-': + if 'r' in self._mode: + return _sys.stdin + elif 'w' in self._mode: + return _sys.stdout + else: + msg = _('argument "-" with mode %r') % self._mode + raise ValueError(msg) + + # all other arguments are used as file names + try: + return open(string, self._mode, self._bufsize) + except IOError as e: + message = _("can't open '%s': %s") + raise ArgumentTypeError(message % (string, e)) + + def __repr__(self): + args = self._mode, self._bufsize + args_str = ', '.join(repr(arg) for arg in args if arg != -1) + return '%s(%s)' % (type(self).__name__, args_str) + +# =========================== +# Optional and Positional Parsing +# =========================== + +class Namespace(_AttributeHolder): + """Simple object for storing attributes. + + Implements equality by attribute names and values, and provides a simple + string representation. + """ + + def __init__(self, **kwargs): + for name in kwargs: + setattr(self, name, kwargs[name]) + + def __eq__(self, other): + return vars(self) == vars(other) + + def __ne__(self, other): + return not (self == other) + + def __contains__(self, key): + return key in self.__dict__ + + +class _ActionsContainer(object): + + def __init__(self, + description, + prefix_chars, + argument_default, + conflict_handler): + super(_ActionsContainer, self).__init__() + + self.description = description + self.argument_default = argument_default + self.prefix_chars = prefix_chars + self.conflict_handler = conflict_handler + + # set up registries + self._registries = {} + + # register actions + self.register('action', None, _StoreAction) + self.register('action', 'store', _StoreAction) + self.register('action', 'store_const', _StoreConstAction) + self.register('action', 'store_true', _StoreTrueAction) + self.register('action', 'store_false', _StoreFalseAction) + self.register('action', 'append', _AppendAction) + self.register('action', 'append_const', _AppendConstAction) + self.register('action', 'count', _CountAction) + self.register('action', 'help', _HelpAction) + self.register('action', 'version', _VersionAction) + self.register('action', 'parsers', _SubParsersAction) + + # raise an exception if the conflict handler is invalid + self._get_handler() + + # action storage + self._actions = [] + self._option_string_actions = {} + + # groups + self._action_groups = [] + self._mutually_exclusive_groups = [] + + # defaults storage + self._defaults = {} + + # determines whether an "option" looks like a negative number + self._negative_number_matcher = _re.compile(r'^-\d+$|^-\d*\.\d+$') + + # whether or not there are any optionals that look like negative + # numbers -- uses a list so it can be shared and edited + self._has_negative_number_optionals = [] + + # ==================== + # Registration methods + # ==================== + def register(self, registry_name, value, object): + registry = self._registries.setdefault(registry_name, {}) + registry[value] = object + + def _registry_get(self, registry_name, value, default=None): + return self._registries[registry_name].get(value, default) + + # ================================== + # Namespace default accessor methods + # ================================== + def set_defaults(self, **kwargs): + self._defaults.update(kwargs) + + # if these defaults match any existing arguments, replace + # the previous default on the object with the new one + for action in self._actions: + if action.dest in kwargs: + action.default = kwargs[action.dest] + + def get_default(self, dest): + for action in self._actions: + if action.dest == dest and action.default is not None: + return action.default + return self._defaults.get(dest, None) + + + # ======================= + # Adding argument actions + # ======================= + def add_argument(self, *args, **kwargs): + """ + add_argument(dest, ..., name=value, ...) + add_argument(option_string, option_string, ..., name=value, ...) + """ + + # if no positional args are supplied or only one is supplied and + # it doesn't look like an option string, parse a positional + # argument + chars = self.prefix_chars + if not args or len(args) == 1 and args[0][0] not in chars: + if args and 'dest' in kwargs: + raise ValueError('dest supplied twice for positional argument') + kwargs = self._get_positional_kwargs(*args, **kwargs) + + # otherwise, we're adding an optional argument + else: + kwargs = self._get_optional_kwargs(*args, **kwargs) + + # if no default was supplied, use the parser-level default + if 'default' not in kwargs: + dest = kwargs['dest'] + if dest in self._defaults: + kwargs['default'] = self._defaults[dest] + elif self.argument_default is not None: + kwargs['default'] = self.argument_default + + # create the action object, and add it to the parser + action_class = self._pop_action_class(kwargs) + if not callable(action_class): + raise ValueError('unknown action "%s"' % (action_class,)) + action = action_class(**kwargs) + + # raise an error if the action type is not callable + type_func = self._registry_get('type', action.type, action.type) + if not callable(type_func): + raise ValueError('%r is not callable' % (type_func,)) + + # raise an error if the metavar does not match the type + if hasattr(self, "_get_formatter"): + try: + self._get_formatter()._format_args(action, None) + except TypeError: + raise ValueError("length of metavar tuple does not match nargs") + + return self._add_action(action) + + def add_argument_group(self, *args, **kwargs): + group = _ArgumentGroup(self, *args, **kwargs) + self._action_groups.append(group) + return group + + def add_mutually_exclusive_group(self, **kwargs): + group = _MutuallyExclusiveGroup(self, **kwargs) + self._mutually_exclusive_groups.append(group) + return group + + def _add_action(self, action): + # resolve any conflicts + self._check_conflict(action) + + # add to actions list + self._actions.append(action) + action.container = self + + # index the action by any option strings it has + for option_string in action.option_strings: + self._option_string_actions[option_string] = action + + # set the flag if any option strings look like negative numbers + for option_string in action.option_strings: + if self._negative_number_matcher.match(option_string): + if not self._has_negative_number_optionals: + self._has_negative_number_optionals.append(True) + + # return the created action + return action + + def _remove_action(self, action): + self._actions.remove(action) + + def _add_container_actions(self, container): + # collect groups by titles + title_group_map = {} + for group in self._action_groups: + if group.title in title_group_map: + msg = _('cannot merge actions - two groups are named %r') + raise ValueError(msg % (group.title)) + title_group_map[group.title] = group + + # map each action to its group + group_map = {} + for group in container._action_groups: + + # if a group with the title exists, use that, otherwise + # create a new group matching the container's group + if group.title not in title_group_map: + title_group_map[group.title] = self.add_argument_group( + title=group.title, + description=group.description, + conflict_handler=group.conflict_handler) + + # map the actions to their new group + for action in group._group_actions: + group_map[action] = title_group_map[group.title] + + # add container's mutually exclusive groups + # NOTE: if add_mutually_exclusive_group ever gains title= and + # description= then this code will need to be expanded as above + for group in container._mutually_exclusive_groups: + mutex_group = self.add_mutually_exclusive_group( + required=group.required) + + # map the actions to their new mutex group + for action in group._group_actions: + group_map[action] = mutex_group + + # add all actions to this container or their group + for action in container._actions: + group_map.get(action, self)._add_action(action) + + def _get_positional_kwargs(self, dest, **kwargs): + # make sure required is not specified + if 'required' in kwargs: + msg = _("'required' is an invalid argument for positionals") + raise TypeError(msg) + + # mark positional arguments as required if at least one is + # always required + if kwargs.get('nargs') not in [OPTIONAL, ZERO_OR_MORE]: + kwargs['required'] = True + if kwargs.get('nargs') == ZERO_OR_MORE and 'default' not in kwargs: + kwargs['required'] = True + + # return the keyword arguments with no option strings + return dict(kwargs, dest=dest, option_strings=[]) + + def _get_optional_kwargs(self, *args, **kwargs): + # determine short and long option strings + option_strings = [] + long_option_strings = [] + for option_string in args: + # error on strings that don't start with an appropriate prefix + if not option_string[0] in self.prefix_chars: + args = {'option': option_string, + 'prefix_chars': self.prefix_chars} + msg = _('invalid option string %(option)r: ' + 'must start with a character %(prefix_chars)r') + raise ValueError(msg % args) + + # strings starting with two prefix characters are long options + option_strings.append(option_string) + if option_string[0] in self.prefix_chars: + if len(option_string) > 1: + if option_string[1] in self.prefix_chars: + long_option_strings.append(option_string) + + # infer destination, '--foo-bar' -> 'foo_bar' and '-x' -> 'x' + dest = kwargs.pop('dest', None) + if dest is None: + if long_option_strings: + dest_option_string = long_option_strings[0] + else: + dest_option_string = option_strings[0] + dest = dest_option_string.lstrip(self.prefix_chars) + if not dest: + msg = _('dest= is required for options like %r') + raise ValueError(msg % option_string) + dest = dest.replace('-', '_') + + # return the updated keyword arguments + return dict(kwargs, dest=dest, option_strings=option_strings) + + def _pop_action_class(self, kwargs, default=None): + action = kwargs.pop('action', default) + return self._registry_get('action', action, action) + + def _get_handler(self): + # determine function from conflict handler string + handler_func_name = '_handle_conflict_%s' % self.conflict_handler + try: + return getattr(self, handler_func_name) + except AttributeError: + msg = _('invalid conflict_resolution value: %r') + raise ValueError(msg % self.conflict_handler) + + def _check_conflict(self, action): + + # find all options that conflict with this option + confl_optionals = [] + for option_string in action.option_strings: + if option_string in self._option_string_actions: + confl_optional = self._option_string_actions[option_string] + confl_optionals.append((option_string, confl_optional)) + + # resolve any conflicts + if confl_optionals: + conflict_handler = self._get_handler() + conflict_handler(action, confl_optionals) + + def _handle_conflict_error(self, action, conflicting_actions): + message = ngettext('conflicting option string: %s', + 'conflicting option strings: %s', + len(conflicting_actions)) + conflict_string = ', '.join([option_string + for option_string, action + in conflicting_actions]) + raise ArgumentError(action, message % conflict_string) + + def _handle_conflict_resolve(self, action, conflicting_actions): + + # remove all conflicting options + for option_string, action in conflicting_actions: + + # remove the conflicting option + action.option_strings.remove(option_string) + self._option_string_actions.pop(option_string, None) + + # if the option now has no option string, remove it from the + # container holding it + if not action.option_strings: + action.container._remove_action(action) + + +class _ArgumentGroup(_ActionsContainer): + + def __init__(self, container, title=None, description=None, **kwargs): + # add any missing keyword arguments by checking the container + update = kwargs.setdefault + update('conflict_handler', container.conflict_handler) + update('prefix_chars', container.prefix_chars) + update('argument_default', container.argument_default) + super_init = super(_ArgumentGroup, self).__init__ + super_init(description=description, **kwargs) + + # group attributes + self.title = title + self._group_actions = [] + + # share most attributes with the container + self._registries = container._registries + self._actions = container._actions + self._option_string_actions = container._option_string_actions + self._defaults = container._defaults + self._has_negative_number_optionals = \ + container._has_negative_number_optionals + self._mutually_exclusive_groups = container._mutually_exclusive_groups + + def _add_action(self, action): + action = super(_ArgumentGroup, self)._add_action(action) + self._group_actions.append(action) + return action + + def _remove_action(self, action): + super(_ArgumentGroup, self)._remove_action(action) + self._group_actions.remove(action) + + +class _MutuallyExclusiveGroup(_ArgumentGroup): + + def __init__(self, container, required=False): + super(_MutuallyExclusiveGroup, self).__init__(container) + self.required = required + self._container = container + + def _add_action(self, action): + if action.required: + msg = _('mutually exclusive arguments must be optional') + raise ValueError(msg) + action = self._container._add_action(action) + self._group_actions.append(action) + return action + + def _remove_action(self, action): + self._container._remove_action(action) + self._group_actions.remove(action) + + +class ArgumentParser(_AttributeHolder, _ActionsContainer): + """Object for parsing command line strings into Python objects. + + Keyword Arguments: + - prog -- The name of the program (default: sys.argv[0]) + - usage -- A usage message (default: auto-generated from arguments) + - description -- A description of what the program does + - epilog -- Text following the argument descriptions + - parents -- Parsers whose arguments should be copied into this one + - formatter_class -- HelpFormatter class for printing help messages + - prefix_chars -- Characters that prefix optional arguments + - fromfile_prefix_chars -- Characters that prefix files containing + additional arguments + - argument_default -- The default value for all arguments + - conflict_handler -- String indicating how to handle conflicts + - add_help -- Add a -h/-help option + """ + + def __init__(self, + prog=None, + usage=None, + description=None, + epilog=None, + version=None, + parents=[], + formatter_class=HelpFormatter, + prefix_chars='-', + fromfile_prefix_chars=None, + argument_default=None, + conflict_handler='error', + add_help=True): + + if version is not None: + import warnings + warnings.warn( + """The "version" argument to ArgumentParser is deprecated. """ + """Please use """ + """"add_argument(..., action='version', version="N", ...)" """ + """instead""", DeprecationWarning) + + superinit = super(ArgumentParser, self).__init__ + superinit(description=description, + prefix_chars=prefix_chars, + argument_default=argument_default, + conflict_handler=conflict_handler) + + # default setting for prog + if prog is None: + prog = _os.path.basename(_sys.argv[0]) + + self.prog = prog + self.usage = usage + self.epilog = epilog + self.version = version + self.formatter_class = formatter_class + self.fromfile_prefix_chars = fromfile_prefix_chars + self.add_help = add_help + + add_group = self.add_argument_group + self._positionals = add_group(_('positional arguments')) + self._optionals = add_group(_('optional arguments')) + self._subparsers = None + + # register types + def identity(string): + return string + self.register('type', None, identity) + + # add help and version arguments if necessary + # (using explicit default to override global argument_default) + default_prefix = '-' if '-' in prefix_chars else prefix_chars[0] + if self.add_help: + self.add_argument( + default_prefix+'h', default_prefix*2+'help', + action='help', default=SUPPRESS, + help=_('show this help message and exit')) + if self.version: + self.add_argument( + default_prefix+'v', default_prefix*2+'version', + action='version', default=SUPPRESS, + version=self.version, + help=_("show program's version number and exit")) + + # add parent arguments and defaults + for parent in parents: + self._add_container_actions(parent) + try: + defaults = parent._defaults + except AttributeError: + pass + else: + self._defaults.update(defaults) + + # ======================= + # Pretty __repr__ methods + # ======================= + def _get_kwargs(self): + names = [ + 'prog', + 'usage', + 'description', + 'version', + 'formatter_class', + 'conflict_handler', + 'add_help', + ] + return [(name, getattr(self, name)) for name in names] + + # ================================== + # Optional/Positional adding methods + # ================================== + def add_subparsers(self, **kwargs): + if self._subparsers is not None: + self.error(_('cannot have multiple subparser arguments')) + + # add the parser class to the arguments if it's not present + kwargs.setdefault('parser_class', type(self)) + + if 'title' in kwargs or 'description' in kwargs: + title = _(kwargs.pop('title', 'subcommands')) + description = _(kwargs.pop('description', None)) + self._subparsers = self.add_argument_group(title, description) + else: + self._subparsers = self._positionals + + # prog defaults to the usage message of this parser, skipping + # optional arguments and with no "usage:" prefix + if kwargs.get('prog') is None: + formatter = self._get_formatter() + positionals = self._get_positional_actions() + groups = self._mutually_exclusive_groups + formatter.add_usage(self.usage, positionals, groups, '') + kwargs['prog'] = formatter.format_help().strip() + + # create the parsers action and add it to the positionals list + parsers_class = self._pop_action_class(kwargs, 'parsers') + action = parsers_class(option_strings=[], **kwargs) + self._subparsers._add_action(action) + + # return the created parsers action + return action + + def _add_action(self, action): + if action.option_strings: + self._optionals._add_action(action) + else: + self._positionals._add_action(action) + return action + + def _get_optional_actions(self): + return [action + for action in self._actions + if action.option_strings] + + def _get_positional_actions(self): + return [action + for action in self._actions + if not action.option_strings] + + # ===================================== + # Command line argument parsing methods + # ===================================== + def parse_args(self, args=None, namespace=None): + args, argv = self.parse_known_args(args, namespace) + if argv: + msg = _('unrecognized arguments: %s') + self.error(msg % ' '.join(argv)) + return args + + def parse_known_args(self, args=None, namespace=None): + # args default to the system args + if args is None: + args = _sys.argv[1:] + + # default Namespace built from parser defaults + if namespace is None: + namespace = Namespace() + + # add any action defaults that aren't present + for action in self._actions: + if action.dest is not SUPPRESS: + if not hasattr(namespace, action.dest): + if action.default is not SUPPRESS: + default = action.default + if isinstance(action.default, str): + default = self._get_value(action, default) + setattr(namespace, action.dest, default) + + # add any parser defaults that aren't present + for dest in self._defaults: + if not hasattr(namespace, dest): + setattr(namespace, dest, self._defaults[dest]) + + # parse the arguments and exit if there are any errors + try: + namespace, args = self._parse_known_args(args, namespace) + if hasattr(namespace, _UNRECOGNIZED_ARGS_ATTR): + args.extend(getattr(namespace, _UNRECOGNIZED_ARGS_ATTR)) + delattr(namespace, _UNRECOGNIZED_ARGS_ATTR) + return namespace, args + except ArgumentError: + err = _sys.exc_info()[1] + self.error(str(err)) + + def _parse_known_args(self, arg_strings, namespace): + # replace arg strings that are file references + if self.fromfile_prefix_chars is not None: + arg_strings = self._read_args_from_files(arg_strings) + + # map all mutually exclusive arguments to the other arguments + # they can't occur with + action_conflicts = {} + for mutex_group in self._mutually_exclusive_groups: + group_actions = mutex_group._group_actions + for i, mutex_action in enumerate(mutex_group._group_actions): + conflicts = action_conflicts.setdefault(mutex_action, []) + conflicts.extend(group_actions[:i]) + conflicts.extend(group_actions[i + 1:]) + + # find all option indices, and determine the arg_string_pattern + # which has an 'O' if there is an option at an index, + # an 'A' if there is an argument, or a '-' if there is a '--' + option_string_indices = {} + arg_string_pattern_parts = [] + arg_strings_iter = iter(arg_strings) + for i, arg_string in enumerate(arg_strings_iter): + + # all args after -- are non-options + if arg_string == '--': + arg_string_pattern_parts.append('-') + for arg_string in arg_strings_iter: + arg_string_pattern_parts.append('A') + + # otherwise, add the arg to the arg strings + # and note the index if it was an option + else: + option_tuple = self._parse_optional(arg_string) + if option_tuple is None: + pattern = 'A' + else: + option_string_indices[i] = option_tuple + pattern = 'O' + arg_string_pattern_parts.append(pattern) + + # join the pieces together to form the pattern + arg_strings_pattern = ''.join(arg_string_pattern_parts) + + # converts arg strings to the appropriate and then takes the action + seen_actions = set() + seen_non_default_actions = set() + + def take_action(action, argument_strings, option_string=None): + seen_actions.add(action) + argument_values = self._get_values(action, argument_strings) + + # error if this argument is not allowed with other previously + # seen arguments, assuming that actions that use the default + # value don't really count as "present" + if argument_values is not action.default: + seen_non_default_actions.add(action) + for conflict_action in action_conflicts.get(action, []): + if conflict_action in seen_non_default_actions: + msg = _('not allowed with argument %s') + action_name = _get_action_name(conflict_action) + raise ArgumentError(action, msg % action_name) + + # take the action if we didn't receive a SUPPRESS value + # (e.g. from a default) + if argument_values is not SUPPRESS: + action(self, namespace, argument_values, option_string) + + # function to convert arg_strings into an optional action + def consume_optional(start_index): + + # get the optional identified at this index + option_tuple = option_string_indices[start_index] + action, option_string, explicit_arg = option_tuple + + # identify additional optionals in the same arg string + # (e.g. -xyz is the same as -x -y -z if no args are required) + match_argument = self._match_argument + action_tuples = [] + while True: + + # if we found no optional action, skip it + if action is None: + extras.append(arg_strings[start_index]) + return start_index + 1 + + # if there is an explicit argument, try to match the + # optional's string arguments to only this + if explicit_arg is not None: + arg_count = match_argument(action, 'A') + + # if the action is a single-dash option and takes no + # arguments, try to parse more single-dash options out + # of the tail of the option string + chars = self.prefix_chars + if arg_count == 0 and option_string[1] not in chars: + action_tuples.append((action, [], option_string)) + char = option_string[0] + option_string = char + explicit_arg[0] + new_explicit_arg = explicit_arg[1:] or None + optionals_map = self._option_string_actions + if option_string in optionals_map: + action = optionals_map[option_string] + explicit_arg = new_explicit_arg + else: + msg = _('ignored explicit argument %r') + raise ArgumentError(action, msg % explicit_arg) + + # if the action expect exactly one argument, we've + # successfully matched the option; exit the loop + elif arg_count == 1: + stop = start_index + 1 + args = [explicit_arg] + action_tuples.append((action, args, option_string)) + break + + # error if a double-dash option did not use the + # explicit argument + else: + msg = _('ignored explicit argument %r') + raise ArgumentError(action, msg % explicit_arg) + + # if there is no explicit argument, try to match the + # optional's string arguments with the following strings + # if successful, exit the loop + else: + start = start_index + 1 + selected_patterns = arg_strings_pattern[start:] + arg_count = match_argument(action, selected_patterns) + stop = start + arg_count + args = arg_strings[start:stop] + action_tuples.append((action, args, option_string)) + break + + # add the Optional to the list and return the index at which + # the Optional's string args stopped + assert action_tuples + for action, args, option_string in action_tuples: + take_action(action, args, option_string) + return stop + + # the list of Positionals left to be parsed; this is modified + # by consume_positionals() + positionals = self._get_positional_actions() + + # function to convert arg_strings into positional actions + def consume_positionals(start_index): + # match as many Positionals as possible + match_partial = self._match_arguments_partial + selected_pattern = arg_strings_pattern[start_index:] + arg_counts = match_partial(positionals, selected_pattern) + + # slice off the appropriate arg strings for each Positional + # and add the Positional and its args to the list + for action, arg_count in zip(positionals, arg_counts): + args = arg_strings[start_index: start_index + arg_count] + start_index += arg_count + take_action(action, args) + + # slice off the Positionals that we just parsed and return the + # index at which the Positionals' string args stopped + positionals[:] = positionals[len(arg_counts):] + return start_index + + # consume Positionals and Optionals alternately, until we have + # passed the last option string + extras = [] + start_index = 0 + if option_string_indices: + max_option_string_index = max(option_string_indices) + else: + max_option_string_index = -1 + while start_index <= max_option_string_index: + + # consume any Positionals preceding the next option + next_option_string_index = min([ + index + for index in option_string_indices + if index >= start_index]) + if start_index != next_option_string_index: + positionals_end_index = consume_positionals(start_index) + + # only try to parse the next optional if we didn't consume + # the option string during the positionals parsing + if positionals_end_index > start_index: + start_index = positionals_end_index + continue + else: + start_index = positionals_end_index + + # if we consumed all the positionals we could and we're not + # at the index of an option string, there were extra arguments + if start_index not in option_string_indices: + strings = arg_strings[start_index:next_option_string_index] + extras.extend(strings) + start_index = next_option_string_index + + # consume the next optional and any arguments for it + start_index = consume_optional(start_index) + + # consume any positionals following the last Optional + stop_index = consume_positionals(start_index) + + # if we didn't consume all the argument strings, there were extras + extras.extend(arg_strings[stop_index:]) + + # make sure all required actions were present + required_actions = [_get_action_name(action) for action in self._actions + if action.required and action not in seen_actions] + if required_actions: + self.error(_('the following arguments are required: %s') % + ', '.join(required_actions)) + + # make sure all required groups had one option present + for group in self._mutually_exclusive_groups: + if group.required: + for action in group._group_actions: + if action in seen_non_default_actions: + break + + # if no actions were used, report the error + else: + names = [_get_action_name(action) + for action in group._group_actions + if action.help is not SUPPRESS] + msg = _('one of the arguments %s is required') + self.error(msg % ' '.join(names)) + + # return the updated namespace and the extra arguments + return namespace, extras + + def _read_args_from_files(self, arg_strings): + # expand arguments referencing files + new_arg_strings = [] + for arg_string in arg_strings: + + # for regular arguments, just add them back into the list + if arg_string[0] not in self.fromfile_prefix_chars: + new_arg_strings.append(arg_string) + + # replace arguments referencing files with the file content + else: + try: + args_file = open(arg_string[1:]) + try: + arg_strings = [] + for arg_line in args_file.read().splitlines(): + for arg in self.convert_arg_line_to_args(arg_line): + arg_strings.append(arg) + arg_strings = self._read_args_from_files(arg_strings) + new_arg_strings.extend(arg_strings) + finally: + args_file.close() + except IOError: + err = _sys.exc_info()[1] + self.error(str(err)) + + # return the modified argument list + return new_arg_strings + + def convert_arg_line_to_args(self, arg_line): + return [arg_line] + + def _match_argument(self, action, arg_strings_pattern): + # match the pattern for this action to the arg strings + nargs_pattern = self._get_nargs_pattern(action) + match = _re.match(nargs_pattern, arg_strings_pattern) + + # raise an exception if we weren't able to find a match + if match is None: + nargs_errors = { + None: _('expected one argument'), + OPTIONAL: _('expected at most one argument'), + ONE_OR_MORE: _('expected at least one argument'), + } + default = ngettext('expected %s argument', + 'expected %s arguments', + action.nargs) % action.nargs + msg = nargs_errors.get(action.nargs, default) + raise ArgumentError(action, msg) + + # return the number of arguments matched + return len(match.group(1)) + + def _match_arguments_partial(self, actions, arg_strings_pattern): + # progressively shorten the actions list by slicing off the + # final actions until we find a match + result = [] + for i in range(len(actions), 0, -1): + actions_slice = actions[:i] + pattern = ''.join([self._get_nargs_pattern(action) + for action in actions_slice]) + match = _re.match(pattern, arg_strings_pattern) + if match is not None: + result.extend([len(string) for string in match.groups()]) + break + + # return the list of arg string counts + return result + + def _parse_optional(self, arg_string): + # if it's an empty string, it was meant to be a positional + if not arg_string: + return None + + # if it doesn't start with a prefix, it was meant to be positional + if not arg_string[0] in self.prefix_chars: + return None + + # if the option string is present in the parser, return the action + if arg_string in self._option_string_actions: + action = self._option_string_actions[arg_string] + return action, arg_string, None + + # if it's just a single character, it was meant to be positional + if len(arg_string) == 1: + return None + + # if the option string before the "=" is present, return the action + if '=' in arg_string: + option_string, explicit_arg = arg_string.split('=', 1) + if option_string in self._option_string_actions: + action = self._option_string_actions[option_string] + return action, option_string, explicit_arg + + # search through all possible prefixes of the option string + # and all actions in the parser for possible interpretations + option_tuples = self._get_option_tuples(arg_string) + + # if multiple actions match, the option string was ambiguous + if len(option_tuples) > 1: + options = ', '.join([option_string + for action, option_string, explicit_arg in option_tuples]) + args = {'option': arg_string, 'matches': options} + msg = _('ambiguous option: %(option)s could match %(matches)s') + self.error(msg % args) + + # if exactly one action matched, this segmentation is good, + # so return the parsed action + elif len(option_tuples) == 1: + option_tuple, = option_tuples + return option_tuple + + # if it was not found as an option, but it looks like a negative + # number, it was meant to be positional + # unless there are negative-number-like options + if self._negative_number_matcher.match(arg_string): + if not self._has_negative_number_optionals: + return None + + # if it contains a space, it was meant to be a positional + if ' ' in arg_string: + return None + + # it was meant to be an optional but there is no such option + # in this parser (though it might be a valid option in a subparser) + return None, arg_string, None + + def _get_option_tuples(self, option_string): + result = [] + + # option strings starting with two prefix characters are only + # split at the '=' + chars = self.prefix_chars + if option_string[0] in chars and option_string[1] in chars: + if '=' in option_string: + option_prefix, explicit_arg = option_string.split('=', 1) + else: + option_prefix = option_string + explicit_arg = None + for option_string in self._option_string_actions: + if option_string.startswith(option_prefix): + action = self._option_string_actions[option_string] + tup = action, option_string, explicit_arg + result.append(tup) + + # single character options can be concatenated with their arguments + # but multiple character options always have to have their argument + # separate + elif option_string[0] in chars and option_string[1] not in chars: + option_prefix = option_string + explicit_arg = None + short_option_prefix = option_string[:2] + short_explicit_arg = option_string[2:] + + for option_string in self._option_string_actions: + if option_string == short_option_prefix: + action = self._option_string_actions[option_string] + tup = action, option_string, short_explicit_arg + result.append(tup) + elif option_string.startswith(option_prefix): + action = self._option_string_actions[option_string] + tup = action, option_string, explicit_arg + result.append(tup) + + # shouldn't ever get here + else: + self.error(_('unexpected option string: %s') % option_string) + + # return the collected option tuples + return result + + def _get_nargs_pattern(self, action): + # in all examples below, we have to allow for '--' args + # which are represented as '-' in the pattern + nargs = action.nargs + + # the default (None) is assumed to be a single argument + if nargs is None: + nargs_pattern = '(-*A-*)' + + # allow zero or one arguments + elif nargs == OPTIONAL: + nargs_pattern = '(-*A?-*)' + + # allow zero or more arguments + elif nargs == ZERO_OR_MORE: + nargs_pattern = '(-*[A-]*)' + + # allow one or more arguments + elif nargs == ONE_OR_MORE: + nargs_pattern = '(-*A[A-]*)' + + # allow any number of options or arguments + elif nargs == REMAINDER: + nargs_pattern = '([-AO]*)' + + # allow one argument followed by any number of options or arguments + elif nargs == PARSER: + nargs_pattern = '(-*A[-AO]*)' + + # all others should be integers + else: + nargs_pattern = '(-*%s-*)' % '-*'.join('A' * nargs) + + # if this is an optional action, -- is not allowed + if action.option_strings: + nargs_pattern = nargs_pattern.replace('-*', '') + nargs_pattern = nargs_pattern.replace('-', '') + + # return the pattern + return nargs_pattern + + # ======================== + # Value conversion methods + # ======================== + def _get_values(self, action, arg_strings): + # for everything but PARSER args, strip out '--' + if action.nargs not in [PARSER, REMAINDER]: + arg_strings = [s for s in arg_strings if s != '--'] + + # optional argument produces a default when not present + if not arg_strings and action.nargs == OPTIONAL: + if action.option_strings: + value = action.const + else: + value = action.default + if isinstance(value, str): + value = self._get_value(action, value) + self._check_value(action, value) + + # when nargs='*' on a positional, if there were no command-line + # args, use the default if it is anything other than None + elif (not arg_strings and action.nargs == ZERO_OR_MORE and + not action.option_strings): + if action.default is not None: + value = action.default + else: + value = arg_strings + self._check_value(action, value) + + # single argument or optional argument produces a single value + elif len(arg_strings) == 1 and action.nargs in [None, OPTIONAL]: + arg_string, = arg_strings + value = self._get_value(action, arg_string) + self._check_value(action, value) + + # REMAINDER arguments convert all values, checking none + elif action.nargs == REMAINDER: + value = [self._get_value(action, v) for v in arg_strings] + + # PARSER arguments convert all values, but check only the first + elif action.nargs == PARSER: + value = [self._get_value(action, v) for v in arg_strings] + self._check_value(action, value[0]) + + # all other types of nargs produce a list + else: + value = [self._get_value(action, v) for v in arg_strings] + for v in value: + self._check_value(action, v) + + # return the converted value + return value + + def _get_value(self, action, arg_string): + type_func = self._registry_get('type', action.type, action.type) + if not callable(type_func): + msg = _('%r is not callable') + raise ArgumentError(action, msg % type_func) + + # convert the value to the appropriate type + try: + result = type_func(arg_string) + + # ArgumentTypeErrors indicate errors + except ArgumentTypeError: + name = getattr(action.type, '__name__', repr(action.type)) + msg = str(_sys.exc_info()[1]) + raise ArgumentError(action, msg) + + # TypeErrors or ValueErrors also indicate errors + except (TypeError, ValueError): + name = getattr(action.type, '__name__', repr(action.type)) + args = {'type': name, 'value': arg_string} + msg = _('invalid %(type)s value: %(value)r') + raise ArgumentError(action, msg % args) + + # return the converted value + return result + + def _check_value(self, action, value): + # converted value must be one of the choices (if specified) + if action.choices is not None and value not in action.choices: + args = {'value': value, + 'choices': ', '.join(map(repr, action.choices))} + msg = _('invalid choice: %(value)r (choose from %(choices)s)') + raise ArgumentError(action, msg % args) + + # ======================= + # Help-formatting methods + # ======================= + def format_usage(self): + formatter = self._get_formatter() + formatter.add_usage(self.usage, self._actions, + self._mutually_exclusive_groups) + return formatter.format_help() + + def format_help(self): + formatter = self._get_formatter() + + # usage + formatter.add_usage(self.usage, self._actions, + self._mutually_exclusive_groups) + + # description + formatter.add_text(self.description) + + # positionals, optionals and user-defined groups + for action_group in self._action_groups: + formatter.start_section(action_group.title) + formatter.add_text(action_group.description) + formatter.add_arguments(action_group._group_actions) + formatter.end_section() + + # epilog + formatter.add_text(self.epilog) + + # determine help from format above + return formatter.format_help() + + def format_version(self): + import warnings + warnings.warn( + 'The format_version method is deprecated -- the "version" ' + 'argument to ArgumentParser is no longer supported.', + DeprecationWarning) + formatter = self._get_formatter() + formatter.add_text(self.version) + return formatter.format_help() + + def _get_formatter(self): + return self.formatter_class(prog=self.prog) + + # ===================== + # Help-printing methods + # ===================== + def print_usage(self, file=None): + if file is None: + file = _sys.stdout + self._print_message(self.format_usage(), file) + + def print_help(self, file=None): + if file is None: + file = _sys.stdout + self._print_message(self.format_help(), file) + + def print_version(self, file=None): + import warnings + warnings.warn( + 'The print_version method is deprecated -- the "version" ' + 'argument to ArgumentParser is no longer supported.', + DeprecationWarning) + self._print_message(self.format_version(), file) + + def _print_message(self, message, file=None): + if message: + if file is None: + file = _sys.stderr + file.write(message) + + # =============== + # Exiting methods + # =============== + def exit(self, status=0, message=None): + if message: + self._print_message(message, _sys.stderr) + _sys.exit(status) + + def error(self, message): + """error(message: string) + + Prints a usage message incorporating the message to stderr and + exits. + + If you override this in a subclass, it should not return -- it + should either exit or raise an exception. + """ + self.print_usage(_sys.stderr) + args = {'prog': self.prog, 'message': message} + self.exit(2, _('%(prog)s: error: %(message)s\n') % args) diff --git a/mosesdecoder/contrib/tmcombine/test/model3/model/lex.counts.e2f b/mosesdecoder/contrib/tmcombine/test/model3/model/lex.counts.e2f new file mode 100644 index 0000000000000000000000000000000000000000..ed05c0b7d3b14213f09367b21c31d4aede88e22e --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model3/model/lex.counts.e2f @@ -0,0 +1,8 @@ +ad af 500 1000 +bd bf 5 10 +der le 20285 102586 +der NULL 12926 704917 +gipfel sommet 3485 7322 +pass col 419 2911 +pass passeport 7 28 +sitzung séance 14 59 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model3/model/lex.counts.f2e b/mosesdecoder/contrib/tmcombine/test/model3/model/lex.counts.f2e new file mode 100644 index 0000000000000000000000000000000000000000..ea31f690d10b817afc83023a14b4ee9e75647db2 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model3/model/lex.counts.f2e @@ -0,0 +1,8 @@ +af ad 500 1000 +bf bd 5 10 +col pass 419 615 +le der 20285 113635 +passeport pass 7 615 +retrouvé NULL 34 1016136 +séance sitzung 14 33 +sommet gipfel 3485 5700 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model3/model/lex.e2f b/mosesdecoder/contrib/tmcombine/test/model3/model/lex.e2f new file mode 100644 index 0000000000000000000000000000000000000000..f9263ffe5d38675247595c9aa50078d10611d5f0 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model3/model/lex.e2f @@ -0,0 +1,8 @@ +ad af 0.5 +bd bf 0.5 +der le 0.1977365 +der NULL 0.0183369 +gipfel sommet 0.4759629 +pass col 0.1439368 +pass passeport 0.2500000 +sitzung séance 0.2372881 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model3/model/lex.f2e b/mosesdecoder/contrib/tmcombine/test/model3/model/lex.f2e new file mode 100644 index 0000000000000000000000000000000000000000..2bba51f0131e1729df7a2860b052b1889d7a6e3b --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model3/model/lex.f2e @@ -0,0 +1,8 @@ +af ad 0.5 +bf bd 0.5 +col pass 0.6813008 +le der 0.1785101 +passeport pass 0.0113821 +retrouvé NULL 0.0000335 +séance sitzung 0.4242424 +sommet gipfel 0.6114035 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model3/model/phrase-table b/mosesdecoder/contrib/tmcombine/test/model3/model/phrase-table new file mode 100644 index 0000000000000000000000000000000000000000..737157e69061a901fc15d9e9f61dc6a63909fe40 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model3/model/phrase-table @@ -0,0 +1,8 @@ +ad ||| af ||| 0.3 0.3 0.3 0.3 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 1000 1000 ||| sparse_feature 1 +bd ||| bf ||| 0.3 0.3 0.3 0.3 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 10 10 ||| +der gipfel ||| sommet ||| 0.3 0.3 0.3 0.3 0.00327135 0.00872768 0.0366795 0.611403 2.718 ||| 1-0 ||| 5808 518 +der pass ||| le col ||| 0.3 0.3 0.3 0.3 0.0173565 0.0284616 0.288889 0.121619 2.718 ||| 0-0 1-1 ||| 749 45 +pass ||| col ||| 0.3 0.3 0.3 0.3 0.1952 0.143937 0.628866 0.681301 2.718 ||| 0-0 ||| 1875 582 +pass ||| passeport retrouvé ||| 0.3 0.3 0.3 0.3 0.5 0.25 0.00171821 3.813e-07 2.718 ||| 0-0 ||| 2 582 +pass ||| passeport ||| 0.3 0.3 0.3 0.3 0.266667 0.25 0.00687285 0.0113821 2.718 ||| 0-0 ||| 15 582 +sitzung ||| séance ||| 0.3 0.3 0.3 0.3 0.272727 0.237288 0.352941 0.424242 2.718 ||| 0-0 ||| 22 17 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model5/model/lex.counts.e2f b/mosesdecoder/contrib/tmcombine/test/model5/model/lex.counts.e2f new file mode 100644 index 0000000000000000000000000000000000000000..ed05c0b7d3b14213f09367b21c31d4aede88e22e --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model5/model/lex.counts.e2f @@ -0,0 +1,8 @@ +ad af 500 1000 +bd bf 5 10 +der le 20285 102586 +der NULL 12926 704917 +gipfel sommet 3485 7322 +pass col 419 2911 +pass passeport 7 28 +sitzung séance 14 59 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model5/model/lex.counts.f2e b/mosesdecoder/contrib/tmcombine/test/model5/model/lex.counts.f2e new file mode 100644 index 0000000000000000000000000000000000000000..ea31f690d10b817afc83023a14b4ee9e75647db2 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model5/model/lex.counts.f2e @@ -0,0 +1,8 @@ +af ad 500 1000 +bf bd 5 10 +col pass 419 615 +le der 20285 113635 +passeport pass 7 615 +retrouvé NULL 34 1016136 +séance sitzung 14 33 +sommet gipfel 3485 5700 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model5/model/lex.e2f b/mosesdecoder/contrib/tmcombine/test/model5/model/lex.e2f new file mode 100644 index 0000000000000000000000000000000000000000..f9263ffe5d38675247595c9aa50078d10611d5f0 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model5/model/lex.e2f @@ -0,0 +1,8 @@ +ad af 0.5 +bd bf 0.5 +der le 0.1977365 +der NULL 0.0183369 +gipfel sommet 0.4759629 +pass col 0.1439368 +pass passeport 0.2500000 +sitzung séance 0.2372881 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model5/model/lex.f2e b/mosesdecoder/contrib/tmcombine/test/model5/model/lex.f2e new file mode 100644 index 0000000000000000000000000000000000000000..2bba51f0131e1729df7a2860b052b1889d7a6e3b --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model5/model/lex.f2e @@ -0,0 +1,8 @@ +af ad 0.5 +bf bd 0.5 +col pass 0.6813008 +le der 0.1785101 +passeport pass 0.0113821 +retrouvé NULL 0.0000335 +séance sitzung 0.4242424 +sommet gipfel 0.6114035 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model5/model/phrase-table b/mosesdecoder/contrib/tmcombine/test/model5/model/phrase-table new file mode 100644 index 0000000000000000000000000000000000000000..5621b5acf24ffde73d3097995a7ab664848afed7 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model5/model/phrase-table @@ -0,0 +1,8 @@ +ad [X][X] [X] ||| af [X][X] [X] ||| 0.5 0.5 0.5 0.5 2.718 ||| 0-0 1-1 ||| 1000 1000 +bd [X] ||| bf [X] ||| 0.5 0.5 0.5 0.5 2.718 ||| 0-0 ||| 10 10 +der gipfel [X] ||| sommet [X] ||| 0.00327135 0.00872768 0.0366795 0.611403 2.718 ||| 1-0 ||| 5808 518 +der [X][X] pass [X] ||| le [X][X] col [X] ||| 0.0173565 0.0284616 0.288889 0.121619 2.718 ||| 0-0 1-1 2-2 ||| 749 45 +pass [X] ||| col [X] ||| 0.1952 0.143937 0.628866 0.681301 2.718 ||| 0-0 ||| 1875 582 +pass [X] ||| passeport retrouvé [X] ||| 0.5 0.25 0.00171821 3.813e-07 2.718 ||| 0-0 ||| 2 582 +pass [X] ||| passeport [X] ||| 0.266667 0.25 0.00687285 0.0113821 2.718 ||| 0-0 ||| 15 582 +[X][X] sitzung [X] ||| [X][X] séance [X] ||| 0.272727 0.237288 0.352941 0.424242 2.718 ||| 0-0 1-1 ||| 22 17 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model6/model/lex.counts.e2f b/mosesdecoder/contrib/tmcombine/test/model6/model/lex.counts.e2f new file mode 100644 index 0000000000000000000000000000000000000000..8475fcdf9215cfbf74585c32cd984d567b57b552 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model6/model/lex.counts.e2f @@ -0,0 +1,8 @@ +ad af 100 1000 +bd bf 1 10 +der le 150181 944391 +der NULL 54483 3595140 +gipfel sommet 3421 9342 +pass col 2 70 +pass passeport 73 379 +sitzung séance 3441 5753 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model6/model/lex.counts.f2e b/mosesdecoder/contrib/tmcombine/test/model6/model/lex.counts.f2e new file mode 100644 index 0000000000000000000000000000000000000000..b0913088a0369dc9237f5dbf303cd672918ab4a4 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model6/model/lex.counts.f2e @@ -0,0 +1,8 @@ +af ad 100 1000 +bf bd 1 10 +col pass 2 108 +le der 150181 1356104 +passeport pass 73 108 +retrouvé NULL 43 6276240 +séance sitzung 3441 6142 +sommet gipfel 3421 4908 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model6/model/lex.e2f b/mosesdecoder/contrib/tmcombine/test/model6/model/lex.e2f new file mode 100644 index 0000000000000000000000000000000000000000..b1ce3a613435b5d40250d17d7ede63c94a1684a3 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model6/model/lex.e2f @@ -0,0 +1,8 @@ +ad af 0.1 +bd bf 0.1 +der le 0.1590242 +der NULL 0.0151546 +gipfel sommet 0.366195 +pass col 0.0285714 +pass passeport 0.1926121 +sitzung séance 0.5981227 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model6/model/lex.f2e b/mosesdecoder/contrib/tmcombine/test/model6/model/lex.f2e new file mode 100644 index 0000000000000000000000000000000000000000..d931dcb722bfa1c857c2efc90deea28b13ec7b63 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model6/model/lex.f2e @@ -0,0 +1,8 @@ +af ad 0.1 +bf bd 0.1 +col pass 0.0185185 +le der 0.1107445 +passeport pass 0.6759259 +retrouvé NULL 0.0000069 +séance sitzung 0.5602410 +sommet gipfel 0.6970253 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/model6/model/phrase-table b/mosesdecoder/contrib/tmcombine/test/model6/model/phrase-table new file mode 100644 index 0000000000000000000000000000000000000000..9c260f171ddc398430d131ce5d94bbfb8fcd2b76 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/model6/model/phrase-table @@ -0,0 +1,5 @@ +ad [X][X] [X] ||| af [X][X] [X] ||| 0.1 0.1 0.1 0.1 2.718 ||| 0-0 1-1 ||| 1000 1000 +bd [X] ||| bf [X] ||| 0.1 0.1 0.1 0.1 2.718 ||| 0-0 ||| 10 10 +der [X][X] pass [X] ||| le [X][X] passeport [X] ||| 0.16 0.03063 0.4 0.0748551 2.718 ||| 0-0 1-1 2-2 ||| 25 10 +pass [X] ||| passeport [X] ||| 0.28022 0.192612 0.607143 0.675926 2.718 ||| 0-0 ||| 182 84 +[X][X] sitzung [X] ||| [X][X] séance [X] ||| 0.784521 0.598123 0.516654 0.560241 2.718 ||| 0-0 1-1 ||| 4251 6455 \ No newline at end of file diff --git a/mosesdecoder/contrib/tmcombine/test/phrase-table_test1 b/mosesdecoder/contrib/tmcombine/test/phrase-table_test1 new file mode 100644 index 0000000000000000000000000000000000000000..1309b711d5afc6b65679a3ca999f1653e798d087 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/phrase-table_test1 @@ -0,0 +1,8 @@ +ad ||| af ||| 0.3 0.3 0.3 0.3 ||| 0-0 ||| 1000 1000 +bd ||| bf ||| 0.3 0.3 0.3 0.3 ||| 0-0 ||| 10 10 +der gipfel ||| sommet ||| 0.00163568 0.00436384 0.0183397 0.305702 ||| 1-0 ||| 5808 518 +der pass ||| le col ||| 0.00867825 0.0142308 0.144445 0.0608095 ||| 0-0 1-1 ||| 749 45 +pass ||| col ||| 0.0976 0.0719685 0.314433 0.340651 ||| 0-0 ||| 1875 582 +pass ||| passeport retrouvé ||| 0.25 0.125 0.000859105 1.9065e-07 ||| 0-0 ||| 2 582 +pass ||| passeport ||| 0.273444 0.221306 0.307008 0.343654 ||| 0-0 ||| 15 582 +sitzung ||| séance ||| 0.528624 0.417705 0.434797 0.492241 ||| 0-0 ||| 22 17 diff --git a/mosesdecoder/contrib/tmcombine/test/phrase-table_test10 b/mosesdecoder/contrib/tmcombine/test/phrase-table_test10 new file mode 100644 index 0000000000000000000000000000000000000000..594bb428f45ae1bfc84c1a228c1d314a7b492548 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/phrase-table_test10 @@ -0,0 +1,9 @@ +ad ||| af ||| 0.3 0.3 0.3 0.3 0.115771 0.35574 0.472359 0.469238 ||| 0-0 ||| 25362.6029089 1074.23173673 ||| sparse_feature 1 +bd ||| bf ||| 0.3 0.3 0.3 0.3 0.115771 0.35574 0.472359 0.469238 ||| 0-0 ||| 253.626029089 10.7423173673 ||| +der gipfel ||| sommet ||| 0.3 0.3 0.3 0.3 0.00327135 0.00686984 0.0366795 0.617135 ||| 1-0 ||| 5808.0 518.0 +der pass ||| le col ||| 0.3 0.3 0.3 0.3 0.0173565 0.023534 0.284201 0.0972183 ||| 0-0 1-1 ||| 749.0 45.7423173673 +der pass ||| le passeport ||| 6e-10 6e-10 6e-10 6e-10 0.16 0.0329324 0.0064913 0.00303408 ||| 0-0 1-1 ||| 609.065072723 45.7423173673 +pass ||| col ||| 0.3 0.3 0.3 0.3 0.1952 0.142393 0.6222 0.671744 ||| 0-0 ||| 1875.0 588.235465885 +pass ||| passeport retrouvé ||| 0.3 0.3 0.3 0.3 0.5 0.199258 0.0017 5.11945e-07 ||| 0-0 ||| 2.0 588.235465885 +pass ||| passeport ||| 0.3 0.3 0.3 0.3 0.280174 0.199258 0.0132359 0.0209644 ||| 0-0 ||| 4448.99372942 588.235465885 +sitzung ||| séance ||| 0.3 0.3 0.3 0.3 0.784412 0.59168 0.511045 0.552002 ||| 0-0 ||| 103587.424966 496.165860589 diff --git a/mosesdecoder/contrib/tmcombine/test/phrase-table_test2 b/mosesdecoder/contrib/tmcombine/test/phrase-table_test2 new file mode 100644 index 0000000000000000000000000000000000000000..4cd3b40b57315f0c5448410418036307586caf76 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/phrase-table_test2 @@ -0,0 +1,9 @@ +ad ||| af ||| 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 1000 1000 +bd ||| bf ||| 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 10 10 +der gipfel ||| sommet ||| 0.000327135 0.000793425 0.0073359 0.305702 ||| 1-0 ||| 5808 518 +der pass ||| le col ||| 0.00173565 0.00258742 0.0577778 0.0608095 ||| 0-0 1-1 ||| 749 45 +der pass ||| le passeport ||| 0.144 0.0278455 0.32 0.0374275 ||| 0-0 1-1 ||| 25 10 +pass ||| col ||| 0.01952 0.0130852 0.125773 0.340651 ||| 0-0 ||| 1875 582 +pass ||| passeport retrouvé ||| 0.05 0.0227273 0.000343642 1.9065e-07 ||| 0-0 ||| 2 582 +pass ||| passeport ||| 0.278865 0.197829 0.487089 0.343654 ||| 0-0 ||| 15 582 +sitzung ||| séance ||| 0.733342 0.56532 0.483911 0.492241 ||| 0-0 ||| 22 17 diff --git a/mosesdecoder/contrib/tmcombine/test/phrase-table_test4 b/mosesdecoder/contrib/tmcombine/test/phrase-table_test4 new file mode 100644 index 0000000000000000000000000000000000000000..18773ad6770a51afb82e53bd7b00afda93897ff0 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/phrase-table_test4 @@ -0,0 +1,8 @@ +ad ||| af ||| 0.5 0.5 0.5 0.5 ||| 0-0 ||| 1000.0 1000.0 +bd ||| bf ||| 0.5 0.5 0.5 0.5 ||| 0-0 ||| 10.0 10.0 +der gipfel ||| sommet ||| 0.00327135 0.00872769 0.0366795 0.611404 ||| 1-0 ||| 5808.0 518.0 +der pass ||| le col ||| 0.0173565 0.0284616 0.288889 0.121619 ||| 0-0 1-1 ||| 749.0 45.0 +pass ||| col ||| 0.1952 0.143937 0.628866 0.681301 ||| 0-0 ||| 1875.0 582.0 +pass ||| passeport retrouvé ||| 0.5 0.25 0.00171821 3.80847e-07 ||| 0-0 ||| 2.0 582.0 +pass ||| passeport ||| 0.266667 0.25 0.00687285 0.0113821 ||| 0-0 ||| 15.0 582.0 +sitzung ||| séance ||| 0.272727 0.237288 0.352941 0.424242 ||| 0-0 ||| 22.0 17.0 diff --git a/mosesdecoder/contrib/tmcombine/test/phrase-table_test6 b/mosesdecoder/contrib/tmcombine/test/phrase-table_test6 new file mode 100644 index 0000000000000000000000000000000000000000..57374f148c8837bffdce1c751560d569340f4ba5 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/phrase-table_test6 @@ -0,0 +1,4 @@ +ad ||| af ||| 0.117462 0.117462 0.117462 0.117462 ||| 0-0 ||| 1000 1000 +bd ||| bf ||| 0.117462 0.117462 0.117462 0.117462 ||| 0-0 ||| 10 10 +pass ||| passeport ||| 0.278834 0.197701 0.387861 0.449295 ||| 0-0 ||| 15 582 +sitzung ||| séance ||| 0.705857 0.545304 0.497336 0.544877 ||| 0-0 ||| 22 17 diff --git a/mosesdecoder/contrib/tmcombine/test/phrase-table_test8 b/mosesdecoder/contrib/tmcombine/test/phrase-table_test8 new file mode 100644 index 0000000000000000000000000000000000000000..1974a53df32df11edca0ed8880a9437c54baa712 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/phrase-table_test8 @@ -0,0 +1,9 @@ +ad ||| af ||| 0.242882 0.39808 0.483231 0.482813 ||| 0-0 ||| 2799.50876845 1043.75589858 +bd ||| bf ||| 0.102211 0.111366 0.17441 0.172864 ||| 0-0 ||| 1809.50876845 53.7558985771 +der gipfel ||| sommet ||| 0.00327135 0.00863716 0.0366795 0.612073 ||| 1-0 ||| 5808.0 518.0 +der pass ||| le col ||| 0.0173565 0.0260468 0.146469 0.113553 ||| 0-0 1-1 ||| 749.0 88.7558985771 +der pass ||| le passeport ||| 0.16 0.03892 0.197197 0.0101013 ||| 0-0 1-1 ||| 1799.50876845 88.7558985771 +pass ||| col ||| 0.1952 0.13181 0.584893 0.636208 ||| 0-0 ||| 1875.0 625.755898577 +pass ||| passeport retrouvé ||| 0.5 0.196956 0.00159806 1.89361e-06 ||| 0-0 ||| 2.0 625.755898577 +pass ||| passeport ||| 0.280108 0.196956 0.0488467 0.056595 ||| 0-0 ||| 1814.50876845 625.755898577 +sitzung ||| séance ||| 0.77834 0.545022 0.470846 0.502627 ||| 0-0 ||| 1821.50876845 60.7558985771 diff --git a/mosesdecoder/contrib/tmcombine/test/phrase-table_test9 b/mosesdecoder/contrib/tmcombine/test/phrase-table_test9 new file mode 100644 index 0000000000000000000000000000000000000000..3e640d14bd535b491b94e2f4039a310a156c2420 --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/test/phrase-table_test9 @@ -0,0 +1,9 @@ +ad ||| af ||| 0.45 0.45 0.45 0.45 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 10000.0 5000.0 ||| sparse_feature 1 +bd ||| bf ||| 0.45 0.45 0.45 0.45 0.14 0.136364 0.18 0.3 ||| 0-0 ||| 100.0 50.0 ||| +der gipfel ||| sommet ||| 0.15 0.15 0.15 0.15 0.00327135 0.00569336 0.0366795 0.651018 ||| 1-0 ||| 5808.0 518.0 +der pass ||| le col ||| 0.15 0.15 0.15 0.15 0.0173565 0.0193836 0.152941 0.0675369 ||| 0-0 1-1 ||| 749.0 85.0 +der pass ||| le passeport ||| 0.3 0.3 0.3 0.3 0.16 0.0307772 0.188235 0.0128336 ||| 0-0 1-1 ||| 225.0 85.0 +pass ||| col ||| 0.15 0.15 0.15 0.15 0.1952 0.121573 0.398693 0.582296 ||| 0-0 ||| 1875.0 918.0 +pass ||| passeport retrouvé ||| 0.15 0.15 0.15 0.15 0.5 0.193033 0.00108932 1.16835e-06 ||| 0-0 ||| 2.0 918.0 +pass ||| passeport ||| 0.45 0.45 0.45 0.45 0.280097 0.193033 0.22658 0.11065 ||| 0-0 ||| 1653.0 918.0 +sitzung ||| séance ||| 0.45 0.45 0.45 0.45 0.784227 0.597753 0.516546 0.559514 ||| 0-0 ||| 38281.0 25837.0 diff --git a/mosesdecoder/contrib/tmcombine/tmcombine.py b/mosesdecoder/contrib/tmcombine/tmcombine.py new file mode 100644 index 0000000000000000000000000000000000000000..467a24e1993db9533536893d6b0968c161e25acf --- /dev/null +++ b/mosesdecoder/contrib/tmcombine/tmcombine.py @@ -0,0 +1,1993 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# This program handles the combination of Moses phrase tables, either through +# linear interpolation of the phrase translation probabilities/lexical weights, +# or through a recomputation based on the (weighted) combined counts. +# +# It also supports an automatic search for weights that minimize the cross-entropy +# between the model and a tuning set of word/phrase alignments. + +# for usage information, run +# python tmcombine.py -h +# you can also check the docstrings of Combine_TMs() for more information and find some example commands in the function test() + + +# Some general things to note: +# - Different combination algorithms require different statistics. To be on the safe side, use the option `-write-lexical-counts` when training models. +# - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). sort with LC_ALL=C. +# - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007). +# - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files. +# - The cross-entropy estimation assumes that phrase tables contain true probability distributions (i.e. a probability mass of 1 for each conditional probability distribution). If this is not true, the results are skewed. +# - Unknown phrase pairs are not considered for the cross-entropy estimation. A comparison of models with different vocabularies may be misleading. +# - Don't directly compare cross-entropies obtained from a combination with different modes. Depending on how some corner cases are treated, linear interpolation does not distribute full probability mass and thus shows higher (i.e. worse) cross-entropies. + + +from __future__ import division, unicode_literals +import sys +import os +import gzip +import argparse +import copy +import re +from math import log, exp +from collections import defaultdict +from operator import mul +from tempfile import NamedTemporaryFile +from subprocess import Popen +try: + from itertools import izip +except: + izip = zip + +try: + from lxml import etree as ET +except: + import xml.etree.cElementTree as ET + +try: + from scipy.optimize.lbfgsb import fmin_l_bfgs_b + optimizer = 'l-bfgs' +except: + optimizer = 'hillclimb' + +class Moses(): + """Moses interface for loading/writing models + to support other phrase table formats, subclass this and overwrite the relevant functions + """ + + def __init__(self,models,number_of_features): + + self.number_of_features = number_of_features + self.models = models + + #example item (assuming mode=='counts' and one feature): phrase_pairs['the house']['das haus'] = [[[10,100]],['0-0 1-1']] + self.phrase_pairs = defaultdict(lambda: defaultdict(lambda: [[[0]*len(self.models) for i in range(self.number_of_features)],[]])) + self.phrase_source = defaultdict(lambda: [0]*len(self.models)) + self.phrase_target = defaultdict(lambda: [0]*len(self.models)) + + self.reordering_pairs = defaultdict(lambda: defaultdict(lambda: [[0]*len(self.models) for i in range(self.number_of_features)])) + + self.word_pairs_e2f = defaultdict(lambda: defaultdict(lambda: [0]*len(self.models))) + self.word_pairs_f2e = defaultdict(lambda: defaultdict(lambda: [0]*len(self.models))) + self.word_source = defaultdict(lambda: [0]*len(self.models)) + self.word_target = defaultdict(lambda: [0]*len(self.models)) + + self.require_alignment = False + + + def open_table(self,model,table,mode='r'): + """define which paths to open for lexical tables and phrase tables. + we assume canonical Moses structure, but feel free to overwrite this + """ + + if table == 'reordering-table': + table = 'reordering-table.wbe-msd-bidirectional-fe' + + filename = os.path.join(model,'model',table) + fileobj = handle_file(filename,'open',mode) + return fileobj + + + def load_phrase_features(self,line,priority,i,mode='interpolate',store='pairs',filter_by=None,filter_by_src=None,filter_by_target=None,inverted=False,flags=None): + """take single phrase table line and store probablities in internal data structure""" + + src = line[0] + target = line[1] + + if inverted: + src,target = target,src + + if (store == 'all' or store == 'pairs') and (priority < 10 or (src in self.phrase_pairs and target in self.phrase_pairs[src])) and not (filter_by and not (src in filter_by and target in filter_by[src])): + + self.store_info(src,target,line) + + scores = line[2].split() + if len(scores) = 5: + if not self.phrase_pairs[src][target][1]: + self.phrase_pairs[src][target][1] = line[3:] + + # assuming that alignment is empty + elif len(line) == 4: + if self.require_alignment: + sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n') + exit(1) + + self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')] + + else: + sys.stderr.write('Error: unexpected phrase table format. Are you using a very old/new version of Moses with different formatting?\n') + exit(1) + + + def get_word_alignments(self,src,target,cache=False,mycache={}): + """from the Moses phrase table alignment info in the form "0-0 1-0", + get the aligned word pairs / NULL alignments + """ + + if cache: + if (src,target) in mycache: + return mycache[(src,target)] + + try: + alignment = self.phrase_pairs[src][target][1][0] + except: + return None,None + + src_list = src.split(b' ') + target_list = target.split(b' ') + + textual_e2f = [[s,[]] for s in src_list] + textual_f2e = [[t,[]] for t in target_list] + + for pair in alignment.split(b' '): + s,t = pair.split(b'-') + s,t = int(s),int(t) + + textual_e2f[s][1].append(target_list[t]) + textual_f2e[t][1].append(src_list[s]) + + for s,t in textual_e2f: + if not t: + t.append(b'NULL') + + for s,t in textual_f2e: + if not t: + t.append(b'NULL') + + #tupelize so we can use the value as dictionary keys + for i in range(len(textual_e2f)): + textual_e2f[i][1] = tuple(textual_e2f[i][1]) + + for i in range(len(textual_f2e)): + textual_f2e[i][1] = tuple(textual_f2e[i][1]) + + if cache: + mycache[(src,target)] = textual_e2f,textual_f2e + + return textual_e2f,textual_f2e + + + def write_phrase_table(self,src,target,weights,features,mode,flags): + """convert data to string in Moses phrase table format""" + + # if one feature value is 0 (either because of loglinear interpolation or rounding to 0), don't write it to phrasetable + # (phrase pair will end up with probability zero in log-linear model anyway) + if 0 in features: + return b'' + + # information specific to Moses model: alignment info and comment section with target and source counts + additional_entries = self.phrase_pairs[src][target][1] + alignment = additional_entries[0] + if alignment: + extra_space = b' ' + else: + extra_space = b'' + + if mode == 'counts': + i_e2f = flags['i_e2f'] + i_f2e = flags['i_f2e'] + srccount = dot_product(self.phrase_source[src],weights[i_f2e]) + targetcount = dot_product(self.phrase_target[target],weights[i_e2f]) + additional_entries[1] = b"%s %s" %(targetcount,srccount) + + features = b' '.join([b'%.6g' %(f) for f in features]) + + if flags['add_origin_features']: + origin_features = list(map(lambda x: 2.718**bool(x),self.phrase_pairs[src][target][0][0])) # 1 if phrase pair doesn't occur in model, 2.718 if it does + origin_features = b' '.join([b'%.4f' %(f) for f in origin_features]) + ' ' + else: + origin_features = b'' + if flags['write_phrase_penalty']: + phrase_penalty = b' 2.718' + else: + phrase_penalty = b'' + line = b"%s ||| %s ||| %s%s %s||| %s%s||| %s\n" %(src,target,features,origin_features,phrase_penalty,alignment,extra_space,b' ||| '.join(additional_entries[1:])) + return line + + + + def write_lexical_file(self,direction, path, weights,mode): + + if mode == 'counts': + bridge = '.counts' + else: + bridge = '' + + fobj = handle_file("{0}{1}.{2}".format(path,bridge,direction),'open',mode='w') + sys.stderr.write('Writing {0}{1}.{2}\n'.format(path,bridge,direction)) + + if direction == 'e2f': + word_pairs = self.word_pairs_e2f + marginal = self.word_target + + elif direction == 'f2e': + word_pairs = self.word_pairs_f2e + marginal = self.word_source + + for x in sorted(word_pairs): + for y in sorted(word_pairs[x]): + xy = dot_product(word_pairs[x][y],weights) + fobj.write(b"%s %s %s" %(x,y,xy)) + + if mode == 'counts': + fobj.write(b" %s\n" %(dot_product(marginal[y],weights))) + else: + fobj.write(b'\n') + + handle_file("{0}{1}.{2}".format(path,bridge,direction),'close',fobj,mode='w') + + + + def write_reordering_table(self,src,target,features): + """convert data to string in Moses reordering table format""" + + # if one feature value is 0 (either because of loglinear interpolation or rounding to 0), don't write it to reordering table + # (phrase pair will end up with probability zero in log-linear model anyway) + if 0 in features: + return b'' + + features = b' '.join([b'%.6g' %(f) for f in features]) + + line = b"%s ||| %s ||| %s\n" %(src,target,features) + return line + + + def create_inverse(self,fobj,tempdir=None): + """swap source and target phrase in the phrase table, and then sort (by target phrase)""" + + inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False,dir=tempdir) + swap = re.compile(b'(.+?) \|\|\| (.+?) \|\|\|') + + # just swap source and target phrase, and leave order of scores etc. intact. + # For better compatibility with existing codebase, we swap the order of the phrases back for processing + for line in fobj: + inverse.write(swap.sub(b'\\2 ||| \\1 |||',line,1)) + inverse.close() + + inverse_sorted = sort_file(inverse.name,tempdir=tempdir) + os.remove(inverse.name) + + return inverse_sorted + + + def merge(self,pt_normal, pt_inverse, pt_out, mode='interpolate'): + """merge two phrasetables (the latter having been inverted to calculate p(s|t) and lex(s|t) in sorted order) + Assumes that p(s|t) and lex(s|t) are in first table half, p(t|s) and lex(t|s) in second""" + + for line,line2 in izip(pt_normal,pt_inverse): + + line = line.split(b' ||| ') + if line[-1].endswith(b' |||'): + line[-1] = line[-1][:-4] + line.append('') + + line2 = line2.split(b' ||| ') + if line2[-1].endswith(b' |||'): + line2[-1] = line2[-1][:-4] + line2.append('') + + #scores + mid = int(self.number_of_features/2) + scores1 = line[2].split() + scores2 = line2[2].split() + line[2] = b' '.join(scores2[:mid]+scores1[mid:]) + + # marginal counts + if mode == 'counts': + src_count = line[4].split()[1] + target_count = line2[-1].split()[0] + line[4] = b' '.join([target_count,src_count]) + + pt_out.write(b' ||| '.join(line)+ b'\n') + + pt_normal.close() + pt_inverse.close() + pt_out.close() + + + +class TigerXML(): + """interface to load reference word alignments from TigerXML corpus. + Tested on SMULTRON (http://kitt.cl.uzh.ch/kitt/smultron/) + """ + + def __init__(self,alignment_xml): + """only argument is TigerXML file + """ + + self.treebanks = self._open_treebanks(alignment_xml) + self.word_pairs = defaultdict(lambda: defaultdict(int)) + self.word_source = defaultdict(int) + self.word_target = defaultdict(int) + + + def load_word_pairs(self,src,target): + """load word pairs. src and target are the itentifiers of the source and target language in the XML""" + + if not src or not target: + sys.stderr.write('Error: Source and/or target language not specified. Required for TigerXML extraction.\n') + exit(1) + + alignments = self._get_aligned_ids(src,target) + self._textualize_alignments(src,target,alignments) + + + def _open_treebanks(self,alignment_xml): + """Parallel XML format references monolingual files. Open all.""" + + alignment_path = os.path.dirname(alignment_xml) + align_xml = ET.parse(alignment_xml) + + treebanks = {} + treebanks['aligned'] = align_xml + + for treebank in align_xml.findall('//treebank'): + treebank_id = treebank.get('id') + filename = treebank.get('filename') + + if not os.path.isabs(filename): + filename = os.path.join(alignment_path,filename) + + treebanks[treebank_id] = ET.parse(filename) + + return treebanks + + + def _get_aligned_ids(self,src,target): + """first step: find which nodes are aligned.""" + + + alignments = [] + ids = defaultdict(dict) + + for alignment in self.treebanks['aligned'].findall('//align'): + + newpair = {} + + if len(alignment) != 2: + sys.stderr.write('Error: alignment with ' + str(len(alignment)) + ' children. Expected 2. Skipping.\n') + continue + + for node in alignment: + lang = node.get('treebank_id') + node_id = node.get('node_id') + newpair[lang] = node_id + + if not (src in newpair and target in newpair): + sys.stderr.write('Error: source and target languages don\'t match. Skipping.\n') + continue + + # every token may only appear in one alignment pair; + # if it occurs in multiple, we interpret them as one 1-to-many or many-to-1 alignment + if newpair[src] in ids[src]: + idx = ids[src][newpair[src]] + alignments[idx][1].append(newpair[target]) + + elif newpair[target] in ids[target]: + idx = ids[target][newpair[target]] + alignments[idx][0].append(newpair[src]) + + else: + idx = len(alignments) + alignments.append(([newpair[src]],[newpair[target]])) + ids[src][newpair[src]] = idx + ids[target][newpair[target]] = idx + + alignments = self._discard_discontinuous(alignments) + + return alignments + + + def _discard_discontinuous(self,alignments): + """discard discontinuous word sequences (which we can't use for phrase-based SMT systems) + and make sure that sequence is in correct order. + """ + + new_alignments = [] + + for alignment in alignments: + new_pair = [] + + for sequence in alignment: + + sequence_split = [t_id.split('_') for t_id in sequence] + + #check if all words come from the same sentence + sentences = [item[0] for item in sequence_split] + if not len(set(sentences)) == 1: + #sys.stderr.write('Warning. Word sequence crossing sentence boundary. Discarding.\n') + #sys.stderr.write(str(sequence)+'\n') + continue + + + #sort words and check for discontinuities. + try: + tokens = sorted([int(item[1]) for item in sequence_split]) + except ValueError: + #sys.stderr.write('Warning. Not valid word IDs. Discarding.\n') + #sys.stderr.write(str(sequence)+'\n') + continue + + if not tokens[-1]-tokens[0] == len(tokens)-1: + #sys.stderr.write('Warning. Discontinuous word sequence(?). Discarding.\n') + #sys.stderr.write(str(sequence)+'\n') + continue + + out_sequence = [sentences[0]+'_'+str(token) for token in tokens] + new_pair.append(out_sequence) + + if len(new_pair) == 2: + new_alignments.append(new_pair) + + return new_alignments + + + def _textualize_alignments(self,src,target,alignments): + """Knowing which nodes are aligned, get actual words that are aligned.""" + + words = defaultdict(dict) + + for text in [text for text in self.treebanks if not text == 'aligned']: + + #TODO: Make lowercasing optional + for terminal in self.treebanks[text].findall('//t'): + words[text][terminal.get('id')] = terminal.get('word').lower() + + + for (src_ids, target_ids) in alignments: + + try: + src_text = ' '.join((words[src][src_id] for src_id in src_ids)) + except KeyError: + #sys.stderr.write('Warning. ID not found: '+ str(src_ids) +'\n') + continue + + try: + target_text = ' '.join((words[target][target_id] for target_id in target_ids)) + except KeyError: + #sys.stderr.write('Warning. ID not found: '+ str(target_ids) +'\n') + continue + + self.word_pairs[src_text][target_text] += 1 + self.word_source[src_text] += 1 + self.word_target[target_text] += 1 + + + +class Moses_Alignment(): + """interface to load reference phrase alignment from corpus aligend with Giza++ + and with extraction heuristics as applied by the Moses toolkit. + + """ + + def __init__(self,alignment_file): + + self.alignment_file = alignment_file + self.word_pairs = defaultdict(lambda: defaultdict(int)) + self.word_source = defaultdict(int) + self.word_target = defaultdict(int) + + + def load_word_pairs(self,src_lang,target_lang): + """main function. overwrite this to import data in different format.""" + + fileobj = handle_file(self.alignment_file,'open','r') + + for line in fileobj: + + line = line.split(b' ||| ') + if line[-1].endswith(b' |||'): + line[-1] = line[-1][:-4] + line.append(b'') + + src = line[0] + target = line[1] + + self.word_pairs[src][target] += 1 + self.word_source[src] += 1 + self.word_target[target] += 1 + + +def dot_product(a,b): + """calculate dot product from two lists""" + + # optimized for PyPy (much faster than enumerate/map) + s = 0 + i = 0 + for x in a: + s += x * b[i] + i += 1 + + return s + + +def priority_sort_models(models): + """primary models should have priority before supplementary models. + zipped with index to know which weight model belongs to + """ + + return [(model,priority,i) for (i,(model,priority)) in sorted(zip(range(len(models)),models),key=lambda x: x[1][1])] + + +def cross_entropy(model_interface,reference_interface,weights,score,mode,flags): + """calculate cross entropy given all necessary information. + don't call this directly, but use one of the Combine_TMs methods. + """ + + weights = normalize_weights(weights,mode,flags) + + if 'compare_cross-entropies' in flags and flags['compare_cross-entropies']: + num_results = len(model_interface.models) + else: + num_results = 1 + + cross_entropies = [[0]*num_results for i in range(model_interface.number_of_features)] + oov = [0]*num_results + oov2 = 0 + other_translations = [0]*num_results + ignored = [0]*num_results + n = [0]*num_results + total_pairs = 0 + + for src in reference_interface.word_pairs: + for target in reference_interface.word_pairs[src]: + + c = reference_interface.word_pairs[src][target] + + for i in range(num_results): + if src in model_interface.phrase_pairs and target in model_interface.phrase_pairs[src]: + + if ('compare_cross-entropies' in flags and flags['compare_cross-entropies']) or ('intersected_cross-entropies' in flags and flags['intersected_cross-entropies']): + + if 0 in model_interface.phrase_pairs[src][target][0][0]: #only use intersection of models for comparability + + # update unknown words statistics + if model_interface.phrase_pairs[src][target][0][0][i]: + ignored[i] += c + elif src in model_interface.phrase_source and model_interface.phrase_source[src][i]: + other_translations[i] += c + else: + oov[i] += c + + continue + + if ('compare_cross-entropies' in flags and flags['compare_cross-entropies']): + tmp_weights = [[0]*i+[1]+[0]*(num_results-i-1)]*model_interface.number_of_features + elif ('intersected_cross-entropies' in flags and flags['intersected_cross-entropies']): + tmp_weights = weights + + features = score(tmp_weights,src,target,model_interface,flags) + + else: + features = score(weights,src,target,model_interface,flags) + + #if weight is so low that feature gets probability zero + if 0 in features: + #sys.stderr.write('Warning: 0 probability in model {0}: source phrase: {1!r}; target phrase: {2!r}\n'.format(i,src,target)) + #sys.stderr.write('Possible reasons: 0 probability in phrase table; very low (or 0) weight; recompute lexweight and different alignments\n') + #sys.stderr.write('Phrase pair is ignored for cross_entropy calculation\n\n') + continue + + n[i] += c + for j in range(model_interface.number_of_features): + cross_entropies[j][i] -= log(features[j],2)*c + + elif src in model_interface.phrase_source and not ('compare_cross-entropies' in flags and flags['compare_cross-entropies']): + other_translations[i] += c + + else: + oov2 += c + + total_pairs += c + + + oov2 = int(oov2/num_results) + + for i in range(num_results): + try: + for j in range(model_interface.number_of_features): + cross_entropies[j][i] /= n[i] + except ZeroDivisionError: + sys.stderr.write('Warning: no matching phrase pairs between reference set and model\n') + for j in range(model_interface.number_of_features): + cross_entropies[j][i] = 0 + + + if 'compare_cross-entropies' in flags and flags['compare_cross-entropies']: + return [tuple([ce[i] for ce in cross_entropies]) + (other_translations[i],oov[i],ignored[i],n[i],total_pairs) for i in range(num_results)], (n[0],total_pairs,oov2) + else: + return tuple([ce[0] for ce in cross_entropies]) + (other_translations[0],oov2,total_pairs) + + +def cross_entropy_light(model_interface,reference_interface,weights,score,mode,flags,cache): + """calculate cross entropy given all necessary information. + don't call this directly, but use one of the Combine_TMs methods. + Same as cross_entropy, but optimized for speed: it doesn't generate all of the statistics, + doesn't normalize, and uses caching. + """ + weights = normalize_weights(weights,mode,flags) + cross_entropies = [0]*model_interface.number_of_features + + for (src,target,c) in cache: + features = score(weights,src,target,model_interface,flags,cache=True) + + if 0 in features: + #sys.stderr.write('Warning: 0 probability in model {0}: source phrase: {1!r}; target phrase: {2!r}\n'.format(i,src,target)) + #sys.stderr.write('Possible reasons: 0 probability in phrase table; very low (or 0) weight; recompute lexweight and different alignments\n') + #sys.stderr.write('Phrase pair is ignored for cross_entropy calculation\n\n') + continue + + for i in range(model_interface.number_of_features): + cross_entropies[i] -= log(features[i],2)*c + + return cross_entropies + + +def _get_reference_cache(reference_interface,model_interface): + """creates a data structure that allows for a quick access + to all relevant reference set phrase/word pairs and their frequencies. + """ + cache = [] + n = 0 + + for src in reference_interface.word_pairs: + for target in reference_interface.word_pairs[src]: + if src in model_interface.phrase_pairs and target in model_interface.phrase_pairs[src]: + c = reference_interface.word_pairs[src][target] + cache.append((src,target,c)) + n += c + + return cache,n + + +def _get_lexical_filter(reference_interface,model_interface): + """returns dictionaries that store the words and word pairs needed + for perplexity optimization. We can use these dicts to load fewer data into memory for optimization.""" + + e2f_filter = defaultdict(set) + f2e_filter = defaultdict(set) + + for src in reference_interface.word_pairs: + for target in reference_interface.word_pairs[src]: + if src in model_interface.phrase_pairs and target in model_interface.phrase_pairs[src]: + e2f_alignment,f2e_alignment = model_interface.get_word_alignments(src,target) + + for s,t_list in e2f_alignment: + for t in t_list: + e2f_filter[s].add(t) + + for t,s_list in f2e_alignment: + for s in s_list: + f2e_filter[t].add(s) + + return e2f_filter,f2e_filter + + +def _hillclimb_move(weights,stepsize,mode,flags): + """Move function for hillclimb algorithm. Updates each weight by stepsize.""" + + for i,w in enumerate(weights): + yield normalize_weights(weights[:i]+[w+stepsize]+weights[i+1:],mode,flags) + + for i,w in enumerate(weights): + new = w-stepsize + if new >= 1e-10: + yield normalize_weights(weights[:i]+[new]+weights[i+1:],mode,flags) + +def _hillclimb(scores,best_weights,objective,model_interface,reference_interface,score_function,mode,flags,precision,cache,n): + """first (deprecated) implementation of iterative weight optimization.""" + + best = objective(best_weights) + + i = 0 #counts number of iterations with same stepsize: if greater than 10, it is doubled + stepsize = 512 # initial stepsize + move = 1 #whether we found a better set of weights in the current iteration. if not, it is halfed + sys.stderr.write('Hillclimb: step size: ' + str(stepsize)) + while stepsize > 0.0078: + + if not move: + stepsize /= 2 + sys.stderr.write(' ' + str(stepsize)) + i = 0 + move = 1 + continue + + move = 0 + + for w in _hillclimb_move(list(best_weights),stepsize,mode,flags): + weights_tuple = tuple(w) + + if weights_tuple in scores: + continue + + scores[weights_tuple] = cross_entropy_light(model_interface,reference_interface,[w for m in range(model_interface.number_of_features)],score_function,mode,flags,cache) + + if objective(weights_tuple)+precision < best: + best = objective(weights_tuple) + best_weights = weights_tuple + move = 1 + + if i and not i % 10: + sys.stderr.write('\nIteration '+ str(i) + ' with stepsize ' + str(stepsize) + '. current cross-entropy: ' + str(best) + '- weights: ' + str(best_weights) + ' ') + stepsize *= 2 + sys.stderr.write('\nIncreasing stepsize: '+ str(stepsize)) + i = 0 + + i += 1 + + return best_weights + + +def optimize_cross_entropy_hillclimb(model_interface,reference_interface,initial_weights,score_function,mode,flags,precision=0.000001): + """find weights that minimize cross-entropy on a tuning set + deprecated (default is now L-BFGS (optimize_cross_entropy)), but left in for people without SciPy + """ + + scores = {} + + best_weights = tuple(initial_weights[0]) + + cache,n = _get_reference_cache(reference_interface,model_interface) + + # each objective is a triple: which score to minimize from cross_entropy(), which weights to update accordingly, and a comment that is printed + objectives = [(lambda x: scores[x][i]/n,[i],'minimize cross-entropy for feature {0}'.format(i)) for i in range(model_interface.number_of_features)] + + scores[best_weights] = cross_entropy_light(model_interface,reference_interface,initial_weights,score_function,mode,flags,cache) + final_weights = initial_weights[:] + final_cross_entropy = [0]*model_interface.number_of_features + + for i,(objective, features, comment) in enumerate(objectives): + best_weights = min(scores,key=objective) + sys.stderr.write('Optimizing objective "' + comment +'"\n') + best_weights = _hillclimb(scores,best_weights,objective,model_interface,reference_interface,score_function,feature_specific_mode(mode,i,flags),flags,precision,cache,n) + + sys.stderr.write('\nCross-entropy:' + str(objective(best_weights)) + ' - weights: ' + str(best_weights)+'\n\n') + + for j in features: + final_weights[j] = list(best_weights) + final_cross_entropy[j] = objective(best_weights) + + return final_weights,final_cross_entropy + + +def optimize_cross_entropy(model_interface,reference_interface,initial_weights,score_function,mode,flags): + """find weights that minimize cross-entropy on a tuning set + Uses L-BFGS optimization and requires SciPy + """ + + if not optimizer == 'l-bfgs': + sys.stderr.write('SciPy is not installed. Falling back to naive hillclimb optimization (instead of L-BFGS)\n') + return optimize_cross_entropy_hillclimb(model_interface,reference_interface,initial_weights,score_function,mode,flags) + + cache,n = _get_reference_cache(reference_interface,model_interface) + + # each objective is a triple: which score to minimize from cross_entropy(), which weights to update accordingly, and a comment that is printed + objectives = [(lambda w: cross_entropy_light(model_interface,reference_interface,[[1]+list(w) for m in range(model_interface.number_of_features)],score_function,feature_specific_mode(mode,i,flags),flags,cache)[i],[i],'minimize cross-entropy for feature {0}'.format(i)) for i in range(model_interface.number_of_features)] #optimize cross-entropy for p(s|t) + + final_weights = initial_weights[:] + final_cross_entropy = [0]*model_interface.number_of_features + + for i,(objective, features, comment) in enumerate(objectives): + sys.stderr.write('Optimizing objective "' + comment +'"\n') + initial_values = [1]*(len(model_interface.models)-1) # we leave value of first model at 1 and optimize all others (normalized of course) + best_weights, best_point, data = fmin_l_bfgs_b(objective,initial_values,approx_grad=True,bounds=[(0.000000001,None)]*len(initial_values)) + best_weights = normalize_weights([1]+list(best_weights),feature_specific_mode(mode,i,flags),flags) + sys.stderr.write('Cross-entropy after L-BFGS optimization: ' + str(best_point/n) + ' - weights: ' + str(best_weights)+'\n') + + for j in features: + final_weights[j] = list(best_weights) + final_cross_entropy[j] = best_point/n + + return final_weights,final_cross_entropy + + +def feature_specific_mode(mode,i,flags): + """in mode 'counts', only the default Moses features can be recomputed from raw frequencies; + all other features are interpolated by default. + This fucntion mostly serves optical purposes (i.e. normalizing a single weight vector for logging), + since normalize_weights also handles a mix of interpolated and recomputed features. + """ + + if mode == 'counts' and i not in [flags['i_e2f'],flags['i_e2f_lex'],flags['i_f2e'],flags['i_f2e_lex']]: + return 'interpolate' + else: + return mode + + +def redistribute_probability_mass(weights,src,target,interface,flags,mode='interpolate'): + """the conditional probability p(x|y) is undefined for cases where p(y) = 0 + this function redistributes the probability mass to only consider models for which p(y) > 0 + """ + + i_e2f = flags['i_e2f'] + i_e2f_lex = flags['i_e2f_lex'] + i_f2e = flags['i_f2e'] + i_f2e_lex = flags['i_f2e_lex'] + + new_weights = weights[:] + + if flags['normalize_s_given_t'] == 's': + + # set weight to 0 for all models where target phrase is unseen (p(s|t) + new_weights[i_e2f] = list(map(mul,interface.phrase_source[src],weights[i_e2f])) + if flags['normalize-lexical_weights']: + new_weights[i_e2f_lex] = list(map(mul,interface.phrase_source[src],weights[i_e2f_lex])) + + elif flags['normalize_s_given_t'] == 't': + + # set weight to 0 for all models where target phrase is unseen (p(s|t) + new_weights[i_e2f] = list(map(mul,interface.phrase_target[target],weights[i_e2f])) + if flags['normalize-lexical_weights']: + new_weights[i_e2f_lex] = list(map(mul,interface.phrase_target[target],weights[i_e2f_lex])) + + # set weight to 0 for all models where source phrase is unseen (p(t|s) + new_weights[i_f2e] = list(map(mul,interface.phrase_source[src],weights[i_f2e])) + if flags['normalize-lexical_weights']: + new_weights[i_f2e_lex] = list(map(mul,interface.phrase_source[src],weights[i_f2e_lex])) + + + return normalize_weights(new_weights,mode,flags) + + +def score_interpolate(weights,src,target,interface,flags,cache=False): + """linear interpolation of probabilites (and other feature values) + if normalized is True, the probability mass for p(x|y) is redistributed to models with p(y) > 0 + """ + + model_values = interface.phrase_pairs[src][target][0] + + scores = [0]*len(model_values) + + if 'normalized' in flags and flags['normalized']: + normalized_weights = redistribute_probability_mass(weights,src,target,interface,flags) + else: + normalized_weights = weights + + if 'recompute_lexweights' in flags and flags['recompute_lexweights']: + e2f_alignment,f2e_alignment = interface.get_word_alignments(src,target,cache=cache) + + if not e2f_alignment or not f2e_alignment: + sys.stderr.write('Error: no word alignments found, but necessary for lexical weight computation.\n') + lst = 0 + lts = 0 + + else: + scores[flags['i_e2f_lex']] = compute_lexicalweight(normalized_weights[flags['i_e2f_lex']],e2f_alignment,interface.word_pairs_e2f,None,mode='interpolate') + scores[flags['i_f2e_lex']] = compute_lexicalweight(normalized_weights[flags['i_f2e_lex']],f2e_alignment,interface.word_pairs_f2e,None,mode='interpolate') + + + for idx,prob in enumerate(model_values): + if not ('recompute_lexweights' in flags and flags['recompute_lexweights'] and (idx == flags['i_e2f_lex'] or idx == flags['i_f2e_lex'])): + scores[idx] = dot_product(prob,normalized_weights[idx]) + + return scores + + +def score_loglinear(weights,src,target,interface,flags,cache=False): + """loglinear interpolation of probabilites + warning: if phrase pair does not occur in all models, resulting probability is 0 + this is usually not what you want - loglinear scoring is only included for completeness' sake + """ + + scores = [] + model_values = interface.phrase_pairs[src][target][0] + + for idx,prob in enumerate(model_values): + try: + scores.append(exp(dot_product(list(map(log,prob)),weights[idx]))) + except ValueError: + scores.append(0) + + return scores + + +def score_counts(weights,src,target,interface,flags,cache=False): + """count-based re-estimation of probabilites and lexical weights + each count is multiplied by its weight; trivial case is weight 1 for each model, which corresponds to a concatentation + """ + + i_e2f = flags['i_e2f'] + i_e2f_lex = flags['i_e2f_lex'] + i_f2e = flags['i_f2e'] + i_f2e_lex = flags['i_f2e_lex'] + + # if we have non-default number of weights, assume that we might have to do a mix of count-based and interpolated scores. + if len(weights) == 4: + scores = [0]*len(weights) + else: + scores = score_interpolate(weights,src,target,interface,flags,cache=cache) + + try: + joined_count = dot_product(interface.phrase_pairs[src][target][0][i_e2f],weights[i_e2f]) + target_count = dot_product(interface.phrase_target[target],weights[i_e2f]) + scores[i_e2f] = joined_count / target_count + except ZeroDivisionError: + scores[i_e2f] = 0 + + try: + joined_count = dot_product(interface.phrase_pairs[src][target][0][i_f2e],weights[i_f2e]) + source_count = dot_product(interface.phrase_source[src],weights[i_f2e]) + scores[i_f2e] = joined_count / source_count + except ZeroDivisionError: + scores[i_f2e] = 0 + + e2f_alignment,f2e_alignment = interface.get_word_alignments(src,target,cache=cache) + + if not e2f_alignment or not f2e_alignment: + sys.stderr.write('Error: no word alignments found, but necessary for lexical weight computation.\n') + scores[i_e2f_lex] = 0 + scores[i_f2e_lex] = 0 + + else: + scores[i_e2f_lex] = compute_lexicalweight(weights[i_e2f_lex],e2f_alignment,interface.word_pairs_e2f,interface.word_target,mode='counts',cache=cache) + scores[i_f2e_lex] = compute_lexicalweight(weights[i_f2e_lex],f2e_alignment,interface.word_pairs_f2e,interface.word_source,mode='counts',cache=cache) + + return scores + + +def score_interpolate_reordering(weights,src,target,interface): + """linear interpolation of reordering model probabilities + also normalizes model so that + """ + + model_values = interface.reordering_pairs[src][target] + + scores = [0]*len(model_values) + + for idx,prob in enumerate(model_values): + scores[idx] = dot_product(prob,weights[idx]) + + #normalizes first half and last half probabilities (so that each half sums to one). + #only makes sense for bidirectional configuration in Moses. Remove/change this if you want a different (or no) normalization + scores = normalize_weights(scores[:int(interface.number_of_features/2)],'interpolate') + normalize_weights(scores[int(interface.number_of_features/2):],'interpolate') + + return scores + + +def compute_lexicalweight(weights,alignment,word_pairs,marginal,mode='counts',cache=False,mycache=[0,defaultdict(dict)]): + """compute the lexical weights as implemented in Moses toolkit""" + + lex = 1 + + # new weights: empty cache + if cache and mycache[0] != weights: + mycache[0] = weights + mycache[1] = defaultdict(dict) + + for x,translations in alignment: + # skip nonterminals + if x.startswith(b'['): + continue + + if cache and translations in mycache[1][x]: + lex_step = mycache[1][x][translations] + + else: + lex_step = 0 + for y in translations: + + if mode == 'counts': + lex_step += dot_product(word_pairs[x][y],weights) / dot_product(marginal[y],weights) + elif mode == 'interpolate': + lex_step += dot_product(word_pairs[x][y],weights) + + lex_step /= len(translations) + + if cache: + mycache[1][x][translations] = lex_step + + lex *= lex_step + + return lex + + +def normalize_weights(weights,mode,flags=None): + """make sure that probability mass in linear interpolation is 1 + for weighted counts, weight of first model is set to 1 + """ + + if mode == 'interpolate' or mode == 'loglinear': + + if type(weights[0]) == list: + + new_weights = [] + + for weight_list in weights: + total = sum(weight_list) + + try: + weight_list = [weight/total for weight in weight_list] + except ZeroDivisionError: + sys.stderr.write('Error: Zero division in weight normalization. Are some of your weights zero? This might lead to undefined behaviour if a phrase pair is only seen in model with weight 0\n') + + new_weights.append(weight_list) + + else: + total = sum(weights) + + try: + new_weights = [weight/total for weight in weights] + except ZeroDivisionError: + sys.stderr.write('Error: Zero division in weight normalization. Are some of your weights zero? This might lead to undefined behaviour if a phrase pair is only seen in model with weight 0\n') + + elif mode == 'counts_pure': + + if type(weights[0]) == list: + + new_weights = [] + + for weight_list in weights: + ratio = 1/weight_list[0] + new_weights.append([weight * ratio for weight in weight_list]) + + else: + ratio = 1/weights[0] + new_weights = [weight * ratio for weight in weights] + + # make sure that features other than the standard Moses features are always interpolated (since no count-based computation is defined) + elif mode == 'counts': + + if type(weights[0]) == list: + norm_counts = normalize_weights(weights,'counts_pure') + new_weights = normalize_weights(weights,'interpolate') + for i in [flags['i_e2f'],flags['i_e2f_lex'],flags['i_f2e'],flags['i_f2e_lex']]: + new_weights[i] = norm_counts[i] + return new_weights + + else: + return normalize_weights(weights,'counts_pure') + + return new_weights + + +def handle_file(filename,action,fileobj=None,mode='r'): + """support reading/writing either from/to file, stdout or gzipped file""" + + if action == 'open': + + if mode == 'r': + mode = 'rb' + elif mode == 'w': + mode = 'wb' + + if mode == 'rb' and not filename == '-' and not os.path.exists(filename): + if os.path.exists(filename+'.gz'): + filename = filename+'.gz' + else: + sys.stderr.write('Error: unable to open file. ' + filename + ' - aborting.\n') + + if 'counts' in filename and os.path.exists(os.path.dirname(filename)): + sys.stderr.write('For a weighted counts combination, we need statistics that Moses doesn\'t write to disk by default.\n') + sys.stderr.write('Repeat step 4 of Moses training for all models with the option -write-lexical-counts.\n') + + exit(1) + + if filename.endswith('.gz'): + fileobj = gzip.open(filename,mode) + + elif filename == '-' and mode == 'wb': + fileobj = sys.stdout + + else: + fileobj = open(filename,mode) + + return fileobj + + elif action == 'close' and filename != '-': + fileobj.close() + + +def sort_file(filename,tempdir=None): + """Sort a file and return temporary file""" + + cmd = ['sort', filename] + env = {} + env['LC_ALL'] = 'C' + if tempdir: + cmd.extend(['-T',tempdir]) + + outfile = NamedTemporaryFile(delete=False,dir=tempdir) + sys.stderr.write('LC_ALL=C ' + ' '.join(cmd) + ' > ' + outfile.name + '\n') + p = Popen(cmd,env=env,stdout=outfile.file) + p.wait() + + outfile.seek(0) + + return outfile + + +class Combine_TMs(): + + """This class handles the various options, checks them for sanity and has methods that define what models to load and what functions to call for the different tasks. + Typically, you only need to interact with this class and its attributes. + + """ + + #some flags that change the behaviour during scoring. See init docstring for more info + flags = {'normalized':False, + 'recompute_lexweights':False, + 'intersected_cross-entropies':False, + 'normalize_s_given_t':None, + 'normalize-lexical_weights':True, + 'add_origin_features':False, + 'write_phrase_penalty':False, + 'lowmem': False, + 'i_e2f':0, + 'i_e2f_lex':1, + 'i_f2e':2, + 'i_f2e_lex':3 + } + + # each model needs a priority. See init docstring for more info + _priorities = {'primary':1, + 'map':2, + 'supplementary':10} + + def __init__(self,models,weights=None, + output_file=None, + mode='interpolate', + number_of_features=4, + model_interface=Moses, + reference_interface=Moses_Alignment, + reference_file=None, + lang_src=None, + lang_target=None, + output_lexical=None, + **flags): + """The whole configuration of the task is done during intialization. Afterwards, you only need to call your intended method(s). + You can change some of the class attributes afterwards (such as the weights, or the output file), but you should never change the models or mode after initialization. + See unit_test function for example configurations + + models: list of tuples (path,priority) that defines which models to process. Path is usually the top directory of a Moses model. There are three priorities: + 'primary': phrase pairs with this priority will always be included in output model. For most purposes, you'll want to define all models as primary. + 'map': for maximum a-posteriori combination (Bacchiani et al. 2004; Foster et al. 2010). for use with mode 'counts'. stores c(t) = 1 and c(s,t) = p(s|t) + 'supplementary': phrase pairs are considered for probability computation, but not included in output model (unless they also occur in at least one primary model) + useful for rescoring a model without changing its vocabulary. + + weights: accept two types of weight declarations: one weight per model, and one weight per model and feature + type one is internally converted to type two. For 2 models with four features, this looks like: [0.1,0.9] -> [[0.1,0.9],[0.1,0.9],[0.1,0.9],[0.1,0.9]] + default: uniform weights (None) + + output_file: filepath of output phrase table. If it ends with .gz, file is automatically zipped. + + output_lexical: If defined, also writes combined lexical tables. Writes to output_lexical.e2f and output_lexical.f2e, or output_lexical.counts.e2f in mode 'counts'. + + mode: declares the basic mixture-model algorithm. there are currently three options: + 'counts': weighted counts (requires some statistics that Moses doesn't produce. Repeat step 4 of Moses training with the option -write-lexical-counts to obtain them.) + Only the standard Moses features are recomputed from weighted counts; additional features are linearly interpolated + (see number_of_features to allow more features, and i_e2f etc. if the standard features are in a non-standard position) + 'interpolate': linear interpolation + 'loglinear': loglinear interpolation (careful: this creates the intersection of phrase tables and is often of little use) + + number_of_features: could be used to interpolate models with non-default Moses features. 4 features is currently still hardcoded in various places + (e.g. cross_entropy calculations, mode 'counts') + + i_e2f,i_e2f_lex,i_f2e,i_f2e_lex: Index of the (Moses) phrase table features p(s|t), lex(s|t), p(t|s) and lex(t|s). + Relevant for mode 'counts', and if 'recompute_lexweights' is True in mode 'interpolate'. In mode 'counts', any additional features are combined through linear interpolation. + + model_interface: class that handles reading phrase tables and lexical tables, and writing phrase tables. Currently only Moses is implemented. + default: Moses + + reference_interace: class that deals with reading in reference phrase pairs for cross-entropy computation + Moses_Alignment: Word/phrase pairs as computed by Giza++ and extracted through Moses heuristics. This corresponds to the file model/extract.gz if you train a Moses model on your tuning set. + TigerXML: TigerXML data format + + default: Moses_Alignment + + reference_file: path to reference file. Required for every operation except combination of models with given weights. + + lang_src: source language. Only required if reference_interface is TigerXML. Identifies which language in XML file we should treat as source language. + + lang_target: target language. Only required if reference_interface is TigerXML. Identifies which language in XML file we should treat as target language. + + intersected_cross-entropies: compute cross-entropies of intersection of phrase pairs, ignoring phrase pairs that do not occur in all models. + If False, algorithm operates on union of phrase pairs + default: False + + add_origin_features: For each model that is being combined, add a binary feature to the final phrase table, with values of 1 (phrase pair doesn't occur in model) and 2.718 (it does). + This indicates which model(s) a phrase pair comes from and can be used during MERT to additionally reward/penalize translation models + + lowmem: low memory mode: instead of loading target phrase counts / probability (when required), process the original table and its inversion (source and target swapped) incrementally, then merge the two halves. + + tempdir: temporary directory (for low memory mode). + + there are a number of further configuration options that you can define, which modify the algorithm for linear interpolation. They have no effect in mode 'counts' + + recompute_lexweights: don't directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. + default: False + + normalized: for interpolation of p(x|y): if True, models with p(y)=0 will be ignored, and probability mass will be distributed among models with p(y)>0. + If False, missing entries (x,y) are always interpreted as p(x|y)=0. + default: False + + normalize_s_given_t: How to we normalize p(s|t) if 'normalized' is True? Three options: + None: don't normalize p(s|t) and lex(s|t) (only p(t|s) and lex(t|s)) + t: check if p(t)==0 : advantage: theoretically sound; disadvantage: slower (we need to know if t occcurs in model); favours rare target phrases (relative to default choice) + s: check if p(s)==0 : advantage: relevant for task; disadvantage: no true probability distributions + + default: None + + normalize-lexical_weights: also normalize lex(s|t) and lex(t|s) if 'normalized' ist True: + reason why you might want to disable this: lexical weights suffer less from data sparseness than probabilities. + default: True + + """ + + + self.mode = mode + self.output_file = output_file + self.lang_src = lang_src + self.lang_target = lang_target + self.loaded = defaultdict(int) + self.output_lexical = output_lexical + + self.flags = copy.copy(self.flags) + self.flags.update(flags) + + self.flags['i_e2f'] = int(self.flags['i_e2f']) + self.flags['i_e2f_lex'] = int(self.flags['i_e2f_lex']) + self.flags['i_f2e'] = int(self.flags['i_f2e']) + self.flags['i_f2e_lex'] = int(self.flags['i_f2e_lex']) + + if reference_interface: + self.reference_interface = reference_interface(reference_file) + + if mode not in ['interpolate','loglinear','counts']: + sys.stderr.write('Error: mode must be either "interpolate", "loglinear" or "counts"\n') + sys.exit(1) + + models,number_of_features,weights = self._sanity_checks(models,number_of_features,weights) + + self.weights = weights + self.models = models + + self.model_interface = model_interface(models,number_of_features) + + if mode == 'interpolate': + self.score = score_interpolate + elif mode == 'loglinear': + self.score = score_loglinear + elif mode == 'counts': + self.score = score_counts + + + def _sanity_checks(self,models,number_of_features,weights): + """check if input arguments make sense (correct number of weights, valid model priorities etc.) + is only called on initialization. If you change weights afterwards, better know what you're doing. + """ + + number_of_features = int(number_of_features) + + for (model,priority) in models: + assert(priority in self._priorities) + models = [(model,self._priorities[p]) for (model,p) in models] + + + # accept two types of weight declarations: one weight per model, and one weight per model and feature + # type one is internally converted to type two: [0.1,0.9] -> [[0.1,0.9],[0.1,0.9],[0.1,0.9],[0.1,0.9]] + if weights: + if type(weights[0]) == list: + assert(len(weights)==number_of_features) + for sublist in weights: + assert(len(sublist)==len(models)) + + else: + assert(len(models) == len(weights)) + weights = [weights for i in range(number_of_features)] + + else: + if self.mode == 'loglinear' or self.mode == 'interpolate': + weights = [[1/len(models)]*len(models) for i in range(number_of_features)] + elif self.mode == 'counts': + weights = [[1]*len(models) for i in range(number_of_features)] + sys.stderr.write('Warning: No weights defined: initializing with uniform weights\n') + + + new_weights = normalize_weights(weights,self.mode,self.flags) + if weights != new_weights: + if self.mode == 'interpolate' or self.mode == 'loglinear': + sys.stderr.write('Warning: weights should sum to 1 - ') + elif self.mode == 'counts': + sys.stderr.write('Warning: normalizing weights so that first model has weight 1 (for features that are recomputed from counts) - ') + sys.stderr.write('normalizing to: '+ str(new_weights) +'\n') + weights = new_weights + + return models,number_of_features,weights + + + def _ensure_loaded(self,data): + """load data (lexical tables; reference alignment; phrase table), if it isn't already in memory""" + + if 'lexical' in data: + self.model_interface.require_alignment = True + + if 'reference' in data and not self.loaded['reference']: + + sys.stderr.write('Loading word pairs from reference set...') + self.reference_interface.load_word_pairs(self.lang_src,self.lang_target) + sys.stderr.write('done\n') + self.loaded['reference'] = 1 + + if 'lexical' in data and not self.loaded['lexical']: + + sys.stderr.write('Loading lexical tables...') + self.model_interface.load_lexical_tables(self.models,self.mode) + sys.stderr.write('done\n') + self.loaded['lexical'] = 1 + + if 'pt-filtered' in data and not self.loaded['pt-filtered']: + + models_prioritized = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)] + + for model,priority,i in models_prioritized: + sys.stderr.write('Loading phrase table ' + str(i) + ' (only data relevant for reference set)') + j = 0 + for line in model: + if not j % 1000000: + sys.stderr.write('...'+str(j)) + j += 1 + line = line.rstrip().split(b' ||| ') + if line[-1].endswith(b' |||'): + line[-1] = line[-1][:-4] + line.append('') + self.model_interface.load_phrase_features(line,priority,i,store='all',mode=self.mode,filter_by=self.reference_interface.word_pairs,filter_by_src=self.reference_interface.word_source,filter_by_target=self.reference_interface.word_target,flags=self.flags) + sys.stderr.write(' done\n') + + self.loaded['pt-filtered'] = 1 + + if 'lexical-filtered' in data and not self.loaded['lexical-filtered']: + e2f_filter, f2e_filter = _get_lexical_filter(self.reference_interface,self.model_interface) + + sys.stderr.write('Loading lexical tables (only data relevant for reference set)...') + self.model_interface.load_lexical_tables(self.models,self.mode,e2f_filter=e2f_filter,f2e_filter=f2e_filter) + sys.stderr.write('done\n') + self.loaded['lexical-filtered'] = 1 + + if 'pt-target' in data and not self.loaded['pt-target']: + + models_prioritized = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)] + + for model,priority,i in models_prioritized: + sys.stderr.write('Loading target information from phrase table ' + str(i)) + j = 0 + for line in model: + if not j % 1000000: + sys.stderr.write('...'+str(j)) + j += 1 + line = line.rstrip().split(b' ||| ') + if line[-1].endswith(b' |||'): + line[-1] = line[-1][:-4] + line.append('') + self.model_interface.load_phrase_features(line,priority,i,mode=self.mode,store='target',flags=self.flags) + sys.stderr.write(' done\n') + + self.loaded['pt-target'] = 1 + + + def _inverse_wrapper(self,weights,tempdir=None): + """if we want to invert the phrase table to better calcualte p(s|t) and lex(s|t), manage creation, sorting and merging of inverted phrase tables""" + + sys.stderr.write('Processing first table half\n') + models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)] + pt_half1 = NamedTemporaryFile(prefix='half1',delete=False,dir=tempdir) + self._write_phrasetable(models,pt_half1,weights) + pt_half1.seek(0) + + sys.stderr.write('Inverting tables\n') + models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table'),tempdir=tempdir),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)] + sys.stderr.write('Processing second table half\n') + pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False,dir=tempdir) + self._write_phrasetable(models,pt_half2_inverted,weights,inverted=True) + pt_half2_inverted.close() + for model,priority,i in models: + model.close() + os.remove(model.name) + pt_half2 = sort_file(pt_half2_inverted.name,tempdir=tempdir) + os.remove(pt_half2_inverted.name) + + sys.stderr.write('Merging tables: first half: {0} ; second half: {1} ; final table: {2}\n'.format(pt_half1.name,pt_half2.name,self.output_file)) + output_object = handle_file(self.output_file,'open',mode='w') + self.model_interface.merge(pt_half1,pt_half2,output_object,self.mode) + os.remove(pt_half1.name) + os.remove(pt_half2.name) + + handle_file(self.output_file,'close',output_object,mode='w') + + + def _write_phrasetable(self,models,output_object,weights,inverted=False): + """Incrementally load phrase tables, calculate score for increment and write it to output_object""" + + # define which information we need to store from the phrase table + # possible flags: 'all', 'target', 'source' and 'pairs' + # interpolated models without re-normalization only need 'pairs', otherwise 'all' is the correct choice + store_flag = 'all' + if self.mode == 'interpolate' and not self.flags['normalized']: + store_flag = 'pairs' + + i = 0 + sys.stderr.write('Incrementally loading and processing phrase tables...') + + for block in self.model_interface.traverse_incrementally('phrase-table',models,self.model_interface.load_phrase_features,store_flag,mode=self.mode,inverted=inverted,lowmem=self.flags['lowmem'],flags=self.flags): + for src in sorted(self.model_interface.phrase_pairs, key = lambda x: x + b' |'): + for target in sorted(self.model_interface.phrase_pairs[src], key = lambda x: x + b' |'): + + if not i % 1000000: + sys.stderr.write(str(i) + '...') + i += 1 + + features = self.score(weights,src,target,self.model_interface,self.flags) + outline = self.model_interface.write_phrase_table(src,target,weights,features,self.mode, self.flags) + output_object.write(outline) + sys.stderr.write('done\n') + + + def combine_given_weights(self,weights=None): + """write a new phrase table, based on existing weights""" + + if not weights: + weights = self.weights + + data = [] + + if self.mode == 'counts': + data.append('lexical') + if not self.flags['lowmem']: + data.append('pt-target') + + elif self.mode == 'interpolate': + if self.flags['recompute_lexweights']: + data.append('lexical') + if self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't' and not self.flags['lowmem']: + data.append('pt-target') + + self._ensure_loaded(data) + + if self.flags['lowmem'] and (self.mode == 'counts' or self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't'): + self._inverse_wrapper(weights,tempdir=self.flags['tempdir']) + else: + models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)] + output_object = handle_file(self.output_file,'open',mode='w') + self._write_phrasetable(models,output_object,weights) + handle_file(self.output_file,'close',output_object,mode='w') + + if self.output_lexical: + sys.stderr.write('Writing lexical tables\n') + self._ensure_loaded(['lexical']) + self.model_interface.write_lexical_file('e2f',self.output_lexical,weights[1],self.mode) + self.model_interface.write_lexical_file('f2e',self.output_lexical,weights[3],self.mode) + + + def combine_given_tuning_set(self): + """write a new phrase table, using the weights that minimize cross-entropy on a tuning set""" + + data = ['reference','pt-filtered'] + + if self.mode == 'counts' or (self.mode == 'interpolate' and self.flags['recompute_lexweights']): + data.append('lexical-filtered') + + self._ensure_loaded(data) + + best_weights,best_cross_entropy = optimize_cross_entropy(self.model_interface,self.reference_interface,self.weights,self.score,self.mode,self.flags) + sys.stderr.write('Best weights: ' + str(best_weights) + '\n') + sys.stderr.write('Cross entropies: ' + str(best_cross_entropy) + '\n') + sys.stderr.write('Executing action combine_given_weights with -w "{0}"\n'.format('; '.join([', '.join(str(w) for w in item) for item in best_weights]))) + + self.loaded['pt-filtered'] = False # phrase table will be overwritten + self.combine_given_weights(weights=best_weights) + + + + def combine_reordering_tables(self,weights=None): + """write a new reordering table, based on existing weights.""" + + if not weights: + weights = self.weights + + data = [] + + if self.mode != 'interpolate': + sys.stderr.write('Error: only linear interpolation is supported for reordering model combination') + + output_object = handle_file(self.output_file,'open',mode='w') + models = [(self.model_interface.open_table(model,'reordering-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)] + + i = 0 + + sys.stderr.write('Incrementally loading and processing phrase tables...') + + for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs',mode=self.mode,lowmem=self.flags['lowmem'],flags=self.flags): + for src in sorted(self.model_interface.reordering_pairs): + for target in sorted(self.model_interface.reordering_pairs[src]): + if not i % 1000000: + sys.stderr.write(str(i) + '...') + i += 1 + + features = score_interpolate_reordering(weights,src,target,self.model_interface) + outline = self.model_interface.write_reordering_table(src,target,features) + output_object.write(outline) + sys.stderr.write('done\n') + + + handle_file(self.output_file,'close',output_object,mode='w') + + + def compare_cross_entropies(self): + """print cross-entropies for each model/feature, using the intersection of phrase pairs. + analysis tool. + """ + + self.flags['compare_cross-entropies'] = True + + data = ['reference','pt-filtered'] + + if self.mode == 'counts' or (self.mode == 'interpolate' and self.flags['recompute_lexweights']): + data.append('lexical-filtered') + + self._ensure_loaded(data) + + results, (intersection,total_pairs,oov2) = cross_entropy(self.model_interface,self.reference_interface,self.weights,self.score,self.mode,self.flags) + + padding = 90 + num_features = self.model_interface.number_of_features + + print('\nResults of model comparison\n') + print('{0:<{padding}}: {1}'.format('phrase pairs in reference (tokens)',total_pairs, padding=padding)) + print('{0:<{padding}}: {1}'.format('phrase pairs in model intersection (tokens)',intersection, padding=padding)) + print('{0:<{padding}}: {1}\n'.format('phrase pairs in model union (tokens)',total_pairs-oov2, padding=padding)) + + for i,data in enumerate(results): + + cross_entropies = data[:num_features] + (other_translations,oov,ignored,n,total_pairs) = data[num_features:] + + print('model ' +str(i)) + for j in range(num_features): + print('{0:<{padding}}: {1}'.format('cross-entropy for feature {0}'.format(j), cross_entropies[j], padding=padding)) + print('{0:<{padding}}: {1}'.format('phrase pairs in model (tokens)', n+ignored, padding=padding)) + print('{0:<{padding}}: {1}'.format('phrase pairs in model, but not in intersection (tokens)', ignored, padding=padding)) + print('{0:<{padding}}: {1}'.format('phrase pairs in union, but not in model (but source phrase is) (tokens)', other_translations, padding=padding)) + print('{0:<{padding}}: {1}\n'.format('phrase pairs in union, but source phrase not in model (tokens)', oov, padding=padding)) + + self.flags['compare_cross-entropies'] = False + + return results, (intersection,total_pairs,oov2) + + + def compute_cross_entropy(self): + """return cross-entropy for a tuning set, a set of models and a set of weights. + analysis tool. + """ + + data = ['reference','pt-filtered'] + + if self.mode == 'counts' or (self.mode == 'interpolate' and self.flags['recompute_lexweights']): + data.append('lexical-filtered') + + self._ensure_loaded(data) + + current_cross_entropy = cross_entropy(self.model_interface,self.reference_interface,self.weights,self.score,self.mode,self.flags) + sys.stderr.write('Cross entropy: ' + str(current_cross_entropy) + '\n') + return current_cross_entropy + + + def return_best_cross_entropy(self): + """return the set of weights and cross-entropy that is optimal for a tuning set and a set of models.""" + + data = ['reference','pt-filtered'] + + if self.mode == 'counts' or (self.mode == 'interpolate' and self.flags['recompute_lexweights']): + data.append('lexical-filtered') + + self._ensure_loaded(data) + + best_weights,best_cross_entropy = optimize_cross_entropy(self.model_interface,self.reference_interface,self.weights,self.score,self.mode,self.flags) + + sys.stderr.write('Best weights: ' + str(best_weights) + '\n') + sys.stderr.write('Cross entropies: ' + str(best_cross_entropy) + '\n') + sys.stderr.write('You can apply these weights with the action combine_given_weights and the option -w "{0}"\n'.format('; '.join([', '.join(str(w) for w in item) for item in best_weights]))) + return best_weights,best_cross_entropy + + +def test(): + """test (and illustrate) the functionality of the program based on two test phrase tables and a small reference set,""" + + # linear interpolation of two models, with fixed weights. Output uses vocabulary of model1 (since model2 is supplementary) + # command line: (currently not possible to define supplementary models through command line) + sys.stderr.write('Regression test 1\n') + Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'supplementary']],[0.5,0.5],os.path.join('test','phrase-table_test1')) + Combiner.combine_given_weights() + + # linear interpolation of two models, with fixed weights (but different for each feature). + # command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test2 + sys.stderr.write('Regression test 2\n') + Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],[[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test2')) + Combiner.combine_given_weights() + + # count-based combination of two models, with fixed weights + # command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test3 -m counts + sys.stderr.write('Regression test 3\n') + Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],[[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test3'),mode='counts') + Combiner.combine_given_weights() + + # output phrase table should be identical to model1 + # command line: python tmcombine.py combine_given_weights test/model1 -w 1 -o test/phrase-table_test4 -m counts + sys.stderr.write('Regression test 4\n') + Combiner = Combine_TMs([[os.path.join('test','model1'),'primary']],[1],os.path.join('test','phrase-table_test4'),mode='counts') + Combiner.combine_given_weights() + + # count-based combination of two models with weights set through perplexity minimization + # command line: python tmcombine.py combine_given_tuning_set test/model1 test/model2 -o test/phrase-table_test5 -m counts -r test/extract + sys.stderr.write('Regression test 5\n') + Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],output_file=os.path.join('test','phrase-table_test5'),mode='counts',reference_file='test/extract') + Combiner.combine_given_tuning_set() + + # loglinear combination of two models with fixed weights + # command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w 0.1,0.9 -o test/phrase-table_test6 -m loglinear + sys.stderr.write('Regression test 6\n') + Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],weights=[0.1,0.9],output_file=os.path.join('test','phrase-table_test6'),mode='loglinear') + Combiner.combine_given_weights() + + # cross-entropy analysis of two models through a reference set + # command line: python tmcombine.py compare_cross_entropies test/model1 test/model2 -m counts -r test/extract + sys.stderr.write('Regression test 7\n') + Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],mode='counts',reference_file='test/extract') + f = open(os.path.join('test','phrase-table_test7'),'w') + f.write(str(Combiner.compare_cross_entropies())) + f.close() + + # maximum a posteriori combination of two models (Bacchiani et al. 2004; Foster et al. 2010) with weights set through cross-entropy minimization + # command line: (currently not possible through command line) + sys.stderr.write('Regression test 8\n') + Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'map']],output_file=os.path.join('test','phrase-table_test8'),mode='counts',reference_file='test/extract') + Combiner.combine_given_tuning_set() + + # count-based combination of two non-default models, with fixed weights. Same as test 3, but with the standard features moved back + # command line: python tmcombine.py combine_given_weights test/model3 test/model4 -w "0.5,0.5;0.5,0.5;0.5,0.5;0.5,0.5;0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test9 -m counts --number_of_features 8 --i_e2f 4 --i_e2f_lex 5 --i_f2e 6 --i_f2e_lex 7 -r test/extract + sys.stderr.write('Regression test 9\n') + Combiner = Combine_TMs([[os.path.join('test','model3'),'primary'],[os.path.join('test','model4'),'primary']],[[0.5,0.5],[0.5,0.5],[0.5,0.5],[0.5,0.5],[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test9'),mode='counts',number_of_features=8,i_e2f=4,i_e2f_lex=5,i_f2e=6,i_f2e_lex=7) + Combiner.combine_given_weights() + + # count-based combination of two non-default models, with fixed weights. Same as test 5, but with the standard features moved back + # command line: python tmcombine.py combine_given_tuning_set test/model3 test/model4 -o test/phrase-table_test10 -m counts --number_of_features 8 --i_e2f 4 --i_e2f_lex 5 --i_f2e 6 --i_f2e_lex 7 -r test/extract + sys.stderr.write('Regression test 10\n') + Combiner = Combine_TMs([[os.path.join('test','model3'),'primary'],[os.path.join('test','model4'),'primary']],output_file=os.path.join('test','phrase-table_test10'),mode='counts',number_of_features=8,i_e2f=4,i_e2f_lex=5,i_f2e=6,i_f2e_lex=7,reference_file='test/extract') + Combiner.combine_given_tuning_set() + + # count-based combination of two hierarchical models, with fixed weights. Same as test 3, but with hierarchical models + # command line: python tmcombine.py combine_given_weights test/model5 test/model6 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test11 -m counts + sys.stderr.write('Regression test 11\n') + Combiner = Combine_TMs([[os.path.join('test','model5'),'primary'],[os.path.join('test','model6'),'primary']],[[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test11'),mode='counts') + Combiner.combine_given_weights() + +#convert weight vector passed as a command line argument +class to_list(argparse.Action): + def __call__(self, parser, namespace, weights, option_string=None): + if ';' in weights: + values = [[float(x) for x in vector.split(',')] for vector in weights.split(';')] + else: + values = [float(x) for x in weights.split(',')] + setattr(namespace, self.dest, values) + + +def parse_command_line(): + parser = argparse.ArgumentParser(description='Combine translation models. Check DOCSTRING of the class Combine_TMs() and its methods for a more in-depth documentation and additional configuration options not available through the command line. The function test() shows examples.') + + group1 = parser.add_argument_group('Main options') + group2 = parser.add_argument_group('More model combination options') + + group1.add_argument('action', metavar='ACTION', choices=["combine_given_weights","combine_given_tuning_set","combine_reordering_tables","compute_cross_entropy","return_best_cross_entropy","compare_cross_entropies"], + help='What you want to do with the models. One of %(choices)s.') + + group1.add_argument('model', metavar='DIRECTORY', nargs='+', + help='Model directory. Assumes default Moses structure (i.e. path to phrase table and lexical tables).') + + group1.add_argument('-w', '--weights', dest='weights', action=to_list, + default=None, + help='weight vector. Format 1: single vector, one weight per model. Example: \"0.1,0.9\" ; format 2: one vector per feature, one weight per model: \"0.1,0.9;0.5,0.5;0.4,0.6;0.2,0.8\"') + + group1.add_argument('-m', '--mode', type=str, + default="interpolate", + choices=["counts","interpolate","loglinear"], + help='basic mixture-model algorithm. Default: %(default)s. Note: depending on mode and additional configuration, additional statistics are needed. Check docstring documentation of Combine_TMs() for more info.') + + group1.add_argument('-r', '--reference', type=str, + default=None, + help='File containing reference phrase pairs for cross-entropy calculation. Default interface expects \'path/model/extract.gz\' that is produced by training a model on the reference (i.e. development) corpus.') + + group1.add_argument('-o', '--output', type=str, + default="-", + help='Output file (phrase table). If not specified, model is written to standard output.') + + group1.add_argument('--output-lexical', type=str, + default=None, + help=('Not only create a combined phrase table, but also combined lexical tables. Writes to OUTPUT_LEXICAL.e2f and OUTPUT_LEXICAL.f2e, or OUTPUT_LEXICAL.counts.e2f in mode \'counts\'.')) + + group1.add_argument('--lowmem', action="store_true", + help=('Low memory mode: requires two passes (and sorting in between) to combine a phrase table, but loads less data into memory. Only relevant for mode "counts" and some configurations of mode "interpolate".')) + + group1.add_argument('--tempdir', type=str, + default=None, + help=('Temporary directory in --lowmem mode.')) + + group2.add_argument('--i_e2f', type=int, + default=0, metavar='N', + help=('Index of p(f|e) (relevant for mode counts if phrase table has custom feature order). (default: %(default)s)')) + + group2.add_argument('--i_e2f_lex', type=int, + default=1, metavar='N', + help=('Index of lex(f|e) (relevant for mode counts or with option recompute_lexweights if phrase table has custom feature order). (default: %(default)s)')) + + group2.add_argument('--i_f2e', type=int, + default=2, metavar='N', + help=('Index of p(e|f) (relevant for mode counts if phrase table has custom feature order). (default: %(default)s)')) + + group2.add_argument('--i_f2e_lex', type=int, + default=3, metavar='N', + help=('Index of lex(e|f) (relevant for mode counts or with option recompute_lexweights if phrase table has custom feature order). (default: %(default)s)')) + + group2.add_argument('--number_of_features', type=int, + default=4, metavar='N', + help=('Combine models with N + 1 features (last feature is constant phrase penalty). (default: %(default)s)')) + + group2.add_argument('--normalized', action="store_true", + help=('for each phrase pair x,y: ignore models with p(y)=0, and distribute probability mass among models with p(y)>0. (default: missing entries (x,y) are always interpreted as p(x|y)=0). Only relevant in mode "interpolate".')) + + group2.add_argument('--write-phrase-penalty', action="store_true", + help=("Include phrase penalty in phrase table")) + + group2.add_argument('--recompute_lexweights', action="store_true", + help=('don\'t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. Only relevant in mode "interpolate".')) + + return parser.parse_args() + +if __name__ == "__main__": + + if len(sys.argv) < 2: + sys.stderr.write("no command specified. use option -h for usage instructions\n") + + elif sys.argv[1] == "test": + test() + + else: + args = parse_command_line() + #initialize + combiner = Combine_TMs([(m,'primary') for m in args.model], + weights=args.weights, + mode=args.mode, + output_file=args.output, + reference_file=args.reference, + output_lexical=args.output_lexical, + lowmem=args.lowmem, + normalized=args.normalized, + recompute_lexweights=args.recompute_lexweights, + tempdir=args.tempdir, + number_of_features=args.number_of_features, + i_e2f=args.i_e2f, + i_e2f_lex=args.i_e2f_lex, + i_f2e=args.i_f2e, + i_f2e_lex=args.i_f2e_lex, + write_phrase_penalty=args.write_phrase_penalty) + # execute right method + f_string = "combiner."+args.action+'()' + exec(f_string) diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.h b/mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.h new file mode 100644 index 0000000000000000000000000000000000000000..3de46272aa405f872aba56b77285715c69738e30 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.h @@ -0,0 +1,192 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_BlockHashIndex_h +#define moses_BlockHashIndex_h + +#include +#include +#include +#include +#include +#include +#include + +#include "MurmurHash3.h" +#include "StringVector.h" +#include "PackedArray.h" +#include "util/exception.hh" +#include "util/string_stream.hh" + +#ifdef WITH_THREADS +#include "moses/ThreadPool.h" +#else +#include +#endif + +#include + +namespace Moses +{ + +class BlockHashIndex +{ +private: + std::priority_queue m_queue; + + size_t m_orderBits; + size_t m_fingerPrintBits; + + std::FILE* m_fileHandle; + size_t m_fileHandleStart; + + StringVector m_landmarks; + + std::vector m_hashes; + std::vector m_clocks; + std::vector*> m_arrays; + + std::vector m_seekIndex; + + size_t m_size; + int m_lastSaved; + int m_lastDropped; + size_t m_numLoadedRanges; + +#ifdef WITH_THREADS + ThreadPool m_threadPool; + boost::mutex m_mutex; + + template + class HashTask : public Task + { + public: + HashTask(int id, BlockHashIndex& hash, Keys& keys) + : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {} + + virtual void Run() { + m_hash.CalcHash(m_id, *m_keys); + } + + virtual ~HashTask() { + delete m_keys; + } + + private: + int m_id; + BlockHashIndex& m_hash; + Keys* m_keys; + }; +#endif + + size_t GetFprint(const char* key) const; + size_t GetHash(size_t i, const char* key); + +public: +#ifdef WITH_THREADS + BlockHashIndex(size_t orderBits, size_t fingerPrintBits, + size_t threadsNum = 2); +#else + BlockHashIndex(size_t orderBits, size_t fingerPrintBits); +#endif + + ~BlockHashIndex(); + + size_t GetHash(const char* key); + size_t GetHash(std::string key); + + size_t operator[](std::string key); + size_t operator[](char* key); + + void BeginSave(std::FILE* mphf); + void SaveRange(size_t i); + void SaveLastRange(); + size_t FinalizeSave(); + +#ifdef WITH_THREADS + void WaitAll(); +#endif + + void DropRange(size_t i); + void DropLastRange(); + + size_t LoadIndex(std::FILE* mphf); + void LoadRange(size_t i); + + size_t Save(std::string filename); + size_t Save(std::FILE * mphf); + + size_t Load(std::string filename); + size_t Load(std::FILE * mphf); + + size_t GetSize() const; + + void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1); + + template + void AddRange(Keys &keys) { + size_t current = m_landmarks.size(); + + if(m_landmarks.size() && m_landmarks.back().str() >= keys[0]) { + util::StringStream strme; + strme << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n"; + strme << "1: " << m_landmarks.back().str() << "\n"; + strme << "2: " << keys[0] << "\n"; + UTIL_THROW2(strme.str()); + } + + m_landmarks.push_back(keys[0]); + m_size += keys.size(); + + if(keys.size() == 1) { + // add dummy key to avoid null hash + keys.push_back("###DUMMY_KEY###"); + } + +#ifdef WITH_THREADS + + boost::shared_ptr > + ht(new HashTask(current, *this, keys)); + m_threadPool.Submit(ht); +#else + CalcHash(current, keys); +#endif + } + + template + void CalcHash(size_t current, Keys &keys) { +#ifdef HAVE_CMPH + void* source = vectorAdapter(keys); + CalcHash(current, source); +#endif + } + + void CalcHash(size_t current, void* source); + +#ifdef HAVE_CMPH + void* vectorAdapter(std::vector& v); + void* vectorAdapter(StringVector& sv); + void* vectorAdapter(StringVector& sv); +#endif +}; + +} +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/MurmurHash3.cpp b/mosesdecoder/moses/TranslationModel/CompactPT/MurmurHash3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dfde887085950ddf7b07e136e84bf81d5005ad75 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/MurmurHash3.cpp @@ -0,0 +1,425 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE inline __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) { + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; + + h1 = ROTL32(h1,19); + h1 += h2; + h1 = h1*5+0x561ccd1b; + + k2 *= c2; + k2 = ROTL32(k2,16); + k2 *= c3; + h2 ^= k2; + + h2 = ROTL32(h2,17); + h2 += h3; + h2 = h2*5+0x0bcaa747; + + k3 *= c3; + k3 = ROTL32(k3,17); + k3 *= c4; + h3 ^= k3; + + h3 = ROTL32(h3,15); + h3 += h4; + h3 = h3*5+0x96cd1c35; + + k4 *= c4; + k4 = ROTL32(k4,18); + k4 *= c1; + h4 ^= k4; + + h4 = ROTL32(h4,13); + h4 += h1; + h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) { + case 15: + k4 ^= tail[14] << 16; + case 14: + k4 ^= tail[13] << 8; + case 13: + k4 ^= tail[12] << 0; + k4 *= c4; + k4 = ROTL32(k4,18); + k4 *= c1; + h4 ^= k4; + + case 12: + k3 ^= tail[11] << 24; + case 11: + k3 ^= tail[10] << 16; + case 10: + k3 ^= tail[ 9] << 8; + case 9: + k3 ^= tail[ 8] << 0; + k3 *= c3; + k3 = ROTL32(k3,17); + k3 *= c4; + h3 ^= k3; + + case 8: + k2 ^= tail[ 7] << 24; + case 7: + k2 ^= tail[ 6] << 16; + case 6: + k2 ^= tail[ 5] << 8; + case 5: + k2 ^= tail[ 4] << 0; + k2 *= c2; + k2 = ROTL32(k2,16); + k2 *= c3; + h2 ^= k2; + + case 4: + k1 ^= tail[ 3] << 24; + case 3: + k1 ^= tail[ 2] << 16; + case 2: + k1 ^= tail[ 1] << 8; + case 1: + k1 ^= tail[ 0] << 0; + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + h2 ^= len; + h3 ^= len; + h4 ^= len; + + h1 += h2; + h1 += h3; + h1 += h4; + h2 += h1; + h3 += h1; + h4 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + h3 = fmix(h3); + h4 = fmix(h4); + + h1 += h2; + h1 += h3; + h1 += h4; + h2 += h1; + h3 += h1; + h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; + k1 = ROTL64(k1,31); + k1 *= c2; + h1 ^= k1; + + h1 = ROTL64(h1,27); + h1 += h2; + h1 = h1*5+0x52dce729; + + k2 *= c2; + k2 = ROTL64(k2,33); + k2 *= c1; + h2 ^= k2; + + h2 = ROTL64(h2,31); + h2 += h1; + h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) { + case 15: + k2 ^= uint64_t(tail[14]) << 48; + case 14: + k2 ^= uint64_t(tail[13]) << 40; + case 13: + k2 ^= uint64_t(tail[12]) << 32; + case 12: + k2 ^= uint64_t(tail[11]) << 24; + case 11: + k2 ^= uint64_t(tail[10]) << 16; + case 10: + k2 ^= uint64_t(tail[ 9]) << 8; + case 9: + k2 ^= uint64_t(tail[ 8]) << 0; + k2 *= c2; + k2 = ROTL64(k2,33); + k2 *= c1; + h2 ^= k2; + + case 8: + k1 ^= uint64_t(tail[ 7]) << 56; + case 7: + k1 ^= uint64_t(tail[ 6]) << 48; + case 6: + k1 ^= uint64_t(tail[ 5]) << 40; + case 5: + k1 ^= uint64_t(tail[ 4]) << 32; + case 4: + k1 ^= uint64_t(tail[ 3]) << 24; + case 3: + k1 ^= uint64_t(tail[ 2]) << 16; + case 2: + k1 ^= uint64_t(tail[ 1]) << 8; + case 1: + k1 ^= uint64_t(tail[ 0]) << 0; + k1 *= c1; + k1 = ROTL64(k1,31); + k1 *= c2; + h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/MurmurHash3.h b/mosesdecoder/moses/TranslationModel/CompactPT/MurmurHash3.h new file mode 100644 index 0000000000000000000000000000000000000000..54e9d3f9e3609d222eb5d0dd9b3e8ebc2bbed498 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/MurmurHash3.h @@ -0,0 +1,37 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.cpp new file mode 100644 index 0000000000000000000000000000000000000000..739959d3a3110e2e0e9ed7e61cffcaaa61de808d --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.cpp @@ -0,0 +1,462 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include + +#include "PhraseDecoder.h" +#include "moses/StaticData.h" + +using namespace std; + +namespace Moses +{ + +PhraseDecoder::PhraseDecoder( + PhraseDictionaryCompact &phraseDictionary, + const std::vector* input, + const std::vector* output, + size_t numScoreComponent + // , const std::vector* weight +) + : m_coding(None), m_numScoreComponent(numScoreComponent), + m_containsAlignmentInfo(true), m_maxRank(0), + m_symbolTree(0), m_multipleScoreTrees(false), + m_scoreTrees(1), m_alignTree(0), + m_phraseDictionary(phraseDictionary), m_input(input), m_output(output), + // m_weight(weight), + m_separator(" ||| ") +{ } + +PhraseDecoder::~PhraseDecoder() +{ + if(m_symbolTree) + delete m_symbolTree; + + for(size_t i = 0; i < m_scoreTrees.size(); i++) + if(m_scoreTrees[i]) + delete m_scoreTrees[i]; + + if(m_alignTree) + delete m_alignTree; +} + +inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol) +{ + boost::unordered_map::iterator it + = m_sourceSymbolsMap.find(symbol); + if(it != m_sourceSymbolsMap.end()) + return it->second; + + size_t idx = m_sourceSymbols.find(symbol); + m_sourceSymbolsMap[symbol] = idx; + return idx; +} + +inline std::string PhraseDecoder::GetTargetSymbol(unsigned idx) const +{ + if(idx < m_targetSymbols.size()) + return m_targetSymbols[idx]; + return std::string("##ERROR##"); +} + +inline size_t PhraseDecoder::GetREncType(unsigned encodedSymbol) +{ + return (encodedSymbol >> 30) + 1; +} + +inline size_t PhraseDecoder::GetPREncType(unsigned encodedSymbol) +{ + return (encodedSymbol >> 31) + 1; +} + +inline unsigned PhraseDecoder::GetTranslation(unsigned srcIdx, size_t rank) +{ + size_t srcTrgIdx = m_lexicalTableIndex[srcIdx]; + return m_lexicalTable[srcTrgIdx + rank].second; +} + +size_t PhraseDecoder::GetMaxSourcePhraseLength() +{ + return m_maxPhraseLength; +} + +inline unsigned PhraseDecoder::DecodeREncSymbol1(unsigned encodedSymbol) +{ + return encodedSymbol &= ~(3 << 30); +} + +inline unsigned PhraseDecoder::DecodeREncSymbol2Rank(unsigned encodedSymbol) +{ + return encodedSymbol &= ~(255 << 24); +} + +inline unsigned PhraseDecoder::DecodeREncSymbol2Position(unsigned encodedSymbol) +{ + encodedSymbol &= ~(3 << 30); + encodedSymbol >>= 24; + return encodedSymbol; +} + +inline unsigned PhraseDecoder::DecodeREncSymbol3(unsigned encodedSymbol) +{ + return encodedSymbol &= ~(3 << 30); +} + +inline unsigned PhraseDecoder::DecodePREncSymbol1(unsigned encodedSymbol) +{ + return encodedSymbol &= ~(1 << 31); +} + +inline int PhraseDecoder::DecodePREncSymbol2Left(unsigned encodedSymbol) +{ + return ((encodedSymbol >> 25) & 63) - 32; +} + +inline int PhraseDecoder::DecodePREncSymbol2Right(unsigned encodedSymbol) +{ + return ((encodedSymbol >> 19) & 63) - 32; +} + +inline unsigned PhraseDecoder::DecodePREncSymbol2Rank(unsigned encodedSymbol) +{ + return (encodedSymbol & 524287); +} + +size_t PhraseDecoder::Load(std::FILE* in) +{ + size_t start = std::ftell(in); + size_t read = 0; + + read += std::fread(&m_coding, sizeof(m_coding), 1, in); + read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, in); + read += std::fread(&m_containsAlignmentInfo, sizeof(m_containsAlignmentInfo), 1, in); + read += std::fread(&m_maxRank, sizeof(m_maxRank), 1, in); + read += std::fread(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, in); + + if(m_coding == REnc) { + m_sourceSymbols.load(in); + + size_t size; + read += std::fread(&size, sizeof(size_t), 1, in); + m_lexicalTableIndex.resize(size); + read += std::fread(&m_lexicalTableIndex[0], sizeof(size_t), size, in); + + read += std::fread(&size, sizeof(size_t), 1, in); + m_lexicalTable.resize(size); + read += std::fread(&m_lexicalTable[0], sizeof(SrcTrg), size, in); + } + + m_targetSymbols.load(in); + + m_symbolTree = new CanonicalHuffman(in); + + read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, in); + if(m_multipleScoreTrees) { + m_scoreTrees.resize(m_numScoreComponent); + for(size_t i = 0; i < m_numScoreComponent; i++) + m_scoreTrees[i] = new CanonicalHuffman(in); + } else { + m_scoreTrees.resize(1); + m_scoreTrees[0] = new CanonicalHuffman(in); + } + + if(m_containsAlignmentInfo) + m_alignTree = new CanonicalHuffman(in); + + size_t end = std::ftell(in); + return end - start; +} + +std::string PhraseDecoder::MakeSourceKey(std::string &source) +{ + return source + m_separator; +} + +TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel, bool eval) +{ + + // Not using TargetPhraseCollection avoiding "new" operator + // which can introduce heavy locking with multiple threads + TargetPhraseVectorPtr tpv(new TargetPhraseVector()); + size_t bitsLeft = 0; + + if(m_coding == PREnc) { + std::pair cachedPhraseColl + = m_decodingCache.Retrieve(sourcePhrase); + + // Has been cached and is complete or does not need to be completed + if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0)) + return cachedPhraseColl.first; + + // Has been cached, but is incomplete + else if(cachedPhraseColl.first != NULL) { + bitsLeft = cachedPhraseColl.second; + tpv->resize(cachedPhraseColl.first->size()); + std::copy(cachedPhraseColl.first->begin(), + cachedPhraseColl.first->end(), + tpv->begin()); + } + } + + // Retrieve source phrase identifier + std::string sourcePhraseString = sourcePhrase.GetStringRep(*m_input); + size_t sourcePhraseId = m_phraseDictionary.m_hash[MakeSourceKey(sourcePhraseString)]; + /* + cerr << "sourcePhraseString=" << sourcePhraseString << " " + << sourcePhraseId + << endl; + */ + if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize()) { + // Retrieve compressed and encoded target phrase collection + std::string encodedPhraseCollection; + if(m_phraseDictionary.m_inMemory) + encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId].str(); + else + encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId].str(); + + BitWrapper<> encodedBitStream(encodedPhraseCollection); + if(m_coding == PREnc && bitsLeft) + encodedBitStream.SeekFromEnd(bitsLeft); + + // Decompress and decode target phrase collection + TargetPhraseVectorPtr decodedPhraseColl = + DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel, eval); + + return decodedPhraseColl; + } else + return TargetPhraseVectorPtr(); +} + +TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( + TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, + const Phrase &sourcePhrase, bool topLevel, bool eval) +{ + + bool extending = tpv->size(); + size_t bitsLeft = encodedBitStream.TellFromEnd(); + + typedef std::pair AlignPointSizeT; + + std::vector sourceWords; + if(m_coding == REnc) { + for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { + std::string sourceWord + = sourcePhrase.GetWord(i).GetString(*m_input, false); + unsigned idx = GetSourceSymbolId(sourceWord); + sourceWords.push_back(idx); + } + } + + unsigned phraseStopSymbol = 0; + AlignPoint alignStopSymbol(-1, -1); + + std::vector scores; + std::set alignment; + + enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; + + size_t srcSize = sourcePhrase.GetSize(); + + TargetPhrase* targetPhrase = NULL; + while(encodedBitStream.TellFromEnd()) { + + if(state == New) { + // Creating new TargetPhrase on the heap + tpv->push_back(TargetPhrase()); + targetPhrase = &tpv->back(); + + alignment.clear(); + scores.clear(); + + state = Symbol; + } + + if(state == Symbol) { + unsigned symbol = m_symbolTree->Read(encodedBitStream); + if(symbol == phraseStopSymbol) { + state = Score; + } else { + if(m_coding == REnc) { + std::string wordString; + size_t type = GetREncType(symbol); + + if(type == 1) { + unsigned decodedSymbol = DecodeREncSymbol1(symbol); + wordString = GetTargetSymbol(decodedSymbol); + } else if (type == 2) { + size_t rank = DecodeREncSymbol2Rank(symbol); + size_t srcPos = DecodeREncSymbol2Position(symbol); + + if(srcPos >= sourceWords.size()) + return TargetPhraseVectorPtr(); + + wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); + if(m_phraseDictionary.m_useAlignmentInfo) { + size_t trgPos = targetPhrase->GetSize(); + alignment.insert(AlignPoint(srcPos, trgPos)); + } + } else if(type == 3) { + size_t rank = DecodeREncSymbol3(symbol); + size_t srcPos = targetPhrase->GetSize(); + + if(srcPos >= sourceWords.size()) + return TargetPhraseVectorPtr(); + + wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); + if(m_phraseDictionary.m_useAlignmentInfo) { + size_t trgPos = srcPos; + alignment.insert(AlignPoint(srcPos, trgPos)); + } + } + + Word word; + word.CreateFromString(Output, *m_output, wordString, false); + targetPhrase->AddWord(word); + } else if(m_coding == PREnc) { + // if the symbol is just a word + if(GetPREncType(symbol) == 1) { + unsigned decodedSymbol = DecodePREncSymbol1(symbol); + + Word word; + word.CreateFromString(Output, *m_output, + GetTargetSymbol(decodedSymbol), false); + targetPhrase->AddWord(word); + } + // if the symbol is a subphrase pointer + else { + int left = DecodePREncSymbol2Left(symbol); + int right = DecodePREncSymbol2Right(symbol); + unsigned rank = DecodePREncSymbol2Rank(symbol); + + int srcStart = left + targetPhrase->GetSize(); + int srcEnd = srcSize - right - 1; + + // false positive consistency check + if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) + return TargetPhraseVectorPtr(); + + // false positive consistency check + if(m_maxRank && rank > m_maxRank) + return TargetPhraseVectorPtr(); + + // set subphrase by default to itself + TargetPhraseVectorPtr subTpv = tpv; + + // if range smaller than source phrase retrieve subphrase + if(unsigned(srcEnd - srcStart + 1) != srcSize) { + Phrase subPhrase = sourcePhrase.GetSubString(Range(srcStart, srcEnd)); + subTpv = CreateTargetPhraseCollection(subPhrase, false); + } else { + // false positive consistency check + if(rank >= tpv->size()-1) + return TargetPhraseVectorPtr(); + } + + // false positive consistency check + if(subTpv != NULL && rank < subTpv->size()) { + // insert the subphrase into the main target phrase + TargetPhrase& subTp = subTpv->at(rank); + if(m_phraseDictionary.m_useAlignmentInfo) { + // reconstruct the alignment data based on the alignment of the subphrase + for(AlignmentInfo::const_iterator it = subTp.GetAlignTerm().begin(); + it != subTp.GetAlignTerm().end(); it++) { + alignment.insert(AlignPointSizeT(srcStart + it->first, + targetPhrase->GetSize() + it->second)); + } + } + targetPhrase->Append(subTp); + } else + return TargetPhraseVectorPtr(); + } + } else { + Word word; + word.CreateFromString(Output, *m_output, + GetTargetSymbol(symbol), false); + targetPhrase->AddWord(word); + } + } + } else if(state == Score) { + size_t idx = m_multipleScoreTrees ? scores.size() : 0; + float score = m_scoreTrees[idx]->Read(encodedBitStream); + scores.push_back(score); + + if(scores.size() == m_numScoreComponent) { + targetPhrase->GetScoreBreakdown().Assign(&m_phraseDictionary, scores); + + if(m_containsAlignmentInfo) + state = Alignment; + else + state = Add; + } + } else if(state == Alignment) { + AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); + if(alignPoint == alignStopSymbol) { + state = Add; + } else { + if(m_phraseDictionary.m_useAlignmentInfo) + alignment.insert(AlignPointSizeT(alignPoint)); + } + } + + if(state == Add) { + if(m_phraseDictionary.m_useAlignmentInfo) { + size_t sourceSize = sourcePhrase.GetSize(); + size_t targetSize = targetPhrase->GetSize(); + for(std::set::iterator it = alignment.begin(); it != alignment.end(); it++) { + if(it->first >= sourceSize || it->second >= targetSize) + return TargetPhraseVectorPtr(); + } + targetPhrase->SetAlignTerm(alignment); + } + + if(eval) { + targetPhrase->EvaluateInIsolation(sourcePhrase, m_phraseDictionary.GetFeaturesToApply()); + } + + if(m_coding == PREnc) { + if(!m_maxRank || tpv->size() <= m_maxRank) + bitsLeft = encodedBitStream.TellFromEnd(); + + if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) + break; + } + + if(encodedBitStream.TellFromEnd() <= 8) + break; + + state = New; + } + } + + if(m_coding == PREnc && !extending) { + bitsLeft = bitsLeft > 8 ? bitsLeft : 0; + m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); + } + + return tpv; +} + +void PhraseDecoder::PruneCache() +{ + m_decodingCache.Prune(); +} + +} diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h b/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h new file mode 100644 index 0000000000000000000000000000000000000000..8d984063c14f764965bfd90198856dee658058a7 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h @@ -0,0 +1,94 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_PhraseDictionaryCompact_h +#define moses_PhraseDictionaryCompact_h + +#include + +#ifdef WITH_THREADS +#ifdef BOOST_HAS_PTHREADS +#include +#endif +#endif + +#include "moses/TranslationModel/PhraseDictionary.h" +#include "moses/ThreadPool.h" + +#include "BlockHashIndex.h" +#include "StringVector.h" +#include "PhraseDecoder.h" +#include "TargetPhraseCollectionCache.h" + +namespace Moses +{ + +class PhraseDecoder; + +class PhraseDictionaryCompact : public PhraseDictionary +{ +protected: + friend class PhraseDecoder; + + static bool s_inMemoryByDefault; + bool m_inMemory; + bool m_useAlignmentInfo; + + typedef std::vector PhraseCache; + typedef boost::thread_specific_ptr SentenceCache; + static SentenceCache m_sentenceCache; + + BlockHashIndex m_hash; + PhraseDecoder* m_phraseDecoder; + + StringVector m_targetPhrasesMapped; + StringVector m_targetPhrasesMemory; + +public: + PhraseDictionaryCompact(const std::string &line); + + ~PhraseDictionaryCompact(); + + void Load(AllOptions::ptr const& opts); + + TargetPhraseCollection::shared_ptr GetTargetPhraseCollectionNonCacheLEGACY(const Phrase &source) const; + TargetPhraseVectorPtr GetTargetPhraseCollectionRaw(const Phrase &source) const; + + void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase); + + void CacheForCleanup(TargetPhraseCollection::shared_ptr tpc); + void CleanUpAfterSentenceProcessing(const InputType &source); + static void SetStaticDefaultParameters(Parameter const& param); + + virtual ChartRuleLookupManager *CreateRuleLookupManager( + const ChartParser &, + const ChartCellCollectionBase &, + std::size_t) { + assert(false); + return 0; + } + + TO_STRING(); + +}; + +} +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8a4bf7d7e9834fa489151f29db2532458e6f477f --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp @@ -0,0 +1,1286 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include + +#include "PhraseTableCreator.h" +#include "ConsistentPhrases.h" +#include "ThrowingFwrite.h" +#include "util/file.hh" +#include "util/exception.hh" + +namespace Moses +{ + +bool operator<(const PackedItem &pi1, const PackedItem &pi2) +{ + if(pi1.GetLine() < pi2.GetLine()) + return false; + return true; +} + +std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__"; +std::string PhraseTableCreator::m_separator = "|||"; + +PhraseTableCreator::PhraseTableCreator(std::string inPath, + std::string outPath, + std::string tempfilePath, + size_t numScoreComponent, + size_t sortScoreIndex, + Coding coding, + size_t orderBits, + size_t fingerPrintBits, + bool useAlignmentInfo, + bool multipleScoreTrees, + size_t quantize, + size_t maxRank, + bool warnMe +#ifdef WITH_THREADS + , size_t threads +#endif + ) + : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath), + m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent), + m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe), + m_coding(coding), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_useAlignmentInfo(useAlignmentInfo), + m_multipleScoreTrees(multipleScoreTrees), + m_quantize(quantize), m_maxRank(maxRank), +#ifdef WITH_THREADS + m_threads(threads), + m_srcHash(m_orderBits, m_fingerPrintBits, 1), + m_rnkHash(10, 24, m_threads), +#else + m_srcHash(m_orderBits, m_fingerPrintBits), + m_rnkHash(m_orderBits, m_fingerPrintBits), +#endif + m_maxPhraseLength(0), + m_lastFlushedLine(-1), m_lastFlushedSourceNum(0), + m_lastFlushedSourcePhrase("") +{ + PrintInfo(); + + AddTargetSymbolId(m_phraseStopSymbol); + + size_t cur_pass = 1; + size_t all_passes = 2; + if(m_coding == PREnc) + all_passes = 3; + + m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); + for(std::vector::iterator it = m_scoreCounters.begin(); + it != m_scoreCounters.end(); it++) + *it = new ScoreCounter(); + m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); + + // 0th pass + if(m_coding == REnc) { + size_t found = inPath.find_last_of("/\\"); + std::string path; + if(found != std::string::npos) + path = inPath.substr(0, found); + else + path = "."; + LoadLexicalTable(path + "/lex.f2e"); + } else if(m_coding == PREnc) { + std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating hash function for rank assignment" << std::endl; + cur_pass++; + CreateRankHash(); + } + + // 1st pass + std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl; + m_srcHash.BeginSave(m_outFile); + + if(tempfilePath.size()) { + MmapAllocator allocEncoded(util::FMakeTemp(tempfilePath)); + m_encodedTargetPhrases = new StringVectorTemp(allocEncoded); + } else { + m_encodedTargetPhrases = new StringVectorTemp(); + } + EncodeTargetPhrases(); + + cur_pass++; + + std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; + CalcHuffmanCodes(); + + // 2nd pass + std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl; + + if(tempfilePath.size()) { + MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); + m_compressedTargetPhrases = new StringVector(allocCompressed); + } else { + m_compressedTargetPhrases = new StringVector(true); + } + CompressTargetPhrases(); + + std::cerr << "Saving to " << m_outPath << std::endl; + Save(); + std::cerr << "Done" << std::endl; + std::fclose(m_outFile); +} + +PhraseTableCreator::~PhraseTableCreator() +{ + delete m_symbolTree; + if(m_useAlignmentInfo) + delete m_alignTree; + for(size_t i = 0; i < m_scoreTrees.size(); i++) { + delete m_scoreTrees[i]; + delete m_scoreCounters[i]; + } + + delete m_encodedTargetPhrases; + delete m_compressedTargetPhrases; +} + +void PhraseTableCreator::PrintInfo() +{ + std::string encodings[3] = {"Huffman", "Huffman + REnc", "Huffman + PREnc"}; + + std::cerr << "Used options:" << std::endl; + std::cerr << "\tText phrase table will be read from: " << m_inPath << std::endl; + std::cerr << "\tOutput phrase table will be written to: " << m_outPath << std::endl; + std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl; + std::cerr << "\tSource phrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl; + std::cerr << "\tSelected target phrase encoding: " << encodings[m_coding] << std::endl; + if(m_coding == PREnc) { + std::cerr << "\tMaxiumum allowed rank for PREnc: "; + if(!m_maxRank) + std::cerr << "unlimited" << std::endl; + else + std::cerr << m_maxRank << std::endl; + } + std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl; + std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl; + std::cerr << "\tUsing score quantization: "; + if(m_quantize) + std::cerr << m_quantize << " best" << std::endl; + else + std::cerr << "no" << std::endl; + std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl; + +#ifdef WITH_THREADS + std::cerr << "\tRunning with " << m_threads << " threads" << std::endl; +#endif + std::cerr << std::endl; +} + +void PhraseTableCreator::Save() +{ + // Save type of encoding + ThrowingFwrite(&m_coding, sizeof(m_coding), 1, m_outFile); + ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile); + ThrowingFwrite(&m_useAlignmentInfo, sizeof(m_useAlignmentInfo), 1, m_outFile); + ThrowingFwrite(&m_maxRank, sizeof(m_maxRank), 1, m_outFile); + ThrowingFwrite(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, m_outFile); + + if(m_coding == REnc) { + // Save source language symbols for REnc + std::vector temp1; + temp1.resize(m_sourceSymbolsMap.size()); + for(boost::unordered_map::iterator it + = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) + temp1[it->second] = it->first; + std::sort(temp1.begin(), temp1.end()); + StringVector sourceSymbols(true); + for(std::vector::iterator it = temp1.begin(); + it != temp1.end(); it++) + sourceSymbols.push_back(*it); + sourceSymbols.save(m_outFile); + + // Save lexical translation table for REnc + size_t size = m_lexicalTableIndex.size(); + ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile); + ThrowingFwrite(&m_lexicalTableIndex[0], sizeof(size_t), size, m_outFile); + size = m_lexicalTable.size(); + ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile); + ThrowingFwrite(&m_lexicalTable[0], sizeof(SrcTrg), size, m_outFile); + } + + // Save target language symbols + std::vector temp2; + temp2.resize(m_targetSymbolsMap.size()); + for(boost::unordered_map::iterator it + = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++) + temp2[it->second] = it->first; + StringVector targetSymbols(true); + for(std::vector::iterator it = temp2.begin(); + it != temp2.end(); it++) + targetSymbols.push_back(*it); + targetSymbols.save(m_outFile); + + // Save Huffman codes for target language symbols + m_symbolTree->Save(m_outFile); + + // Save number of Huffman code sets for scores and + // save Huffman code sets + ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile); + size_t numScoreTrees = m_scoreTrees.size(); + for(size_t i = 0; i < numScoreTrees; i++) + m_scoreTrees[i]->Save(m_outFile); + + // Save Huffman codes for alignments + if(m_useAlignmentInfo) + m_alignTree->Save(m_outFile); + + // Save compressed target phrase collections + m_compressedTargetPhrases->save(m_outFile); +} + +void PhraseTableCreator::LoadLexicalTable(std::string filePath) +{ + std::vector t_lexTable; + + std::cerr << "Reading in lexical table for Rank Encoding" << std::endl; + std::ifstream lexIn(filePath.c_str(), std::ifstream::in); + std::string src, trg; + float prob; + + // Reading in the translation probability lexicon + + std::cerr << "\tLoading from " << filePath << std::endl; + while(lexIn >> trg >> src >> prob) { + t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob)); + AddSourceSymbolId(src); + AddTargetSymbolId(trg); + } + + // Sorting lexicon by source words by lexicographical order, corresponding + // target words by decreasing probability. + + std::cerr << "\tSorting according to translation rank" << std::endl; + std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter()); + + // Re-assigning source word ids in lexicographical order + + std::vector temp1; + temp1.resize(m_sourceSymbolsMap.size()); + for(boost::unordered_map::iterator it + = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) + temp1[it->second] = it->first; + + std::sort(temp1.begin(), temp1.end()); + + for(size_t i = 0; i < temp1.size(); i++) + m_sourceSymbolsMap[temp1[i]] = i; + + // Building the lexicon based on source and target word ids + + std::string srcWord = ""; + size_t srcIdx = 0; + for(std::vector::iterator it = t_lexTable.begin(); + it != t_lexTable.end(); it++) { + // If we encounter a new source word + if(it->first.first != srcWord) { + srcIdx = GetSourceSymbolId(it->first.first); + + // Store position of first translation + if(srcIdx >= m_lexicalTableIndex.size()) + m_lexicalTableIndex.resize(srcIdx + 1); + m_lexicalTableIndex[srcIdx] = m_lexicalTable.size(); + } + + // Store pair of source word and target word + size_t trgIdx = GetTargetSymbolId(it->first.second); + m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx)); + + srcWord = it->first.first; + } + std::cerr << "\tLoaded " << m_lexicalTable.size() << " lexical pairs" << std::endl; + std::cerr << std::endl; +} + +void PhraseTableCreator::CreateRankHash() +{ + InputFileStream inFile(m_inPath); + +#ifdef WITH_THREADS + boost::thread_group threads; + for (size_t i = 0; i < m_threads; ++i) { + RankingTask* rt = new RankingTask(inFile, *this); + threads.create_thread(*rt); + } + threads.join_all(); +#else + RankingTask* rt = new RankingTask(inFile, *this); + (*rt)(); + delete rt; +#endif + FlushRankedQueue(true); +} + +inline std::string PhraseTableCreator::MakeSourceKey(std::string &source) +{ + return source + " " + m_separator + " "; +} + +inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target) +{ + return source + " " + m_separator + " " + target + " " + m_separator + " "; +} + +void PhraseTableCreator::EncodeTargetPhrases() +{ + InputFileStream inFile(m_inPath); + +#ifdef WITH_THREADS + boost::thread_group threads; + for (size_t i = 0; i < m_threads; ++i) { + EncodingTask* et = new EncodingTask(inFile, *this); + threads.create_thread(*et); + } + threads.join_all(); +#else + EncodingTask* et = new EncodingTask(inFile, *this); + (*et)(); + delete et; +#endif + FlushEncodedQueue(true); +} + + +void PhraseTableCreator::CompressTargetPhrases() +{ +#ifdef WITH_THREADS + boost::thread_group threads; + for (size_t i = 0; i < m_threads; ++i) { + CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this); + threads.create_thread(*ct); + } + threads.join_all(); +#else + CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this); + (*ct)(); + delete ct; +#endif + FlushCompressedQueue(true); +} + +void PhraseTableCreator::CalcHuffmanCodes() +{ + std::cerr << "\tCreating Huffman codes for " << m_symbolCounter.Size() + << " target phrase symbols" << std::endl; + + m_symbolTree = new SymbolTree(m_symbolCounter.Begin(), + m_symbolCounter.End()); + + std::vector::iterator treeIt = m_scoreTrees.begin(); + for(std::vector::iterator it = m_scoreCounters.begin(); + it != m_scoreCounters.end(); it++) { + if(m_quantize) + (*it)->Quantize(m_quantize); + + std::cerr << "\tCreating Huffman codes for " << (*it)->Size() + << " scores" << std::endl; + + *treeIt = new ScoreTree((*it)->Begin(), (*it)->End()); + treeIt++; + } + + if(m_useAlignmentInfo) { + std::cerr << "\tCreating Huffman codes for " << m_alignCounter.Size() + << " alignment points" << std::endl; + m_alignTree = new AlignTree(m_alignCounter.Begin(), m_alignCounter.End()); + } + std::cerr << std::endl; +} + + +void PhraseTableCreator::AddSourceSymbolId(std::string& symbol) +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + if(m_sourceSymbolsMap.count(symbol) == 0) { + unsigned value = m_sourceSymbolsMap.size(); + m_sourceSymbolsMap[symbol] = value; + } +} + +void PhraseTableCreator::AddTargetSymbolId(std::string& symbol) +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + if(m_targetSymbolsMap.count(symbol) == 0) { + unsigned value = m_targetSymbolsMap.size(); + m_targetSymbolsMap[symbol] = value; + } +} + +unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol) +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + boost::unordered_map::iterator it + = m_sourceSymbolsMap.find(symbol); + + if(it != m_sourceSymbolsMap.end()) + return it->second; + else + return m_sourceSymbolsMap.size(); +} + +unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol) +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + boost::unordered_map::iterator it + = m_targetSymbolsMap.find(symbol); + + UTIL_THROW_IF2(it == m_targetSymbolsMap.end(), "No id found for target symbol: " << symbol); + return it->second; +} + +unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol) +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + boost::unordered_map::iterator it + = m_targetSymbolsMap.find(symbol); + + if(it != m_targetSymbolsMap.end()) + return it->second; + else { + unsigned value = m_targetSymbolsMap.size(); + m_targetSymbolsMap[symbol] = value; + return value; + } +} + +unsigned PhraseTableCreator::GetRank(unsigned srcIdx, unsigned trgIdx) +{ + size_t srcTrgIdx = m_lexicalTableIndex[srcIdx]; + while(srcTrgIdx < m_lexicalTable.size() + && srcIdx == m_lexicalTable[srcTrgIdx].first + && m_lexicalTable[srcTrgIdx].second != trgIdx) + srcTrgIdx++; + + if(srcTrgIdx < m_lexicalTable.size() + && m_lexicalTable[srcTrgIdx].second == trgIdx) + return srcTrgIdx - m_lexicalTableIndex[srcIdx]; + else + return m_lexicalTable.size(); +} + +unsigned PhraseTableCreator::EncodeREncSymbol1(unsigned trgIdx) +{ + assert((~(1 << 31)) > trgIdx); + return trgIdx; +} + +unsigned PhraseTableCreator::EncodeREncSymbol2(unsigned pos, unsigned rank) +{ + unsigned symbol = rank; + symbol |= 1 << 30; + symbol |= pos << 24; + return symbol; +} + +unsigned PhraseTableCreator::EncodeREncSymbol3(unsigned rank) +{ + unsigned symbol = rank; + symbol |= 2 << 30; + return symbol; +} + +unsigned PhraseTableCreator::EncodePREncSymbol1(unsigned trgIdx) +{ + assert((~(1 << 31)) > trgIdx); + return trgIdx; +} + +unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned rank) +{ + // "left" and "right" must be smaller than 2^5 + // "rank" must be smaller than 2^19 + left = left + 32; + right = right + 32; + + assert(64 > left); + assert(64 > right); + assert(524288 > rank); + + unsigned symbol = 0; + symbol |= 1 << 31; + symbol |= left << 25; + symbol |= right << 19; + symbol |= rank; + return symbol; +} + +void PhraseTableCreator::EncodeTargetPhraseNone(std::vector& t, + std::ostream& os) +{ + std::stringstream encodedTargetPhrase; + size_t j = 0; + while(j < t.size()) { + unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]); + + m_symbolCounter.Increase(targetSymbolId); + os.write((char*)&targetSymbolId, sizeof(targetSymbolId)); + j++; + } + + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); + os.write((char*)&stopSymbolId, sizeof(stopSymbolId)); + m_symbolCounter.Increase(stopSymbolId); +} + +void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector& s, + std::vector& t, + std::set& a, + std::ostream& os) +{ + std::stringstream encodedTargetPhrase; + + std::vector > a2(t.size()); + for(std::set::iterator it = a.begin(); it != a.end(); it++) + a2[it->second].push_back(it->first); + + for(size_t i = 0; i < t.size(); i++) { + unsigned idxTarget = GetOrAddTargetSymbolId(t[i]); + unsigned encodedSymbol = -1; + + unsigned bestSrcPos = s.size(); + unsigned bestDiff = s.size(); + unsigned bestRank = m_lexicalTable.size(); + unsigned badRank = m_lexicalTable.size(); + + for(std::vector::iterator it = a2[i].begin(); it != a2[i].end(); it++) { + unsigned idxSource = GetSourceSymbolId(s[*it]); + size_t r = GetRank(idxSource, idxTarget); + if(r != badRank) { + if(r < bestRank) { + bestRank = r; + bestSrcPos = *it; + bestDiff = abs((long)*it-(long)i); + } else if(r == bestRank && unsigned(abs((long)*it-(long)i)) < bestDiff) { + bestSrcPos = *it; + bestDiff = abs((long)*it-(long)i); + } + } + } + + if(bestRank != badRank && bestSrcPos < s.size()) { + if(bestSrcPos == i) + encodedSymbol = EncodeREncSymbol3(bestRank); + else + encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank); + a.erase(AlignPoint(bestSrcPos, i)); + } else { + encodedSymbol = EncodeREncSymbol1(idxTarget); + } + + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + m_symbolCounter.Increase(encodedSymbol); + } + + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); + unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId); + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + m_symbolCounter.Increase(encodedSymbol); +} + +void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector& s, + std::vector& t, + std::set& a, + size_t ownRank, + std::ostream& os) +{ + std::vector encodedSymbols(t.size()); + std::vector encodedSymbolsLengths(t.size(), 0); + + ConsistentPhrases cp(s.size(), t.size(), a); + while(!cp.Empty()) { + ConsistentPhrases::Phrase p = cp.Pop(); + + std::stringstream key1; + key1 << s[p.i]; + for(int i = p.i+1; i < p.i+p.m; i++) + key1 << " " << s[i]; + + std::stringstream key2; + key2 << t[p.j]; + for(int i = p.j+1; i < p.j+p.n; i++) + key2 << " " << t[i]; + + int rank = -1; + std::string key1Str = key1.str(), key2Str = key2.str(); + size_t idx = m_rnkHash[MakeSourceTargetKey(key1Str, key2Str)]; + if(idx != m_rnkHash.GetSize()) + rank = m_ranks[idx]; + + if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) { + if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) { + std::stringstream encodedSymbol; + encodedSymbols[p.j] = EncodePREncSymbol2(p.i-p.j, s.size()-(p.i+p.m), rank); + encodedSymbolsLengths[p.j] = p.n; + + std::set tAlignment; + for(std::set::iterator it = a.begin(); + it != a.end(); it++) + if(it->first < p.i || it->first >= p.i + p.m + || it->second < p.j || it->second >= p.j + p.n) + tAlignment.insert(*it); + a = tAlignment; + cp.RemoveOverlap(p); + } + } + } + + std::stringstream encodedTargetPhrase; + + size_t j = 0; + while(j < t.size()) { + if(encodedSymbolsLengths[j] > 0) { + unsigned encodedSymbol = encodedSymbols[j]; + m_symbolCounter.Increase(encodedSymbol); + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + j += encodedSymbolsLengths[j]; + } else { + unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]); + unsigned encodedSymbol = EncodePREncSymbol1(targetSymbolId); + m_symbolCounter.Increase(encodedSymbol); + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + j++; + } + } + + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); + unsigned encodedSymbol = EncodePREncSymbol1(stopSymbolId); + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + m_symbolCounter.Increase(encodedSymbol); +} + +void PhraseTableCreator::EncodeScores(std::vector& scores, std::ostream& os) +{ + size_t c = 0; + float score; + + while(c < scores.size()) { + score = scores[c]; + score = FloorScore(TransformScore(score)); + os.write((char*)&score, sizeof(score)); + m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score); + c++; + } +} + +void PhraseTableCreator::EncodeAlignment(std::set& alignment, + std::ostream& os) +{ + for(std::set::iterator it = alignment.begin(); + it != alignment.end(); it++) { + os.write((char*)&(*it), sizeof(AlignPoint)); + m_alignCounter.Increase(*it); + } + AlignPoint stop(-1, -1); + os.write((char*) &stop, sizeof(AlignPoint)); + m_alignCounter.Increase(stop); +} + +std::string PhraseTableCreator::EncodeLine(std::vector& tokens, size_t ownRank) +{ + std::string sourcePhraseStr = tokens[0]; + std::string targetPhraseStr = tokens[1]; + std::string scoresStr = tokens[2]; + + std::string alignmentStr = ""; + if(tokens.size() > 3) + alignmentStr = tokens[3]; + + std::vector s = Tokenize(sourcePhraseStr); + + size_t phraseLength = s.size(); + if(m_maxPhraseLength < phraseLength) + m_maxPhraseLength = phraseLength; + + std::vector t = Tokenize(targetPhraseStr); + std::vector scores = Tokenize(scoresStr); + + if(scores.size() != m_numScoreComponent) { + std::stringstream strme; + strme << "Error: Wrong number of scores detected (" + << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; + strme << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[2] << " ..." << std::endl; + UTIL_THROW2(strme.str()); + } + + std::set a; + if(m_coding != None || m_useAlignmentInfo) { + std::vector positions = Tokenize(alignmentStr, " \t-"); + for(size_t i = 0; i < positions.size(); i += 2) { + a.insert(AlignPoint(positions[i], positions[i+1])); + } + } + + std::stringstream encodedTargetPhrase; + + if(m_coding == PREnc) { + EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase); + } else if(m_coding == REnc) { + EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase); + } else { + EncodeTargetPhraseNone(t, encodedTargetPhrase); + } + + EncodeScores(scores, encodedTargetPhrase); + + if(m_useAlignmentInfo) + EncodeAlignment(a, encodedTargetPhrase); + + return encodedTargetPhrase.str(); +} + +std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCollection) +{ + enum EncodeState { + ReadSymbol, ReadScore, ReadAlignment, + EncodeSymbol, EncodeScore, EncodeAlignment + }; + EncodeState state = ReadSymbol; + + unsigned phraseStopSymbolId; + if(m_coding == REnc) + phraseStopSymbolId = EncodeREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); + else if(m_coding == PREnc) + phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); + else + phraseStopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); + AlignPoint alignStopSymbol(-1, -1); + + std::stringstream encodedStream(encodedCollection); + encodedStream.unsetf(std::ios::skipws); + + std::string compressedEncodedCollection; + BitWrapper<> bitStream(compressedEncodedCollection); + + unsigned symbol; + float score; + size_t currScore = 0; + AlignPoint alignPoint; + + while(encodedStream) { + switch(state) { + case ReadSymbol: + encodedStream.read((char*) &symbol, sizeof(unsigned)); + state = EncodeSymbol; + break; + case ReadScore: + if(currScore == m_numScoreComponent) { + currScore = 0; + if(m_useAlignmentInfo) + state = ReadAlignment; + else + state = ReadSymbol; + } else { + encodedStream.read((char*) &score, sizeof(float)); + currScore++; + state = EncodeScore; + } + break; + case ReadAlignment: + encodedStream.read((char*) &alignPoint, sizeof(AlignPoint)); + state = EncodeAlignment; + break; + + case EncodeSymbol: + state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol; + m_symbolTree->Put(bitStream, symbol); + break; + case EncodeScore: { + state = ReadScore; + size_t idx = m_multipleScoreTrees ? currScore-1 : 0; + if(m_quantize) + score = m_scoreCounters[idx]->LowerBound(score); + m_scoreTrees[idx]->Put(bitStream, score); + } + break; + case EncodeAlignment: + state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment; + m_alignTree->Put(bitStream, alignPoint); + break; + } + } + + return compressedEncodedCollection; +} + +void PhraseTableCreator::AddRankedLine(PackedItem& pi) +{ + m_queue.push(pi); +} + +void PhraseTableCreator::FlushRankedQueue(bool force) +{ + size_t step = 1ul << 10; + + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { + m_lastFlushedLine++; + + PackedItem pi = m_queue.top(); + m_queue.pop(); + + if(m_lastSourceRange.size() == step) { + m_rnkHash.AddRange(m_lastSourceRange); + m_lastSourceRange.clear(); + } + + if(m_lastFlushedSourcePhrase != pi.GetSrc()) { + if(m_rankQueue.size()) { + m_lastFlushedSourceNum++; + if(m_lastFlushedSourceNum % 100000 == 0) { + std::cerr << "."; + } + if(m_lastFlushedSourceNum % 5000000 == 0) { + std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl; + } + + m_ranks.resize(m_lastFlushedLine + 1); + int r = 0; + while(!m_rankQueue.empty()) { + m_ranks[m_rankQueue.top().second] = r++; + m_rankQueue.pop(); + } + } + } + + m_lastSourceRange.push_back(pi.GetTrg()); + + m_rankQueue.push(std::make_pair(pi.GetScore(), pi.GetLine())); + m_lastFlushedSourcePhrase = pi.GetSrc(); + } + + if(force) { + if(!m_lastSourceRange.empty()) { + m_rnkHash.AddRange(m_lastSourceRange); + m_lastSourceRange.clear(); + } + +#ifdef WITH_THREADS + m_rnkHash.WaitAll(); +#endif + + m_ranks.resize(m_lastFlushedLine + 1); + int r = 0; + while(!m_rankQueue.empty()) { + m_ranks[m_rankQueue.top().second] = r++; + m_rankQueue.pop(); + } + + m_lastFlushedLine = -1; + m_lastFlushedSourceNum = 0; + + std::cerr << std::endl << std::endl; + } +} + + +void PhraseTableCreator::AddEncodedLine(PackedItem& pi) +{ + m_queue.push(pi); +} + +void PhraseTableCreator::FlushEncodedQueue(bool force) +{ + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { + PackedItem pi = m_queue.top(); + m_queue.pop(); + m_lastFlushedLine++; + + if(m_lastFlushedSourcePhrase != pi.GetSrc()) { + if(m_lastCollection.size()) { + std::stringstream targetPhraseCollection; + for(std::vector::iterator it = + m_lastCollection.begin(); it != m_lastCollection.end(); it++) + targetPhraseCollection << *it; + + m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); + m_encodedTargetPhrases->push_back(targetPhraseCollection.str()); + + m_lastFlushedSourceNum++; + if(m_lastFlushedSourceNum % 100000 == 0) + std::cerr << "."; + if(m_lastFlushedSourceNum % 5000000 == 0) + std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl; + + m_lastCollection.clear(); + } + } + + if(m_lastSourceRange.size() == (1ul << m_orderBits)) { + m_srcHash.AddRange(m_lastSourceRange); + m_srcHash.SaveLastRange(); + m_srcHash.DropLastRange(); + m_lastSourceRange.clear(); + } + + m_lastFlushedSourcePhrase = pi.GetSrc(); + if(m_coding == PREnc) { + if(m_lastCollection.size() <= pi.GetRank()) + m_lastCollection.resize(pi.GetRank() + 1); + m_lastCollection[pi.GetRank()] = pi.GetTrg(); + } else { + m_lastCollection.push_back(pi.GetTrg()); + } + } + + if(force) { + if(!m_lastSourceRange.size() || m_lastSourceRange.back() != m_lastFlushedSourcePhrase) + m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); + + if(m_lastCollection.size()) { + std::stringstream targetPhraseCollection; + for(std::vector::iterator it = + m_lastCollection.begin(); it != m_lastCollection.end(); it++) + targetPhraseCollection << *it; + + m_encodedTargetPhrases->push_back(targetPhraseCollection.str()); + m_lastCollection.clear(); + } + + if(!m_lastSourceRange.empty()) { + m_srcHash.AddRange(m_lastSourceRange); + m_lastSourceRange.clear(); + } + +#ifdef WITH_THREADS + m_srcHash.WaitAll(); +#endif + + m_srcHash.SaveLastRange(); + m_srcHash.DropLastRange(); + m_srcHash.FinalizeSave(); + + m_lastFlushedLine = -1; + m_lastFlushedSourceNum = 0; + + std::cerr << std::endl << std::endl; + } +} + +void PhraseTableCreator::AddCompressedCollection(PackedItem& pi) +{ + m_queue.push(pi); +} + +void PhraseTableCreator::FlushCompressedQueue(bool force) +{ + if(force || m_queue.size() > 10000) { + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { + PackedItem pi = m_queue.top(); + m_queue.pop(); + m_lastFlushedLine++; + + m_compressedTargetPhrases->push_back(pi.GetTrg()); + + if((pi.GetLine()+1) % 100000 == 0) + std::cerr << "."; + if((pi.GetLine()+1) % 5000000 == 0) + std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; + } + } + + if(force) { + m_lastFlushedLine = -1; + std::cerr << std::endl << std::endl; + } +} + +//****************************************************************************// + +size_t RankingTask::m_lineNum = 0; +#ifdef WITH_THREADS +boost::mutex RankingTask::m_mutex; +boost::mutex RankingTask::m_fileMutex; +#endif + +RankingTask::RankingTask(InputFileStream& inFile, PhraseTableCreator& creator) + : m_inFile(inFile), m_creator(creator) {} + +void RankingTask::operator()() +{ + size_t lineNum = 0; + + std::vector lines; + size_t max_lines = 1000; + lines.reserve(max_lines); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } + + std::vector result; + result.reserve(max_lines); + + while(lines.size()) { + for(size_t i = 0; i < lines.size(); i++) { + std::vector tokens; + Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); + + for(std::vector::iterator it = tokens.begin(); it != tokens.end(); it++) + *it = Moses::Trim(*it); + + if(tokens.size() < 4) { + std::stringstream strme; + strme << "Error: It seems the following line has a wrong format:" << std::endl; + strme << "Line " << i << ": " << lines[i] << std::endl; + UTIL_THROW2(strme.str()); + } + + if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) { + std::stringstream strme; + strme << "Error: It seems the following line contains no alignment information, " << std::endl; + strme << "but you are using "; + strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc"); + strme << " encoding which makes use of alignment data. " << std::endl; + strme << "Use -encoding None" << std::endl; + strme << "Line " << i << ": " << lines[i] << std::endl; + UTIL_THROW2(strme.str()); + } + + std::vector scores = Tokenize(tokens[2]); + if(scores.size() != m_creator.m_numScoreComponent) { + std::stringstream strme; + strme << "Error: It seems the following line has a wrong number of scores (" + << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl; + strme << "Line " << i << ": " << lines[i] << std::endl; + UTIL_THROW2(strme.str()); + } + + float sortScore = scores[m_creator.m_sortScoreIndex]; + + std::string key1 = m_creator.MakeSourceKey(tokens[0]); + std::string key2 = m_creator.MakeSourceTargetKey(tokens[0], tokens[1]); + + PackedItem packedItem(lineNum + i, key1, key2, 0, sortScore); + result.push_back(packedItem); + } + lines.clear(); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + for(size_t i = 0; i < result.size(); i++) + m_creator.AddRankedLine(result[i]); + m_creator.FlushRankedQueue(); + } + + result.clear(); + lines.reserve(max_lines); + result.reserve(max_lines); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } +} + +size_t EncodingTask::m_lineNum = 0; +#ifdef WITH_THREADS +boost::mutex EncodingTask::m_mutex; +boost::mutex EncodingTask::m_fileMutex; +#endif + +EncodingTask::EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator) + : m_inFile(inFile), m_creator(creator) {} + +void EncodingTask::operator()() +{ + size_t lineNum = 0; + + std::vector lines; + size_t max_lines = 1000; + lines.reserve(max_lines); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } + + std::vector result; + result.reserve(max_lines); + + while(lines.size()) { + for(size_t i = 0; i < lines.size(); i++) { + std::vector tokens; + Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); + + for(std::vector::iterator it = tokens.begin(); it != tokens.end(); it++) + *it = Moses::Trim(*it); + + if(tokens.size() < 3) { + std::stringstream strme; + strme << "Error: It seems the following line has a wrong format:" << std::endl; + strme << "Line " << i << ": " << lines[i] << std::endl; + UTIL_THROW2(strme.str()); + } + + if(tokens.size() > 3 && tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) { + std::stringstream strme; + strme << "Error: It seems the following line contains no alignment information, " << std::endl; + strme << "but you are using "; + strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc"); + strme << " encoding which makes use of alignment data. " << std::endl; + strme << "Use -encoding None" << std::endl; + strme << "Line " << i << ": " << lines[i] << std::endl; + UTIL_THROW2(strme.str()); + } + + size_t ownRank = 0; + if(m_creator.m_coding == PhraseTableCreator::PREnc) + ownRank = m_creator.m_ranks[lineNum + i]; + + std::string encodedLine = m_creator.EncodeLine(tokens, ownRank); + + PackedItem packedItem(lineNum + i, tokens[0], encodedLine, ownRank); + result.push_back(packedItem); + } + lines.clear(); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + for(size_t i = 0; i < result.size(); i++) + m_creator.AddEncodedLine(result[i]); + m_creator.FlushEncodedQueue(); + } + + result.clear(); + lines.reserve(max_lines); + result.reserve(max_lines); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } +} + +//****************************************************************************// + +size_t CompressionTask::m_collectionNum = 0; +#ifdef WITH_THREADS +boost::mutex CompressionTask::m_mutex; +#endif + +CompressionTask::CompressionTask(StringVectorTemp& encodedCollections, + PhraseTableCreator& creator) + : m_encodedCollections(encodedCollections), m_creator(creator) {} + +void CompressionTask::operator()() +{ + size_t collectionNum; + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + collectionNum = m_collectionNum; + m_collectionNum++; + } + + while(collectionNum < m_encodedCollections.size()) { + std::string collection = m_encodedCollections[collectionNum]; + std::string compressedCollection + = m_creator.CompressEncodedCollection(collection); + + std::string dummy; + PackedItem packedItem(collectionNum, dummy, compressedCollection, 0); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_creator.AddCompressedCollection(packedItem); + m_creator.FlushCompressedQueue(); + + collectionNum = m_collectionNum; + m_collectionNum++; + } +} + +//****************************************************************************// + +PackedItem::PackedItem(long line, std::string sourcePhrase, + std::string packedTargetPhrase, size_t rank, + float score) + : m_line(line), m_sourcePhrase(sourcePhrase), + m_packedTargetPhrase(packedTargetPhrase), m_rank(rank), + m_score(score) {} + +long PackedItem::GetLine() const +{ + return m_line; +} + +const std::string& PackedItem::GetSrc() const +{ + return m_sourcePhrase; +} + +const std::string& PackedItem::GetTrg() const +{ + return m_packedTargetPhrase; +} + +size_t PackedItem::GetRank() const +{ + return m_rank; +} + +float PackedItem::GetScore() const +{ + return m_score; +} + +} diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp b/mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e837a6610b7fa58aafe3eb8d59fb62e1dc55047d --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp @@ -0,0 +1,32 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "TargetPhraseCollectionCache.h" + +namespace Moses +{ + + +boost::thread_specific_ptr +TargetPhraseCollectionCache::m_phraseCache; + +} + diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp b/mosesdecoder/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b635dc05049e706628c99f3c699368e6c714f17c --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp @@ -0,0 +1,60 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "ApplicableRuleTrie.h" + +namespace Moses +{ + +void ApplicableRuleTrie::Extend(const UTrieNode &root, int minPos, + const SentenceMap &sentMap, bool followsGap) +{ + const UTrieNode::TerminalMap &termMap = root.GetTerminalMap(); + for (UTrieNode::TerminalMap::const_iterator p = termMap.begin(); + p != termMap.end(); ++p) { + const Word &word = p->first; + const UTrieNode &child = p->second; + SentenceMap::const_iterator q = sentMap.find(word); + if (q == sentMap.end()) { + continue; + } + for (std::vector::const_iterator r = q->second.begin(); + r != q->second.end(); ++r) { + size_t index = *r; + if (index == (size_t)minPos || (followsGap && index > (size_t)minPos) || minPos == -1) { + ApplicableRuleTrie *subTrie = new ApplicableRuleTrie(index, index, + child); + subTrie->Extend(child, index+1, sentMap, false); + m_children.push_back(subTrie); + } + } + } + + const UTrieNode *child = root.GetNonTerminalChild(); + if (!child) { + return; + } + int start = followsGap ? -1 : minPos; + ApplicableRuleTrie *subTrie = new ApplicableRuleTrie(start, -1, *child); + int newMinPos = (minPos == -1 ? 1 : minPos+1); + subTrie->Extend(*child, newMinPos, sentMap, true); + m_children.push_back(subTrie); +} + +} // namespace Moses diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h new file mode 100644 index 0000000000000000000000000000000000000000..9d2f2cda97aee47b81e66d9354f8a98415bb359d --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h @@ -0,0 +1,58 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include "SentenceMap.h" +#include "VarSpanNode.h" +#include "moses/TranslationModel/RuleTable/UTrieNode.h" +#include "moses/Util.h" + +#include + +namespace Moses +{ + +struct VarSpanNode; + +/** @todo what is this? + */ +struct ApplicableRuleTrie { +public: + ApplicableRuleTrie(int start, int end, const UTrieNode &node) + : m_start(start) + , m_end(end) + , m_node(&node) + , m_vstNode(NULL) {} + + ~ApplicableRuleTrie() { + RemoveAllInColl(m_children); + } + + void Extend(const UTrieNode &root, int minPos, const SentenceMap &sentMap, + bool followsGap); + + int m_start; + int m_end; + const UTrieNode *m_node; + const VarSpanNode *m_vstNode; + std::vector m_children; +}; + +} diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h new file mode 100644 index 0000000000000000000000000000000000000000..4990851272b88e307d1b49f9035e4fcef5db273f --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h @@ -0,0 +1,55 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include + +namespace Moses +{ + +/** @todo what is this? + */ +struct IntermediateVarSpanNode { +public: + typedef std::pair Range; + + IntermediateVarSpanNode() + : m_start(Range(-1, -1)) + , m_end(Range(-1, -1)) + , m_numSplitPoints(0) {} + + IntermediateVarSpanNode(const Range &start, const Range &end) + : m_start(start) + , m_end(end) + , m_numSplitPoints(0) {} + + bool isOpen() { + return m_end.second == -1; + } + bool isClosed() { + return !isOpen(); + } + + Range m_start; + Range m_end; + int m_numSplitPoints; +}; + +} diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.h new file mode 100644 index 0000000000000000000000000000000000000000..c5a71c16fbf83dd5d8b8413f9b27a3b65ad15b8f --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.h @@ -0,0 +1,99 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include "moses/ChartRuleLookupManager.h" +#include "moses/ChartTranslationOptionList.h" +#include "moses/NonTerminal.h" +#include "moses/TranslationModel/RuleTable/UTrieNode.h" +#include "moses/TranslationModel/RuleTable/UTrie.h" +#include "moses/StaticData.h" +#include "ApplicableRuleTrie.h" +#include "StackLattice.h" +#include "StackLatticeBuilder.h" +#include "StackLatticeSearcher.h" +#include "VarSpanTrieBuilder.h" + +#include +#include + +namespace Moses +{ + +class InputType; +class ChartCellCollectionBase; +class ChartHypothesisCollection; +class Range; + +/** @todo what is this? + */ +class Scope3Parser : public ChartRuleLookupManager +{ +public: + Scope3Parser(const ChartParser &parser, + const ChartCellCollectionBase &cellColl, + const RuleTableUTrie &ruleTable, + size_t maxChartSpan) + : ChartRuleLookupManager(parser, cellColl) + , m_ruleTable(ruleTable) + , m_maxChartSpan(maxChartSpan) { + Init(); + } + + void GetChartRuleCollection( + const InputPath &inputPath, + size_t last, + ChartParserCallback &outColl); + +private: + // Define a callback type for use by StackLatticeSearcher. + struct MatchCallback { + public: + MatchCallback(const Range &range, ChartParserCallback &out) + : m_range(range) , m_out(out) // , m_tpc(NULL) + { } + + void operator()(const StackVec &stackVec) { + m_out.Add(*m_tpc, stackVec, m_range); + } + const Range &m_range; + ChartParserCallback &m_out; + TargetPhraseCollection::shared_ptr m_tpc; + }; + + void Init(); + void InitRuleApplicationVector(); + void FillSentenceMap(SentenceMap &); + void AddRulesToCells(const ApplicableRuleTrie &, std::pair, int, + int); + + const RuleTableUTrie &m_ruleTable; + std::vector > > > m_ruleApplications; + std::auto_ptr m_varSpanTrie; + StackVec m_emptyStackVec; + const size_t m_maxChartSpan; + StackLattice m_lattice; + StackLatticeBuilder m_latticeBuilder; + std::vector m_ranges; + std::vector > m_quickCheckTable; +}; + +} // namespace Moses diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/SentenceMap.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/SentenceMap.h new file mode 100644 index 0000000000000000000000000000000000000000..a7a1fdad98117d05d9f80a16aaf270b3f7f22795 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/SentenceMap.h @@ -0,0 +1,35 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include "moses/Terminal.h" +#include "moses/Word.h" + +#include + +#include + +namespace Moses +{ +typedef boost::unordered_map, + TerminalHasher, + TerminalEqualityPred> SentenceMap; +} diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLattice.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLattice.h new file mode 100644 index 0000000000000000000000000000000000000000..0c90721d0b3b8da69d2d96c2bef2dad23bc76565 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLattice.h @@ -0,0 +1,38 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include "moses/StackVec.h" + +#include +#include + +namespace Moses +{ + +/** For an entry, lattice[i][j][k][l]: + * i = offset from span start + * j = NT index (zero-based, from left of rule) + * k = span + * l = label index (as in UTrieNode) + */ +typedef std::vector > > StackLattice; + +} diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp b/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp new file mode 100644 index 0000000000000000000000000000000000000000..26e4e6aca7d8541e72db4bff79ea759b9389226e --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp @@ -0,0 +1,90 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "StackLatticeBuilder.h" + +#include "moses/ChartRuleLookupManager.h" +#include "moses/TranslationModel/RuleTable/UTrieNode.h" +#include "StackLattice.h" +#include "VarSpanNode.h" + +namespace Moses +{ + +void StackLatticeBuilder::Build( + int start, + int end, + const UTrieNode &ruleNode, + const VarSpanNode &varSpanNode, + const std::vector &ranges, + const ChartRuleLookupManager &manager, + StackLattice &lattice, + std::vector > &checkTable) +{ + // Extend the lattice if necessary. Do not shrink it. + const size_t span = end - start + 1; + if (lattice.size() < span) { + lattice.resize(span); + } + + // Extend the quick-check table if necessary. Do not shrink it. + if (checkTable.size() < varSpanNode.m_rank) { + checkTable.resize(varSpanNode.m_rank); + } + + const UTrieNode::LabelTable &labelTable = ruleNode.GetLabelTable(); + + for (size_t index = 0; index < ranges.size(); ++index) { + const VarSpanNode::NonTermRange &range = ranges[index]; + const std::vector &labelVec = labelTable[index]; + checkTable[index].clear(); + checkTable[index].resize(labelVec.size(), false); + // Note: values in range are offsets not absolute positions. + for (size_t offset = range.s1; offset <= range.s2; ++offset) { + // Allocate additional space if required. + if (lattice[offset].size() < index+1) { + lattice[offset].resize(index+1); + } + size_t e1 = std::max(offset, range.e1); + const size_t maxSpan = range.e2-offset+1; + if (lattice[offset][index].size() < maxSpan+1) { + lattice[offset][index].resize(maxSpan+1); + } + for (size_t end = e1; end <= range.e2; ++end) { + const size_t span = end-offset+1; + // Fill the StackVec at lattice[offset][index][span] by iterating over + // labelTable[index] and looking up each label over the span + // [start, end] + StackVec &stackVec = lattice[offset][index][span]; + stackVec.clear(); + stackVec.reserve(labelVec.size()); + std::vector::iterator q = checkTable[index].begin(); + for (std::vector::const_iterator p = labelVec.begin(); + p != labelVec.end(); ++p) { + const Word &label = *p; + const ChartCellLabel *stack = manager.GetTargetLabelSet(start+offset, start+offset+span-1).Find(label); + stackVec.push_back(stack); + *q++ = *q || static_cast(stack); + } + } + } + } +} + +} diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h new file mode 100644 index 0000000000000000000000000000000000000000..551655e30f1061cec3806e49e7d6ccc0415bb95e --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h @@ -0,0 +1,44 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include "StackLattice.h" +#include "VarSpanNode.h" +#include "moses/TranslationModel/RuleTable/UTrieNode.h" + +namespace Moses +{ + +class ChartCellCollection; + +/** @todo what is this? + */ +class StackLatticeBuilder +{ +public: + StackLatticeBuilder() {} + + void Build(int, int, const UTrieNode &, const VarSpanNode &, + const std::vector &, + const ChartRuleLookupManager &, StackLattice &, + std::vector > &); +}; + +} diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h new file mode 100644 index 0000000000000000000000000000000000000000..4deac31f8b5d3f7f6c3764c9e12831c470c09f91 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h @@ -0,0 +1,84 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include "StackLattice.h" +#include "VarSpanNode.h" +#include "moses/StackVec.h" + +#include + +namespace Moses +{ + +class ChartHypothesisCollection; + +template +class StackLatticeSearcher +{ +public: + StackLatticeSearcher(const StackLattice &lattice, + const std::vector &ranges) + : m_lattice(lattice) + , m_ranges(ranges) {} + + void Search(const std::vector &labels, MatchCallBackType &callback) { + m_labels = &labels; + m_matchCB = &callback; + SearchInner(0, 0); + } + +private: + void SearchInner(int start, size_t index) { + assert(m_stackVec.size() == index); + + const VarSpanNode::NonTermRange &range = m_ranges[index]; + + const size_t offset = (range.s1 == range.s2) ? range.s1 : start; + + const size_t minSpan = std::max(offset, range.e1) - offset + 1; + const size_t maxSpan = range.e2 - offset + 1; + + // Loop over all possible spans for this offset and index. + const std::vector &spanVec = m_lattice[offset][index]; + + for (size_t j = minSpan; j <= maxSpan; ++j) { + const ChartCellLabel *stack = spanVec[j][(*m_labels)[index]]; + if (!stack) { + continue; + } + m_stackVec.push_back(stack); + if (index+1 == m_labels->size()) { + (*m_matchCB)(m_stackVec); + } else { + SearchInner(offset+j, index+1); + } + m_stackVec.pop_back(); + } + } + + const StackLattice &m_lattice; + const std::vector &m_ranges; + const std::vector *m_labels; + MatchCallBackType *m_matchCB; + StackVec m_stackVec; +}; + +} // namespace Moses diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanNode.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanNode.h new file mode 100644 index 0000000000000000000000000000000000000000..21f74325131f11a6da4c6bdde93fb024a29be3be --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanNode.h @@ -0,0 +1,130 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include "IntermediateVarSpanNode.h" +#include "moses/Range.h" + +#include + +#include + +#include + +namespace Moses +{ + +/** @todo what is this? + */ +struct VarSpanNode { +public: + struct NonTermRange { + size_t s1; + size_t s2; + size_t e1; + size_t e2; + }; + typedef std::vector NodeVec; + typedef boost::array KeyType; + typedef std::map MapType; + + VarSpanNode() : m_parent(0), m_label(0), m_rank(0) {} + + VarSpanNode &Insert(const NodeVec &vec) { + if (vec.empty()) { + return *this; + } + return Insert(vec.begin(), vec.end()); + } + + // Given a span, determine the ranges of possible start and end offsets + // for each non-terminal. + void CalculateRanges(int start, int end, + std::vector &ranges) const { + ranges.resize(m_rank); + const VarSpanNode *n = this; + size_t firstIndex = m_rank; + while (n->m_parent) { + const KeyType &key = *(n->m_label); + assert(key[0] == 0 || key[0] == key[1]); + assert(key[3] == -1 || key[2] == key[3]); + const int numSplitPoints = key[4]; + firstIndex -= numSplitPoints+1; + const int vsn_start = key[0] == 0 ? start : key[0]; + const int vsn_end = key[3] == -1 ? end : key[3]; + // The start position of the first non-terminal is known. + ranges[firstIndex].s1 = ranges[firstIndex].s2 = vsn_start - start; + // The end range depends on the number of split points. If there are + // no split points then the end position is fixed. + if (numSplitPoints) { + ranges[firstIndex].e1 = vsn_start - start; + ranges[firstIndex].e2 = vsn_end - start - numSplitPoints; + } else { + ranges[firstIndex].e1 = ranges[firstIndex].e2 = vsn_end - start; + } + // For the remaining non-terminals, the start and end boundaries shift + // by one position with each split point. + for (int i = 1; i <= numSplitPoints; ++i) { + ranges[firstIndex+i].s1 = ranges[firstIndex].s1+i; + ranges[firstIndex+i].s2 = ranges[firstIndex].e2+i; + ranges[firstIndex+i].e1 = ranges[firstIndex].s1+i; + ranges[firstIndex+i].e2 = ranges[firstIndex].e2+i; + } + // Except that the end point of the final non-terminal is fixed. + ranges[firstIndex+numSplitPoints].e1 = vsn_end - start; + ranges[firstIndex+numSplitPoints].e2 = vsn_end - start; + n = n->m_parent; + } + assert(firstIndex == 0); + } + + const VarSpanNode *m_parent; + const KeyType *m_label; + size_t m_rank; + MapType m_children; + +private: + VarSpanNode &Insert(NodeVec::const_iterator first, + NodeVec::const_iterator last) { + assert(first != last); + + KeyType key; + key[0] = first->m_start.first; + key[1] = first->m_start.second; + key[2] = first->m_end.first; + key[3] = first->m_end.second; + key[4] = first->m_numSplitPoints; + + std::pair result = m_children.insert( + std::make_pair(key, VarSpanNode())); + VarSpanNode &child = result.first->second; + if (result.second) { + child.m_parent = this; + child.m_label = &(result.first->first); + child.m_rank = m_rank + first->m_numSplitPoints + 1; + } + if (++first == last) { + return child; + } + return child.Insert(first, last); + } +}; + +} diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp b/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp new file mode 100644 index 0000000000000000000000000000000000000000..35e66978b87cd8d1726f37f02ec3d168a8b22436 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp @@ -0,0 +1,109 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "VarSpanTrieBuilder.h" + +#include "ApplicableRuleTrie.h" +#include "IntermediateVarSpanNode.h" +#include "VarSpanNode.h" + +#include +#include + +namespace Moses +{ + +std::auto_ptr VarSpanTrieBuilder::Build( + ApplicableRuleTrie &root) +{ + std::auto_ptr vstRoot(new VarSpanNode()); + NodeVec vec; + const std::vector &children = root.m_children; + for (std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + Build(**p, vec, *(vstRoot.get())); + } + return vstRoot; +} + +void VarSpanTrieBuilder::Build(ApplicableRuleTrie &artNode, + NodeVec &vec, + VarSpanNode &vstRoot) +{ + typedef IntermediateVarSpanNode::Range Range; + + // Record enough information about vec that any changes made during this + // function call can be undone at the end. + NodeVecState state; + RecordState(vec, state); + + if (artNode.m_end == -1) { + if (!vec.empty() && vec.back().isOpen()) { + ++(vec.back().m_numSplitPoints); + ++(vec.back().m_end.first); + } else if (artNode.m_start == -1) { + Range start(0, -1); + Range end(0, -1); + vec.push_back(IntermediateVarSpanNode(start, end)); + } else { + Range start(artNode.m_start, artNode.m_start); + Range end(artNode.m_start, -1); + vec.push_back(IntermediateVarSpanNode(start, end)); + } + } else if (!vec.empty() && vec.back().isOpen()) { + vec.back().m_end = Range(artNode.m_start-1, artNode.m_start-1); + if (vec.back().m_start.second == -1) { + size_t s = artNode.m_start - (vec.back().m_numSplitPoints + 1); + vec.back().m_start.second = s; + } + } + + if (artNode.m_node->HasRules()) { + artNode.m_vstNode = &(vstRoot.Insert(vec)); + } + + const std::vector &children = artNode.m_children; + for (std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + Build(**p, vec, vstRoot); + } + + // Return vec to its original value. + RestoreState(state, vec); +} + +void VarSpanTrieBuilder::RecordState(const NodeVec &vec, NodeVecState &state) +{ + state.m_size = vec.size(); + if (!vec.empty()) { + state.m_lastNode = vec.back(); + } +} + +void VarSpanTrieBuilder::RestoreState(const NodeVecState &state, NodeVec &vec) +{ + assert(state.m_size == vec.size() || state.m_size+1 == vec.size()); + if (state.m_size < vec.size()) { + vec.resize(state.m_size); + } else if (!vec.empty()) { + vec.back() = state.m_lastNode; + } +} + +} diff --git a/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h b/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h new file mode 100644 index 0000000000000000000000000000000000000000..2513a2878f9d9bf82674783fb7f06dff44bc63b0 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h @@ -0,0 +1,51 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include "IntermediateVarSpanNode.h" + +#include +#include + +namespace Moses +{ + +struct ApplicableRuleTrie; +struct VarSpanNode; + +/** @todo what is this? + */ +class VarSpanTrieBuilder +{ +public: + std::auto_ptr Build(ApplicableRuleTrie &); + +private: + typedef std::vector NodeVec; + struct NodeVecState { + std::size_t m_size; + IntermediateVarSpanNode m_lastNode; + }; + void Build(ApplicableRuleTrie &, NodeVec &, VarSpanNode &); + void RecordState(const NodeVec &, NodeVecState &); + void RestoreState(const NodeVecState &, NodeVec &); +}; + +}