| ################################################ |
| ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### |
| ################################################ |
|
|
| [GENERAL] |
|
|
| ### directory in which experiment is run |
| # |
| working-dir = /home/hieu/workspace/experiment/data/issues/toy |
|
|
| # specification of the language pair |
| input-extension = fr |
| output-extension = en |
| pair-extension = fr-en |
|
|
| ### directories that contain tools and data |
| # |
| # moses |
| moses-src-dir = /home/hieu/workspace/github/mosesdecoder |
| # |
| # moses binaries |
| moses-bin-dir = $moses-src-dir/bin |
| # |
| # moses scripts |
| moses-script-dir = $moses-src-dir/scripts |
| # |
| # directory where GIZA++/MGIZA programs resides |
| external-bin-dir = /home/hieu/workspace/bin/training-tools |
| # |
| # srilm |
| srilm-dir = $moses-src-dir/srilm/bin/i686-m64 |
| # |
| # irstlm |
| irstlm-dir = $moses-src-dir/irstlm/bin |
|
|
| # data |
| wmt12-data = $working-dir/data |
|
|
| ### basic tools |
| # |
| # moses decoder |
| decoder = $moses-bin-dir/moses_chart |
|
|
| # conversion of phrase table into binary on-disk format |
| #ttable-binarizer = $moses-bin-dir/processPhraseTable |
|
|
| # conversion of rule table into binary on-disk format |
| ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" |
|
|
| # tokenizers - comment out if all your data is already tokenized |
| input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" |
| output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" |
|
|
| # truecasers - comment out if you do not use the truecaser |
| input-truecaser = $moses-script-dir/recaser/truecase.perl |
| output-truecaser = $moses-script-dir/recaser/truecase.perl |
| detruecaser = $moses-script-dir/recaser/detruecase.perl |
|
|
|
|
| ### multi-core settings |
| # when the generic parallelizer is used, the number of cores |
| # specified here |
| cores = 8 |
|
|
| ################################################################# |
| # PARALLEL CORPUS PREPARATION: |
| # create a tokenized, sentence-aligned corpus, ready for training |
|
|
| [CORPUS] |
|
|
| ### long sentences are filtered out, since they slow down GIZA++ |
| # and are a less reliable source of data. set here the maximum |
| # length of a sentence |
| # |
| max-sentence-length = 80 |
|
|
| [CORPUS:nc] |
| raw-stem = $wmt12-data/nc-5k |
|
|
| ################################################################# |
| # LANGUAGE MODEL TRAINING |
|
|
| [LM] |
|
|
| ### tool to be used for language model training |
| # srilm |
| lm-training = $srilm-dir/ngram-count |
| settings = "-interpolate -kndiscount -unk" |
|
|
| # irstlm training |
| # msb = modified kneser ney; p=0 no singleton pruning |
| #lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp" |
| #settings = "-s msb -p 0" |
|
|
| # order of the language model |
| order = 5 |
|
|
| # kenlm, also set type to 8 |
| lm-binarizer = $moses-bin-dir/build_binary |
| type = 8 |
|
|
| ### each language model to be used has its own section here |
|
|
| [LM:nc] |
| raw-corpus = $wmt12-data/nc-5k.$output-extension |
|
|
| ################################################################# |
| # TRANSLATION MODEL TRAINING |
|
|
| [TRAINING] |
|
|
| ### training script to be used: either a legacy script or |
| # current moses training script (default) |
| # |
| script = $moses-script-dir/training/train-model.perl |
|
|
| ### parallelization of data preparation step |
| # the two directions of the data preparation can be run in parallel |
| # comment out if not needed |
| # |
| parallel = yes |
|
|
| ### symmetrization method to obtain word alignments from giza output |
| # (commonly used: grow-diag-final-and) |
| # |
| alignment-symmetrization-method = grow-diag-final-and |
|
|
| ### hierarchical rule set |
| # |
| hierarchical-rule-set = true |
|
|
| ### settings for rule scoring |
| # |
| score-settings = "--GoodTuring" |
|
|
| ##################################################### |
| ### TUNING: finding good weights for model components |
|
|
| [TUNING] |
|
|
| ### tuning script to be used |
| # |
| tuning-script = $moses-script-dir/training/mert-moses.pl |
| tuning-settings = "-mertdir $moses-bin-dir" |
|
|
| ### specify the corpus used for tuning |
| # it should contain 1000s of sentences |
| # |
| input-sgm = $wmt12-data/test-src.$input-extension.sgm |
| #raw-input = |
| #tokenized-input = |
| #factorized-input = |
| #input = |
| # |
| reference-sgm = $wmt12-data/test-ref.$output-extension.sgm |
| #raw-reference = |
| #tokenized-reference = |
| #factorized-reference = |
| #reference = |
|
|
| ### size of n-best list used (typically 100) |
| # |
| nbest = 100 |
|
|
| ####################################################### |
| ## TRUECASER: train model to truecase corpora and input |
|
|
| [TRUECASER] |
|
|
| ### script to train truecaser models |
| # |
| trainer = $moses-script-dir/recaser/train-truecaser.perl |
|
|
| ###################################################################### |
| ## EVALUATION: translating a test set using the tuned system and score it |
|
|
| [EVALUATION] |
|
|
| ### prepare system output for scoring |
| # this may include detokenization and wrapping output in sgm |
| # (needed for nist-bleu, ter, meteor) |
| # |
| detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension" |
| #recaser = $moses-script-dir/recaser/recase.perl |
| wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" |
| #output-sgm = |
|
|
| ### BLEU |
| # |
| nist-bleu = $moses-script-dir/generic/mteval-v13a.pl |
| nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" |
| #multi-bleu = $moses-script-dir/generic/multi-bleu.perl |
| #ibm-bleu = |
|
|
| ### Analysis: carry out various forms of analysis on the output |
| # |
| analysis = $moses-script-dir/ems/support/analysis.perl |
| # |
| # also report on input coverage |
| analyze-coverage = yes |
| # |
| # also report on phrase mappings used |
| report-segmentation = yes |
|
|
| [EVALUATION:newstest2011] |
|
|
| ### input data |
| # |
| input-sgm = $wmt12-data/test-src.$input-extension.sgm |
|
|
| ### reference data |
| # |
| reference-sgm = $wmt12-data/test-ref.$output-extension.sgm |
|
|
| ### wrapping frame |
| # for nist-bleu and other scoring scripts, the output needs to be wrapped |
| # in sgm markup (typically like the input sgm) |
| # |
| wrapping-frame = $input-sgm |
|
|
| ########################################## |
| ### REPORTING: summarize evaluation scores |
|
|
| [REPORTING] |
|
|
| ### currently no parameters for reporting section |
|
|
|
|