|
|
#!/bin/sh |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SRC=ro |
|
|
|
|
|
|
|
|
TRG=en |
|
|
|
|
|
|
|
|
|
|
|
bpe_operations=89500 |
|
|
|
|
|
|
|
|
mosesdecoder=/path/to/mosesdecoder |
|
|
|
|
|
|
|
|
subword_nmt=/path/to/subword-nmt |
|
|
|
|
|
|
|
|
nematus=/path/to/nematus |
|
|
|
|
|
|
|
|
for prefix in corpus newsdev2016 |
|
|
do |
|
|
cat data/$prefix.$SRC | \ |
|
|
$mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC | \ |
|
|
../preprocess/normalise-romanian.py | \ |
|
|
../preprocess/remove-diacritics.py | \ |
|
|
$mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC |
|
|
|
|
|
cat data/$prefix.$TRG | \ |
|
|
$mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG | \ |
|
|
$mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG |
|
|
|
|
|
done |
|
|
|
|
|
|
|
|
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok $SRC $TRG data/corpus.tok.clean 1 80 |
|
|
|
|
|
|
|
|
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model model/truecase-model.$SRC |
|
|
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model model/truecase-model.$TRG |
|
|
|
|
|
|
|
|
for prefix in corpus |
|
|
do |
|
|
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC |
|
|
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG |
|
|
done |
|
|
|
|
|
|
|
|
for prefix in newsdev2016 |
|
|
do |
|
|
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC |
|
|
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG |
|
|
done |
|
|
|
|
|
|
|
|
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe |
|
|
|
|
|
|
|
|
|
|
|
for prefix in corpus newsdev2016 |
|
|
do |
|
|
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC |
|
|
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG |
|
|
done |
|
|
|
|
|
|
|
|
$nematus/data/build_dictionary.py data/corpus.bpe.$SRC data/corpus.bpe.$TRG |
|
|
|