Training `quickmt` Models

Environment setup

# Install system dependencies
sudo apt install  libhunspell-dev parallel

## Install eole
git clone https://github.com/eole-nlp/eole.git
pip install -e ./eole

## Install ctranslate2
git clone --recursive https://github.com/OpenNMT/CTranslate2.git
cd CTranslate2
mkdir build && cd build
cmake -DOPENMP_RUNTIME=COMP -DWITH_MKL=OFF .. 
make -j8
sudo make install
sudo ldconfig
pip install -e ./python/

# Install kenlm
pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip

# Install quickmt
python -m pip install -e ./

Download Data

mv $HOME/.mtdata /path/to/large/disk
ln -s /path/to/large/disk $HOME/.mtdata

# Create experiment data/experiment folder
mkdir id-en

# List corpora
mtdata list -l ind-eng | cut -f1 > corpora.txt

# Download corpora
# Select some, then fetch
mtdata get -l ind-eng --merge --out id-en  --no-fail -j 1 --test Flores-flores200_devtest-1-eng-ind Microsoft-ntrex-128-eng-ind \
--dev Neulab-tedtalks_test-1-eng-ind Flores-flores200_dev-1-eng-ind \
--train Statmt-news_commentary-14-eng-ind Statmt-news_commentary-15-eng-ind Statmt-news_commentary-16-eng-ind Statmt-news_commentary-17-eng-ind Statmt-news_commentary-18-eng-ind Statmt-news_commentary-18.1-eng-ind Statmt-ccaligned-1-eng-ind_ID Facebook-wikimatrix-1-eng-ind Neulab-tedtalks_train-1-eng-ind Neulab-tedtalks_dev-1-eng-ind ELRC-wikipedia_health-1-eng-ind ELRC-hrw_dataset_v1-1-eng-ind OPUS-ccaligned-v1-eng-ind OPUS-ccmatrix-v1-eng-ind OPUS-elrc_3049_wikipedia_health-v1-eng-ind OPUS-elrc_wikipedia_health-v1-eng-ind OPUS-elrc_2922-v1-eng-ind OPUS-gnome-v1-eng-ind OPUS-globalvoices-v2015-eng-ind OPUS-globalvoices-v2017q3-eng-ind OPUS-globalvoices-v2018q4-eng-ind OPUS-kde4-v2-eng-ind OPUS-multiccaligned-v1-eng-ind OPUS-nllb-v1-eng-ind OPUS-neulab_tedtalks-v1-eng-ind OPUS-news_commentary-v14-eng-ind OPUS-news_commentary-v16-eng-ind OPUS-opensubtitles-v2016-eng-ind OPUS-opensubtitles-v2018-eng-ind OPUS-opensubtitles-v2024-eng-ind OPUS-paracrawl_bonus-v9-eng-ind OPUS-qed-v2.0a-eng-ind OPUS-ted2020-v1-eng-ind OPUS-tanzil-v1-eng-ind OPUS-tatoeba-v2-eng-ind OPUS-tatoeba-v20190709-eng-ind OPUS-tatoeba-v20200531-eng-ind OPUS-tatoeba-v20201109-eng-ind OPUS-tatoeba-v20210310-eng-ind OPUS-tatoeba-v20210722-eng-ind OPUS-tatoeba-v20220303-eng-ind OPUS-tatoeba-v20230412-eng-ind OPUS-ubuntu-v14.10-eng-ind OPUS-wikimatrix-v1-eng-ind OPUS-xlent-v1-eng-ind OPUS-xlent-v1.1-eng-ind OPUS-xlent-v1.2-eng-ind OPUS-bible_uedin-v1-eng-ind OPUS-tico_19-v20201028-eng-ind OPUS-tldr_pages-v20230829-eng-ind OPUS-wikimedia-v20210402-eng-ind OPUS-wikimedia-v20230407-eng-ind Google-wmt24pp-1-eng-ind_ID 


# Move files to standardized src/tgt names 
cd id-en
mv dev.ind dev.id
mv dev.eng dev.en
mv train.ind train.src
mv train.eng train.tgt

paste -d '\t' train.src train.tgt \
    | sort | uniq  \
    | parallel --block 70M -j 6 --pipe -k -l 200000 quickmt-clean --src_lang id --tgt_lang en --ft_model_path ../lid.176.bin --length_ratio 3 --src_min_langid_score 0.5 --tgt_min_langid_score 0.5 \
    | awk 'BEGIN{srand()}{print rand(), $0}' | sort -n -k 1 | awk 'sub(/\S* /,"\t")' \
    | awk -v FS="\t" '{ print $2 > "train.cleaned.src" ; print $3 > "train.cleaned.tgt" }'

Upload Data to Huggingface

You will have to have authenticated to huggingface and you will need to write to a location for which you have permissions (replace quickmt/quickmt-train.ri-en with your_username/your_dataset_name)

huggingface-cli login
quickmt-corpus-upload quickmt/quickmt-train.id-en --src_in train.cleaned.src --tgt_in train.cleaned.tgt --src_lang id --tgt_lang en

Train Tokenizers

# Train target tokenizer
spm_train --input_sentence_size 10000000 --shuffle_input_sentence false \
    --input=train.cleaned.tgt --num_threads 4 --model_prefix=en.spm \
    --vocab_size=20000 --character_coverage=0.9999 --model_type=unigram \
    --byte_fallback --train_extremely_large_corpus true

# Train source tokenizer
spm_train --input_sentence_size 10000000 --shuffle_input_sentence false \
    --input=train.cleaned.src --num_threads 4 --model_prefix=id.spm \
    --vocab_size=20000 --character_coverage=0.9999 --model_type=unigram \
    --byte_fallback --train_extremely_large_corpus true

# Train joint tokenizer
# spm_train --input_sentence_size 10000000 --shuffle_input_sentence true \
#     --input=tok.txt --num_threads 6 --model_prefix=joint.spm \
#     --vocab_size=50000 --character_coverage=0.9999 --model_type=unigram


# Convert spm vocab to eole vocab
cat en.spm.vocab | eole tools spm_to_vocab > en.eole.vocab
cat id.spm.vocab | eole tools spm_to_vocab > id.eole.vocab
#cat fr-en/joint.spm.vocab | eole tools spm_to_vocab > fr-en/joint.eole.vocab

mv train.cleaned.src train.id
mv train.cleaned.tgt train.en

Train Model

eole train --config eole-config-iden.yaml
eole train --config eole-config-enid.yaml

Inference with eole

eole predict -model_path ./so-en/model/ -src input.txt  -output output.txt  --batch_size 16 --gpu_ranks 0

Convert to ctranslate2

python -m ctranslate2.converters.eole_ct2 --model_path quickmt-id-en-eole-model/ --output_dir ct2-iden --force

# Copy over src and tgt tokenizers
cp en.spm.model ct2-iden/tgt.spm.model
cp id.spm.model ct2-iden/src.spm.model

# Copy over the config too
cp eole-config-iden.yaml ct2-iden/eole-config.yaml

Evaluate

Evaluate on the flores-devtest dataset

quickmt-eval --model_path ct2-iden --tgt_lang eng_Latn --src_lang ind_Latn --output_file quickmt.iden.mt --device cpu

Statmt-commoncrawl_wmt13-1-rus-eng
Statmt-news_commentary_wmt18-13-rus-eng
Statmt-news_commentary-14-eng-rus
Statmt-news_commentary-15-eng-rus
Statmt-news_commentary-16-eng-rus
Statmt-news_commentary-17-eng-rus
Statmt-news_commentary-18-eng-rus
Statmt-news_commentary-18.1-eng-rus
Statmt-newstest_ruen-2014-rus-eng
Statmt-newstest_enru-2015-eng-rus
Statmt-newstest_ruen-2015-rus-eng
Statmt-newstest_ruen-2016-rus-eng
Statmt-newstest_enru-2016-eng-rus
Statmt-newstest_ruen-2017-rus-eng
Statmt-newstest_enru-2017-eng-rus
Statmt-newstest_ruen-2018-rus-eng
Statmt-newstest_enru-2018-eng-rus
Statmt-newstest_ruen-2019-rus-eng
Statmt-newstest_enru-2019-eng-rus
Statmt-newstest-2012-eng-rus
Statmt-newstest-2013-eng-rus
Statmt-newstest_enru-2020-eng-rus
Statmt-newstest_ruen-2020-rus-eng
Statmt-newstestb_ruen-2020-rus-eng
Statmt-newstest_enru-2021-eng-rus
Statmt-newstest_ruen-2021-rus-eng
Statmt-backtrans_ruen-wmt20-rus-eng
Statmt-yandex-wmt22-eng-rus
Tilde-airbaltic-1-eng-rus
Tilde-czechtourism-1-eng-rus
Tilde-worldbank-1-eng-rus
Neulab-tedtalks_train-1-eng-rus
Neulab-tedtalks_test-1-eng-rus
Neulab-tedtalks_dev-1-eng-rus
ELRC-wikipedia_health-1-eng-rus
ELRC-swps_university_social_sciences_humanities-1-eng-rus
ELRC-scipar-1-eng-rus
ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus
ELRC-hrw_dataset_v1-1-eng-rus
OPUS-books-v1-eng-rus
OPUS-ccaligned-v1-eng-rus
OPUS-ccmatrix-v1-eng-rus
OPUS-elrc_3075_wikipedia_health-v1-eng-rus
OPUS-elrc_3855_swps_university_soci-v1-eng-rus
OPUS-elrc_5067_scipar-v1-eng-rus
OPUS-elrc_5183_scipar_ukraine-v1-eng-rus
OPUS-elrc_wikipedia_health-v1-eng-rus
OPUS-elrc_2922-v1-eng-rus
OPUS-eubookshop-v2-eng-rus
OPUS-gnome-v1-eng-rus
OPUS-globalvoices-v2015-eng-rus
OPUS-globalvoices-v2017q3-eng-rus
OPUS-globalvoices-v2018q4-eng-rus
OPUS-kde4-v2-eng-rus
OPUS-kdedoc-v1-eng_GB-rus
OPUS-linguatools_wikititles-v2014-eng-rus
OPUS-mdn_web_docs-v20230925-eng-rus
OPUS-multiun-v1-eng-rus
OPUS-neulab_tedtalks-v1-eng-rus
OPUS-news_commentary-v11-eng-rus
OPUS-news_commentary-v14-eng-rus
OPUS-news_commentary-v16-eng-rus
OPUS-news_commentary-v9.0-eng-rus
OPUS-news_commentary-v9.1-eng-rus
OPUS-openoffice-v3-eng_GB-rus
OPUS-opensubtitles-v2024-eng-rus
OPUS-php-v1-eng-rus
OPUS-paracrawl-v9-eng-rus
OPUS-qed-v2.0a-eng-rus
OPUS-ted2013-v1.1-eng-rus
OPUS-ted2020-v1-eng-rus
OPUS-tanzil-v1-eng-rus
OPUS-tatoeba-v2-eng-rus
OPUS-tatoeba-v20190709-eng-rus
OPUS-tatoeba-v20200531-eng-rus
OPUS-tatoeba-v20201109-eng-rus
OPUS-tatoeba-v20210310-eng-rus
OPUS-tatoeba-v20210722-eng-rus
OPUS-tatoeba-v20220303-eng-rus
OPUS-tatoeba-v20230412-eng-rus
OPUS-tildemodel-v2018-eng-rus
OPUS-unpc-v1.0-eng-rus
OPUS-ubuntu-v14.10-eng-rus
OPUS-wmt_news-v2014-eng-rus
OPUS-wmt_news-v2019-eng-rus
OPUS-wikimatrix-v1-eng-rus
OPUS-wikipedia-v1.0-eng-rus
OPUS-ada83-v1-eng-rus
OPUS-bible_uedin-v1-eng-rus
OPUS-infopankki-v1-eng-rus
OPUS-tico_19-v20201028-eng-rus
OPUS-tldr_pages-v20230829-eng-rus
OPUS-wikimedia-v20230407-eng-rus
Google-wmt24pp-1-eng-rus_RU

Training quickmt Models