quickmt-id-en / training-id.md

Upload folder using huggingface_hub

af1488c verified 6 months ago

8.81 kB


	## Training `quickmt` Models

	### Environment setup

	```bash
	# Install system dependencies
	sudo apt install libhunspell-dev parallel

	## Install eole
	git clone https://github.com/eole-nlp/eole.git
	pip install -e ./eole

	## Install ctranslate2
	git clone --recursive https://github.com/OpenNMT/CTranslate2.git
	cd CTranslate2
	mkdir build && cd build
	cmake -DOPENMP_RUNTIME=COMP -DWITH_MKL=OFF ..
	make -j8
	sudo make install
	sudo ldconfig
	pip install -e ./python/

	# Install kenlm
	pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip

	# Install quickmt
	python -m pip install -e ./
	```

	### Download Data

	```bash
	mv $HOME/.mtdata /path/to/large/disk
	ln -s /path/to/large/disk $HOME/.mtdata

	# Create experiment data/experiment folder
	mkdir id-en

	# List corpora
	mtdata list -l ind-eng \| cut -f1 > corpora.txt

	# Download corpora
	# Select some, then fetch
	mtdata get -l ind-eng --merge --out id-en --no-fail -j 1 --test Flores-flores200_devtest-1-eng-ind Microsoft-ntrex-128-eng-ind \
	--dev Neulab-tedtalks_test-1-eng-ind Flores-flores200_dev-1-eng-ind \
	--train Statmt-news_commentary-14-eng-ind Statmt-news_commentary-15-eng-ind Statmt-news_commentary-16-eng-ind Statmt-news_commentary-17-eng-ind Statmt-news_commentary-18-eng-ind Statmt-news_commentary-18.1-eng-ind Statmt-ccaligned-1-eng-ind_ID Facebook-wikimatrix-1-eng-ind Neulab-tedtalks_train-1-eng-ind Neulab-tedtalks_dev-1-eng-ind ELRC-wikipedia_health-1-eng-ind ELRC-hrw_dataset_v1-1-eng-ind OPUS-ccaligned-v1-eng-ind OPUS-ccmatrix-v1-eng-ind OPUS-elrc_3049_wikipedia_health-v1-eng-ind OPUS-elrc_wikipedia_health-v1-eng-ind OPUS-elrc_2922-v1-eng-ind OPUS-gnome-v1-eng-ind OPUS-globalvoices-v2015-eng-ind OPUS-globalvoices-v2017q3-eng-ind OPUS-globalvoices-v2018q4-eng-ind OPUS-kde4-v2-eng-ind OPUS-multiccaligned-v1-eng-ind OPUS-nllb-v1-eng-ind OPUS-neulab_tedtalks-v1-eng-ind OPUS-news_commentary-v14-eng-ind OPUS-news_commentary-v16-eng-ind OPUS-opensubtitles-v2016-eng-ind OPUS-opensubtitles-v2018-eng-ind OPUS-opensubtitles-v2024-eng-ind OPUS-paracrawl_bonus-v9-eng-ind OPUS-qed-v2.0a-eng-ind OPUS-ted2020-v1-eng-ind OPUS-tanzil-v1-eng-ind OPUS-tatoeba-v2-eng-ind OPUS-tatoeba-v20190709-eng-ind OPUS-tatoeba-v20200531-eng-ind OPUS-tatoeba-v20201109-eng-ind OPUS-tatoeba-v20210310-eng-ind OPUS-tatoeba-v20210722-eng-ind OPUS-tatoeba-v20220303-eng-ind OPUS-tatoeba-v20230412-eng-ind OPUS-ubuntu-v14.10-eng-ind OPUS-wikimatrix-v1-eng-ind OPUS-xlent-v1-eng-ind OPUS-xlent-v1.1-eng-ind OPUS-xlent-v1.2-eng-ind OPUS-bible_uedin-v1-eng-ind OPUS-tico_19-v20201028-eng-ind OPUS-tldr_pages-v20230829-eng-ind OPUS-wikimedia-v20210402-eng-ind OPUS-wikimedia-v20230407-eng-ind Google-wmt24pp-1-eng-ind_ID


	# Move files to standardized src/tgt names
	cd id-en
	mv dev.ind dev.id
	mv dev.eng dev.en
	mv train.ind train.src
	mv train.eng train.tgt

	paste -d '\t' train.src train.tgt \
	\| sort \| uniq \
	\| parallel --block 70M -j 6 --pipe -k -l 200000 quickmt-clean --src_lang id --tgt_lang en --ft_model_path ../lid.176.bin --length_ratio 3 --src_min_langid_score 0.5 --tgt_min_langid_score 0.5 \
	\| awk 'BEGIN{srand()}{print rand(), $0}' \| sort -n -k 1 \| awk 'sub(/\S* /,"\t")' \
	\| awk -v FS="\t" '{ print $2 > "train.cleaned.src" ; print $3 > "train.cleaned.tgt" }'
	```

	### Upload Data to Huggingface

	You will have to have authenticated to huggingface and you will need to write to a location for which you have permissions (replace `quickmt/quickmt-train.ri-en` with `your_username/your_dataset_name`)

	```
	huggingface-cli login
	quickmt-corpus-upload quickmt/quickmt-train.id-en --src_in train.cleaned.src --tgt_in train.cleaned.tgt --src_lang id --tgt_lang en
	```

	### Train Tokenizers

	```bash
	# Train target tokenizer
	spm_train --input_sentence_size 10000000 --shuffle_input_sentence false \
	--input=train.cleaned.tgt --num_threads 4 --model_prefix=en.spm \
	--vocab_size=20000 --character_coverage=0.9999 --model_type=unigram \
	--byte_fallback --train_extremely_large_corpus true

	# Train source tokenizer
	spm_train --input_sentence_size 10000000 --shuffle_input_sentence false \
	--input=train.cleaned.src --num_threads 4 --model_prefix=id.spm \
	--vocab_size=20000 --character_coverage=0.9999 --model_type=unigram \
	--byte_fallback --train_extremely_large_corpus true

	# Train joint tokenizer
	# spm_train --input_sentence_size 10000000 --shuffle_input_sentence true \
	# --input=tok.txt --num_threads 6 --model_prefix=joint.spm \
	# --vocab_size=50000 --character_coverage=0.9999 --model_type=unigram


	# Convert spm vocab to eole vocab
	cat en.spm.vocab \| eole tools spm_to_vocab > en.eole.vocab
	cat id.spm.vocab \| eole tools spm_to_vocab > id.eole.vocab
	#cat fr-en/joint.spm.vocab \| eole tools spm_to_vocab > fr-en/joint.eole.vocab

	mv train.cleaned.src train.id
	mv train.cleaned.tgt train.en
	```

	### Train Model

	```bash
	eole train --config eole-config-iden.yaml
	eole train --config eole-config-enid.yaml


	```


	### Inference with eole

	```bash
	eole predict -model_path ./so-en/model/ -src input.txt -output output.txt --batch_size 16 --gpu_ranks 0
	```


	### Convert to ctranslate2

	```python
	python -m ctranslate2.converters.eole_ct2 --model_path quickmt-id-en-eole-model/ --output_dir ct2-iden --force

	# Copy over src and tgt tokenizers
	cp en.spm.model ct2-iden/tgt.spm.model
	cp id.spm.model ct2-iden/src.spm.model

	# Copy over the config too
	cp eole-config-iden.yaml ct2-iden/eole-config.yaml
	```

	### Evaluate

	Evaluate on the `flores-devtest` dataset

	```bash
	quickmt-eval --model_path ct2-iden --tgt_lang eng_Latn --src_lang ind_Latn --output_file quickmt.iden.mt --device cpu

	```


	* Statmt-commoncrawl_wmt13-1-rus-eng
	* Statmt-news_commentary_wmt18-13-rus-eng
	* Statmt-news_commentary-14-eng-rus
	* Statmt-news_commentary-15-eng-rus
	* Statmt-news_commentary-16-eng-rus
	* Statmt-news_commentary-17-eng-rus
	* Statmt-news_commentary-18-eng-rus
	* Statmt-news_commentary-18.1-eng-rus
	* Statmt-newstest_ruen-2014-rus-eng
	* Statmt-newstest_enru-2015-eng-rus
	* Statmt-newstest_ruen-2015-rus-eng
	* Statmt-newstest_ruen-2016-rus-eng
	* Statmt-newstest_enru-2016-eng-rus
	* Statmt-newstest_ruen-2017-rus-eng
	* Statmt-newstest_enru-2017-eng-rus
	* Statmt-newstest_ruen-2018-rus-eng
	* Statmt-newstest_enru-2018-eng-rus
	* Statmt-newstest_ruen-2019-rus-eng
	* Statmt-newstest_enru-2019-eng-rus
	* Statmt-newstest-2012-eng-rus
	* Statmt-newstest-2013-eng-rus
	* Statmt-newstest_enru-2020-eng-rus
	* Statmt-newstest_ruen-2020-rus-eng
	* Statmt-newstestb_ruen-2020-rus-eng
	* Statmt-newstest_enru-2021-eng-rus
	* Statmt-newstest_ruen-2021-rus-eng
	* Statmt-backtrans_ruen-wmt20-rus-eng
	* Statmt-yandex-wmt22-eng-rus
	* Tilde-airbaltic-1-eng-rus
	* Tilde-czechtourism-1-eng-rus
	* Tilde-worldbank-1-eng-rus
	* Neulab-tedtalks_train-1-eng-rus
	* Neulab-tedtalks_test-1-eng-rus
	* Neulab-tedtalks_dev-1-eng-rus
	* ELRC-wikipedia_health-1-eng-rus
	* ELRC-swps_university_social_sciences_humanities-1-eng-rus
	* ELRC-scipar-1-eng-rus
	* ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus
	* ELRC-hrw_dataset_v1-1-eng-rus
	* OPUS-books-v1-eng-rus
	* OPUS-ccaligned-v1-eng-rus
	* OPUS-ccmatrix-v1-eng-rus
	* OPUS-elrc_3075_wikipedia_health-v1-eng-rus
	* OPUS-elrc_3855_swps_university_soci-v1-eng-rus
	* OPUS-elrc_5067_scipar-v1-eng-rus
	* OPUS-elrc_5183_scipar_ukraine-v1-eng-rus
	* OPUS-elrc_wikipedia_health-v1-eng-rus
	* OPUS-elrc_2922-v1-eng-rus
	* OPUS-eubookshop-v2-eng-rus
	* OPUS-gnome-v1-eng-rus
	* OPUS-globalvoices-v2015-eng-rus
	* OPUS-globalvoices-v2017q3-eng-rus
	* OPUS-globalvoices-v2018q4-eng-rus
	* OPUS-kde4-v2-eng-rus
	* OPUS-kdedoc-v1-eng_GB-rus
	* OPUS-linguatools_wikititles-v2014-eng-rus
	* OPUS-mdn_web_docs-v20230925-eng-rus
	* OPUS-multiun-v1-eng-rus
	* OPUS-neulab_tedtalks-v1-eng-rus
	* OPUS-news_commentary-v11-eng-rus
	* OPUS-news_commentary-v14-eng-rus
	* OPUS-news_commentary-v16-eng-rus
	* OPUS-news_commentary-v9.0-eng-rus
	* OPUS-news_commentary-v9.1-eng-rus
	* OPUS-openoffice-v3-eng_GB-rus
	* OPUS-opensubtitles-v2024-eng-rus
	* OPUS-php-v1-eng-rus
	* OPUS-paracrawl-v9-eng-rus
	* OPUS-qed-v2.0a-eng-rus
	* OPUS-ted2013-v1.1-eng-rus
	* OPUS-ted2020-v1-eng-rus
	* OPUS-tanzil-v1-eng-rus
	* OPUS-tatoeba-v2-eng-rus
	* OPUS-tatoeba-v20190709-eng-rus
	* OPUS-tatoeba-v20200531-eng-rus
	* OPUS-tatoeba-v20201109-eng-rus
	* OPUS-tatoeba-v20210310-eng-rus
	* OPUS-tatoeba-v20210722-eng-rus
	* OPUS-tatoeba-v20220303-eng-rus
	* OPUS-tatoeba-v20230412-eng-rus
	* OPUS-tildemodel-v2018-eng-rus
	* OPUS-unpc-v1.0-eng-rus
	* OPUS-ubuntu-v14.10-eng-rus
	* OPUS-wmt_news-v2014-eng-rus
	* OPUS-wmt_news-v2019-eng-rus
	* OPUS-wikimatrix-v1-eng-rus
	* OPUS-wikipedia-v1.0-eng-rus
	* OPUS-ada83-v1-eng-rus
	* OPUS-bible_uedin-v1-eng-rus
	* OPUS-infopankki-v1-eng-rus
	* OPUS-tico_19-v20201028-eng-rus
	* OPUS-tldr_pages-v20230829-eng-rus
	* OPUS-wikimedia-v20230407-eng-rus
	* Google-wmt24pp-1-eng-rus_RU