| #!/bin/bash |
| set -e |
|
|
| |
| |
| |
| LANGUAGES_WIKIPEDIA=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" ) |
| LANGUAGES_OSCAR=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" ) |
| MODEL_TYPES=("bpe" "unigram") |
|
|
| NDOC_FOR_LM=1000000 |
| VOCAB_SIZES=(16000 32000 64000) |
| SMALL_VOCAB_SIZE=16000 |
| EXTRA_IDS=100 |
|
|
| |
| SKIP_KENLM=True |
| REMOVE_ACCENTS=False |
| LOWER_CASE=False |
| NORMALIZE_NUMBERS=True |
| NORMALIZE_PUNCT=1 |
|
|
| |
| NDOC_FOR_LM_OSCAR=1000000 |
|
|
|
|
| train_language_and_dataset () { |
| local lang=$1 |
| local dataset=$2 |
| local vocab_size=$3 |
| local vocab_ndoc=$4 |
| local model_type=$5 |
| local model_extra_ids="" |
| local extra_ids=`python -c "print('--user_defined_symbols='+','.join([f'<extra_id_{i}>' for i in range($EXTRA_IDS)]))"` |
| if [ "$EXTRA_IDS" = 0 ]; then |
| model_extra_ids="" |
| else |
| model_extra_ids=".${EXTRA_IDS}extra" |
| fi |
| if [[ "$lang" == *"-"* ]]; then |
| echo "Set of languages: ${lang}" |
| for sublang in $(echo $lang | tr "-" "\n") |
| do |
| train_language_and_dataset "$sublang" "$dataset" "$vocab_size" "$vocab_ndoc" "$model_type" |
| done |
| if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then |
| echo "${dataset} openings were alerady extracted for ${lang}" |
| else |
| touch "data/${dataset}/cirrus/gz/${lang}.json.gz" |
| touch "data/${dataset}/cirrus/gz/${lang}.opening.tmp" |
| echo "Combining and shuffling languages: ${lang}" |
| for sublang in $(echo $lang | tr "-" "\n") |
| do |
| cat "data/${dataset}/cirrus/gz/${sublang}.opening.txt" >> "data/${dataset}/cirrus/gz/${lang}.opening.tmp" |
| done |
| shuf "data/${dataset}/cirrus/gz/${lang}.opening.tmp" -o "data/${dataset}/cirrus/gz/${lang}.opening.txt" |
| rm "data/${dataset}/cirrus/gz/${lang}.opening.tmp" |
| fi |
| fi |
|
|
| if [ "$dataset" = "wikipedia" ]; then |
| |
| if [ -f "data/${dataset}/cirrus/gz/${lang}.json.gz" ]; then |
| echo "${lang} Wikipedia cirrus was already downloaded." |
| else |
| echo "Downloading ${lang}" |
| mkdir -p "data/${dataset}/cirrus/gz/" |
| python cc_net/get_wiki_cirrus.py dl --lang "${lang}" --output_dir "data/${dataset}/cirrus/gz" --date 20220418 |
| echo "Downloaded Wikipedia cirrus for ${lang}" |
| fi |
|
|
| |
| if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then |
| echo "Wikipedia openings were already extracted for ${lang}" |
| else |
| echo "Extracting ${lang}" |
| python cc_net/get_wiki_cirrus.py opening \ |
| --n_docs ${NDOC_FOR_LM} \ |
| --file "data/${dataset}/cirrus/gz/${lang}.json.gz" \ |
| --output "data/${dataset}/cirrus/gz/${lang}.opening.txt" \ |
| --accent ${REMOVE_ACCENTS} \ |
| --case ${LOWER_CASE} \ |
| --numbers ${NORMALIZE_NUMBERS} \ |
| --punct ${NORMALIZE_PUNCT} |
| fi |
| else |
| |
| if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then |
| echo "OSCAR openings were already extracted for ${lang}" |
| else |
| echo "Downloading OSCAR ${lang}" |
| mkdir -p "data/${dataset}/cirrus/gz/" |
| python cc_net/get_hf_dataset.py dl \ |
| --dataset "${dataset}" \ |
| --output_file "data/${dataset}/cirrus/gz/${lang}.opening.txt" \ |
| --name "unshuffled_deduplicated_${lang}" \ |
| --split "train" \ |
| --max_docs $NDOC_FOR_LM_OSCAR |
| fi |
| fi |
| local model_name="${lang}_${vocab_size}_${model_type}${model_extra_ids}" |
| |
| if [ -f "data/${dataset}/lm_sp/${model_name}.sp.model" ]; then |
| echo "Sentence piece tokenizer was already trained for ${model_name}" |
| else |
| echo "Training sentence piece tokenizer for ${lang}_${vocab_size}_${model_type}" |
| mkdir -p "data/${dataset}/lm_sp" |
| ./bin/spm_train --input="data/${dataset}/cirrus/gz/${lang}.opening.txt" \ |
| --vocab_size=${vocab_size} --hard_vocab_limit \ |
| --character_coverage=1.0 \ |
| --model_type=${model_type} \ |
| --bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 \ |
| --input_sentence_size=${vocab_ndoc} --shuffle_input_sentence=true \ |
| --model_prefix="data/${dataset}/lm_sp/${model_name}.sp" ${extra_ids} \ |
| || echo "WARNING: Corpus is too small, will train smaller model" |
| |
| |
| |
| |
| |
| |
|
|
| echo "Trained SentencePiece model with $(wc -l data/"${dataset}"/lm_sp/"${lang}"_"${vocab_size}"_"${model_type}${model_extra_ids}".sp.vocab) pieces" |
| fi |
|
|
| if [ "$SKIP_KENLM" = "False" ]; then |
|
|
| |
| if [ -f "data/${dataset}/cirrus/sp/${lang}.opening.txt" ]; then |
| echo "Openings dataset already tokenized for ${model_name}" |
| else |
| mkdir -p "data/${dataset}/cirrus/sp" |
| echo "Tokenizing openings dataset for ${model_name}" |
| ./bin/spm_encode \ |
| --model="data/${dataset}/lm_sp/${model_name}.sp.model" \ |
| --output_format=piece \ |
| "data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/cirrus/sp/${lang}.opening.txt" |
| echo "Tokenized openings dataset for ${model_name}" |
| fi |
|
|
| |
| if [ -f "data/${dataset}/lm_sp/${model_name}.arpa" ] || [ -f "data/${dataset}/lm_sp/${model_name}.arpa.bin" ]; then |
| echo "KenLM model already trained for ${model_name}" |
| else |
| echo "Training KenLM model for ${model_name}" |
| mkdir -p tmp |
| ./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback \ |
| < "data/${dataset}/cirrus/sp/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}.arpa" |
| echo "Trained KenLM model for ${model_name}" |
| fi |
| if [ -f "data/${dataset}/lm_sp/${model_name}_untokenized.arpa" ] ; then |
| echo "KenLM model already trained for ${model_name}_untokenized" |
| else |
| echo "Training KenLM model for ${model_name}_untokenized" |
| mkdir -p tmp |
| ./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback --skip_symbols \ |
| < "data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}_untokenized.arpa" |
| echo "Trained KenLM model for ${model_name}_untokenized" |
| fi |
|
|
|
|
| |
| if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin" ]; then |
| echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" |
| else |
| echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" |
| ./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin" |
| echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" |
| rm "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa" |
| fi |
| if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin" ]; then |
| echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" |
| else |
| echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" |
| ./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin" |
| echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" |
| |
| fi |
|
|
| fi |
| } |
|
|
| for model_type in "${MODEL_TYPES[@]}" |
| do |
| for vocab_size in "${VOCAB_SIZES[@]}" |
| do |
| echo -e "\n--------------------\nVocab: ${vocab_size}. Model: ${model_type}\n--------------------\n" |
| for lang in "${LANGUAGES_WIKIPEDIA[@]}" |
| do |
| train_language_and_dataset "$lang" wikipedia "$vocab_size" "$NDOC_FOR_LM" "$model_type" |
| done |
|
|
| for lang in "${LANGUAGES_OSCAR[@]}" |
| do |
| train_language_and_dataset "$lang" oscar "$vocab_size" "$NDOC_FOR_LM_OSCAR" "$model_type" |
| done |
| done |
| done |
|
|