#!/bin/bash set -e # Languages to train on #LANGUAGES_WIKIPEDIA=( "es" "af" "ar" "arz" "as" "bn" "fr" "sw" "eu" "ca" "zh" "en" "hi" "ur" "id" "pt" "vi" "gu" "kn" "ml" "mr" "ta" "te" "yo" ) #LANGUAGES_OSCAR=( "es" "af" "ar" "arz" "as" "bn" "fr" "sw" "eu" "ca" "zh" "en" "hi" "ur" "id" "pt" "vi" "gu" "kn" "ml" "mr" "te" ) LANGUAGES_WIKIPEDIA=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" ) LANGUAGES_OSCAR=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" ) MODEL_TYPES=("bpe" "unigram") NDOC_FOR_LM=1000000 VOCAB_SIZES=(16000 32000 64000) # 65536 SMALL_VOCAB_SIZE=16000 EXTRA_IDS=100 # Normalization parameters SKIP_KENLM=True REMOVE_ACCENTS=False LOWER_CASE=False NORMALIZE_NUMBERS=True NORMALIZE_PUNCT=1 # OSCAR NDOC_FOR_LM_OSCAR=1000000 train_language_and_dataset () { local lang=$1 local dataset=$2 local vocab_size=$3 local vocab_ndoc=$4 local model_type=$5 local model_extra_ids="" local extra_ids=`python -c "print('--user_defined_symbols='+','.join([f'' for i in range($EXTRA_IDS)]))"` if [ "$EXTRA_IDS" = 0 ]; then model_extra_ids="" else model_extra_ids=".${EXTRA_IDS}extra" fi if [[ "$lang" == *"-"* ]]; then echo "Set of languages: ${lang}" for sublang in $(echo $lang | tr "-" "\n") do train_language_and_dataset "$sublang" "$dataset" "$vocab_size" "$vocab_ndoc" "$model_type" done if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then echo "${dataset} openings were alerady extracted for ${lang}" else touch "data/${dataset}/cirrus/gz/${lang}.json.gz" touch "data/${dataset}/cirrus/gz/${lang}.opening.tmp" echo "Combining and shuffling languages: ${lang}" for sublang in $(echo $lang | tr "-" "\n") do cat "data/${dataset}/cirrus/gz/${sublang}.opening.txt" >> "data/${dataset}/cirrus/gz/${lang}.opening.tmp" done shuf "data/${dataset}/cirrus/gz/${lang}.opening.tmp" -o "data/${dataset}/cirrus/gz/${lang}.opening.txt" rm "data/${dataset}/cirrus/gz/${lang}.opening.tmp" fi fi if [ "$dataset" = "wikipedia" ]; then # 1 Download Wikipedia cirrus if [ -f "data/${dataset}/cirrus/gz/${lang}.json.gz" ]; then echo "${lang} Wikipedia cirrus was already downloaded." else echo "Downloading ${lang}" mkdir -p "data/${dataset}/cirrus/gz/" python cc_net/get_wiki_cirrus.py dl --lang "${lang}" --output_dir "data/${dataset}/cirrus/gz" --date 20220418 echo "Downloaded Wikipedia cirrus for ${lang}" fi # 2 Extract opening text of each article if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then echo "Wikipedia openings were already extracted for ${lang}" else echo "Extracting ${lang}" python cc_net/get_wiki_cirrus.py opening \ --n_docs ${NDOC_FOR_LM} \ --file "data/${dataset}/cirrus/gz/${lang}.json.gz" \ --output "data/${dataset}/cirrus/gz/${lang}.opening.txt" \ --accent ${REMOVE_ACCENTS} \ --case ${LOWER_CASE} \ --numbers ${NORMALIZE_NUMBERS} \ --punct ${NORMALIZE_PUNCT} fi else # 1 & 2 Download and preprocess dataset from HF hub if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then echo "OSCAR openings were already extracted for ${lang}" else echo "Downloading OSCAR ${lang}" mkdir -p "data/${dataset}/cirrus/gz/" python cc_net/get_hf_dataset.py dl \ --dataset "${dataset}" \ --output_file "data/${dataset}/cirrus/gz/${lang}.opening.txt" \ --name "unshuffled_deduplicated_${lang}" \ --split "train" \ --max_docs $NDOC_FOR_LM_OSCAR fi fi local model_name="${lang}_${vocab_size}_${model_type}${model_extra_ids}" # 3 Train sentence piece tokenizer if [ -f "data/${dataset}/lm_sp/${model_name}.sp.model" ]; then echo "Sentence piece tokenizer was already trained for ${model_name}" else echo "Training sentence piece tokenizer for ${lang}_${vocab_size}_${model_type}" mkdir -p "data/${dataset}/lm_sp" ./bin/spm_train --input="data/${dataset}/cirrus/gz/${lang}.opening.txt" \ --vocab_size=${vocab_size} --hard_vocab_limit \ --character_coverage=1.0 \ --model_type=${model_type} \ --bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 \ --input_sentence_size=${vocab_ndoc} --shuffle_input_sentence=true \ --model_prefix="data/${dataset}/lm_sp/${model_name}.sp" ${extra_ids} \ || echo "WARNING: Corpus is too small, will train smaller model" #&& \ #./bin/spm_train --input="data/${dataset}/cirrus/gz/${lang}.opening.txt" \ # --vocab_size=${SMALL_VOCAB_SIZE} \ # --character_coverage=1.0 \ # --model_type=${model_type} \ # --bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 \ # --model_prefix="data/${dataset}/lm_sp/${lang}_${vocab_size}.sp" echo "Trained SentencePiece model with $(wc -l data/"${dataset}"/lm_sp/"${lang}"_"${vocab_size}"_"${model_type}${model_extra_ids}".sp.vocab) pieces" fi if [ "$SKIP_KENLM" = "False" ]; then # 4 Tokenize openings dataset if [ -f "data/${dataset}/cirrus/sp/${lang}.opening.txt" ]; then echo "Openings dataset already tokenized for ${model_name}" else mkdir -p "data/${dataset}/cirrus/sp" echo "Tokenizing openings dataset for ${model_name}" ./bin/spm_encode \ --model="data/${dataset}/lm_sp/${model_name}.sp.model" \ --output_format=piece \ "data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/cirrus/sp/${lang}.opening.txt" echo "Tokenized openings dataset for ${model_name}" fi # 5 Train KenLM model on tokenized dataset if [ -f "data/${dataset}/lm_sp/${model_name}.arpa" ] || [ -f "data/${dataset}/lm_sp/${model_name}.arpa.bin" ]; then echo "KenLM model already trained for ${model_name}" else echo "Training KenLM model for ${model_name}" mkdir -p tmp ./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback \ < "data/${dataset}/cirrus/sp/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}.arpa" echo "Trained KenLM model for ${model_name}" fi if [ -f "data/${dataset}/lm_sp/${model_name}_untokenized.arpa" ] ; then echo "KenLM model already trained for ${model_name}_untokenized" else echo "Training KenLM model for ${model_name}_untokenized" mkdir -p tmp ./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback --skip_symbols \ < "data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}_untokenized.arpa" echo "Trained KenLM model for ${model_name}_untokenized" fi # 6 Convert KenLM model to binary if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin" ]; then echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" else echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" ./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin" echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" rm "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa" fi if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin" ]; then echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" else echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" ./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin" echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" # rm "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}_untokenized.arpa" fi fi } for model_type in "${MODEL_TYPES[@]}" do for vocab_size in "${VOCAB_SIZES[@]}" do echo -e "\n--------------------\nVocab: ${vocab_size}. Model: ${model_type}\n--------------------\n" for lang in "${LANGUAGES_WIKIPEDIA[@]}" do train_language_and_dataset "$lang" wikipedia "$vocab_size" "$NDOC_FOR_LM" "$model_type" done for lang in "${LANGUAGES_OSCAR[@]}" do train_language_and_dataset "$lang" oscar "$vocab_size" "$NDOC_FOR_LM_OSCAR" "$model_type" done done done