| #!/usr/bin/env bash |
|
|
| |
| export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python |
|
|
| set -eou pipefail |
|
|
| nj=15 |
| stage=-1 |
| stop_stage=100 |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| otc_token="<star>" |
| |
| feature_type="fbank" |
|
|
| dl_dir=$PWD/download |
| manifests_dir="data/manifests" |
| feature_dir="data/${feature_type}" |
| lang_dir="data/lang" |
| lm_dir="data/lm" |
|
|
| perturb_speed=false |
|
|
| . shared/parse_options.sh || exit 1 |
|
|
| |
| |
| |
| vocab_sizes=( |
| 200 |
| ) |
|
|
| |
| |
| mkdir -p data |
|
|
| log() { |
| |
| local fname=${BASH_SOURCE[1]##*/} |
| echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" |
| } |
|
|
| log "dl_dir: ${dl_dir}" |
|
|
| if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then |
| log "Stage -1: Download LM" |
| mkdir -p ${dl_dir}/lm |
| if [ ! -e ${dl_dir}/lm/.done ]; then |
| ./local/download_lm.py --out-dir=${dl_dir}/lm |
| touch ${dl_dir}/lm/.done |
| fi |
| fi |
|
|
| if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then |
| log "Stage 0: Download data" |
|
|
| |
| |
| |
| |
| |
| if [ ! -d $dl_dir/LibriSpeech/train-clean-100 ]; then |
| lhotse download librispeech --full ${dl_dir} |
| fi |
| fi |
|
|
| if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then |
| log "Stage 1: Prepare LibriSpeech manifest" |
| |
| |
| mkdir -p data/manifests |
| if [ ! -e data/manifests/.librispeech.done ]; then |
| lhotse prepare librispeech -j ${nj} \ |
| -p dev-clean \ |
| -p dev-other \ |
| -p test-clean \ |
| -p test-other \ |
| -p train-clean-100 "${dl_dir}/LibriSpeech" "${manifests_dir}" |
| touch data/manifests/.librispeech.done |
| fi |
| fi |
|
|
| if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then |
| log "Stage 2: Compute ${feature_type} feature for librispeech (train-clean-100)" |
| mkdir -p "${feature_dir}" |
| if [ ! -e "${feature_dir}/.librispeech.done" ]; then |
| if [ "${feature_type}" = ssl ]; then |
| ./local/compute_ssl_librispeech.py |
| elif [ "${feature_type}" = fbank ]; then |
| ./local/compute_fbank_librispeech.py --perturb-speed ${perturb_speed} |
| else |
| log "Error: not supported --feature-type '${feature_type}'" |
| exit 2 |
| fi |
|
|
| touch "${feature_dir}.librispeech.done" |
| fi |
|
|
| if [ ! -e "${feature_dir}/.librispeech-validated.done" ]; then |
| log "Validating data/ssl for LibriSpeech" |
| parts=( |
| train-clean-100 |
| test-clean |
| test-other |
| dev-clean |
| dev-other |
| ) |
| for part in ${parts[@]}; do |
| python3 ./local/validate_manifest.py \ |
| "${feature_dir}/librispeech_cuts_${part}.jsonl.gz" |
| done |
| touch "${feature_dir}/.librispeech-validated.done" |
| fi |
| fi |
|
|
| if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then |
| log "Stage 3: Prepare words.txt" |
| mkdir -p ${lang_dir} |
|
|
| (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) | |
| cat - $dl_dir/lm/librispeech-lexicon.txt | |
| sort | uniq > ${lang_dir}/lexicon.txt |
|
|
| local/get_words_from_lexicon.py \ |
| --lang-dir ${lang_dir} \ |
| --otc-token ${otc_token} |
| fi |
|
|
| if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then |
| log "Stage 4: Prepare BPE based lang" |
|
|
| for vocab_size in ${vocab_sizes[@]}; do |
| bpe_lang_dir="data/lang_bpe_${vocab_size}" |
| mkdir -p "${bpe_lang_dir}" |
| |
| |
| cp "${lang_dir}/words.txt" "${bpe_lang_dir}" |
|
|
| if [ ! -f "${bpe_lang_dir}/transcript_words.txt" ]; then |
| log "Generate data for BPE training" |
| files=$( |
| find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" |
| find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" |
| find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" |
| ) |
| for f in ${files[@]}; do |
| cat $f | cut -d " " -f 2- |
| done > "${bpe_lang_dir}/transcript_words.txt" |
| fi |
|
|
| if [ ! -f ${bpe_lang_dir}/bpe.model ]; then |
| ./local/train_bpe_model.py \ |
| --lang-dir ${bpe_lang_dir} \ |
| --vocab-size ${vocab_size} \ |
| --transcript ${bpe_lang_dir}/transcript_words.txt |
| fi |
|
|
| if [ ! -f ${bpe_lang_dir}/L_disambig.pt ]; then |
| ./local/prepare_otc_lang_bpe.py \ |
| --lang-dir "${bpe_lang_dir}" \ |
| --otc-token "${otc_token}" |
|
|
| log "Validating ${bpe_lang_dir}/lexicon.txt" |
| ./local/validate_bpe_lexicon.py \ |
| --lexicon ${bpe_lang_dir}/lexicon.txt \ |
| --bpe-model ${bpe_lang_dir}/bpe.model \ |
| --otc-token "${otc_token}" |
| fi |
| done |
| fi |
|
|
| if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then |
| log "Stage 5: Prepare phone based lang" |
| lang_dir="data/lang_phone" |
| mkdir -p ${lang_dir} |
|
|
| if [ ! -f $lang_dir/lexicon.txt ]; then |
| (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) | |
| cat - $dl_dir/lm/librispeech-lexicon.txt | |
| sort | uniq > $lang_dir/lexicon.txt |
| fi |
|
|
| if [ ! -f $lang_dir/L_disambig.pt ]; then |
| ./local/prepare_otc_lang.py --lang-dir $lang_dir |
| fi |
| fi |
|
|
| if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then |
| log "Stage 6: Prepare G" |
| |
| |
|
|
| mkdir -p "${lm_dir}" |
| if [ ! -f ${lm_dir}/G_3_gram.fst.txt ]; then |
| |
| python3 -m kaldilm \ |
| --read-symbol-table="${lang_dir}/words.txt" \ |
| --disambig-symbol='#0' \ |
| --max-order=3 \ |
| ${dl_dir}/lm/3-gram.pruned.1e-7.arpa > ${lm_dir}/G_3_gram.fst.txt |
| fi |
|
|
| if [ ! -f ${lm_dir}/G_4_gram.fst.txt ]; then |
| |
| python3 -m kaldilm \ |
| --read-symbol-table="${lang_dir}/words.txt" \ |
| --disambig-symbol='#0' \ |
| --max-order=4 \ |
| ${dl_dir}/lm/4-gram.arpa > ${lm_dir}/G_4_gram.fst.txt |
| fi |
| fi |
|
|
| if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then |
| log "Stage 7: Compile HLG" |
| |
| |
| |
| |
| for vocab_size in ${vocab_sizes[@]}; do |
| lang_dir="data/lang_bpe_${vocab_size}" |
| echo "LM DIR: ${lm_dir}" |
| ./local/compile_hlg.py \ |
| --lm-dir "${lm_dir}" \ |
| --lang-dir "${bpe_lang_dir}" |
| done |
| fi |
|
|
| if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then |
| log "Stage 7: Compile HLG" |
| |
| |
| |
| |
| lang_dir="data/lang_phone" |
| echo "LM DIR: ${lm_dir}" |
| ./local/compile_hlg.py \ |
| --lm-dir "${lm_dir}" \ |
| --lang-dir "${lang_dir}" |
| fi |
|
|