| #!/usr/bin/env bash |
|
|
| |
| export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python |
|
|
| set -eou pipefail |
|
|
| nj=30 |
| stage=0 |
| stop_stage=7 |
| perturb_speed=true |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| dl_dir=$PWD/download |
|
|
| . shared/parse_options.sh || exit 1 |
|
|
| |
| |
| mkdir -p data |
|
|
| log() { |
| |
| local fname=${BASH_SOURCE[1]##*/} |
| echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" |
| } |
|
|
| log "dl_dir: $dl_dir" |
|
|
| if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then |
| log "stage 0: Download data" |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| |
| |
| |
| |
| |
| if [ ! -d $dl_dir/musan ]; then |
| lhotse download musan $dl_dir |
| fi |
| fi |
|
|
| if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then |
| log "Stage 1: Prepare aishell2 manifest" |
| |
| |
| if [ ! -f data/manifests/.aishell2_manifests.done ]; then |
| mkdir -p data/manifests |
| lhotse prepare aishell2 $dl_dir/aishell2 data/manifests -j $nj |
| touch data/manifests/.aishell2_manifests.done |
| fi |
| fi |
|
|
| if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then |
| log "Stage 2: Prepare musan manifest" |
| |
| |
| if [ ! -f data/manifests/.musan_manifests.done ]; then |
| log "It may take 6 minutes" |
| mkdir -p data/manifests |
| lhotse prepare musan $dl_dir/musan data/manifests |
| touch data/manifests/.musan_manifests.done |
| fi |
| fi |
|
|
| if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then |
| log "Stage 3: Compute fbank for aishell2" |
| if [ ! -f data/fbank/.aishell2.done ]; then |
| mkdir -p data/fbank |
| ./local/compute_fbank_aishell2.py --perturb-speed ${perturb_speed} |
| touch data/fbank/.aishell2.done |
| fi |
| fi |
|
|
| whisper_mel_bins=80 |
| if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then |
| log "Stage 30: Compute whisper fbank for aishell2" |
| if [ ! -f data/fbank/.aishell2.whisper.done ]; then |
| mkdir -p data/fbank |
| ./local/compute_fbank_aishell2.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true |
| touch data/fbank/.aishell2.whisper.done |
| fi |
| fi |
|
|
| if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then |
| log "Stage 4: Compute fbank for musan" |
| if [ ! -f data/fbank/.msuan.done ]; then |
| mkdir -p data/fbank |
| ./local/compute_fbank_musan.py |
| touch data/fbank/.msuan.done |
| fi |
| fi |
|
|
| lang_char_dir=data/lang_char |
| if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then |
| log "Stage 5: Prepare char based lang" |
| mkdir -p $lang_char_dir |
|
|
| |
| |
| |
| |
| |
| if [ ! -f $lang_char_dir/text ]; then |
| gunzip -c data/manifests/aishell2_supervisions_train.jsonl.gz \ |
| | jq '.text' | sed 's/"//g' \ |
| | ./local/text2token.py -t "char" > $lang_char_dir/text |
| fi |
|
|
| |
| |
| |
| |
| if [ ! -f $lang_char_dir/text_words_segmentation ]; then |
| python3 ./local/text2segments.py \ |
| --input-file $lang_char_dir/text \ |
| --output-file $lang_char_dir/text_words_segmentation |
| fi |
|
|
| cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \ |
| | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt |
|
|
| if [ ! -f $lang_char_dir/words.txt ]; then |
| python3 ./local/prepare_words.py \ |
| --input-file $lang_char_dir/words_no_ids.txt \ |
| --output-file $lang_char_dir/words.txt |
| fi |
|
|
| if [ ! -f $lang_char_dir/L_disambig.pt ]; then |
| python3 ./local/prepare_char.py |
| fi |
| fi |
|
|
| if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then |
| log "Stage 6: Prepare G" |
| |
| |
|
|
| if [ ! -f ${lang_char_dir}/3-gram.unpruned.arpa ]; then |
| ./shared/make_kn_lm.py \ |
| -ngram-order 3 \ |
| -text $lang_char_dir/text_words_segmentation \ |
| -lm $lang_char_dir/3-gram.unpruned.arpa |
| fi |
|
|
| mkdir -p data/lm |
| if [ ! -f data/lm/G_3_gram.fst.txt ]; then |
| |
| python3 -m kaldilm \ |
| --read-symbol-table="$lang_char_dir/words.txt" \ |
| --disambig-symbol='#0' \ |
| --max-order=3 \ |
| $lang_char_dir/3-gram.unpruned.arpa > data/lm/G_3_gram.fst.txt |
| fi |
| fi |
|
|
| if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then |
| log "Stage 7: Compile LG" |
| ./local/compile_lg.py --lang-dir $lang_char_dir |
| fi |
|
|