| #!/usr/bin/env bash |
|
|
| set -eou pipefail |
|
|
| stage=-1 |
| stop_stage=100 |
| use_gss=false |
|
|
| dl_dir=$PWD/download |
|
|
| . shared/parse_options.sh || exit 1 |
|
|
| mkdir -p data |
| vocab_size=500 |
|
|
| log() { |
| local fname=${BASH_SOURCE[1]##*/} |
| echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" |
| } |
|
|
| log "dl_dir: $dl_dir" |
|
|
| |
| |
| |
| if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then |
| log "Stage 0: Download AMI IHM data (~70h)" |
|
|
| if [ ! -d $dl_dir/amicorpus ]; then |
| lhotse download ami --mic ihm $dl_dir/amicorpus |
| fi |
| fi |
|
|
| |
| |
| |
| if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then |
| log "Stage 1: Prepare AMI IHM manifests" |
|
|
| mkdir -p data/manifests |
|
|
| lhotse prepare ami \ |
| --mic ihm \ |
| --partition full-corpus-asr \ |
| --normalize-text kaldi \ |
| --max-words-per-segment 30 \ |
| $dl_dir/amicorpus data/manifests/ |
| fi |
|
|
| |
| |
| |
| if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then |
| log "Stage 4: Compute fbank features for AMI IHM" |
|
|
| mkdir -p data/fbank |
| python local/compute_fbank_ami.py |
|
|
| log "Creating final IHM training cuts" |
| gunzip -c data/manifests/cuts_train_ihm.jsonl.gz | \ |
| gzip -c > data/manifests/cuts_train_all.jsonl.gz |
| fi |
|
|
| |
| |
| |
| if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then |
| log "Stage 6: Dump IHM transcripts for BPE" |
|
|
| mkdir -p data/lm |
|
|
| gunzip -c data/manifests/ami-ihm_supervisions_train.jsonl.gz | \ |
| jq '.text' | sed 's:"::g' > data/lm/transcript_words.txt |
| fi |
|
|
| |
| |
| |
| if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then |
| log "Stage 7: Prepare BPE-based lang" |
|
|
| lang_dir=data/lang_bpe_${vocab_size} |
| mkdir -p $lang_dir |
|
|
| echo "<eps> 0" > $lang_dir/words.txt |
| echo "!SIL 1" >> $lang_dir/words.txt |
| echo "<UNK> 2" >> $lang_dir/words.txt |
|
|
| cat data/lm/transcript_words.txt | \ |
| grep -o -E '\w+' | sort -u | \ |
| awk '{print $0, NR+2}' >> $lang_dir/words.txt |
|
|
| num_words=$(wc -l < $lang_dir/words.txt) |
| echo "<s> ${num_words}" >> $lang_dir/words.txt |
| num_words=$(wc -l < $lang_dir/words.txt) |
| echo "</s> ${num_words}" >> $lang_dir/words.txt |
| num_words=$(wc -l < $lang_dir/words.txt) |
| echo "#0 ${num_words}" >> $lang_dir/words.txt |
|
|
| ./local/train_bpe_model.py \ |
| --lang-dir $lang_dir \ |
| --vocab-size $vocab_size \ |
| --transcript data/lm/transcript_words.txt |
|
|
| if [ ! -f $lang_dir/L_disambig.pt ]; then |
| ./local/prepare_lang_bpe.py --lang-dir $lang_dir |
| fi |
| fi |