odg123's picture
Upload icefall experiment results and logs
d596074 verified
#!/usr/bin/env bash
set -eou pipefail
stage=-1
stop_stage=100
use_gss=false # ❌ Disable GSS (not needed for IHM-only)
dl_dir=$PWD/download
. shared/parse_options.sh || exit 1
mkdir -p data
vocab_size=500
log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
# -----------------------------
# Stage 0: Download AMI (IHM only)
# -----------------------------
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download AMI IHM data (~70h)"
if [ ! -d $dl_dir/amicorpus ]; then
lhotse download ami --mic ihm $dl_dir/amicorpus
fi
fi
# -----------------------------
# Stage 1: Prepare AMI IHM manifests
# -----------------------------
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare AMI IHM manifests"
mkdir -p data/manifests
lhotse prepare ami \
--mic ihm \
--partition full-corpus-asr \
--normalize-text kaldi \
--max-words-per-segment 30 \
$dl_dir/amicorpus data/manifests/
fi
# -----------------------------
# Stage 4: Compute fbank features (AMI IHM only)
# -----------------------------
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank features for AMI IHM"
mkdir -p data/fbank
python local/compute_fbank_ami.py
log "Creating final IHM training cuts"
gunzip -c data/manifests/cuts_train_ihm.jsonl.gz | \
gzip -c > data/manifests/cuts_train_all.jsonl.gz
fi
# -----------------------------
# Stage 6: Dump transcripts for BPE training (IHM only)
# -----------------------------
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Dump IHM transcripts for BPE"
mkdir -p data/lm
gunzip -c data/manifests/ami-ihm_supervisions_train.jsonl.gz | \
jq '.text' | sed 's:"::g' > data/lm/transcript_words.txt
fi
# -----------------------------
# Stage 7: Prepare BPE-based language resources
# -----------------------------
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Prepare BPE-based lang"
lang_dir=data/lang_bpe_${vocab_size}
mkdir -p $lang_dir
echo "<eps> 0" > $lang_dir/words.txt
echo "!SIL 1" >> $lang_dir/words.txt
echo "<UNK> 2" >> $lang_dir/words.txt
cat data/lm/transcript_words.txt | \
grep -o -E '\w+' | sort -u | \
awk '{print $0, NR+2}' >> $lang_dir/words.txt
num_words=$(wc -l < $lang_dir/words.txt)
echo "<s> ${num_words}" >> $lang_dir/words.txt
num_words=$(wc -l < $lang_dir/words.txt)
echo "</s> ${num_words}" >> $lang_dir/words.txt
num_words=$(wc -l < $lang_dir/words.txt)
echo "#0 ${num_words}" >> $lang_dir/words.txt
./local/train_bpe_model.py \
--lang-dir $lang_dir \
--vocab-size $vocab_size \
--transcript data/lm/transcript_words.txt
if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang_bpe.py --lang-dir $lang_dir
fi
fi