Upload folder using huggingface_hub
Browse files- 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/ctc-4gram.tar.gz +3 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/ctc-decoding.tar.gz +3 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/epoch-48-avg-47.pt +3 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-0 +0 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-1 +0 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-2 +0 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-3 +0 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/tensorboard/events.out.tfevents.1758385759.TENCENT64.site.70629.0 +3 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/ctc-4gram.tar.gz +3 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/ctc-decoding.tar.gz +3 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/epoch-248-avg-245.pt +3 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/log/log-train-2025-09-23-00-07-42 +0 -0
- 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/tensorboard/events.out.tfevents.1758557262.TENCENT64.site.86523.0 +3 -0
- 94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls1.sh +93 -0
- 94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls10.sh +93 -0
- 94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls100.sh +95 -0
- 94m-uni-v2-dual-domain-mvq/finetune_rnnt_94m_ls100.sh +89 -0
- 94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt +3 -0
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/ctc-4gram.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3235199ea2bea61e9597c0b5a7a9273f724081adf7798e4f54c8872eacfcb82d
|
| 3 |
+
size 2867343
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/ctc-decoding.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ef31089a56b123ff51ae9f3a1f65ef3c0fd0d245d4d12a96e4ccc3c72201b65
|
| 3 |
+
size 574481494
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/epoch-48-avg-47.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28cb04d3e72d7fb5991f18641032102d620ea5d8432e4029f301f885dd9a890c
|
| 3 |
+
size 373547808
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-0
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-1
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-2
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-3
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/tensorboard/events.out.tfevents.1758385759.TENCENT64.site.70629.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a25907eec1641774c45f94c78dfc9c0870ce951834a274ed48dc16b89330553
|
| 3 |
+
size 111765
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/ctc-4gram.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa432cc690b9a7827f9c5dc020d54ba15d828d545c6b040a77aaabd1138c11d1
|
| 3 |
+
size 14398592
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/ctc-decoding.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a03bd894e0902ada9c5c0d33fcb99f6db4d25afaba3f8cc46863916feec8290
|
| 3 |
+
size 3348536190
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/epoch-248-avg-245.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9f4da7c66cc866dfe226b1ed35b74e08c5358600451c5649a5c3268e7e47b68
|
| 3 |
+
size 373548866
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/log/log-train-2025-09-23-00-07-42
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/tensorboard/events.out.tfevents.1758557262.TENCENT64.site.86523.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8cdca1724641336a9d5a13f4c17654e384a6a18b547d1fd010a26f5574151246
|
| 3 |
+
size 439928
|
94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls1.sh
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
|
| 3 |
+
export PYTHONPATH=/root/icefall:$PYTHONPATH
|
| 4 |
+
export CUDA_VISIBLE_DEVICES=$1
|
| 5 |
+
|
| 6 |
+
# data related
|
| 7 |
+
use_librispeech=1
|
| 8 |
+
full_libri="ls1"
|
| 9 |
+
|
| 10 |
+
causal=0
|
| 11 |
+
lr=0.002
|
| 12 |
+
|
| 13 |
+
# finetune checkpoint
|
| 14 |
+
do_finetune=1
|
| 15 |
+
finetune_ckpt=/private_data2/94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt
|
| 16 |
+
|
| 17 |
+
use_ctc=1
|
| 18 |
+
use_transducer=0
|
| 19 |
+
output_ds=2
|
| 20 |
+
post_output_ds=1
|
| 21 |
+
|
| 22 |
+
freeze_encoder=0
|
| 23 |
+
freeze_encoder_steps=-1
|
| 24 |
+
encoder_lr_scale=0.005
|
| 25 |
+
|
| 26 |
+
md=500
|
| 27 |
+
|
| 28 |
+
exp_dir=zipformer_finetune/94m-uni-v2-dual-domain-mvq-exp_ft_ls1_char_ctc_ws1_md500_lr1e-5_baselr0.002_bf16
|
| 29 |
+
|
| 30 |
+
echo $exp_dir
|
| 31 |
+
|
| 32 |
+
if true; then
|
| 33 |
+
torchrun --nproc_per_node=1 --master_port=19193 \
|
| 34 |
+
zipformer_finetune/finetune_asr.py \
|
| 35 |
+
--num-epochs 8000 \
|
| 36 |
+
--use-fp16 0 \
|
| 37 |
+
--use-bf16 1 \
|
| 38 |
+
--start-epoch 1 \
|
| 39 |
+
--use-librispeech $use_librispeech --full-libri $full_libri \
|
| 40 |
+
--exp-dir $exp_dir \
|
| 41 |
+
--manifest-dir data/fbank \
|
| 42 |
+
--lang-dir data/lang_char \
|
| 43 |
+
--base-lr $lr \
|
| 44 |
+
--use-ctc $use_ctc --use-transducer $use_transducer \
|
| 45 |
+
--do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
|
| 46 |
+
--freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
|
| 47 |
+
--encoder-lr-scale $encoder_lr_scale \
|
| 48 |
+
--causal $causal \
|
| 49 |
+
--downsampling-factor 1,2,4,8,4,2,1 \
|
| 50 |
+
--num-encoder-layers 1,2,3,3,1,1,1 \
|
| 51 |
+
--feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
|
| 52 |
+
--encoder-dim 512,512,512,512,512,512,512 \
|
| 53 |
+
--encoder-unmasked-dim 256,256,256,256,256,256,256 \
|
| 54 |
+
--num-heads 8,8,8,8,8,8,8 \
|
| 55 |
+
--cnn-module-kernel 31,31,15,15,15,31,31 \
|
| 56 |
+
--output-downsampling-factor $output_ds \
|
| 57 |
+
--post-encoder-downsampling-factor $post_output_ds \
|
| 58 |
+
--on-the-fly-feats 1 \
|
| 59 |
+
--max-duration $md
|
| 60 |
+
fi
|
| 61 |
+
|
| 62 |
+
# start=$2
|
| 63 |
+
if true; then
|
| 64 |
+
for m in ctc-decoding; do
|
| 65 |
+
for epoch in $(seq 400 -20 100); do
|
| 66 |
+
for avg in $(seq $((epoch-1)) -20 50); do
|
| 67 |
+
python zipformer_finetune/decode_ctc.py \
|
| 68 |
+
--epoch $epoch \
|
| 69 |
+
--avg $avg \
|
| 70 |
+
--manifest-dir data/fbank \
|
| 71 |
+
--lang-dir data/lang_char \
|
| 72 |
+
--use-averaged-model 1 \
|
| 73 |
+
--downsampling-factor 1,2,4,8,4,2,1 \
|
| 74 |
+
--num-encoder-layers 1,2,3,3,1,1,1 \
|
| 75 |
+
--feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
|
| 76 |
+
--encoder-dim 512,512,512,512,512,512,512 \
|
| 77 |
+
--encoder-unmasked-dim 256,256,256,256,256,256,256 \
|
| 78 |
+
--num-heads 8,8,8,8,8,8,8 \
|
| 79 |
+
--cnn-module-kernel 31,31,15,15,15,31,31 \
|
| 80 |
+
--use-ctc $use_ctc --use-transducer $use_transducer \
|
| 81 |
+
--output-downsampling-factor $output_ds \
|
| 82 |
+
--post-encoder-downsampling-factor $post_output_ds \
|
| 83 |
+
--on-the-fly-feats 1 \
|
| 84 |
+
--exp-dir $exp_dir \
|
| 85 |
+
--decoding-method $m \
|
| 86 |
+
--max-duration 2000
|
| 87 |
+
done
|
| 88 |
+
done
|
| 89 |
+
done
|
| 90 |
+
fi
|
| 91 |
+
|
| 92 |
+
echo "Done"
|
| 93 |
+
python ~/busygpu/run.py &
|
94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls10.sh
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
|
| 3 |
+
export PYTHONPATH=/root/icefall:$PYTHONPATH
|
| 4 |
+
export CUDA_VISIBLE_DEVICES=$1
|
| 5 |
+
|
| 6 |
+
# data related
|
| 7 |
+
use_librispeech=1
|
| 8 |
+
full_libri="ls10"
|
| 9 |
+
|
| 10 |
+
causal=0
|
| 11 |
+
lr=0.045
|
| 12 |
+
|
| 13 |
+
# finetune checkpoint
|
| 14 |
+
do_finetune=1
|
| 15 |
+
finetune_ckpt=/private_data2/94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt
|
| 16 |
+
|
| 17 |
+
use_ctc=1
|
| 18 |
+
use_transducer=0
|
| 19 |
+
output_ds=2
|
| 20 |
+
post_output_ds=1
|
| 21 |
+
|
| 22 |
+
freeze_encoder=0
|
| 23 |
+
freeze_encoder_steps=-1
|
| 24 |
+
encoder_lr_scale=0.02222
|
| 25 |
+
|
| 26 |
+
md=1000
|
| 27 |
+
|
| 28 |
+
exp_dir=zipformer_finetune/94m-uni-v2-dual-domain-mvq-exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16
|
| 29 |
+
|
| 30 |
+
echo $exp_dir
|
| 31 |
+
|
| 32 |
+
if false; then
|
| 33 |
+
torchrun --nproc_per_node=1 --master_port=19293 \
|
| 34 |
+
zipformer_finetune/finetune_asr.py \
|
| 35 |
+
--num-epochs 500 \
|
| 36 |
+
--use-fp16 0 \
|
| 37 |
+
--use-bf16 1 \
|
| 38 |
+
--start-epoch 1 \
|
| 39 |
+
--use-librispeech $use_librispeech --full-libri $full_libri \
|
| 40 |
+
--exp-dir $exp_dir \
|
| 41 |
+
--manifest-dir data/fbank \
|
| 42 |
+
--lang-dir data/lang_char \
|
| 43 |
+
--base-lr $lr \
|
| 44 |
+
--use-ctc $use_ctc --use-transducer $use_transducer \
|
| 45 |
+
--do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
|
| 46 |
+
--freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
|
| 47 |
+
--encoder-lr-scale $encoder_lr_scale \
|
| 48 |
+
--causal $causal \
|
| 49 |
+
--downsampling-factor 1,2,4,8,4,2,1 \
|
| 50 |
+
--num-encoder-layers 1,2,3,3,1,1,1 \
|
| 51 |
+
--feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
|
| 52 |
+
--encoder-dim 512,512,512,512,512,512,512 \
|
| 53 |
+
--encoder-unmasked-dim 256,256,256,256,256,256,256 \
|
| 54 |
+
--num-heads 8,8,8,8,8,8,8 \
|
| 55 |
+
--cnn-module-kernel 31,31,15,15,15,31,31 \
|
| 56 |
+
--output-downsampling-factor $output_ds \
|
| 57 |
+
--post-encoder-downsampling-factor $post_output_ds \
|
| 58 |
+
--on-the-fly-feats 1 \
|
| 59 |
+
--max-duration $md
|
| 60 |
+
fi
|
| 61 |
+
|
| 62 |
+
start=$2
|
| 63 |
+
if true; then
|
| 64 |
+
for m in ctc-decoding; do
|
| 65 |
+
for epoch in $(seq $start -1 $((start-29))); do
|
| 66 |
+
for avg in $(seq $((epoch-1)) -1 $((epoch-20))); do
|
| 67 |
+
python zipformer_finetune/decode_ctc.py \
|
| 68 |
+
--epoch $epoch \
|
| 69 |
+
--avg $avg \
|
| 70 |
+
--manifest-dir data/fbank \
|
| 71 |
+
--lang-dir data/lang_char \
|
| 72 |
+
--use-averaged-model 1 \
|
| 73 |
+
--downsampling-factor 1,2,4,8,4,2,1 \
|
| 74 |
+
--num-encoder-layers 1,2,3,3,1,1,1 \
|
| 75 |
+
--feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
|
| 76 |
+
--encoder-dim 512,512,512,512,512,512,512 \
|
| 77 |
+
--encoder-unmasked-dim 256,256,256,256,256,256,256 \
|
| 78 |
+
--num-heads 8,8,8,8,8,8,8 \
|
| 79 |
+
--cnn-module-kernel 31,31,15,15,15,31,31 \
|
| 80 |
+
--use-ctc $use_ctc --use-transducer $use_transducer \
|
| 81 |
+
--output-downsampling-factor $output_ds \
|
| 82 |
+
--post-encoder-downsampling-factor $post_output_ds \
|
| 83 |
+
--on-the-fly-feats 1 \
|
| 84 |
+
--exp-dir $exp_dir \
|
| 85 |
+
--decoding-method $m \
|
| 86 |
+
--max-duration 2000
|
| 87 |
+
done
|
| 88 |
+
done
|
| 89 |
+
done
|
| 90 |
+
fi
|
| 91 |
+
|
| 92 |
+
echo "Done"
|
| 93 |
+
python ~/busygpu/run.py &
|
94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls100.sh
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
|
| 3 |
+
export PYTHONPATH=/root/icefall:$PYTHONPATH
|
| 4 |
+
# export CUDA_VISIBLE_DEVICES=0,1,2,3
|
| 5 |
+
export CUDA_VISIBLE_DEVICES=$1
|
| 6 |
+
|
| 7 |
+
# data related
|
| 8 |
+
use_librispeech=1
|
| 9 |
+
full_libri=0
|
| 10 |
+
|
| 11 |
+
causal=0
|
| 12 |
+
lr=0.045
|
| 13 |
+
|
| 14 |
+
# finetune checkpoint
|
| 15 |
+
do_finetune=1
|
| 16 |
+
finetune_ckpt=/private_data2/94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt
|
| 17 |
+
|
| 18 |
+
use_ctc=1
|
| 19 |
+
use_transducer=0
|
| 20 |
+
output_ds=2
|
| 21 |
+
post_output_ds=1
|
| 22 |
+
|
| 23 |
+
freeze_encoder=0
|
| 24 |
+
freeze_encoder_steps=-1
|
| 25 |
+
encoder_lr_scale=0.02222
|
| 26 |
+
|
| 27 |
+
md=1000
|
| 28 |
+
|
| 29 |
+
exp_dir=/private_data2/94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16
|
| 30 |
+
|
| 31 |
+
echo $exp_dir
|
| 32 |
+
|
| 33 |
+
if false; then
|
| 34 |
+
torchrun --nproc_per_node=4 --master_port=19291 \
|
| 35 |
+
zipformer_finetune/finetune_asr.py \
|
| 36 |
+
--num-epochs 50 \
|
| 37 |
+
--use-fp16 0 \
|
| 38 |
+
--use-bf16 1 \
|
| 39 |
+
--start-epoch 1 \
|
| 40 |
+
--use-librispeech $use_librispeech --full-libri $full_libri \
|
| 41 |
+
--exp-dir $exp_dir \
|
| 42 |
+
--manifest-dir data/fbank \
|
| 43 |
+
--lang-dir data/lang_char \
|
| 44 |
+
--base-lr $lr \
|
| 45 |
+
--use-ctc $use_ctc --use-transducer $use_transducer \
|
| 46 |
+
--do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
|
| 47 |
+
--freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
|
| 48 |
+
--encoder-lr-scale $encoder_lr_scale \
|
| 49 |
+
--causal $causal \
|
| 50 |
+
--downsampling-factor 1,2,4,8,4,2,1 \
|
| 51 |
+
--num-encoder-layers 1,2,3,3,1,1,1 \
|
| 52 |
+
--feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
|
| 53 |
+
--encoder-dim 512,512,512,512,512,512,512 \
|
| 54 |
+
--encoder-unmasked-dim 256,256,256,256,256,256,256 \
|
| 55 |
+
--num-heads 8,8,8,8,8,8,8 \
|
| 56 |
+
--cnn-module-kernel 31,31,15,15,15,31,31 \
|
| 57 |
+
--output-downsampling-factor $output_ds \
|
| 58 |
+
--post-encoder-downsampling-factor $post_output_ds \
|
| 59 |
+
--on-the-fly-feats 1 \
|
| 60 |
+
--max-duration $md
|
| 61 |
+
fi
|
| 62 |
+
|
| 63 |
+
start=$2
|
| 64 |
+
if true; then
|
| 65 |
+
for m in ctc-decoding; do
|
| 66 |
+
for epoch in $(seq $start -1 $((start-10))); do
|
| 67 |
+
for avg in $(seq $((epoch-1)) -1 10); do
|
| 68 |
+
python zipformer_finetune/decode_ctc.py \
|
| 69 |
+
--epoch $epoch \
|
| 70 |
+
--avg $avg \
|
| 71 |
+
--manifest-dir data/fbank \
|
| 72 |
+
--lang-dir data/lang_char \
|
| 73 |
+
--use-averaged-model 1 \
|
| 74 |
+
--downsampling-factor 1,2,4,8,4,2,1 \
|
| 75 |
+
--num-encoder-layers 1,2,3,3,1,1,1 \
|
| 76 |
+
--feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
|
| 77 |
+
--encoder-dim 512,512,512,512,512,512,512 \
|
| 78 |
+
--encoder-unmasked-dim 256,256,256,256,256,256,256 \
|
| 79 |
+
--num-heads 8,8,8,8,8,8,8 \
|
| 80 |
+
--cnn-module-kernel 31,31,15,15,15,31,31 \
|
| 81 |
+
--use-ctc $use_ctc --use-transducer $use_transducer \
|
| 82 |
+
--output-downsampling-factor $output_ds \
|
| 83 |
+
--post-encoder-downsampling-factor $post_output_ds \
|
| 84 |
+
--on-the-fly-feats 1 \
|
| 85 |
+
--exp-dir $exp_dir \
|
| 86 |
+
--decoding-method $m \
|
| 87 |
+
--max-duration 2000
|
| 88 |
+
done
|
| 89 |
+
done
|
| 90 |
+
done
|
| 91 |
+
fi
|
| 92 |
+
|
| 93 |
+
echo "Done"
|
| 94 |
+
# for i in {0..3}; do CUDA_VISIBLE_DEVICES=$i python ~/busygpu/run.py & done
|
| 95 |
+
python ~/busygpu/run.py &
|
94m-uni-v2-dual-domain-mvq/finetune_rnnt_94m_ls100.sh
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
|
| 3 |
+
export PYTHONPATH=./../../../:$PYTHONPATH
|
| 4 |
+
|
| 5 |
+
# data related
|
| 6 |
+
use_librispeech=1
|
| 7 |
+
full_libri=0
|
| 8 |
+
|
| 9 |
+
causal=0
|
| 10 |
+
lr=0.045
|
| 11 |
+
|
| 12 |
+
# finetune checkpoint
|
| 13 |
+
do_finetune=1
|
| 14 |
+
finetune_ckpt=zipformer_audio_encoder/exp-96M-uniform-v2-zipformer-out-ds-2-lh-large-giga-xl-voxpopuli-1-as-full-x2-all-audio-w2v2-mask-p-0.65-l-10-cha-mask-p-0.25-l-20-musan-p-0.5-min-snr-10-multi-mvq-wavlm-all-wavlm-large-cb16-1.0-dasheng-cb8-0.1-md400/iter-400000-avg-4.pt
|
| 15 |
+
|
| 16 |
+
use_ctc=0
|
| 17 |
+
use_transducer=1
|
| 18 |
+
output_ds=2
|
| 19 |
+
post_output_ds=1
|
| 20 |
+
|
| 21 |
+
freeze_encoder=0
|
| 22 |
+
freeze_encoder_steps=2000
|
| 23 |
+
# freeze_encoder=1
|
| 24 |
+
# freeze_encoder_steps=-1
|
| 25 |
+
encoder_lr_scale=0.05
|
| 26 |
+
|
| 27 |
+
md=1000
|
| 28 |
+
|
| 29 |
+
exp_dir=zipformer_finetune/exp-finetune-rnnt-94m-out-ds-${output_ds}
|
| 30 |
+
|
| 31 |
+
echo $exp_dir
|
| 32 |
+
|
| 33 |
+
torchrun --nproc_per_node=2 --master_port=19291 \
|
| 34 |
+
zipformer_finetune/finetune_asr.py \
|
| 35 |
+
--num-epochs 30 \
|
| 36 |
+
--use-fp16 1 \
|
| 37 |
+
--start-epoch 1 \
|
| 38 |
+
--use-librispeech $use_librispeech --full-libri $full_libri \
|
| 39 |
+
--exp-dir $exp_dir \
|
| 40 |
+
--manifest-dir data/fbank \
|
| 41 |
+
--bpe-model data/lang_bpe_500/bpe.model \
|
| 42 |
+
--base-lr $lr \
|
| 43 |
+
--use-ctc $use_ctc --use-transducer $use_transducer \
|
| 44 |
+
--do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
|
| 45 |
+
--freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
|
| 46 |
+
--encoder-lr-scale $encoder_lr_scale \
|
| 47 |
+
--causal $causal \
|
| 48 |
+
--downsampling-factor 1,2,4,8,4,2,1 \
|
| 49 |
+
--num-encoder-layers 1,2,3,3,1,1,1 \
|
| 50 |
+
--feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
|
| 51 |
+
--encoder-dim 512,512,512,512,512,512,512 \
|
| 52 |
+
--encoder-unmasked-dim 256,256,256,256,256,256,256 \
|
| 53 |
+
--num-heads 8,8,8,8,8,8,8 \
|
| 54 |
+
--cnn-module-kernel 31,31,15,15,15,31,31 \
|
| 55 |
+
--output-downsampling-factor $output_ds \
|
| 56 |
+
--post-encoder-downsampling-factor $post_output_ds \
|
| 57 |
+
--on-the-fly-feats 1 \
|
| 58 |
+
--max-duration $md
|
| 59 |
+
|
| 60 |
+
for m in greedy_search modified_beam_search; do
|
| 61 |
+
for epoch in 30; do
|
| 62 |
+
for avg in $(seq 15 -1 10); do
|
| 63 |
+
python zipformer_finetune/decode.py \
|
| 64 |
+
--epoch $epoch \
|
| 65 |
+
--avg $avg \
|
| 66 |
+
--manifest-dir data/fbank_librispeech \
|
| 67 |
+
--bpe-model data/lang_bpe_500/bpe.model \
|
| 68 |
+
--use-averaged-model 1 \
|
| 69 |
+
--downsampling-factor 1,2,4,8,4,2,1 \
|
| 70 |
+
--num-encoder-layers 1,2,3,3,1,1,1 \
|
| 71 |
+
--feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
|
| 72 |
+
--encoder-dim 512,512,512,512,512,512,512 \
|
| 73 |
+
--encoder-unmasked-dim 256,256,256,256,256,256,256 \
|
| 74 |
+
--num-heads 8,8,8,8,8,8,8 \
|
| 75 |
+
--cnn-module-kernel 31,31,15,15,15,31,31 \
|
| 76 |
+
--use-ctc $use_ctc --use-transducer $use_transducer \
|
| 77 |
+
--output-downsampling-factor $output_ds \
|
| 78 |
+
--post-encoder-downsampling-factor $post_output_ds \
|
| 79 |
+
--on-the-fly-feats 1 \
|
| 80 |
+
--exp-dir $exp_dir \
|
| 81 |
+
--decoding-method $m \
|
| 82 |
+
--max-duration 1000
|
| 83 |
+
done
|
| 84 |
+
done
|
| 85 |
+
done
|
| 86 |
+
|
| 87 |
+
# rm $exp_dir/*.pt
|
| 88 |
+
|
| 89 |
+
echo "Done"
|
94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c96d858101d874a14ea0c2d2452243b0e9626f0a6f89de4309bd7edc95cc8965
|
| 3 |
+
size 374551106
|