yfyeung commited on
Commit
5596709
·
verified ·
1 Parent(s): 24234b0

Upload folder using huggingface_hub

Browse files
Files changed (18) hide show
  1. 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/ctc-4gram.tar.gz +3 -0
  2. 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/ctc-decoding.tar.gz +3 -0
  3. 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/epoch-48-avg-47.pt +3 -0
  4. 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-0 +0 -0
  5. 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-1 +0 -0
  6. 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-2 +0 -0
  7. 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-3 +0 -0
  8. 94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/tensorboard/events.out.tfevents.1758385759.TENCENT64.site.70629.0 +3 -0
  9. 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/ctc-4gram.tar.gz +3 -0
  10. 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/ctc-decoding.tar.gz +3 -0
  11. 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/epoch-248-avg-245.pt +3 -0
  12. 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/log/log-train-2025-09-23-00-07-42 +0 -0
  13. 94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/tensorboard/events.out.tfevents.1758557262.TENCENT64.site.86523.0 +3 -0
  14. 94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls1.sh +93 -0
  15. 94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls10.sh +93 -0
  16. 94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls100.sh +95 -0
  17. 94m-uni-v2-dual-domain-mvq/finetune_rnnt_94m_ls100.sh +89 -0
  18. 94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt +3 -0
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/ctc-4gram.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3235199ea2bea61e9597c0b5a7a9273f724081adf7798e4f54c8872eacfcb82d
3
+ size 2867343
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/ctc-decoding.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef31089a56b123ff51ae9f3a1f65ef3c0fd0d245d4d12a96e4ccc3c72201b65
3
+ size 574481494
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/epoch-48-avg-47.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28cb04d3e72d7fb5991f18641032102d620ea5d8432e4029f301f885dd9a890c
3
+ size 373547808
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-0 ADDED
The diff for this file is too large to render. See raw diff
 
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-1 ADDED
The diff for this file is too large to render. See raw diff
 
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-2 ADDED
The diff for this file is too large to render. See raw diff
 
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/log/log-train-2025-09-21-00-29-19-3 ADDED
The diff for this file is too large to render. See raw diff
 
94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16/tensorboard/events.out.tfevents.1758385759.TENCENT64.site.70629.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a25907eec1641774c45f94c78dfc9c0870ce951834a274ed48dc16b89330553
3
+ size 111765
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/ctc-4gram.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa432cc690b9a7827f9c5dc020d54ba15d828d545c6b040a77aaabd1138c11d1
3
+ size 14398592
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/ctc-decoding.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a03bd894e0902ada9c5c0d33fcb99f6db4d25afaba3f8cc46863916feec8290
3
+ size 3348536190
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/epoch-248-avg-245.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9f4da7c66cc866dfe226b1ed35b74e08c5358600451c5649a5c3268e7e47b68
3
+ size 373548866
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/log/log-train-2025-09-23-00-07-42 ADDED
The diff for this file is too large to render. See raw diff
 
94m-uni-v2-dual-domain-mvq/exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16/tensorboard/events.out.tfevents.1758557262.TENCENT64.site.86523.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cdca1724641336a9d5a13f4c17654e384a6a18b547d1fd010a26f5574151246
3
+ size 439928
94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls1.sh ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ export PYTHONPATH=/root/icefall:$PYTHONPATH
4
+ export CUDA_VISIBLE_DEVICES=$1
5
+
6
+ # data related
7
+ use_librispeech=1
8
+ full_libri="ls1"
9
+
10
+ causal=0
11
+ lr=0.002
12
+
13
+ # finetune checkpoint
14
+ do_finetune=1
15
+ finetune_ckpt=/private_data2/94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt
16
+
17
+ use_ctc=1
18
+ use_transducer=0
19
+ output_ds=2
20
+ post_output_ds=1
21
+
22
+ freeze_encoder=0
23
+ freeze_encoder_steps=-1
24
+ encoder_lr_scale=0.005
25
+
26
+ md=500
27
+
28
+ exp_dir=zipformer_finetune/94m-uni-v2-dual-domain-mvq-exp_ft_ls1_char_ctc_ws1_md500_lr1e-5_baselr0.002_bf16
29
+
30
+ echo $exp_dir
31
+
32
+ if true; then
33
+ torchrun --nproc_per_node=1 --master_port=19193 \
34
+ zipformer_finetune/finetune_asr.py \
35
+ --num-epochs 8000 \
36
+ --use-fp16 0 \
37
+ --use-bf16 1 \
38
+ --start-epoch 1 \
39
+ --use-librispeech $use_librispeech --full-libri $full_libri \
40
+ --exp-dir $exp_dir \
41
+ --manifest-dir data/fbank \
42
+ --lang-dir data/lang_char \
43
+ --base-lr $lr \
44
+ --use-ctc $use_ctc --use-transducer $use_transducer \
45
+ --do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
46
+ --freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
47
+ --encoder-lr-scale $encoder_lr_scale \
48
+ --causal $causal \
49
+ --downsampling-factor 1,2,4,8,4,2,1 \
50
+ --num-encoder-layers 1,2,3,3,1,1,1 \
51
+ --feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
52
+ --encoder-dim 512,512,512,512,512,512,512 \
53
+ --encoder-unmasked-dim 256,256,256,256,256,256,256 \
54
+ --num-heads 8,8,8,8,8,8,8 \
55
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
56
+ --output-downsampling-factor $output_ds \
57
+ --post-encoder-downsampling-factor $post_output_ds \
58
+ --on-the-fly-feats 1 \
59
+ --max-duration $md
60
+ fi
61
+
62
+ # start=$2
63
+ if true; then
64
+ for m in ctc-decoding; do
65
+ for epoch in $(seq 400 -20 100); do
66
+ for avg in $(seq $((epoch-1)) -20 50); do
67
+ python zipformer_finetune/decode_ctc.py \
68
+ --epoch $epoch \
69
+ --avg $avg \
70
+ --manifest-dir data/fbank \
71
+ --lang-dir data/lang_char \
72
+ --use-averaged-model 1 \
73
+ --downsampling-factor 1,2,4,8,4,2,1 \
74
+ --num-encoder-layers 1,2,3,3,1,1,1 \
75
+ --feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
76
+ --encoder-dim 512,512,512,512,512,512,512 \
77
+ --encoder-unmasked-dim 256,256,256,256,256,256,256 \
78
+ --num-heads 8,8,8,8,8,8,8 \
79
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
80
+ --use-ctc $use_ctc --use-transducer $use_transducer \
81
+ --output-downsampling-factor $output_ds \
82
+ --post-encoder-downsampling-factor $post_output_ds \
83
+ --on-the-fly-feats 1 \
84
+ --exp-dir $exp_dir \
85
+ --decoding-method $m \
86
+ --max-duration 2000
87
+ done
88
+ done
89
+ done
90
+ fi
91
+
92
+ echo "Done"
93
+ python ~/busygpu/run.py &
94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls10.sh ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ export PYTHONPATH=/root/icefall:$PYTHONPATH
4
+ export CUDA_VISIBLE_DEVICES=$1
5
+
6
+ # data related
7
+ use_librispeech=1
8
+ full_libri="ls10"
9
+
10
+ causal=0
11
+ lr=0.045
12
+
13
+ # finetune checkpoint
14
+ do_finetune=1
15
+ finetune_ckpt=/private_data2/94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt
16
+
17
+ use_ctc=1
18
+ use_transducer=0
19
+ output_ds=2
20
+ post_output_ds=1
21
+
22
+ freeze_encoder=0
23
+ freeze_encoder_steps=-1
24
+ encoder_lr_scale=0.02222
25
+
26
+ md=1000
27
+
28
+ exp_dir=zipformer_finetune/94m-uni-v2-dual-domain-mvq-exp_ft_ls10_char_ctc_ws1_md1000_lr1e-3_bf16
29
+
30
+ echo $exp_dir
31
+
32
+ if false; then
33
+ torchrun --nproc_per_node=1 --master_port=19293 \
34
+ zipformer_finetune/finetune_asr.py \
35
+ --num-epochs 500 \
36
+ --use-fp16 0 \
37
+ --use-bf16 1 \
38
+ --start-epoch 1 \
39
+ --use-librispeech $use_librispeech --full-libri $full_libri \
40
+ --exp-dir $exp_dir \
41
+ --manifest-dir data/fbank \
42
+ --lang-dir data/lang_char \
43
+ --base-lr $lr \
44
+ --use-ctc $use_ctc --use-transducer $use_transducer \
45
+ --do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
46
+ --freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
47
+ --encoder-lr-scale $encoder_lr_scale \
48
+ --causal $causal \
49
+ --downsampling-factor 1,2,4,8,4,2,1 \
50
+ --num-encoder-layers 1,2,3,3,1,1,1 \
51
+ --feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
52
+ --encoder-dim 512,512,512,512,512,512,512 \
53
+ --encoder-unmasked-dim 256,256,256,256,256,256,256 \
54
+ --num-heads 8,8,8,8,8,8,8 \
55
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
56
+ --output-downsampling-factor $output_ds \
57
+ --post-encoder-downsampling-factor $post_output_ds \
58
+ --on-the-fly-feats 1 \
59
+ --max-duration $md
60
+ fi
61
+
62
+ start=$2
63
+ if true; then
64
+ for m in ctc-decoding; do
65
+ for epoch in $(seq $start -1 $((start-29))); do
66
+ for avg in $(seq $((epoch-1)) -1 $((epoch-20))); do
67
+ python zipformer_finetune/decode_ctc.py \
68
+ --epoch $epoch \
69
+ --avg $avg \
70
+ --manifest-dir data/fbank \
71
+ --lang-dir data/lang_char \
72
+ --use-averaged-model 1 \
73
+ --downsampling-factor 1,2,4,8,4,2,1 \
74
+ --num-encoder-layers 1,2,3,3,1,1,1 \
75
+ --feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
76
+ --encoder-dim 512,512,512,512,512,512,512 \
77
+ --encoder-unmasked-dim 256,256,256,256,256,256,256 \
78
+ --num-heads 8,8,8,8,8,8,8 \
79
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
80
+ --use-ctc $use_ctc --use-transducer $use_transducer \
81
+ --output-downsampling-factor $output_ds \
82
+ --post-encoder-downsampling-factor $post_output_ds \
83
+ --on-the-fly-feats 1 \
84
+ --exp-dir $exp_dir \
85
+ --decoding-method $m \
86
+ --max-duration 2000
87
+ done
88
+ done
89
+ done
90
+ fi
91
+
92
+ echo "Done"
93
+ python ~/busygpu/run.py &
94m-uni-v2-dual-domain-mvq/finetune_ctc_94m_ls100.sh ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ export PYTHONPATH=/root/icefall:$PYTHONPATH
4
+ # export CUDA_VISIBLE_DEVICES=0,1,2,3
5
+ export CUDA_VISIBLE_DEVICES=$1
6
+
7
+ # data related
8
+ use_librispeech=1
9
+ full_libri=0
10
+
11
+ causal=0
12
+ lr=0.045
13
+
14
+ # finetune checkpoint
15
+ do_finetune=1
16
+ finetune_ckpt=/private_data2/94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt
17
+
18
+ use_ctc=1
19
+ use_transducer=0
20
+ output_ds=2
21
+ post_output_ds=1
22
+
23
+ freeze_encoder=0
24
+ freeze_encoder_steps=-1
25
+ encoder_lr_scale=0.02222
26
+
27
+ md=1000
28
+
29
+ exp_dir=/private_data2/94m-uni-v2-dual-domain-mvq/exp_ft_ls100_char_ctc_ws4_md1000_lr1e-3_bf16
30
+
31
+ echo $exp_dir
32
+
33
+ if false; then
34
+ torchrun --nproc_per_node=4 --master_port=19291 \
35
+ zipformer_finetune/finetune_asr.py \
36
+ --num-epochs 50 \
37
+ --use-fp16 0 \
38
+ --use-bf16 1 \
39
+ --start-epoch 1 \
40
+ --use-librispeech $use_librispeech --full-libri $full_libri \
41
+ --exp-dir $exp_dir \
42
+ --manifest-dir data/fbank \
43
+ --lang-dir data/lang_char \
44
+ --base-lr $lr \
45
+ --use-ctc $use_ctc --use-transducer $use_transducer \
46
+ --do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
47
+ --freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
48
+ --encoder-lr-scale $encoder_lr_scale \
49
+ --causal $causal \
50
+ --downsampling-factor 1,2,4,8,4,2,1 \
51
+ --num-encoder-layers 1,2,3,3,1,1,1 \
52
+ --feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
53
+ --encoder-dim 512,512,512,512,512,512,512 \
54
+ --encoder-unmasked-dim 256,256,256,256,256,256,256 \
55
+ --num-heads 8,8,8,8,8,8,8 \
56
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
57
+ --output-downsampling-factor $output_ds \
58
+ --post-encoder-downsampling-factor $post_output_ds \
59
+ --on-the-fly-feats 1 \
60
+ --max-duration $md
61
+ fi
62
+
63
+ start=$2
64
+ if true; then
65
+ for m in ctc-decoding; do
66
+ for epoch in $(seq $start -1 $((start-10))); do
67
+ for avg in $(seq $((epoch-1)) -1 10); do
68
+ python zipformer_finetune/decode_ctc.py \
69
+ --epoch $epoch \
70
+ --avg $avg \
71
+ --manifest-dir data/fbank \
72
+ --lang-dir data/lang_char \
73
+ --use-averaged-model 1 \
74
+ --downsampling-factor 1,2,4,8,4,2,1 \
75
+ --num-encoder-layers 1,2,3,3,1,1,1 \
76
+ --feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
77
+ --encoder-dim 512,512,512,512,512,512,512 \
78
+ --encoder-unmasked-dim 256,256,256,256,256,256,256 \
79
+ --num-heads 8,8,8,8,8,8,8 \
80
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
81
+ --use-ctc $use_ctc --use-transducer $use_transducer \
82
+ --output-downsampling-factor $output_ds \
83
+ --post-encoder-downsampling-factor $post_output_ds \
84
+ --on-the-fly-feats 1 \
85
+ --exp-dir $exp_dir \
86
+ --decoding-method $m \
87
+ --max-duration 2000
88
+ done
89
+ done
90
+ done
91
+ fi
92
+
93
+ echo "Done"
94
+ # for i in {0..3}; do CUDA_VISIBLE_DEVICES=$i python ~/busygpu/run.py & done
95
+ python ~/busygpu/run.py &
94m-uni-v2-dual-domain-mvq/finetune_rnnt_94m_ls100.sh ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ export PYTHONPATH=./../../../:$PYTHONPATH
4
+
5
+ # data related
6
+ use_librispeech=1
7
+ full_libri=0
8
+
9
+ causal=0
10
+ lr=0.045
11
+
12
+ # finetune checkpoint
13
+ do_finetune=1
14
+ finetune_ckpt=zipformer_audio_encoder/exp-96M-uniform-v2-zipformer-out-ds-2-lh-large-giga-xl-voxpopuli-1-as-full-x2-all-audio-w2v2-mask-p-0.65-l-10-cha-mask-p-0.25-l-20-musan-p-0.5-min-snr-10-multi-mvq-wavlm-all-wavlm-large-cb16-1.0-dasheng-cb8-0.1-md400/iter-400000-avg-4.pt
15
+
16
+ use_ctc=0
17
+ use_transducer=1
18
+ output_ds=2
19
+ post_output_ds=1
20
+
21
+ freeze_encoder=0
22
+ freeze_encoder_steps=2000
23
+ # freeze_encoder=1
24
+ # freeze_encoder_steps=-1
25
+ encoder_lr_scale=0.05
26
+
27
+ md=1000
28
+
29
+ exp_dir=zipformer_finetune/exp-finetune-rnnt-94m-out-ds-${output_ds}
30
+
31
+ echo $exp_dir
32
+
33
+ torchrun --nproc_per_node=2 --master_port=19291 \
34
+ zipformer_finetune/finetune_asr.py \
35
+ --num-epochs 30 \
36
+ --use-fp16 1 \
37
+ --start-epoch 1 \
38
+ --use-librispeech $use_librispeech --full-libri $full_libri \
39
+ --exp-dir $exp_dir \
40
+ --manifest-dir data/fbank \
41
+ --bpe-model data/lang_bpe_500/bpe.model \
42
+ --base-lr $lr \
43
+ --use-ctc $use_ctc --use-transducer $use_transducer \
44
+ --do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
45
+ --freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
46
+ --encoder-lr-scale $encoder_lr_scale \
47
+ --causal $causal \
48
+ --downsampling-factor 1,2,4,8,4,2,1 \
49
+ --num-encoder-layers 1,2,3,3,1,1,1 \
50
+ --feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
51
+ --encoder-dim 512,512,512,512,512,512,512 \
52
+ --encoder-unmasked-dim 256,256,256,256,256,256,256 \
53
+ --num-heads 8,8,8,8,8,8,8 \
54
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
55
+ --output-downsampling-factor $output_ds \
56
+ --post-encoder-downsampling-factor $post_output_ds \
57
+ --on-the-fly-feats 1 \
58
+ --max-duration $md
59
+
60
+ for m in greedy_search modified_beam_search; do
61
+ for epoch in 30; do
62
+ for avg in $(seq 15 -1 10); do
63
+ python zipformer_finetune/decode.py \
64
+ --epoch $epoch \
65
+ --avg $avg \
66
+ --manifest-dir data/fbank_librispeech \
67
+ --bpe-model data/lang_bpe_500/bpe.model \
68
+ --use-averaged-model 1 \
69
+ --downsampling-factor 1,2,4,8,4,2,1 \
70
+ --num-encoder-layers 1,2,3,3,1,1,1 \
71
+ --feedforward-dim 1536,1536,1536,1536,1536,1536,1536 \
72
+ --encoder-dim 512,512,512,512,512,512,512 \
73
+ --encoder-unmasked-dim 256,256,256,256,256,256,256 \
74
+ --num-heads 8,8,8,8,8,8,8 \
75
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
76
+ --use-ctc $use_ctc --use-transducer $use_transducer \
77
+ --output-downsampling-factor $output_ds \
78
+ --post-encoder-downsampling-factor $post_output_ds \
79
+ --on-the-fly-feats 1 \
80
+ --exp-dir $exp_dir \
81
+ --decoding-method $m \
82
+ --max-duration 1000
83
+ done
84
+ done
85
+ done
86
+
87
+ # rm $exp_dir/*.pt
88
+
89
+ echo "Done"
94m-uni-v2-dual-domain-mvq/iter-400000-avg-4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c96d858101d874a14ea0c2d2452243b0e9626f0a6f89de4309bd7edc95cc8965
3
+ size 374551106