marcoyang commited on
Commit
ab12c11
·
1 Parent(s): 55c315c

update finetune script

Browse files
327M-uni-v2-dual-domain-mvq/finetune_rnnt_300m.sh ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ export PYTHONPATH=./../../../:$PYTHONPATH
4
+
5
+ # data related
6
+ use_librispeech=1
7
+ full_libri=0
8
+
9
+ causal=0
10
+ lr=0.045
11
+
12
+ # finetune checkpoint
13
+ do_finetune=1
14
+ finetune_ckpt=zipformer_audio_encoder/exp-316M-uniform-v2-zipformer-out-ds-2-lh-large-giga-xl-voxpopuli-1-as-full-x2-all-audio-w2v2-mask-p-0.65-l-10-cha-mask-p-0.25-l-20-musan-p-0.5-min-snr-10-multi-mvq-wavlm-all-wavlm-large-cb16-1.0-dasheng-cb8-0.1-md300/iter-496000-avg-4.pt
15
+
16
+ use_ctc=0
17
+ use_transducer=1
18
+ output_ds=2
19
+ post_output_ds=1
20
+
21
+ freeze_encoder=0
22
+ freeze_encoder_steps=2000
23
+ # freeze_encoder=1
24
+ # freeze_encoder_steps=-1
25
+ encoder_lr_scale=0.05
26
+
27
+ md=1000
28
+
29
+ exp_dir=zipformer_finetune/exp-finetune-rnnt-327M-multi-mvq-out-ds-2
30
+
31
+ echo $exp_dir
32
+
33
+ torchrun --nproc_per_node=2 --master_port=19291 \
34
+ zipformer_finetune/finetune_asr.py \
35
+ --num-epochs 30 \
36
+ --use-fp16 1 \
37
+ --start-epoch 1 \
38
+ --use-librispeech $use_librispeech --full-libri $full_libri \
39
+ --exp-dir $exp_dir \
40
+ --manifest-dir data/fbank \
41
+ --bpe-model data/lang_bpe_500/bpe.model \
42
+ --base-lr $lr \
43
+ --use-ctc $use_ctc --use-transducer $use_transducer \
44
+ --do-finetune $do_finetune --init-modules "encoder_embed,encoder" --finetune-ckpt $finetune_ckpt \
45
+ --freeze-encoder $freeze_encoder --freeze-encoder-steps $freeze_encoder_steps \
46
+ --encoder-lr-scale $encoder_lr_scale \
47
+ --causal $causal \
48
+ --downsampling-factor 1,2,4,8,4,2,1 \
49
+ --num-encoder-layers 1,2,2,3,1,1,1 \
50
+ --feedforward-dim 3072,3072,3072,3072,3072,3072,3072 \
51
+ --encoder-dim 1024,1024,1024,1024,1024,1024,1024 \
52
+ --encoder-unmasked-dim 512,512,512,512,512,512,512 \
53
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
54
+ --num-heads 8,8,8,8,8,8,8 \
55
+ --output-downsampling-factor $output_ds \
56
+ --post-encoder-downsampling-factor $post_output_ds \
57
+ --on-the-fly-feats 1 \
58
+ --max-duration $md
59
+
60
+ for m in greedy_search modified_beam_search; do
61
+ for epoch in 23; do
62
+ for avg in 8; do
63
+ python zipformer_finetune/decode.py \
64
+ --epoch $epoch \
65
+ --avg $avg \
66
+ --manifest-dir data/fbank_librispeech \
67
+ --bpe-model data/lang_bpe_500/bpe.model \
68
+ --use-averaged-model 1 \
69
+ --downsampling-factor 1,2,4,8,4,2,1 \
70
+ --num-encoder-layers 1,2,2,3,1,1,1 \
71
+ --feedforward-dim 3072,3072,3072,3072,3072,3072,3072 \
72
+ --encoder-dim 1024,1024,1024,1024,1024,1024,1024 \
73
+ --encoder-unmasked-dim 512,512,512,512,512,512,512 \
74
+ --cnn-module-kernel 31,31,15,15,15,31,31 \
75
+ --num-heads 8,8,8,8,8,8,8 \
76
+ --use-ctc $use_ctc --use-transducer $use_transducer \
77
+ --output-downsampling-factor $output_ds \
78
+ --post-encoder-downsampling-factor $post_output_ds \
79
+ --on-the-fly-feats 1 \
80
+ --exp-dir $exp_dir \
81
+ --decoding-method $m \
82
+ --max-duration 1000
83
+ done
84
+ done
85
+ done
86
+
87
+ # rm $exp_dir/*.pt
88
+
89
+ echo "Done"