File size: 3,140 Bytes
1b10e1d
 
 
 
81f7423
1b10e1d
81f7423
 
 
 
 
 
 
 
 
 
 
 
 
 
1b10e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb2aeef
1b10e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7fe2a3e
1b10e1d
7fe2a3e
1b10e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
model: FunASRNano
model_conf:
  lsm_weight: 0.1
  length_normalized_loss: true
audio_encoder: SenseVoiceEncoderSmall
audio_encoder_conf:
  output_size: 512
  attention_heads: 4
  linear_units: 2048
  num_blocks: 50
  tp_blocks: 20
  dropout_rate: 0.1
  positional_dropout_rate: 0.1
  attention_dropout_rate: 0.1
  input_layer: pe
  pos_enc_class: SinusoidalPositionEncoder
  normalize_before: true
  kernel_size: 11
  sanm_shfit: 0
  selfattention_layer_type: sanm
  freeze: true
  freeze_layer_num: -1
  feat_permute: true
llm: Qwen3-0.6b
llm_conf:
  hub: hf
  freeze: true
  llm_dtype: bf16
  init_param_path: Qwen3-0.6B
  use_lora: false
  lora_conf:
    freeze_lora: true
    task_type: CAUSAL_LM
    r: 16
    lora_alpha: 32
    lora_dropout: 0.05
    bias: none
    target_modules:
      - q_proj
      - v_proj
    init_param_path: ""
audio_adaptor: Transformer
audio_adaptor_conf:
  downsample_rate: 1
  use_low_frame_rate: true
  ffn_dim: 2048
  llm_dim: 1024
  encoder_dim: 512
  n_layer: 2
  freeze: true
ctc_decoder: Transformer
detach_ctc_decoder: true
ctc_decoder_conf:
  downsample_rate: 1
  ffn_dim: 2048
  llm_dim: 512
  encoder_dim: 512
  n_layer: 5
  freeze: false
ctc_weight: 1.0
ctc_conf:
  dropout_rate: 0.0
  ctc_type: builtin
  reduce: true
  ignore_nan_grad: true
frontend: WavFrontend
frontend_conf:
  fs: 16000
  window: hamming
  n_mels: 80
  frame_length: 25
  frame_shift: 10
  lfr_m: 7
  lfr_n: 6
  cmvn_file: null
train_conf:
  use_lora: ${llm_conf.use_lora}
  accum_grad: 1
  grad_clip: 5
  max_epoch: 2
  keep_nbest_models: 200
  log_interval: 100
  effective_save_name_excludes:
    - llm.
  resume: true
  validate_interval: 2000
  save_checkpoint_interval: 2000
  avg_nbest_model: 100
  use_bf16: false
  use_deepspeed: true
  deepspeed_config: null
  save_init_model: false
optim: adamw
optim_conf:
  lr: 5.0e-06
  weight_decay: 0.0
scheduler: warmuplr
scheduler_conf:
  warmup_steps: 2500
dataset: FunASR
dataset_conf:
  index_ds: FunASR
  batch_sampler: BatchSampler
  batch_type: token
  batch_size: 6000
  max_token_length: 3500
  shuffle: true
  sort_size: 1024
  batch_size_scale_ratio_max: 2
  num_workers: 4
  audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
  audio_encoder_downsample_rate: 6
  data_split_num: 256
  batch_size_sample_max: 10
  retry: 2000
  batch_size_token_max: 6000
  max_source_length: 12000
  max_target_length: 2048
  prompt_classes: MultiContextPrompt
  prompt_conf:
    max_neg_hotwords_num: 0
    min_neg_hotwords_num: 0
    use_hist: false
    use_one_pass_result: true
    use_hotwords: true
    use_asr_hotwords: true
    chinese_hotwords_list: null
    english_hotwords_list: null
  ctc_tokenizer: SenseVoiceTokenizer
  ctc_target_normalize: true
  ctc_tokenizer_conf:
    vocab_path: null
    is_multilingual: true
    num_languages: 8749
  min_source_length: 10
  batch_size_scale_threshold: 3000
  use_dynamic_output_ratio: 0.0
tokenizer: HuggingfaceTokenizer
tokenizer_conf:
  init_param_path: ${llm_conf.init_param_path}
enable_tf32: true
debug: false
train_data_set_list: null
valid_data_set_list: null
init_param: null
output_dir: null