xiaoyi1734 commited on
Commit
dcd0333
·
verified ·
1 Parent(s): c79ed2a

Upload Kimi-Audio-Reaction/audio_detokenizer/config.yaml with huggingface_hub

Browse files
Kimi-Audio-Reaction/audio_detokenizer/config.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ base_config: config/config_base.yaml
3
+ batch_max_tokens: 12000
4
+ batch_size: 2
5
+ cfg_init: 1.0
6
+ cfg_scale: 4.0
7
+ cfg_schedule: linear
8
+ check_val_every_n_epoch: 10
9
+ clip_grad_norm: 0
10
+ data_dir: ''
11
+ debug: false
12
+ deep_speed_strategy_stage: 2
13
+ drop_last: true
14
+ dynamic_cfg: false
15
+ endless_ds: false
16
+ filter_args:
17
+ lang:
18
+ - zh
19
+ - en
20
+ max_spk_num: 6
21
+ speech_ratio: 0.6
22
+ gradient_clip_val: 1.0
23
+ indexed_ds: true
24
+ infer: false
25
+ infer_exp_name: ''
26
+ infer_json_path: ''
27
+ inference_ckpt: ''
28
+ inference_mode: nonstreaming
29
+ learning_rate: 1e-4
30
+ limit_val_batches: 100
31
+ load_opt: false
32
+ log_interval: 10
33
+ logger_type: tensorboard
34
+ loss:
35
+ lambda_fm: 1.0
36
+ lambda_phone: 0.0
37
+ mel_loss: l1
38
+ max_epochs: 1000
39
+ max_eval_sentences: -1
40
+ max_eval_tokens: -1
41
+ max_prompt_ratio: 0.5
42
+ max_segment_cnt: 20000
43
+ max_sentences: -1
44
+ max_speech_duration: 20
45
+ max_tokens: 31250
46
+ max_training_steps: 100000
47
+ max_updates: 160000
48
+ mel_mean: -4.479605
49
+ mel_std: 3.4584913
50
+ meta_dir: null
51
+ min_prompt_duration: 0.5
52
+ min_speech_duration: -1
53
+ model:
54
+ condition_prenet_depth: 6
55
+ dit:
56
+ chunk_params:
57
+ hz: 50
58
+ max_chunk: 3.0
59
+ max_chunk_history: 50000000
60
+ min_chunk: 0.5
61
+ need_block_shift: false
62
+ condition_input_dim: 1280
63
+ condition_type: discrete_codes
64
+ depth: 16
65
+ ffn_act_layer: gleu_tanh
66
+ ffn_conv_kernel_size: 5
67
+ ffn_gated_glu: false
68
+ ffn_type: vanilla_mlp
69
+ hidden_size: 2304
70
+ input_size: 80
71
+ max_seq_len: 4096
72
+ mlp_ratio: 4.0
73
+ num_heads: 18
74
+ position_embedding_type: skip
75
+ prompt_cfg_dropout: 0.2
76
+ rope_params:
77
+ max_position_embeddings: 4096
78
+ rope_base: 10000.0
79
+ rope_interpolation_factor: 1.0
80
+ semantic_cfg_dropout: 0.2
81
+ semantic_vocab_size: 16384
82
+ use_chunk_setting: true
83
+ use_rope: true
84
+ phone_predictor:
85
+ blank_id: 4
86
+ phone_vocab_size: 5000
87
+ position_id_start_from: 0
88
+ random_position_start: true
89
+ restart_position_ids: false
90
+ use_condition_prenet: false
91
+ need_merge_same_speaker: true
92
+ need_precise_phones: false
93
+ no_verlap: true
94
+ normalize_mel: true
95
+ num_nodes: 1
96
+ num_sanity_val_steps: 0
97
+ num_workers: 1
98
+ ode_steps: 150
99
+ optimizer_adam_beta1: 0.9
100
+ optimizer_adam_beta2: 0.98
101
+ optimizer_class: adamw
102
+ pin_memory: true
103
+ precision: bf16-mixed
104
+ save_interval: 2000
105
+ save_topk: 10
106
+ seed: 1234
107
+ shuffle: true
108
+ sort_by_len: true
109
+ src_sample_rate: 16000
110
+ strategy: ddp
111
+ tensorboard_dir: tb_logs
112
+ test_num: 100
113
+ tgt_sample_rate: 24000
114
+ timescale: 80000
115
+ use_cfg: false
116
+ use_cfg_rescale: false
117
+ use_distributed_sampler: false
118
+ use_uncondition: false
119
+ val_check_interval: 2000000
120
+ vocoder_ckpt: ''
121
+ wandb_name: glm4_semantic_cfm_v2_debug
122
+ warmup_updates: 100
123
+ weight_decay: 0.0001