Upload Kimi-Audio-Reaction/audio_detokenizer/config.yaml with huggingface_hub
Browse files
Kimi-Audio-Reaction/audio_detokenizer/config.yaml
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accumulate_grad_batches: 1
|
| 2 |
+
base_config: config/config_base.yaml
|
| 3 |
+
batch_max_tokens: 12000
|
| 4 |
+
batch_size: 2
|
| 5 |
+
cfg_init: 1.0
|
| 6 |
+
cfg_scale: 4.0
|
| 7 |
+
cfg_schedule: linear
|
| 8 |
+
check_val_every_n_epoch: 10
|
| 9 |
+
clip_grad_norm: 0
|
| 10 |
+
data_dir: ''
|
| 11 |
+
debug: false
|
| 12 |
+
deep_speed_strategy_stage: 2
|
| 13 |
+
drop_last: true
|
| 14 |
+
dynamic_cfg: false
|
| 15 |
+
endless_ds: false
|
| 16 |
+
filter_args:
|
| 17 |
+
lang:
|
| 18 |
+
- zh
|
| 19 |
+
- en
|
| 20 |
+
max_spk_num: 6
|
| 21 |
+
speech_ratio: 0.6
|
| 22 |
+
gradient_clip_val: 1.0
|
| 23 |
+
indexed_ds: true
|
| 24 |
+
infer: false
|
| 25 |
+
infer_exp_name: ''
|
| 26 |
+
infer_json_path: ''
|
| 27 |
+
inference_ckpt: ''
|
| 28 |
+
inference_mode: nonstreaming
|
| 29 |
+
learning_rate: 1e-4
|
| 30 |
+
limit_val_batches: 100
|
| 31 |
+
load_opt: false
|
| 32 |
+
log_interval: 10
|
| 33 |
+
logger_type: tensorboard
|
| 34 |
+
loss:
|
| 35 |
+
lambda_fm: 1.0
|
| 36 |
+
lambda_phone: 0.0
|
| 37 |
+
mel_loss: l1
|
| 38 |
+
max_epochs: 1000
|
| 39 |
+
max_eval_sentences: -1
|
| 40 |
+
max_eval_tokens: -1
|
| 41 |
+
max_prompt_ratio: 0.5
|
| 42 |
+
max_segment_cnt: 20000
|
| 43 |
+
max_sentences: -1
|
| 44 |
+
max_speech_duration: 20
|
| 45 |
+
max_tokens: 31250
|
| 46 |
+
max_training_steps: 100000
|
| 47 |
+
max_updates: 160000
|
| 48 |
+
mel_mean: -4.479605
|
| 49 |
+
mel_std: 3.4584913
|
| 50 |
+
meta_dir: null
|
| 51 |
+
min_prompt_duration: 0.5
|
| 52 |
+
min_speech_duration: -1
|
| 53 |
+
model:
|
| 54 |
+
condition_prenet_depth: 6
|
| 55 |
+
dit:
|
| 56 |
+
chunk_params:
|
| 57 |
+
hz: 50
|
| 58 |
+
max_chunk: 3.0
|
| 59 |
+
max_chunk_history: 50000000
|
| 60 |
+
min_chunk: 0.5
|
| 61 |
+
need_block_shift: false
|
| 62 |
+
condition_input_dim: 1280
|
| 63 |
+
condition_type: discrete_codes
|
| 64 |
+
depth: 16
|
| 65 |
+
ffn_act_layer: gleu_tanh
|
| 66 |
+
ffn_conv_kernel_size: 5
|
| 67 |
+
ffn_gated_glu: false
|
| 68 |
+
ffn_type: vanilla_mlp
|
| 69 |
+
hidden_size: 2304
|
| 70 |
+
input_size: 80
|
| 71 |
+
max_seq_len: 4096
|
| 72 |
+
mlp_ratio: 4.0
|
| 73 |
+
num_heads: 18
|
| 74 |
+
position_embedding_type: skip
|
| 75 |
+
prompt_cfg_dropout: 0.2
|
| 76 |
+
rope_params:
|
| 77 |
+
max_position_embeddings: 4096
|
| 78 |
+
rope_base: 10000.0
|
| 79 |
+
rope_interpolation_factor: 1.0
|
| 80 |
+
semantic_cfg_dropout: 0.2
|
| 81 |
+
semantic_vocab_size: 16384
|
| 82 |
+
use_chunk_setting: true
|
| 83 |
+
use_rope: true
|
| 84 |
+
phone_predictor:
|
| 85 |
+
blank_id: 4
|
| 86 |
+
phone_vocab_size: 5000
|
| 87 |
+
position_id_start_from: 0
|
| 88 |
+
random_position_start: true
|
| 89 |
+
restart_position_ids: false
|
| 90 |
+
use_condition_prenet: false
|
| 91 |
+
need_merge_same_speaker: true
|
| 92 |
+
need_precise_phones: false
|
| 93 |
+
no_verlap: true
|
| 94 |
+
normalize_mel: true
|
| 95 |
+
num_nodes: 1
|
| 96 |
+
num_sanity_val_steps: 0
|
| 97 |
+
num_workers: 1
|
| 98 |
+
ode_steps: 150
|
| 99 |
+
optimizer_adam_beta1: 0.9
|
| 100 |
+
optimizer_adam_beta2: 0.98
|
| 101 |
+
optimizer_class: adamw
|
| 102 |
+
pin_memory: true
|
| 103 |
+
precision: bf16-mixed
|
| 104 |
+
save_interval: 2000
|
| 105 |
+
save_topk: 10
|
| 106 |
+
seed: 1234
|
| 107 |
+
shuffle: true
|
| 108 |
+
sort_by_len: true
|
| 109 |
+
src_sample_rate: 16000
|
| 110 |
+
strategy: ddp
|
| 111 |
+
tensorboard_dir: tb_logs
|
| 112 |
+
test_num: 100
|
| 113 |
+
tgt_sample_rate: 24000
|
| 114 |
+
timescale: 80000
|
| 115 |
+
use_cfg: false
|
| 116 |
+
use_cfg_rescale: false
|
| 117 |
+
use_distributed_sampler: false
|
| 118 |
+
use_uncondition: false
|
| 119 |
+
val_check_interval: 2000000
|
| 120 |
+
vocoder_ckpt: ''
|
| 121 |
+
wandb_name: glm4_semantic_cfm_v2_debug
|
| 122 |
+
warmup_updates: 100
|
| 123 |
+
weight_decay: 0.0001
|