| { |
| "barbet_config": { |
| "attention_dropout": 0.0, |
| "attention_sink": false, |
| "bos_token_id": 114689, |
| "eos_token_id": 114690, |
| "global_attention_layers": [ |
| 0, |
| 4, |
| 8, |
| 12, |
| 16, |
| 20, |
| 24 |
| ], |
| "head_dim": 128, |
| "hidden_dropout": 0.0, |
| "hidden_size": 1536, |
| "initializer_range": 0.02, |
| "intermediate_size": 5120, |
| "mamba_d_conv": 4, |
| "mamba_d_state": 64, |
| "mamba_expand": 2, |
| "mamba_layers": [ |
| 3, |
| 7, |
| 11, |
| 15, |
| 19, |
| 23, |
| 27 |
| ], |
| "max_position_embeddings": 262144, |
| "mtp_enabled": false, |
| "mtp_loss_weights": { |
| "2": 0.2, |
| "3": 0.1 |
| }, |
| "mtp_offsets": [ |
| 2, |
| 3 |
| ], |
| "num_attention_heads": 16, |
| "num_hidden_layers": 28, |
| "num_key_value_heads": 2, |
| "pad_token_id": 114691, |
| "qk_clip_alpha": 0.5, |
| "qk_clip_threshold": 100.0, |
| "qk_logit_clip": false, |
| "qk_norm": true, |
| "rms_norm_eps": 1e-06, |
| "rope_theta": 10000000.0, |
| "sliding_window_size": 8192, |
| "tie_word_embeddings": true, |
| "unk_token_id": 114688, |
| "use_cache": true, |
| "vocab_size": 114944 |
| }, |
| "vox_lm_config": { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "hidden_size": 2048, |
| "intermediate_size": 6144, |
| "max_position_embeddings": 32768, |
| "num_attention_heads": 16, |
| "num_hidden_layers": 28, |
| "num_key_value_heads": 2, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": { |
| "type": "longrope", |
| "long_factor": [ |
| 0.9977997200264581, |
| 1.014658295992452, |
| 1.0349680404997148, |
| 1.059429246056193, |
| 1.0888815016813513, |
| 1.1243301355211495, |
| 1.166977103606075, |
| 1.2182568066927284, |
| 1.2798772354275727, |
| 1.3538666751582975, |
| 1.4426259039919596, |
| 1.5489853358570191, |
| 1.6762658237220625, |
| 1.8283407612492941, |
| 2.0096956085876183, |
| 2.225478927469756, |
| 2.481536379650452, |
| 2.784415934557119, |
| 3.1413289096347365, |
| 3.560047844772632, |
| 4.048719380066383, |
| 4.615569542115128, |
| 5.2684819496549835, |
| 6.014438591970396, |
| 6.858830049237097, |
| 7.804668263503327, |
| 8.851768731513417, |
| 9.99600492938444, |
| 11.228766118181639, |
| 12.536757560834843, |
| 13.902257701387796, |
| 15.303885189125953, |
| 16.717837610115794, |
| 18.119465097853947, |
| 19.484965238406907, |
| 20.792956681060105, |
| 22.02571786985731, |
| 23.16995406772833, |
| 24.217054535738416, |
| 25.16289275000465, |
| 26.007284207271347, |
| 26.753240849586767, |
| 27.40615325712662, |
| 27.973003419175363, |
| 28.461674954469114, |
| 28.880393889607006, |
| 29.237306864684626, |
| 29.540186419591297, |
| 29.79624387177199, |
| 30.01202719065413, |
| 30.193382037992453, |
| 30.34545697551969, |
| 30.47273746338473, |
| 30.579096895249787, |
| 30.66785612408345, |
| 30.741845563814174, |
| 30.80346599254902, |
| 30.85474569563567, |
| 30.897392663720595, |
| 30.932841297560394, |
| 30.962293553185553, |
| 30.986754758742034, |
| 31.007064503249293, |
| 31.02392307921529 |
| ], |
| "short_factor": [ |
| 0.9977997200264581, |
| 1.014658295992452, |
| 1.0349680404997148, |
| 1.059429246056193, |
| 1.0888815016813513, |
| 1.1243301355211495, |
| 1.166977103606075, |
| 1.2182568066927284, |
| 1.2798772354275727, |
| 1.3538666751582975, |
| 1.4426259039919596, |
| 1.5489853358570191, |
| 1.6762658237220625, |
| 1.8283407612492941, |
| 2.0096956085876183, |
| 2.225478927469756, |
| 2.481536379650452, |
| 2.784415934557119, |
| 3.1413289096347365, |
| 3.560047844772632, |
| 4.048719380066383, |
| 4.615569542115128, |
| 5.2684819496549835, |
| 6.014438591970396, |
| 6.858830049237097, |
| 7.804668263503327, |
| 8.851768731513417, |
| 9.99600492938444, |
| 11.228766118181639, |
| 12.536757560834843, |
| 13.902257701387796, |
| 15.303885189125953, |
| 16.717837610115794, |
| 18.119465097853947, |
| 19.484965238406907, |
| 20.792956681060105, |
| 22.02571786985731, |
| 23.16995406772833, |
| 24.217054535738416, |
| 25.16289275000465, |
| 26.007284207271347, |
| 26.753240849586767, |
| 27.40615325712662, |
| 27.973003419175363, |
| 28.461674954469114, |
| 28.880393889607006, |
| 29.237306864684626, |
| 29.540186419591297, |
| 29.79624387177199, |
| 30.01202719065413, |
| 30.193382037992453, |
| 30.34545697551969, |
| 30.47273746338473, |
| 30.579096895249787, |
| 30.66785612408345, |
| 30.741845563814174, |
| 30.80346599254902, |
| 30.85474569563567, |
| 30.897392663720595, |
| 30.932841297560394, |
| 30.962293553185553, |
| 30.986754758742034, |
| 31.007064503249293, |
| 31.02392307921529 |
| ], |
| "original_max_position_embeddings": 32768 |
| }, |
| "vocab_size": 73448, |
| "use_mup": false, |
| "scale_emb": 12.0, |
| "dim_model_base": 256, |
| "scale_depth": 1.4, |
| "rope_theta": 10000.0, |
| "kv_channels": 128, |
| "no_rope": false |
| }, |
| "patch_size": 4, |
| "feat_dim": 64, |
| "residual_lm_num_layers": 8, |
| "residual_lm_no_rope": true, |
| "scalar_quantization_latent_dim": 512, |
| "scalar_quantization_scale": 9, |
| "encoder_config": { |
| "hidden_dim": 1024, |
| "ffn_dim": 4096, |
| "num_heads": 16, |
| "num_layers": 12, |
| "kv_channels": 128 |
| }, |
| "dit_config": { |
| "hidden_dim": 1024, |
| "ffn_dim": 4096, |
| "num_heads": 16, |
| "num_layers": 12, |
| "kv_channels": 128, |
| "dit_mean_mode": false, |
| "cfm_config": { |
| "sigma_min": 1e-06, |
| "solver": "euler", |
| "t_scheduler": "log-norm", |
| "training_cfg_rate": 0.1, |
| "inference_cfg_rate": 2.0, |
| "reg_loss_type": "l1", |
| "ratio_r_neq_t_range": [ |
| 0.25, |
| 0.75 |
| ], |
| "noise_cond_prob_range": [ |
| 0.0, |
| 0.0 |
| ], |
| "noise_cond_scale": 0.0 |
| } |
| }, |
| "audio_vae_config": { |
| "encoder_dim": 128, |
| "encoder_rates": [ |
| 2, |
| 5, |
| 8, |
| 8 |
| ], |
| "latent_dim": 64, |
| "decoder_dim": 2048, |
| "decoder_rates": [ |
| 8, |
| 6, |
| 5, |
| 2, |
| 2, |
| 2 |
| ], |
| "depthwise": true, |
| "sample_rate": 16000, |
| "out_sample_rate": 48000, |
| "use_noise_block": false, |
| "sr_bin_boundaries": [ |
| 20000, |
| 30000, |
| 40000 |
| ], |
| "cond_type": "scale_bias", |
| "cond_dim": 128, |
| "cond_out_layer": false |
| }, |
| "adapter_config": { |
| "num_residual_blocks": 1, |
| "ffn_mult": 2.0, |
| "rms_norm_eps": 1e-06 |
| }, |
| "speaker_embed_dim": 192, |
| "audio_start_token": -1, |
| "audio_end_token": -1, |
| "ref_audio_start_token": -1, |
| "ref_audio_end_token": -1, |
| "spk_token": -1, |
| "barbet_effective_vocab_size": null, |
| "max_length": 8192, |
| "device": "cuda", |
| "dtype": "bfloat16", |
| "generation_defaults": { |
| "cfg_value": 2.8, |
| "inference_timesteps": 9, |
| "max_len": 2000, |
| "retry_badcase": true, |
| "retry_badcase_max_times": 3, |
| "retry_badcase_ratio_threshold": 6.0, |
| "speaker_id": "hung_yi_lee", |
| "speaker_source_dataset": "voidful/hung-yi_lee", |
| "speaker_centroid_path": "checkpoints/hung_yi_lee_speaker_centroids.pt", |
| "speaker_centroid_sha256": "e1d4c95a4c33935ff1fee0ab47fa796dcc13908a183c60e9b02bc0a61c541c4c", |
| "speaker_centroid_dim": 192 |
| }, |
| "generation_defaults_source": { |
| "scope": "tts_hard_sentences_zh_500 + Breeze-ASR-25 normalized CER", |
| "sentences": "/home/voidful/tts_hard_sentences_zh_500.txt", |
| "asr_model": "MediaTek-Research/Breeze-ASR-25", |
| "conversion": "s2twp", |
| "normalized_cer": 0.09669792733863977, |
| "mixed_token_error_rate": 0.0911015155363644, |
| "char_errors": 1227, |
| "char_reference_length": 12689, |
| "evaluated_examples": 500, |
| "trial": "hy_cfg2p8_steps9", |
| "run": "hungyi_high_refine_hy_cfg2p8_steps9_20260620" |
| } |
| } |
|
|