{ "architectures": [ "VibeVoiceSemanticTokenizerModel" ], "causal": true, "channels": 1, "conv_bias": true, "conv_norm": "none", "corpus_normalize": 0.0, "disable_last_norm": true, "encoder_depths": "3-3-3-3-3-3-8", "encoder_n_filters": 32, "encoder_ratios": [ 8, 5, 5, 4, 2, 2 ], "fix_std": 0, "layer_scale_init_value": 1e-06, "layernorm": "RMSNorm", "layernorm_elementwise_affine": true, "layernorm_eps": 1e-05, "mixer_layer": "depthwise_conv", "model_type": "vibevoice_semantic_tokenizer", "pad_mode": "constant", "std_dist_type": "none", "torch_dtype": "bfloat16", "transformers_version": "4.51.3", "vae_dim": 128, "weight_init_value": 0.01 }