| { |
| "model_type": "vibevoice", |
| "version": "1.5B", |
| "components": { |
| "acoustic_encoder": "vibevoice_acoustic_encoder.mlpackage", |
| "acoustic_decoder": "vibevoice_acoustic_decoder.mlpackage", |
| "semantic_encoder": "vibevoice_semantic_encoder.mlpackage", |
| "acoustic_connector": "vibevoice_acoustic_connector.mlpackage", |
| "semantic_connector": "vibevoice_semantic_connector.mlpackage", |
| "llm": "vibevoice_llm.mlpackage", |
| "diffusion_head": "vibevoice_diffusion_head.mlpackage" |
| }, |
| "audio_config": { |
| "sample_rate": 24000, |
| "downsample_factor": 3200, |
| "frame_rate": 7.5 |
| }, |
| "latent_config": { |
| "acoustic_dim": 64, |
| "semantic_dim": 128, |
| "hidden_dim": 1536 |
| }, |
| "diffusion_config": { |
| "num_inference_steps": 20, |
| "beta_schedule": "cosine", |
| "prediction_type": "v_prediction" |
| }, |
| "llm_config": { |
| "vocab_size": 151936, |
| "max_position_embeddings": 65536, |
| "num_hidden_layers": 28, |
| "hidden_size": 1536, |
| "num_attention_heads": 12, |
| "num_key_value_heads": 2 |
| }, |
| "normalization": { |
| "scaling_factor": 0.1962890625, |
| "bias_factor": -0.04931640625 |
| } |
| } |