{ "format": "vibevoice-asr-hf-audio-encoder-v1", "transformers_git_revision": "cbb65a4815d44f1d8b8ff7f51cca24ce491fc09e", "audio_encoder_weight_format": "hf-vibevoice-asr-audio-v1", "audio_encoder_file": "audio_encoder.safetensors", "includes_wte": false, "includes_processor_files": false, "wte_key": null, "text_hidden_size": 3584, "text_vocab_size": 152064, "sample_rate": 24000, "acoustic_vae_std": 0.625, "speech_token_compress_ratio": 3200, "key_prefixes": { "acoustic_encoder": "model.acoustic_tokenizer_encoder.", "semantic_encoder": "model.semantic_tokenizer_encoder.", "projector": "model.multi_modal_projector.", "wte": null }, "tensor_counts": { "acoustic_encoder": 276, "semantic_encoder": 276, "projector": 10, "wte": 0, "omitted_decoder_or_unknown": 277 } }