Duplicate from microsoft/VibeVoice-ASR

Browse files

Co-authored-by: FW <frontierai@users.noreply.huggingface.co>

Files changed (16) hide show

.gitattributes +2 -0
README.md +60 -0
config.json +148 -0
figures/DER.jpg +0 -0
figures/VibeVoice_ASR_archi.png +3 -0
figures/cpWER.jpg +0 -0
figures/tcpWER.jpg +0 -0
model-00001-of-00008.safetensors +3 -0
model-00002-of-00008.safetensors +3 -0
model-00003-of-00008.safetensors +3 -0
model-00004-of-00008.safetensors +3 -0
model-00005-of-00008.safetensors +3 -0
model-00006-of-00008.safetensors +3 -0
model-00007-of-00008.safetensors +3 -0
model-00008-of-00008.safetensors +3 -0
model.safetensors.index.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2	+ figures/VibeVoice_ASR_archi.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+language:
+- en
+- zh
+license: mit
+pipeline_tag: automatic-speech-recognition
+tags:
+- ASR
+- Transcriptoin
+- Diarization
+- Speech-to-Text
+library_name: transformers
+---
+## VibeVoice-ASR
+[![GitHub](https://img.shields.io/badge/GitHub-Repo-black?logo=github)](https://github.com/microsoft/VibeVoice)
+[![Live Playground](https://img.shields.io/badge/Live-Playground-green?logo=gradio)](https://aka.ms/vibevoice-asr)
+**VibeVoice-ASR** is a unified speech-to-text model designed to handle **60-minute long-form audio** in a single pass, generating structured transcriptions containing **Who (Speaker), When (Timestamps), and What (Content)**, with support for **Customized Hotwords**.
+➡️ **Code:** [microsoft/VibeVoice](https://github.com/microsoft/VibeVoice)<br>
+➡️ **Demo:** [VibeVoice-ASR-Demo](https://aka.ms/vibevoice-asr)
+<p align="left">
+  <img src="figures/VibeVoice_ASR_archi.png" alt="VibeVoice-ASR Architecture" height="250px">
+</p>
+## 🔥 Key Features
+- **🕒 60-minute Single-Pass Processing**:
+  Unlike conventional ASR models that slice audio into short chunks (often losing global context), VibeVoice ASR accepts up to **60 minutes** of continuous audio input within 64K token length. This ensures consistent speaker tracking and semantic coherence across the entire hour.
+- **👤 Customized Hotwords**:
+  Users can provide customized hotwords (e.g., specific names, technical terms, or background info) to guide the recognition process, significantly improving accuracy on domain-specific content.
+- **📝 Rich Transcription (Who, When, What)**:
+  The model jointly performs ASR, diarization, and timestamping, producing a structured output that indicates *who* said *what* and *when*.
+## Evaluation
+<p align="center">
+  <img src="figures/DER.jpg" alt="DER" width="70%">
+  <img src="figures/cpWER.jpg" alt="cpWER" width="70%">
+  <img src="figures/tcpWER.jpg" alt="tcpWER" width="70%">
+</p>
+## Installation and Usage
+Please refer to [GitHub README](https://github.com/microsoft/VibeVoice/blob/main/docs/vibevoice-asr.md#installation).
+## License
+This project is licensed under the MIT License.
+## Contact
+This project was conducted by members of Microsoft Research. We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at VibeVoice@microsoft.com.
+If the team receives reports of undesired behavior or identifies issues independently, we will update this repository with appropriate mitigations.

config.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+  "_attn_implementation_autoset": false,
+  "acoustic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "decoder_depths": null,
+    "decoder_n_filters": 32,
+    "decoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "disable_last_norm": true,
+    "dtype": "bfloat16",
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0.5,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_acoustic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "gaussian",
+    "vae_dim": 64,
+    "weight_init_value": 0.01
+  },
+  "acoustic_vae_dim": 64,
+  "architectures": [
+    "VibeVoiceForASRTraining"
+  ],
+  "decoder_config": {
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "max_window_layers": 28,
+    "model_type": "qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_mrope": false,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "diffusion_head_config": {
+    "ddpm_batch_mul": 4,
+    "ddpm_beta_schedule": "cosine",
+    "ddpm_num_inference_steps": 20,
+    "ddpm_num_steps": 1000,
+    "diffusion_type": "ddpm",
+    "head_ffn_ratio": 3.0,
+    "head_layers": 4,
+    "hidden_size": 3584,
+    "latent_size": 64,
+    "model_type": "vibepod_diffusion_head",
+    "prediction_type": "v_prediction",
+    "rms_norm_eps": 1e-05,
+    "speech_vae_dim": 64
+  },
+  "dtype": "float32",
+  "model_type": "vibevoice",
+  "semantic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "disable_last_norm": true,
+    "dtype": "bfloat16",
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_semantic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "none",
+    "vae_dim": 128,
+    "weight_init_value": 0.01
+  },
+  "semantic_vae_dim": 128,
+  "transformers_version": "4.57.6"
+}

figures/DER.jpg ADDED Viewed

figures/VibeVoice_ASR_archi.png ADDED Viewed

Git LFS Details

SHA256: ae2623a1eaa7ac18cdbba1a246a6ffe9ca78e976e9e7728750f140e6b2ebb90a
Pointer size: 131 Bytes
Size of remote file: 168 kB

figures/cpWER.jpg ADDED Viewed

figures/tcpWER.jpg ADDED Viewed

model-00001-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5548c67885d423ba184bc8c33f2e9f81b582a6d119cef79907e19a274b916637
+size 2488346272

model-00002-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:163023c61a3fb047745cbaf53ed41c1e27e515e9786a376e122bfac2ea6e687e
+size 2389315976

model-00003-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e021702dfac2c52e8fdd6688de82c118be7bb7ad9b5c7988725ec63c44a64fb
+size 2466376368

model-00004-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b17657bb151daa117a5a4671374ac1b248acb696691a2a67ac227a1115925e30
+size 2466376400

model-00005-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ed4e457268f7b02dda5cffe16b3a32614ccc2ccfe5de2db39bdd79700836406
+size 2499431136

model-00006-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6de8246bb042fd853b57d40995efd289ea44e4d1b611cec2e122570b8d2122bd
+size 2483469928

model-00007-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2ba6960d994dc7598efc6796f85ab097da7708f4dd56095f7fccf4df8dc00e5
+size 1464887482

model-00008-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b9d9b328f85a25b4efca712d31513c6eed9e178152cc8cf4a6f0c2cd2bb623f
+size 1089994848

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff