Update README.md content
Browse files
README.md
CHANGED
|
@@ -62,18 +62,15 @@ library_name: transformers
|
|
| 62 |
---
|
| 63 |
|
| 64 |
|
| 65 |
-
## VibeVoice-ASR
|
| 66 |
[](https://github.com/microsoft/VibeVoice)
|
| 67 |
[](https://aka.ms/vibevoice-asr)
|
| 68 |
[](https://arxiv.org/pdf/2601.18184)
|
| 69 |
|
| 70 |
**VibeVoice-ASR** is a unified speech-to-text model designed to handle **60-minute long-form audio** in a single pass, generating structured transcriptions containing **Who (Speaker), When (Timestamps), and What (Content)**, with support for **Customized Hotwords** and over **50 languages**.
|
| 71 |
|
| 72 |
-
➡️ **Code:** [microsoft/VibeVoice](https://github.com/microsoft/VibeVoice)<br>
|
| 73 |
➡️ **Demo:** [VibeVoice-ASR-Demo](https://aka.ms/vibevoice-asr)<br>
|
| 74 |
➡️ **Report:** [VibeVoice-ASR Technical Report](https://arxiv.org/pdf/2601.18184)<br>
|
| 75 |
-
➡️ **Finetuning:** [Finetuning](https://github.com/microsoft/VibeVoice/blob/main/finetuning-asr/README.md)<br>
|
| 76 |
-
➡️ **vLLM:** [vLLM-VibeVoice-ASR](https://github.com/microsoft/VibeVoice/blob/main/docs/vibevoice-vllm-asr.md)<br>
|
| 77 |
|
| 78 |
<p align="left">
|
| 79 |
<img src="figures/VibeVoice_ASR_archi.png" alt="VibeVoice-ASR Architecture" height="250px">
|
|
@@ -100,9 +97,13 @@ library_name: transformers
|
|
| 100 |
|
| 101 |
### Setup
|
| 102 |
|
| 103 |
-
VibeVoice ASR is not yet merged into Transformers but can be used by pulling the source code from the following fork:
|
| 104 |
```
|
| 105 |
-
pip install
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
```
|
| 107 |
|
| 108 |
### Loading model
|
|
@@ -110,7 +111,7 @@ pip install git+https://github.com/ebezzam/transformers.git@vibevoice_asr
|
|
| 110 |
```python
|
| 111 |
from transformers import AutoProcessor, VibeVoiceForConditionalGeneration
|
| 112 |
|
| 113 |
-
model_id = "
|
| 114 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 115 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id)
|
| 116 |
```
|
|
@@ -128,7 +129,7 @@ The example below transcribes the following audio.
|
|
| 128 |
```python
|
| 129 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 130 |
|
| 131 |
-
model_id = "
|
| 132 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 133 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
| 134 |
print(f"Model loaded on {model.device} with dtype {model.dtype}")
|
|
@@ -199,7 +200,7 @@ Below we transcribe an audio where the speaker (with a German accent) talks abou
|
|
| 199 |
```python
|
| 200 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 201 |
|
| 202 |
-
model_id = "
|
| 203 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 204 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
| 205 |
print(f"Model loaded on {model.device} with dtype {model.dtype}")
|
|
@@ -237,7 +238,7 @@ Batch inference is possible by passing a list of audio and (if provided) a list
|
|
| 237 |
```python
|
| 238 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 239 |
|
| 240 |
-
model_id = "
|
| 241 |
audio = [
|
| 242 |
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav",
|
| 243 |
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/example_output/VibeVoice-1.5B_output.wav"
|
|
@@ -266,7 +267,7 @@ However, if chunks of 60 seconds are too large for your device, the `tokenizer_c
|
|
| 266 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 267 |
|
| 268 |
tokenizer_chunk_size = 64000 # default is 1440000 (60s @ 24kHz)
|
| 269 |
-
model_id = "
|
| 270 |
audio = [
|
| 271 |
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav",
|
| 272 |
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/example_output/VibeVoice-1.5B_output.wav"
|
|
@@ -290,7 +291,7 @@ VibeVoice ASR also accepts chat template inputs (`apply_transcription_request` i
|
|
| 290 |
```python
|
| 291 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 292 |
|
| 293 |
-
model_id = "
|
| 294 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 295 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
| 296 |
|
|
@@ -339,7 +340,7 @@ VibeVoice ASR can be trained with the loss outputted by the model.
|
|
| 339 |
```python
|
| 340 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 341 |
|
| 342 |
-
model_id = "
|
| 343 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 344 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
| 345 |
model.train()
|
|
@@ -392,7 +393,7 @@ import time
|
|
| 392 |
import torch
|
| 393 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 394 |
|
| 395 |
-
model_id = "
|
| 396 |
|
| 397 |
num_warmup = 5
|
| 398 |
num_runs = 20
|
|
@@ -475,7 +476,7 @@ The model can be used as a pipeline, but you will have to define your own method
|
|
| 475 |
```python
|
| 476 |
from transformers import pipeline
|
| 477 |
|
| 478 |
-
model_id = "
|
| 479 |
pipe = pipeline("any-to-any", model=model_id, device_map="auto")
|
| 480 |
chat_template = [
|
| 481 |
{
|
|
|
|
| 62 |
---
|
| 63 |
|
| 64 |
|
| 65 |
+
## VibeVoice-ASR (Transformers-compatible version)
|
| 66 |
[](https://github.com/microsoft/VibeVoice)
|
| 67 |
[](https://aka.ms/vibevoice-asr)
|
| 68 |
[](https://arxiv.org/pdf/2601.18184)
|
| 69 |
|
| 70 |
**VibeVoice-ASR** is a unified speech-to-text model designed to handle **60-minute long-form audio** in a single pass, generating structured transcriptions containing **Who (Speaker), When (Timestamps), and What (Content)**, with support for **Customized Hotwords** and over **50 languages**.
|
| 71 |
|
|
|
|
| 72 |
➡️ **Demo:** [VibeVoice-ASR-Demo](https://aka.ms/vibevoice-asr)<br>
|
| 73 |
➡️ **Report:** [VibeVoice-ASR Technical Report](https://arxiv.org/pdf/2601.18184)<br>
|
|
|
|
|
|
|
| 74 |
|
| 75 |
<p align="left">
|
| 76 |
<img src="figures/VibeVoice_ASR_archi.png" alt="VibeVoice-ASR Architecture" height="250px">
|
|
|
|
| 97 |
|
| 98 |
### Setup
|
| 99 |
|
|
|
|
| 100 |
```
|
| 101 |
+
pip install transformers
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
However, if you're here early and VibeVoice ASR is not yet part of an official Transformers release, it can be used by installing from the source code:
|
| 105 |
+
```
|
| 106 |
+
pip install git+https://github.com/huggingface/transformers.git
|
| 107 |
```
|
| 108 |
|
| 109 |
### Loading model
|
|
|
|
| 111 |
```python
|
| 112 |
from transformers import AutoProcessor, VibeVoiceForConditionalGeneration
|
| 113 |
|
| 114 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 115 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 116 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id)
|
| 117 |
```
|
|
|
|
| 129 |
```python
|
| 130 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 131 |
|
| 132 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 133 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 134 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
| 135 |
print(f"Model loaded on {model.device} with dtype {model.dtype}")
|
|
|
|
| 200 |
```python
|
| 201 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 202 |
|
| 203 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 204 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 205 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
| 206 |
print(f"Model loaded on {model.device} with dtype {model.dtype}")
|
|
|
|
| 238 |
```python
|
| 239 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 240 |
|
| 241 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 242 |
audio = [
|
| 243 |
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav",
|
| 244 |
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/example_output/VibeVoice-1.5B_output.wav"
|
|
|
|
| 267 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 268 |
|
| 269 |
tokenizer_chunk_size = 64000 # default is 1440000 (60s @ 24kHz)
|
| 270 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 271 |
audio = [
|
| 272 |
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav",
|
| 273 |
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/example_output/VibeVoice-1.5B_output.wav"
|
|
|
|
| 291 |
```python
|
| 292 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 293 |
|
| 294 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 295 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 296 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
| 297 |
|
|
|
|
| 340 |
```python
|
| 341 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 342 |
|
| 343 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 344 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 345 |
model = VibeVoiceAsrForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
| 346 |
model.train()
|
|
|
|
| 393 |
import torch
|
| 394 |
from transformers import AutoProcessor, VibeVoiceAsrForConditionalGeneration
|
| 395 |
|
| 396 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 397 |
|
| 398 |
num_warmup = 5
|
| 399 |
num_runs = 20
|
|
|
|
| 476 |
```python
|
| 477 |
from transformers import pipeline
|
| 478 |
|
| 479 |
+
model_id = "microsoft/VibeVoice-ASR-HF"
|
| 480 |
pipe = pipeline("any-to-any", model=model_id, device_map="auto")
|
| 481 |
chat_template = [
|
| 482 |
{
|