litagin frontierai commited on
Commit
ed86d3c
·
verified ·
0 Parent(s):

Duplicate from microsoft/VibeVoice-ASR

Browse files

Co-authored-by: FW <frontierai@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ figures/VibeVoice_ASR_archi.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - zh
5
+ license: mit
6
+ pipeline_tag: automatic-speech-recognition
7
+ tags:
8
+ - ASR
9
+ - Transcriptoin
10
+ - Diarization
11
+ - Speech-to-Text
12
+ library_name: transformers
13
+ ---
14
+
15
+
16
+ ## VibeVoice-ASR
17
+ [![GitHub](https://img.shields.io/badge/GitHub-Repo-black?logo=github)](https://github.com/microsoft/VibeVoice)
18
+ [![Live Playground](https://img.shields.io/badge/Live-Playground-green?logo=gradio)](https://aka.ms/vibevoice-asr)
19
+
20
+ **VibeVoice-ASR** is a unified speech-to-text model designed to handle **60-minute long-form audio** in a single pass, generating structured transcriptions containing **Who (Speaker), When (Timestamps), and What (Content)**, with support for **Customized Hotwords**.
21
+
22
+ ➡️ **Code:** [microsoft/VibeVoice](https://github.com/microsoft/VibeVoice)<br>
23
+ ➡️ **Demo:** [VibeVoice-ASR-Demo](https://aka.ms/vibevoice-asr)
24
+
25
+ <p align="left">
26
+ <img src="figures/VibeVoice_ASR_archi.png" alt="VibeVoice-ASR Architecture" height="250px">
27
+ </p>
28
+
29
+
30
+ ## 🔥 Key Features
31
+
32
+ - **🕒 60-minute Single-Pass Processing**:
33
+ Unlike conventional ASR models that slice audio into short chunks (often losing global context), VibeVoice ASR accepts up to **60 minutes** of continuous audio input within 64K token length. This ensures consistent speaker tracking and semantic coherence across the entire hour.
34
+
35
+ - **👤 Customized Hotwords**:
36
+ Users can provide customized hotwords (e.g., specific names, technical terms, or background info) to guide the recognition process, significantly improving accuracy on domain-specific content.
37
+
38
+ - **📝 Rich Transcription (Who, When, What)**:
39
+ The model jointly performs ASR, diarization, and timestamping, producing a structured output that indicates *who* said *what* and *when*.
40
+
41
+
42
+
43
+
44
+ ## Evaluation
45
+ <p align="center">
46
+ <img src="figures/DER.jpg" alt="DER" width="70%">
47
+ <img src="figures/cpWER.jpg" alt="cpWER" width="70%">
48
+ <img src="figures/tcpWER.jpg" alt="tcpWER" width="70%">
49
+ </p>
50
+
51
+ ## Installation and Usage
52
+
53
+ Please refer to [GitHub README](https://github.com/microsoft/VibeVoice/blob/main/docs/vibevoice-asr.md#installation).
54
+
55
+ ## License
56
+ This project is licensed under the MIT License.
57
+
58
+ ## Contact
59
+ This project was conducted by members of Microsoft Research. We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at VibeVoice@microsoft.com.
60
+ If the team receives reports of undesired behavior or identifies issues independently, we will update this repository with appropriate mitigations.
config.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "acoustic_tokenizer_config": {
4
+ "causal": true,
5
+ "channels": 1,
6
+ "conv_bias": true,
7
+ "conv_norm": "none",
8
+ "corpus_normalize": 0.0,
9
+ "decoder_depths": null,
10
+ "decoder_n_filters": 32,
11
+ "decoder_ratios": [
12
+ 8,
13
+ 5,
14
+ 5,
15
+ 4,
16
+ 2,
17
+ 2
18
+ ],
19
+ "disable_last_norm": true,
20
+ "dtype": "bfloat16",
21
+ "encoder_depths": "3-3-3-3-3-3-8",
22
+ "encoder_n_filters": 32,
23
+ "encoder_ratios": [
24
+ 8,
25
+ 5,
26
+ 5,
27
+ 4,
28
+ 2,
29
+ 2
30
+ ],
31
+ "fix_std": 0.5,
32
+ "layer_scale_init_value": 1e-06,
33
+ "layernorm": "RMSNorm",
34
+ "layernorm_elementwise_affine": true,
35
+ "layernorm_eps": 1e-05,
36
+ "mixer_layer": "depthwise_conv",
37
+ "model_type": "vibevoice_acoustic_tokenizer",
38
+ "pad_mode": "constant",
39
+ "std_dist_type": "gaussian",
40
+ "vae_dim": 64,
41
+ "weight_init_value": 0.01
42
+ },
43
+ "acoustic_vae_dim": 64,
44
+ "architectures": [
45
+ "VibeVoiceForASRTraining"
46
+ ],
47
+ "decoder_config": {
48
+ "attention_dropout": 0.0,
49
+ "dtype": "bfloat16",
50
+ "hidden_act": "silu",
51
+ "hidden_size": 3584,
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 18944,
54
+ "layer_types": [
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention",
75
+ "full_attention",
76
+ "full_attention",
77
+ "full_attention",
78
+ "full_attention",
79
+ "full_attention",
80
+ "full_attention",
81
+ "full_attention",
82
+ "full_attention"
83
+ ],
84
+ "max_position_embeddings": 131072,
85
+ "max_window_layers": 28,
86
+ "model_type": "qwen2",
87
+ "num_attention_heads": 28,
88
+ "num_hidden_layers": 28,
89
+ "num_key_value_heads": 4,
90
+ "rms_norm_eps": 1e-06,
91
+ "rope_scaling": null,
92
+ "rope_theta": 1000000.0,
93
+ "sliding_window": null,
94
+ "use_cache": true,
95
+ "use_mrope": false,
96
+ "use_sliding_window": false,
97
+ "vocab_size": 152064
98
+ },
99
+ "diffusion_head_config": {
100
+ "ddpm_batch_mul": 4,
101
+ "ddpm_beta_schedule": "cosine",
102
+ "ddpm_num_inference_steps": 20,
103
+ "ddpm_num_steps": 1000,
104
+ "diffusion_type": "ddpm",
105
+ "head_ffn_ratio": 3.0,
106
+ "head_layers": 4,
107
+ "hidden_size": 3584,
108
+ "latent_size": 64,
109
+ "model_type": "vibepod_diffusion_head",
110
+ "prediction_type": "v_prediction",
111
+ "rms_norm_eps": 1e-05,
112
+ "speech_vae_dim": 64
113
+ },
114
+ "dtype": "float32",
115
+ "model_type": "vibevoice",
116
+ "semantic_tokenizer_config": {
117
+ "causal": true,
118
+ "channels": 1,
119
+ "conv_bias": true,
120
+ "conv_norm": "none",
121
+ "corpus_normalize": 0.0,
122
+ "disable_last_norm": true,
123
+ "dtype": "bfloat16",
124
+ "encoder_depths": "3-3-3-3-3-3-8",
125
+ "encoder_n_filters": 32,
126
+ "encoder_ratios": [
127
+ 8,
128
+ 5,
129
+ 5,
130
+ 4,
131
+ 2,
132
+ 2
133
+ ],
134
+ "fix_std": 0,
135
+ "layer_scale_init_value": 1e-06,
136
+ "layernorm": "RMSNorm",
137
+ "layernorm_elementwise_affine": true,
138
+ "layernorm_eps": 1e-05,
139
+ "mixer_layer": "depthwise_conv",
140
+ "model_type": "vibevoice_semantic_tokenizer",
141
+ "pad_mode": "constant",
142
+ "std_dist_type": "none",
143
+ "vae_dim": 128,
144
+ "weight_init_value": 0.01
145
+ },
146
+ "semantic_vae_dim": 128,
147
+ "transformers_version": "4.57.6"
148
+ }
figures/DER.jpg ADDED
figures/VibeVoice_ASR_archi.png ADDED

Git LFS Details

  • SHA256: ae2623a1eaa7ac18cdbba1a246a6ffe9ca78e976e9e7728750f140e6b2ebb90a
  • Pointer size: 131 Bytes
  • Size of remote file: 168 kB
figures/cpWER.jpg ADDED
figures/tcpWER.jpg ADDED
model-00001-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5548c67885d423ba184bc8c33f2e9f81b582a6d119cef79907e19a274b916637
3
+ size 2488346272
model-00002-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:163023c61a3fb047745cbaf53ed41c1e27e515e9786a376e122bfac2ea6e687e
3
+ size 2389315976
model-00003-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e021702dfac2c52e8fdd6688de82c118be7bb7ad9b5c7988725ec63c44a64fb
3
+ size 2466376368
model-00004-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b17657bb151daa117a5a4671374ac1b248acb696691a2a67ac227a1115925e30
3
+ size 2466376400
model-00005-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed4e457268f7b02dda5cffe16b3a32614ccc2ccfe5de2db39bdd79700836406
3
+ size 2499431136
model-00006-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6de8246bb042fd853b57d40995efd289ea44e4d1b611cec2e122570b8d2122bd
3
+ size 2483469928
model-00007-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2ba6960d994dc7598efc6796f85ab097da7708f4dd56095f7fccf4df8dc00e5
3
+ size 1464887482
model-00008-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b9d9b328f85a25b4efca712d31513c6eed9e178152cc8cf4a6f0c2cd2bb623f
3
+ size 1089994848
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff