frontierai commited on
Commit
e7a4ffe
·
verified ·
1 Parent(s): d1e3119

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ figures/VibeVoice_ASR_archi.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,11 +1,47 @@
1
  ---
2
- license: mit
3
  language:
4
- - zh
5
  - en
6
- pipeline_tag: automatic-speech-recognition
 
 
7
  tags:
8
- - ASR
9
- - Diarization
10
- - Transcription
11
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
 
2
  language:
 
3
  - en
4
+ - zh
5
+ license: mit
6
+ pipeline_tag: speech-to-text
7
  tags:
8
+ - Podcast
9
+ library_name: transformers
10
+ ---
11
+
12
+
13
+ ## VibeVoice-ASR: Long-Form Rich Transcription with User Prompts
14
+
15
+ **VibeVoice-ASR** is the latest addition to the **VibeVoice** family. While the original VibeVoice / VibeVoice-Realtime focused on expressive TTS, **VibeVoice-ASR** focuses on understanding long-form speech with high precision and rich metadata.
16
+
17
+ It is a unified speech-to-text model designed to handle **1-hour long-form audio** in a single pass, generating structured transcriptions containing **Who (Speaker), When (Timestamps), and What (Content)**, with support for **User-Customized Context**.
18
+
19
+ ➡️ **Code:** [microsoft/VibeVoice-Code](https://github.com/microsoft/VibeVoice)
20
+
21
+ <p align="left">
22
+ <img src="figures/VibeVoice_ASR_archi.png" alt="VibeVoice-ASR Architecture" height="250px">
23
+ </p>
24
+
25
+
26
+ ## 🔥 Key Features
27
+
28
+ - **🕒 60-min Single-Pass Processing**:
29
+ Unlike conventional ASR models that slice audio into short chunks (often losing global context), VibeVoice ASR accepts up to **60 minutes** of continuous audio input within 64K length. This ensures consistent speaker tracking and semantic coherence across the entire hour.
30
+
31
+ - **👤 Optional Context Injection**:
32
+ Users can provide customized context (e.g., specific names, technical terms, or background info) to guide the recognition process, significantly improving accuracy on domain-specific content.
33
+
34
+ - **📝 Rich Transcription (Who, When, What)**:
35
+ The model performs ASR, Diarization, and Timestamping simultaneously. The output is a structured sequence indicating *who* said *what* at *which time*.
36
+
37
+
38
+ ## Installation and Usage
39
+
40
+ Please refer to [GitHub README](https://github.com/microsoft/VibeVoice/blob/main/docs/vibevoice-asr.md#installation)
41
+
42
+ ## License
43
+ This project is licensed under the MIT License.
44
+
45
+ ## Contact
46
+ This project was conducted by members of Microsoft Research. We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at VibeVoice@microsoft.com.
47
+ If the team receives reports of undesired behavior or identifies issues independently, we will update this repository with appropriate mitigations.
config.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "acoustic_tokenizer_config": {
4
+ "causal": true,
5
+ "channels": 1,
6
+ "conv_bias": true,
7
+ "conv_norm": "none",
8
+ "corpus_normalize": 0.0,
9
+ "decoder_depths": null,
10
+ "decoder_n_filters": 32,
11
+ "decoder_ratios": [
12
+ 8,
13
+ 5,
14
+ 5,
15
+ 4,
16
+ 2,
17
+ 2
18
+ ],
19
+ "disable_last_norm": true,
20
+ "dtype": "bfloat16",
21
+ "encoder_depths": "3-3-3-3-3-3-8",
22
+ "encoder_n_filters": 32,
23
+ "encoder_ratios": [
24
+ 8,
25
+ 5,
26
+ 5,
27
+ 4,
28
+ 2,
29
+ 2
30
+ ],
31
+ "fix_std": 0.5,
32
+ "layer_scale_init_value": 1e-06,
33
+ "layernorm": "RMSNorm",
34
+ "layernorm_elementwise_affine": true,
35
+ "layernorm_eps": 1e-05,
36
+ "mixer_layer": "depthwise_conv",
37
+ "model_type": "vibevoice_acoustic_tokenizer",
38
+ "pad_mode": "constant",
39
+ "std_dist_type": "gaussian",
40
+ "vae_dim": 64,
41
+ "weight_init_value": 0.01
42
+ },
43
+ "acoustic_vae_dim": 64,
44
+ "architectures": [
45
+ "VibeVoiceForASRTraining"
46
+ ],
47
+ "decoder_config": {
48
+ "attention_dropout": 0.0,
49
+ "dtype": "bfloat16",
50
+ "hidden_act": "silu",
51
+ "hidden_size": 3584,
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 18944,
54
+ "layer_types": [
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention",
75
+ "full_attention",
76
+ "full_attention",
77
+ "full_attention",
78
+ "full_attention",
79
+ "full_attention",
80
+ "full_attention",
81
+ "full_attention",
82
+ "full_attention"
83
+ ],
84
+ "max_position_embeddings": 131072,
85
+ "max_window_layers": 28,
86
+ "model_type": "qwen2",
87
+ "num_attention_heads": 28,
88
+ "num_hidden_layers": 28,
89
+ "num_key_value_heads": 4,
90
+ "rms_norm_eps": 1e-06,
91
+ "rope_scaling": null,
92
+ "rope_theta": 1000000.0,
93
+ "sliding_window": null,
94
+ "use_cache": true,
95
+ "use_mrope": false,
96
+ "use_sliding_window": false,
97
+ "vocab_size": 152064
98
+ },
99
+ "diffusion_head_config": {
100
+ "ddpm_batch_mul": 4,
101
+ "ddpm_beta_schedule": "cosine",
102
+ "ddpm_num_inference_steps": 20,
103
+ "ddpm_num_steps": 1000,
104
+ "diffusion_type": "ddpm",
105
+ "head_ffn_ratio": 3.0,
106
+ "head_layers": 4,
107
+ "hidden_size": 3584,
108
+ "latent_size": 64,
109
+ "model_type": "vibepod_diffusion_head",
110
+ "prediction_type": "v_prediction",
111
+ "rms_norm_eps": 1e-05,
112
+ "speech_vae_dim": 64
113
+ },
114
+ "dtype": "float32",
115
+ "model_type": "vibevoice",
116
+ "semantic_tokenizer_config": {
117
+ "causal": true,
118
+ "channels": 1,
119
+ "conv_bias": true,
120
+ "conv_norm": "none",
121
+ "corpus_normalize": 0.0,
122
+ "disable_last_norm": true,
123
+ "dtype": "bfloat16",
124
+ "encoder_depths": "3-3-3-3-3-3-8",
125
+ "encoder_n_filters": 32,
126
+ "encoder_ratios": [
127
+ 8,
128
+ 5,
129
+ 5,
130
+ 4,
131
+ 2,
132
+ 2
133
+ ],
134
+ "fix_std": 0,
135
+ "layer_scale_init_value": 1e-06,
136
+ "layernorm": "RMSNorm",
137
+ "layernorm_elementwise_affine": true,
138
+ "layernorm_eps": 1e-05,
139
+ "mixer_layer": "depthwise_conv",
140
+ "model_type": "vibevoice_semantic_tokenizer",
141
+ "pad_mode": "constant",
142
+ "std_dist_type": "none",
143
+ "vae_dim": 128,
144
+ "weight_init_value": 0.01
145
+ },
146
+ "semantic_vae_dim": 128,
147
+ "transformers_version": "4.57.6"
148
+ }
figures/VibeVoice_ASR_archi.png ADDED

Git LFS Details

  • SHA256: ae2623a1eaa7ac18cdbba1a246a6ffe9ca78e976e9e7728750f140e6b2ebb90a
  • Pointer size: 131 Bytes
  • Size of remote file: 168 kB
model-00001-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5548c67885d423ba184bc8c33f2e9f81b582a6d119cef79907e19a274b916637
3
+ size 2488346272
model-00002-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:163023c61a3fb047745cbaf53ed41c1e27e515e9786a376e122bfac2ea6e687e
3
+ size 2389315976
model-00003-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e021702dfac2c52e8fdd6688de82c118be7bb7ad9b5c7988725ec63c44a64fb
3
+ size 2466376368
model-00004-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b17657bb151daa117a5a4671374ac1b248acb696691a2a67ac227a1115925e30
3
+ size 2466376400
model-00005-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed4e457268f7b02dda5cffe16b3a32614ccc2ccfe5de2db39bdd79700836406
3
+ size 2499431136
model-00006-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6de8246bb042fd853b57d40995efd289ea44e4d1b611cec2e122570b8d2122bd
3
+ size 2483469928
model-00007-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2ba6960d994dc7598efc6796f85ab097da7708f4dd56095f7fccf4df8dc00e5
3
+ size 1464887482
model-00008-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b9d9b328f85a25b4efca712d31513c6eed9e178152cc8cf4a6f0c2cd2bb623f
3
+ size 1089994848
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff