Upload VibeVoiceAsrForConditionalGeneration
Browse files- config.json +2 -0
- model-00007-of-00008.safetensors +2 -2
- model-00008-of-00008.safetensors +2 -2
- model.safetensors.index.json +8 -8
config.json
CHANGED
|
@@ -23,6 +23,7 @@
|
|
| 23 |
"ffn_expansion": 4,
|
| 24 |
"hidden_act": "gelu",
|
| 25 |
"hidden_size": 64,
|
|
|
|
| 26 |
"kernel_size": 7,
|
| 27 |
"layer_scale_init_value": 1e-06,
|
| 28 |
"model_type": "vibevoice_asr_encoder",
|
|
@@ -63,6 +64,7 @@
|
|
| 63 |
"ffn_expansion": 4,
|
| 64 |
"hidden_act": "gelu",
|
| 65 |
"hidden_size": 128,
|
|
|
|
| 66 |
"kernel_size": 7,
|
| 67 |
"layer_scale_init_value": 1e-06,
|
| 68 |
"model_type": "vibevoice_asr_encoder",
|
|
|
|
| 23 |
"ffn_expansion": 4,
|
| 24 |
"hidden_act": "gelu",
|
| 25 |
"hidden_size": 64,
|
| 26 |
+
"initializer_range": 0.01,
|
| 27 |
"kernel_size": 7,
|
| 28 |
"layer_scale_init_value": 1e-06,
|
| 29 |
"model_type": "vibevoice_asr_encoder",
|
|
|
|
| 64 |
"ffn_expansion": 4,
|
| 65 |
"hidden_act": "gelu",
|
| 66 |
"hidden_size": 128,
|
| 67 |
+
"initializer_range": 0.01,
|
| 68 |
"kernel_size": 7,
|
| 69 |
"layer_scale_init_value": 1e-06,
|
| 70 |
"model_type": "vibevoice_asr_encoder",
|
model-00007-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e7b2afc2317cb1250bdad558ecb5aaccbf6a9072f5eab695ae60e827fe8bb66
|
| 3 |
+
size 2482226384
|
model-00008-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6dfbf12dbdabde3e406b1c83826493225bb9a7e09e7e9a2466e1022e3cdcf81e
|
| 3 |
+
size 37262376
|
model.safetensors.index.json
CHANGED
|
@@ -626,8 +626,8 @@
|
|
| 626 |
"multi_modal_projector.acoustic_norm.weight": "model-00007-of-00008.safetensors",
|
| 627 |
"multi_modal_projector.semantic_linear_1.bias": "model-00007-of-00008.safetensors",
|
| 628 |
"multi_modal_projector.semantic_linear_1.weight": "model-00007-of-00008.safetensors",
|
| 629 |
-
"multi_modal_projector.semantic_linear_2.bias": "model-
|
| 630 |
-
"multi_modal_projector.semantic_linear_2.weight": "model-
|
| 631 |
"multi_modal_projector.semantic_norm.weight": "model-00007-of-00008.safetensors",
|
| 632 |
"semantic_tokenizer.encoder.conv_layers.0.conv.conv.bias": "model-00007-of-00008.safetensors",
|
| 633 |
"semantic_tokenizer.encoder.conv_layers.0.conv.conv.weight": "model-00007-of-00008.safetensors",
|
|
@@ -863,16 +863,16 @@
|
|
| 863 |
"semantic_tokenizer.encoder.conv_layers.5.stage.6.norm.weight": "model-00007-of-00008.safetensors",
|
| 864 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear1.bias": "model-00007-of-00008.safetensors",
|
| 865 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear1.weight": "model-00007-of-00008.safetensors",
|
| 866 |
-
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear2.bias": "model-
|
| 867 |
-
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear2.weight": "model-
|
| 868 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn_gamma": "model-00007-of-00008.safetensors",
|
| 869 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn_norm.weight": "model-00007-of-00008.safetensors",
|
| 870 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.gamma": "model-00007-of-00008.safetensors",
|
| 871 |
-
"semantic_tokenizer.encoder.conv_layers.5.stage.7.mixer.conv.bias": "model-
|
| 872 |
-
"semantic_tokenizer.encoder.conv_layers.5.stage.7.mixer.conv.weight": "model-
|
| 873 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.norm.weight": "model-00007-of-00008.safetensors",
|
| 874 |
-
"semantic_tokenizer.encoder.head.conv.bias": "model-
|
| 875 |
-
"semantic_tokenizer.encoder.head.conv.weight": "model-
|
| 876 |
"semantic_tokenizer.encoder.stem.conv.conv.bias": "model-00007-of-00008.safetensors",
|
| 877 |
"semantic_tokenizer.encoder.stem.conv.conv.weight": "model-00007-of-00008.safetensors",
|
| 878 |
"semantic_tokenizer.encoder.stem.stage.0.ffn.linear1.bias": "model-00007-of-00008.safetensors",
|
|
|
|
| 626 |
"multi_modal_projector.acoustic_norm.weight": "model-00007-of-00008.safetensors",
|
| 627 |
"multi_modal_projector.semantic_linear_1.bias": "model-00007-of-00008.safetensors",
|
| 628 |
"multi_modal_projector.semantic_linear_1.weight": "model-00007-of-00008.safetensors",
|
| 629 |
+
"multi_modal_projector.semantic_linear_2.bias": "model-00007-of-00008.safetensors",
|
| 630 |
+
"multi_modal_projector.semantic_linear_2.weight": "model-00007-of-00008.safetensors",
|
| 631 |
"multi_modal_projector.semantic_norm.weight": "model-00007-of-00008.safetensors",
|
| 632 |
"semantic_tokenizer.encoder.conv_layers.0.conv.conv.bias": "model-00007-of-00008.safetensors",
|
| 633 |
"semantic_tokenizer.encoder.conv_layers.0.conv.conv.weight": "model-00007-of-00008.safetensors",
|
|
|
|
| 863 |
"semantic_tokenizer.encoder.conv_layers.5.stage.6.norm.weight": "model-00007-of-00008.safetensors",
|
| 864 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear1.bias": "model-00007-of-00008.safetensors",
|
| 865 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear1.weight": "model-00007-of-00008.safetensors",
|
| 866 |
+
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear2.bias": "model-00008-of-00008.safetensors",
|
| 867 |
+
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear2.weight": "model-00008-of-00008.safetensors",
|
| 868 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn_gamma": "model-00007-of-00008.safetensors",
|
| 869 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn_norm.weight": "model-00007-of-00008.safetensors",
|
| 870 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.gamma": "model-00007-of-00008.safetensors",
|
| 871 |
+
"semantic_tokenizer.encoder.conv_layers.5.stage.7.mixer.conv.bias": "model-00008-of-00008.safetensors",
|
| 872 |
+
"semantic_tokenizer.encoder.conv_layers.5.stage.7.mixer.conv.weight": "model-00008-of-00008.safetensors",
|
| 873 |
"semantic_tokenizer.encoder.conv_layers.5.stage.7.norm.weight": "model-00007-of-00008.safetensors",
|
| 874 |
+
"semantic_tokenizer.encoder.head.conv.bias": "model-00008-of-00008.safetensors",
|
| 875 |
+
"semantic_tokenizer.encoder.head.conv.weight": "model-00008-of-00008.safetensors",
|
| 876 |
"semantic_tokenizer.encoder.stem.conv.conv.bias": "model-00007-of-00008.safetensors",
|
| 877 |
"semantic_tokenizer.encoder.stem.conv.conv.weight": "model-00007-of-00008.safetensors",
|
| 878 |
"semantic_tokenizer.encoder.stem.stage.0.ffn.linear1.bias": "model-00007-of-00008.safetensors",
|