bezzam HF Staff commited on
Commit
59580e4
·
verified ·
1 Parent(s): f6e5a42

Upload VibeVoiceAsrForConditionalGeneration

Browse files
config.json CHANGED
@@ -23,6 +23,7 @@
23
  "ffn_expansion": 4,
24
  "hidden_act": "gelu",
25
  "hidden_size": 64,
 
26
  "kernel_size": 7,
27
  "layer_scale_init_value": 1e-06,
28
  "model_type": "vibevoice_asr_encoder",
@@ -63,6 +64,7 @@
63
  "ffn_expansion": 4,
64
  "hidden_act": "gelu",
65
  "hidden_size": 128,
 
66
  "kernel_size": 7,
67
  "layer_scale_init_value": 1e-06,
68
  "model_type": "vibevoice_asr_encoder",
 
23
  "ffn_expansion": 4,
24
  "hidden_act": "gelu",
25
  "hidden_size": 64,
26
+ "initializer_range": 0.01,
27
  "kernel_size": 7,
28
  "layer_scale_init_value": 1e-06,
29
  "model_type": "vibevoice_asr_encoder",
 
64
  "ffn_expansion": 4,
65
  "hidden_act": "gelu",
66
  "hidden_size": 128,
67
+ "initializer_range": 0.01,
68
  "kernel_size": 7,
69
  "layer_scale_init_value": 1e-06,
70
  "model_type": "vibevoice_asr_encoder",
model-00007-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ae098071e2891cef7442f4f9869568fdf5f50e86bab55880650c45b9f659784
3
- size 2493791232
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e7b2afc2317cb1250bdad558ecb5aaccbf6a9072f5eab695ae60e827fe8bb66
3
+ size 2482226384
model-00008-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fbe26228575ec5eeedcdaa3ef903671c008650dc5a34da76e726de40a1f68b1
3
- size 25697544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dfbf12dbdabde3e406b1c83826493225bb9a7e09e7e9a2466e1022e3cdcf81e
3
+ size 37262376
model.safetensors.index.json CHANGED
@@ -626,8 +626,8 @@
626
  "multi_modal_projector.acoustic_norm.weight": "model-00007-of-00008.safetensors",
627
  "multi_modal_projector.semantic_linear_1.bias": "model-00007-of-00008.safetensors",
628
  "multi_modal_projector.semantic_linear_1.weight": "model-00007-of-00008.safetensors",
629
- "multi_modal_projector.semantic_linear_2.bias": "model-00008-of-00008.safetensors",
630
- "multi_modal_projector.semantic_linear_2.weight": "model-00008-of-00008.safetensors",
631
  "multi_modal_projector.semantic_norm.weight": "model-00007-of-00008.safetensors",
632
  "semantic_tokenizer.encoder.conv_layers.0.conv.conv.bias": "model-00007-of-00008.safetensors",
633
  "semantic_tokenizer.encoder.conv_layers.0.conv.conv.weight": "model-00007-of-00008.safetensors",
@@ -863,16 +863,16 @@
863
  "semantic_tokenizer.encoder.conv_layers.5.stage.6.norm.weight": "model-00007-of-00008.safetensors",
864
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear1.bias": "model-00007-of-00008.safetensors",
865
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear1.weight": "model-00007-of-00008.safetensors",
866
- "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear2.bias": "model-00007-of-00008.safetensors",
867
- "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear2.weight": "model-00007-of-00008.safetensors",
868
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn_gamma": "model-00007-of-00008.safetensors",
869
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn_norm.weight": "model-00007-of-00008.safetensors",
870
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.gamma": "model-00007-of-00008.safetensors",
871
- "semantic_tokenizer.encoder.conv_layers.5.stage.7.mixer.conv.bias": "model-00007-of-00008.safetensors",
872
- "semantic_tokenizer.encoder.conv_layers.5.stage.7.mixer.conv.weight": "model-00007-of-00008.safetensors",
873
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.norm.weight": "model-00007-of-00008.safetensors",
874
- "semantic_tokenizer.encoder.head.conv.bias": "model-00007-of-00008.safetensors",
875
- "semantic_tokenizer.encoder.head.conv.weight": "model-00007-of-00008.safetensors",
876
  "semantic_tokenizer.encoder.stem.conv.conv.bias": "model-00007-of-00008.safetensors",
877
  "semantic_tokenizer.encoder.stem.conv.conv.weight": "model-00007-of-00008.safetensors",
878
  "semantic_tokenizer.encoder.stem.stage.0.ffn.linear1.bias": "model-00007-of-00008.safetensors",
 
626
  "multi_modal_projector.acoustic_norm.weight": "model-00007-of-00008.safetensors",
627
  "multi_modal_projector.semantic_linear_1.bias": "model-00007-of-00008.safetensors",
628
  "multi_modal_projector.semantic_linear_1.weight": "model-00007-of-00008.safetensors",
629
+ "multi_modal_projector.semantic_linear_2.bias": "model-00007-of-00008.safetensors",
630
+ "multi_modal_projector.semantic_linear_2.weight": "model-00007-of-00008.safetensors",
631
  "multi_modal_projector.semantic_norm.weight": "model-00007-of-00008.safetensors",
632
  "semantic_tokenizer.encoder.conv_layers.0.conv.conv.bias": "model-00007-of-00008.safetensors",
633
  "semantic_tokenizer.encoder.conv_layers.0.conv.conv.weight": "model-00007-of-00008.safetensors",
 
863
  "semantic_tokenizer.encoder.conv_layers.5.stage.6.norm.weight": "model-00007-of-00008.safetensors",
864
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear1.bias": "model-00007-of-00008.safetensors",
865
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear1.weight": "model-00007-of-00008.safetensors",
866
+ "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear2.bias": "model-00008-of-00008.safetensors",
867
+ "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn.linear2.weight": "model-00008-of-00008.safetensors",
868
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn_gamma": "model-00007-of-00008.safetensors",
869
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.ffn_norm.weight": "model-00007-of-00008.safetensors",
870
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.gamma": "model-00007-of-00008.safetensors",
871
+ "semantic_tokenizer.encoder.conv_layers.5.stage.7.mixer.conv.bias": "model-00008-of-00008.safetensors",
872
+ "semantic_tokenizer.encoder.conv_layers.5.stage.7.mixer.conv.weight": "model-00008-of-00008.safetensors",
873
  "semantic_tokenizer.encoder.conv_layers.5.stage.7.norm.weight": "model-00007-of-00008.safetensors",
874
+ "semantic_tokenizer.encoder.head.conv.bias": "model-00008-of-00008.safetensors",
875
+ "semantic_tokenizer.encoder.head.conv.weight": "model-00008-of-00008.safetensors",
876
  "semantic_tokenizer.encoder.stem.conv.conv.bias": "model-00007-of-00008.safetensors",
877
  "semantic_tokenizer.encoder.stem.conv.conv.weight": "model-00007-of-00008.safetensors",
878
  "semantic_tokenizer.encoder.stem.stage.0.ffn.linear1.bias": "model-00007-of-00008.safetensors",