Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- README.md +10 -13
- config.json +320 -43
- fine_acoustics.safetensors +2 -2
- model.safetensors +2 -2
- v2/announcer.npz +3 -0
- v2/de_speaker_0.npz +3 -0
- v2/de_speaker_1.npz +3 -0
- v2/de_speaker_2.npz +3 -0
- v2/de_speaker_3.npz +3 -0
- v2/de_speaker_4.npz +3 -0
- v2/de_speaker_5.npz +3 -0
- v2/de_speaker_6.npz +3 -0
- v2/de_speaker_7.npz +3 -0
- v2/de_speaker_8.npz +3 -0
- v2/de_speaker_9.npz +3 -0
- v2/en_speaker_0.npz +3 -0
- v2/en_speaker_1.npz +3 -0
- v2/en_speaker_2.npz +3 -0
- v2/en_speaker_3.npz +3 -0
- v2/en_speaker_4.npz +3 -0
- v2/en_speaker_5.npz +3 -0
- v2/en_speaker_6.npz +3 -0
- v2/en_speaker_7.npz +3 -0
- v2/en_speaker_8.npz +3 -0
- v2/en_speaker_9.npz +3 -0
- v2/es_speaker_0.npz +3 -0
- v2/es_speaker_1.npz +3 -0
- v2/es_speaker_2.npz +3 -0
- v2/es_speaker_3.npz +3 -0
- v2/es_speaker_4.npz +3 -0
- v2/es_speaker_5.npz +3 -0
- v2/es_speaker_6.npz +3 -0
- v2/es_speaker_7.npz +3 -0
- v2/es_speaker_8.npz +3 -0
- v2/es_speaker_9.npz +3 -0
- v2/fr_speaker_0.npz +3 -0
- v2/fr_speaker_1.npz +3 -0
- v2/fr_speaker_2.npz +3 -0
- v2/fr_speaker_3.npz +3 -0
- v2/fr_speaker_4.npz +3 -0
- v2/fr_speaker_5.npz +3 -0
- v2/fr_speaker_6.npz +3 -0
- v2/fr_speaker_7.npz +3 -0
- v2/fr_speaker_8.npz +3 -0
- v2/fr_speaker_9.npz +3 -0
- v2/hi_speaker_0.npz +3 -0
- v2/hi_speaker_1.npz +3 -0
- v2/hi_speaker_2.npz +3 -0
- v2/hi_speaker_3.npz +3 -0
- v2/hi_speaker_4.npz +3 -0
README.md
CHANGED
|
@@ -1,20 +1,17 @@
|
|
| 1 |
# mlx-community/bark
|
| 2 |
|
| 3 |
-
|
| 4 |
|
| 5 |
-
|
| 6 |
-
It generates highly realistic, multilingual speech as well as music, background noise, and simple sound effects.
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
-
|
| 11 |
-
pip install -U mlx-audio
|
| 12 |
-
python -m mlx_audio.tts.generate --model mlx-community/bark --text "Hello, this is a test."
|
| 13 |
-
```
|
| 14 |
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 1 |
# mlx-community/bark
|
| 2 |
|
| 3 |
+
MLX format conversion of [suno/bark](https://huggingface.co/suno/bark).
|
| 4 |
|
| 5 |
+
## Original model
|
|
|
|
| 6 |
|
| 7 |
+
Bark is a transformer-based text-to-speech model by [Suno](https://suno.ai).
|
| 8 |
|
| 9 |
+
## Use with mlx-audio
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
```python
|
| 12 |
+
from mlx_audio.tts.utils import load_model
|
| 13 |
|
| 14 |
+
model = load_model("mlx-community/bark")
|
| 15 |
+
for result in model.generate("Hello, this is a test of the Bark TTS model."):
|
| 16 |
+
print(result.audio.shape, result.sample_rate)
|
| 17 |
+
```
|
config.json
CHANGED
|
@@ -1,66 +1,343 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
"BarkModel"
|
| 4 |
],
|
| 5 |
-
"model_type": "bark",
|
| 6 |
-
"model_size": "base",
|
| 7 |
-
"initializer_range": 0.02,
|
| 8 |
-
"torch_dtype": "float32",
|
| 9 |
-
"semantic_config": {
|
| 10 |
-
"model_type": "semantic",
|
| 11 |
-
"architectures": [
|
| 12 |
-
"BarkSemanticModel"
|
| 13 |
-
],
|
| 14 |
-
"n_layer": 24,
|
| 15 |
-
"n_head": 16,
|
| 16 |
-
"n_embd": 1024,
|
| 17 |
-
"input_vocab_size": 129600,
|
| 18 |
-
"output_vocab_size": 10048,
|
| 19 |
-
"block_size": 1024,
|
| 20 |
-
"bias": false,
|
| 21 |
-
"dropout": 0.0
|
| 22 |
-
},
|
| 23 |
"coarse_acoustics_config": {
|
| 24 |
-
"
|
|
|
|
| 25 |
"architectures": [
|
| 26 |
"BarkCoarseModel"
|
| 27 |
],
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
"input_vocab_size": 12096,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
"output_vocab_size": 12096,
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
},
|
| 37 |
"fine_acoustics_config": {
|
| 38 |
-
"
|
|
|
|
| 39 |
"architectures": [
|
| 40 |
"BarkFineModel"
|
| 41 |
],
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
"input_vocab_size": 1056,
|
| 46 |
-
"output_vocab_size": 1056,
|
| 47 |
"block_size": 1024,
|
| 48 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"dropout": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
"n_codes_total": 8,
|
| 51 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
},
|
| 53 |
-
"
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
| 55 |
"architectures": [
|
| 56 |
-
"
|
| 57 |
],
|
| 58 |
-
"
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
},
|
| 64 |
-
"
|
| 65 |
-
"
|
|
|
|
|
|
|
|
|
|
| 66 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_commit_hash": null,
|
| 3 |
"architectures": [
|
| 4 |
"BarkModel"
|
| 5 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"coarse_acoustics_config": {
|
| 7 |
+
"_name_or_path": "",
|
| 8 |
+
"add_cross_attention": false,
|
| 9 |
"architectures": [
|
| 10 |
"BarkCoarseModel"
|
| 11 |
],
|
| 12 |
+
"bad_words_ids": null,
|
| 13 |
+
"begin_suppress_tokens": null,
|
| 14 |
+
"bias": false,
|
| 15 |
+
"block_size": 1024,
|
| 16 |
+
"bos_token_id": null,
|
| 17 |
+
"chunk_size_feed_forward": 0,
|
| 18 |
+
"cross_attention_hidden_size": null,
|
| 19 |
+
"decoder_start_token_id": null,
|
| 20 |
+
"diversity_penalty": 0.0,
|
| 21 |
+
"do_sample": false,
|
| 22 |
+
"dropout": 0.0,
|
| 23 |
+
"early_stopping": false,
|
| 24 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 25 |
+
"eos_token_id": null,
|
| 26 |
+
"exponential_decay_length_penalty": null,
|
| 27 |
+
"finetuning_task": null,
|
| 28 |
+
"forced_bos_token_id": null,
|
| 29 |
+
"forced_eos_token_id": null,
|
| 30 |
+
"hidden_size": 1024,
|
| 31 |
+
"id2label": {
|
| 32 |
+
"0": "LABEL_0",
|
| 33 |
+
"1": "LABEL_1"
|
| 34 |
+
},
|
| 35 |
+
"initializer_range": 0.02,
|
| 36 |
"input_vocab_size": 12096,
|
| 37 |
+
"is_decoder": false,
|
| 38 |
+
"is_encoder_decoder": false,
|
| 39 |
+
"label2id": {
|
| 40 |
+
"LABEL_0": 0,
|
| 41 |
+
"LABEL_1": 1
|
| 42 |
+
},
|
| 43 |
+
"length_penalty": 1.0,
|
| 44 |
+
"max_length": 20,
|
| 45 |
+
"min_length": 0,
|
| 46 |
+
"model_type": "coarse_acoustics",
|
| 47 |
+
"no_repeat_ngram_size": 0,
|
| 48 |
+
"num_beam_groups": 1,
|
| 49 |
+
"num_beams": 1,
|
| 50 |
+
"num_heads": 16,
|
| 51 |
+
"num_layers": 24,
|
| 52 |
+
"num_return_sequences": 1,
|
| 53 |
+
"output_attentions": false,
|
| 54 |
+
"output_hidden_states": false,
|
| 55 |
+
"output_scores": false,
|
| 56 |
"output_vocab_size": 12096,
|
| 57 |
+
"pad_token_id": null,
|
| 58 |
+
"prefix": null,
|
| 59 |
+
"problem_type": null,
|
| 60 |
+
"pruned_heads": {},
|
| 61 |
+
"remove_invalid_values": false,
|
| 62 |
+
"repetition_penalty": 1.0,
|
| 63 |
+
"return_dict": true,
|
| 64 |
+
"return_dict_in_generate": false,
|
| 65 |
+
"sep_token_id": null,
|
| 66 |
+
"suppress_tokens": null,
|
| 67 |
+
"task_specific_params": null,
|
| 68 |
+
"temperature": 1.0,
|
| 69 |
+
"tf_legacy_loss": false,
|
| 70 |
+
"tie_encoder_decoder": false,
|
| 71 |
+
"tie_word_embeddings": true,
|
| 72 |
+
"tokenizer_class": null,
|
| 73 |
+
"top_k": 50,
|
| 74 |
+
"top_p": 1.0,
|
| 75 |
+
"torch_dtype": "float32",
|
| 76 |
+
"torchscript": false,
|
| 77 |
+
"transformers_version": "4.31.0.dev0",
|
| 78 |
+
"typical_p": 1.0,
|
| 79 |
+
"use_bfloat16": false,
|
| 80 |
+
"use_cache": true
|
| 81 |
+
},
|
| 82 |
+
"codec_config": {
|
| 83 |
+
"_name_or_path": "facebook/encodec_24khz",
|
| 84 |
+
"add_cross_attention": false,
|
| 85 |
+
"architectures": [
|
| 86 |
+
"EncodecModel"
|
| 87 |
+
],
|
| 88 |
+
"audio_channels": 1,
|
| 89 |
+
"bad_words_ids": null,
|
| 90 |
+
"begin_suppress_tokens": null,
|
| 91 |
+
"bos_token_id": null,
|
| 92 |
+
"chunk_length_s": null,
|
| 93 |
+
"chunk_size_feed_forward": 0,
|
| 94 |
+
"codebook_dim": 128,
|
| 95 |
+
"codebook_size": 1024,
|
| 96 |
+
"compress": 2,
|
| 97 |
+
"cross_attention_hidden_size": null,
|
| 98 |
+
"decoder_start_token_id": null,
|
| 99 |
+
"dilation_growth_rate": 2,
|
| 100 |
+
"diversity_penalty": 0.0,
|
| 101 |
+
"do_sample": false,
|
| 102 |
+
"early_stopping": false,
|
| 103 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 104 |
+
"eos_token_id": null,
|
| 105 |
+
"exponential_decay_length_penalty": null,
|
| 106 |
+
"finetuning_task": null,
|
| 107 |
+
"forced_bos_token_id": null,
|
| 108 |
+
"forced_eos_token_id": null,
|
| 109 |
+
"hidden_size": 128,
|
| 110 |
+
"id2label": {
|
| 111 |
+
"0": "LABEL_0",
|
| 112 |
+
"1": "LABEL_1"
|
| 113 |
+
},
|
| 114 |
+
"is_decoder": false,
|
| 115 |
+
"is_encoder_decoder": false,
|
| 116 |
+
"kernel_size": 7,
|
| 117 |
+
"label2id": {
|
| 118 |
+
"LABEL_0": 0,
|
| 119 |
+
"LABEL_1": 1
|
| 120 |
+
},
|
| 121 |
+
"last_kernel_size": 7,
|
| 122 |
+
"length_penalty": 1.0,
|
| 123 |
+
"max_length": 20,
|
| 124 |
+
"min_length": 0,
|
| 125 |
+
"model_type": "encodec",
|
| 126 |
+
"no_repeat_ngram_size": 0,
|
| 127 |
+
"norm_type": "weight_norm",
|
| 128 |
+
"normalize": false,
|
| 129 |
+
"num_beam_groups": 1,
|
| 130 |
+
"num_beams": 1,
|
| 131 |
+
"num_filters": 32,
|
| 132 |
+
"num_lstm_layers": 2,
|
| 133 |
+
"num_residual_layers": 1,
|
| 134 |
+
"num_return_sequences": 1,
|
| 135 |
+
"output_attentions": false,
|
| 136 |
+
"output_hidden_states": false,
|
| 137 |
+
"output_scores": false,
|
| 138 |
+
"overlap": null,
|
| 139 |
+
"pad_mode": "reflect",
|
| 140 |
+
"pad_token_id": null,
|
| 141 |
+
"prefix": null,
|
| 142 |
+
"problem_type": null,
|
| 143 |
+
"pruned_heads": {},
|
| 144 |
+
"remove_invalid_values": false,
|
| 145 |
+
"repetition_penalty": 1.0,
|
| 146 |
+
"residual_kernel_size": 3,
|
| 147 |
+
"return_dict": true,
|
| 148 |
+
"return_dict_in_generate": false,
|
| 149 |
+
"sampling_rate": 24000,
|
| 150 |
+
"sep_token_id": null,
|
| 151 |
+
"suppress_tokens": null,
|
| 152 |
+
"target_bandwidths": [
|
| 153 |
+
1.5,
|
| 154 |
+
3.0,
|
| 155 |
+
6.0,
|
| 156 |
+
12.0,
|
| 157 |
+
24.0
|
| 158 |
+
],
|
| 159 |
+
"task_specific_params": null,
|
| 160 |
+
"temperature": 1.0,
|
| 161 |
+
"tf_legacy_loss": false,
|
| 162 |
+
"tie_encoder_decoder": false,
|
| 163 |
+
"tie_word_embeddings": true,
|
| 164 |
+
"tokenizer_class": null,
|
| 165 |
+
"top_k": 50,
|
| 166 |
+
"top_p": 1.0,
|
| 167 |
+
"torch_dtype": "float32",
|
| 168 |
+
"torchscript": false,
|
| 169 |
+
"transformers_version": "4.31.0.dev0",
|
| 170 |
+
"trim_right_ratio": 1.0,
|
| 171 |
+
"typical_p": 1.0,
|
| 172 |
+
"upsampling_ratios": [
|
| 173 |
+
8,
|
| 174 |
+
5,
|
| 175 |
+
4,
|
| 176 |
+
2
|
| 177 |
+
],
|
| 178 |
+
"use_bfloat16": false,
|
| 179 |
+
"use_causal_conv": true,
|
| 180 |
+
"use_conv_shortcut": true
|
| 181 |
},
|
| 182 |
"fine_acoustics_config": {
|
| 183 |
+
"_name_or_path": "",
|
| 184 |
+
"add_cross_attention": false,
|
| 185 |
"architectures": [
|
| 186 |
"BarkFineModel"
|
| 187 |
],
|
| 188 |
+
"bad_words_ids": null,
|
| 189 |
+
"begin_suppress_tokens": null,
|
| 190 |
+
"bias": false,
|
|
|
|
|
|
|
| 191 |
"block_size": 1024,
|
| 192 |
+
"bos_token_id": null,
|
| 193 |
+
"chunk_size_feed_forward": 0,
|
| 194 |
+
"cross_attention_hidden_size": null,
|
| 195 |
+
"decoder_start_token_id": null,
|
| 196 |
+
"diversity_penalty": 0.0,
|
| 197 |
+
"do_sample": false,
|
| 198 |
"dropout": 0.0,
|
| 199 |
+
"early_stopping": false,
|
| 200 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 201 |
+
"eos_token_id": null,
|
| 202 |
+
"exponential_decay_length_penalty": null,
|
| 203 |
+
"finetuning_task": null,
|
| 204 |
+
"forced_bos_token_id": null,
|
| 205 |
+
"forced_eos_token_id": null,
|
| 206 |
+
"hidden_size": 1024,
|
| 207 |
+
"id2label": {
|
| 208 |
+
"0": "LABEL_0",
|
| 209 |
+
"1": "LABEL_1"
|
| 210 |
+
},
|
| 211 |
+
"initializer_range": 0.02,
|
| 212 |
+
"input_vocab_size": 1056,
|
| 213 |
+
"is_decoder": false,
|
| 214 |
+
"is_encoder_decoder": false,
|
| 215 |
+
"label2id": {
|
| 216 |
+
"LABEL_0": 0,
|
| 217 |
+
"LABEL_1": 1
|
| 218 |
+
},
|
| 219 |
+
"length_penalty": 1.0,
|
| 220 |
+
"max_length": 20,
|
| 221 |
+
"min_length": 0,
|
| 222 |
+
"model_type": "fine_acoustics",
|
| 223 |
+
"n_codes_given": 1,
|
| 224 |
"n_codes_total": 8,
|
| 225 |
+
"no_repeat_ngram_size": 0,
|
| 226 |
+
"num_beam_groups": 1,
|
| 227 |
+
"num_beams": 1,
|
| 228 |
+
"num_heads": 16,
|
| 229 |
+
"num_layers": 24,
|
| 230 |
+
"num_return_sequences": 1,
|
| 231 |
+
"output_attentions": false,
|
| 232 |
+
"output_hidden_states": false,
|
| 233 |
+
"output_scores": false,
|
| 234 |
+
"output_vocab_size": 1056,
|
| 235 |
+
"pad_token_id": null,
|
| 236 |
+
"prefix": null,
|
| 237 |
+
"problem_type": null,
|
| 238 |
+
"pruned_heads": {},
|
| 239 |
+
"remove_invalid_values": false,
|
| 240 |
+
"repetition_penalty": 1.0,
|
| 241 |
+
"return_dict": true,
|
| 242 |
+
"return_dict_in_generate": false,
|
| 243 |
+
"sep_token_id": null,
|
| 244 |
+
"suppress_tokens": null,
|
| 245 |
+
"task_specific_params": null,
|
| 246 |
+
"temperature": 1.0,
|
| 247 |
+
"tf_legacy_loss": false,
|
| 248 |
+
"tie_encoder_decoder": false,
|
| 249 |
+
"tie_word_embeddings": true,
|
| 250 |
+
"tokenizer_class": null,
|
| 251 |
+
"top_k": 50,
|
| 252 |
+
"top_p": 1.0,
|
| 253 |
+
"torch_dtype": "float32",
|
| 254 |
+
"torchscript": false,
|
| 255 |
+
"transformers_version": "4.31.0.dev0",
|
| 256 |
+
"typical_p": 1.0,
|
| 257 |
+
"use_bfloat16": false,
|
| 258 |
+
"use_cache": true
|
| 259 |
},
|
| 260 |
+
"initializer_range": 0.02,
|
| 261 |
+
"model_type": "bark",
|
| 262 |
+
"semantic_config": {
|
| 263 |
+
"_name_or_path": "",
|
| 264 |
+
"add_cross_attention": false,
|
| 265 |
"architectures": [
|
| 266 |
+
"BarkSemanticModel"
|
| 267 |
],
|
| 268 |
+
"bad_words_ids": null,
|
| 269 |
+
"begin_suppress_tokens": null,
|
| 270 |
+
"bias": false,
|
| 271 |
+
"block_size": 1024,
|
| 272 |
+
"bos_token_id": null,
|
| 273 |
+
"chunk_size_feed_forward": 0,
|
| 274 |
+
"cross_attention_hidden_size": null,
|
| 275 |
+
"decoder_start_token_id": null,
|
| 276 |
+
"diversity_penalty": 0.0,
|
| 277 |
+
"do_sample": false,
|
| 278 |
+
"dropout": 0.0,
|
| 279 |
+
"early_stopping": false,
|
| 280 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 281 |
+
"eos_token_id": null,
|
| 282 |
+
"exponential_decay_length_penalty": null,
|
| 283 |
+
"finetuning_task": null,
|
| 284 |
+
"forced_bos_token_id": null,
|
| 285 |
+
"forced_eos_token_id": null,
|
| 286 |
+
"hidden_size": 1024,
|
| 287 |
+
"id2label": {
|
| 288 |
+
"0": "LABEL_0",
|
| 289 |
+
"1": "LABEL_1"
|
| 290 |
+
},
|
| 291 |
+
"initializer_range": 0.02,
|
| 292 |
+
"input_vocab_size": 129600,
|
| 293 |
+
"is_decoder": false,
|
| 294 |
+
"is_encoder_decoder": false,
|
| 295 |
+
"label2id": {
|
| 296 |
+
"LABEL_0": 0,
|
| 297 |
+
"LABEL_1": 1
|
| 298 |
+
},
|
| 299 |
+
"length_penalty": 1.0,
|
| 300 |
+
"max_length": 20,
|
| 301 |
+
"min_length": 0,
|
| 302 |
+
"model_type": "semantic",
|
| 303 |
+
"no_repeat_ngram_size": 0,
|
| 304 |
+
"num_beam_groups": 1,
|
| 305 |
+
"num_beams": 1,
|
| 306 |
+
"num_heads": 16,
|
| 307 |
+
"num_layers": 24,
|
| 308 |
+
"num_return_sequences": 1,
|
| 309 |
+
"output_attentions": false,
|
| 310 |
+
"output_hidden_states": false,
|
| 311 |
+
"output_scores": false,
|
| 312 |
+
"output_vocab_size": 10048,
|
| 313 |
+
"pad_token_id": null,
|
| 314 |
+
"prefix": null,
|
| 315 |
+
"problem_type": null,
|
| 316 |
+
"pruned_heads": {},
|
| 317 |
+
"remove_invalid_values": false,
|
| 318 |
+
"repetition_penalty": 1.0,
|
| 319 |
+
"return_dict": true,
|
| 320 |
+
"return_dict_in_generate": false,
|
| 321 |
+
"sep_token_id": null,
|
| 322 |
+
"suppress_tokens": null,
|
| 323 |
+
"task_specific_params": null,
|
| 324 |
+
"temperature": 1.0,
|
| 325 |
+
"tf_legacy_loss": false,
|
| 326 |
+
"tie_encoder_decoder": false,
|
| 327 |
+
"tie_word_embeddings": true,
|
| 328 |
+
"tokenizer_class": null,
|
| 329 |
+
"top_k": 50,
|
| 330 |
+
"top_p": 1.0,
|
| 331 |
+
"torch_dtype": "float32",
|
| 332 |
+
"torchscript": false,
|
| 333 |
+
"transformers_version": "4.31.0.dev0",
|
| 334 |
+
"typical_p": 1.0,
|
| 335 |
+
"use_bfloat16": false,
|
| 336 |
+
"use_cache": true
|
| 337 |
},
|
| 338 |
+
"torch_dtype": "float32",
|
| 339 |
+
"transformers_version": null,
|
| 340 |
+
"model_size": "base",
|
| 341 |
+
"codec_path": "mlx-community/encodec-24khz-float32",
|
| 342 |
+
"sample_rate": 24000
|
| 343 |
}
|
fine_acoustics.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e26d52bb92af91510c42436476d999ef33bcc784efd300a04afb4e853fc7e244
|
| 3 |
+
size 1277465968
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ce72023913da4cf38e6a04d8243cf1337e5bb7ac991894ca775aca62d7d1829
|
| 3 |
+
size 3095843536
|
v2/announcer.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc357306d4b95678cc763423965962087136fe16be42579800f4d76db8aa46b2
|
| 3 |
+
size 32792
|
v2/de_speaker_0.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c90d9d34ecba26a7d81ee8947153cc6e1728a25338d76d8a8fa4b2a5804d138
|
| 3 |
+
size 25712
|
v2/de_speaker_1.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64327abc4e61903e7b53ada730dd6bab09637c21f36c5c7439da754d7c2b122e
|
| 3 |
+
size 25712
|
v2/de_speaker_2.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6bd0dc012675e78f5f31b76ab063cd730789c7a5b71e0ec32dec38aa8128238
|
| 3 |
+
size 18972
|
v2/de_speaker_3.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09c1e4702d722ca5d3fdeba5278285d114f6111b461079c8d6c51a4c76bfa169
|
| 3 |
+
size 23408
|
v2/de_speaker_4.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:277a35a46311d611a7157f2112e0137ea33f2fd3f4932ff02bfac28cb5fc20e6
|
| 3 |
+
size 16412
|
v2/de_speaker_5.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d6d2e5ffcb05b66e0d23cac272e4b6698b82bdbce5ac8cb1f7f33cd109c7808
|
| 3 |
+
size 28228
|
v2/de_speaker_6.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc4bae176d5e4c16cb510100b9158b78be51e772126aa3905131c4763375f322
|
| 3 |
+
size 25540
|
v2/de_speaker_7.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0071c30ec0a4fda6978692aec0ec764fe5cb0a7f023310e129cafe171039e948
|
| 3 |
+
size 47640
|
v2/de_speaker_8.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:166c836a6af8dab17471c2f4aaea1499bfb92201a62111222ae9f37a962d5135
|
| 3 |
+
size 20252
|
v2/de_speaker_9.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8ab63c167822a07c85b7223ec376b728a6ddeee1453177765d49312f62cb0be
|
| 3 |
+
size 17904
|
v2/en_speaker_0.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ac0263de8abf1415245370b34369552a0f3c8c53b9b8a402c6436318ace3124
|
| 3 |
+
size 18076
|
v2/en_speaker_1.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d527080cf5f2828f55b2152353cacac78c558b4a95253c7fb4f433e8405c909
|
| 3 |
+
size 14876
|
v2/en_speaker_2.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0eb6aa00e600e0dfec89118110fc1127486dd13c4a72e624da6b5dfcd9ffaaab
|
| 3 |
+
size 27248
|
v2/en_speaker_3.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc6ac1f03aea18d617fb9d9c1790c2e82e7a3285615ccc483cee7ce8adf4a845
|
| 3 |
+
size 30660
|
v2/en_speaker_4.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5febe332dce69d7a13021e8524cc28b3d6919658cc0ca0a470b5c6d92c4e3c34
|
| 3 |
+
size 17136
|
v2/en_speaker_5.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b056796169ef551bf43375ff7b2035be0b29171e0abc2ef83feaf91d1d2c325
|
| 3 |
+
size 12572
|
v2/en_speaker_6.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15776b96cc1d8d056c1a78578b5ef1e0819c71117ddda186bc0b6dfef0d578d9
|
| 3 |
+
size 10908
|
v2/en_speaker_7.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c72927d888618e23cad87b396acf12943a6a75a58fb13b4163f8af88c565c52
|
| 3 |
+
size 28228
|
v2/en_speaker_8.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50d41d255bbf5a1a7cdaa3b2e27cb17aa9bd70bb678854f3eddfb068228f5766
|
| 3 |
+
size 15344
|
v2/en_speaker_9.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac920442a269a7b33e8c669fb1d0e77a5e1ac097b5d8170c07bb1b6f9cf3985b
|
| 3 |
+
size 28912
|
v2/es_speaker_0.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8d5ed46018a021946dac4146c3bda8cd51e65dfcc37ec37b326c24fd87e307f
|
| 3 |
+
size 22256
|
v2/es_speaker_1.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a155f791c1aa70612638aafb2a508cc4806944aa01ca60c6a1ec4b89cd1c059
|
| 3 |
+
size 20508
|
v2/es_speaker_2.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d51d1b98c16907d800a3f4eb2358fafed2bdf01e3027754ef35477256496ca0
|
| 3 |
+
size 22256
|
v2/es_speaker_3.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d133f56c42c32c8244d7bea2f255094a33251cabaa34404b1f8a460c85e505ab
|
| 3 |
+
size 21360
|
v2/es_speaker_4.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9975b81c165cb98694923890b1896240af53221b0be62e84f494c8245b72e78c
|
| 3 |
+
size 19696
|
v2/es_speaker_5.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0780f9e464569118c4ef574c88fdfcdc915867f5609f4defd97cf63d174dbf0e
|
| 3 |
+
size 20080
|
v2/es_speaker_6.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5388ad90370a675fbd78cca1486a2e74484b35396d9afa9cbdc19e8e4ff8c4fe
|
| 3 |
+
size 28016
|
v2/es_speaker_7.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e91453a6e49ab9b384d81cca5fb30aa9a90e3893208f3776fde37fc3fa8eac2
|
| 3 |
+
size 17436
|
v2/es_speaker_8.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:daf2c2cae1c6b69d9c199877024c81e37716fedc2ee67307471c57e8aace95da
|
| 3 |
+
size 15088
|
v2/es_speaker_9.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7cf738c2886d2bdd4fd8930fe638bfda3e42d2819a9c5860e5d7e97dd2cf4f6f
|
| 3 |
+
size 18288
|
v2/fr_speaker_0.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:957ac6f584f6c8930f8b4eba8194c8ba6b7c2d66cd5293ebe6139c7bbb6d5fbf
|
| 3 |
+
size 24644
|
v2/fr_speaker_1.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1cee2e526fd44cc7716d66fb1fe0226b54ad49d40bb4b96e4085d7f73d1df93
|
| 3 |
+
size 23620
|
v2/fr_speaker_2.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd9c91f709475ff4f0eb9af3eae55d784fc6c02a1469b9a748ebb562b493c9ad
|
| 3 |
+
size 41028
|
v2/fr_speaker_3.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d30139a213b4fb8b5d106594361e55b62c3d1200b8c2bfe7bbbd5308986d338
|
| 3 |
+
size 25328
|
v2/fr_speaker_4.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fee501eb6022027c4e248775ee9a84a7fdeac293faa162b0fc11cf6df0a62ba8
|
| 3 |
+
size 29252
|
v2/fr_speaker_5.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d25adf10ce6b6cca7e4e5f661f8b97cf68c4d7a12fac9419cf1f73a454cc628
|
| 3 |
+
size 35396
|
v2/fr_speaker_6.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64094f15751cbb4afa9539f74dec43be7d6a4759a5c42c66a090b497bde559bf
|
| 3 |
+
size 35012
|
v2/fr_speaker_7.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:573d9435aa480be7d845ebd986ce2c8e414282febfbd470af121dbe96d26507e
|
| 3 |
+
size 43288
|
v2/fr_speaker_8.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67068a32fdf3f7bf2b86f101c93bce5c2cad384b31553e9cfe6dbafa6de276d4
|
| 3 |
+
size 26608
|
v2/fr_speaker_9.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4130e7e444d7e88eb7645848ef85a90c2b76c0b28f63228f297443530113a7d0
|
| 3 |
+
size 25156
|
v2/hi_speaker_0.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26a0c34236f820205aca5ca27ef9c48ee2eb52f46c4c56a553521d7ac2cb8f81
|
| 3 |
+
size 26224
|
v2/hi_speaker_1.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e3d37a58a69289f8ebabafcd9785e5837397ad5745196e7180a09c633ee8ab0
|
| 3 |
+
size 18588
|
v2/hi_speaker_2.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:117a20e822551022fac86b411dfd659443ae881c46c26f204cc34a155523e472
|
| 3 |
+
size 21616
|
v2/hi_speaker_3.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68b25988c9c9e8072781824219a13b4d218abdfb9c599e6fc65fa801cc2aadba
|
| 3 |
+
size 23108
|
v2/hi_speaker_4.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d00c089efcf63598d3dbc22f1ad36c5af38f78a365a94192151368278e7aeac9
|
| 3 |
+
size 19740
|