Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +29 -0
added_tokens.json +21 -0
conds.safetensors +3 -0
config.json +91 -0
merges.txt +0 -0
model.safetensors +3 -0
model.safetensors.index.json +0 -0
special_tokens_map.json +24 -0
tokenizer_config.json +175 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+---
+license: apache-2.0
+language:
+- en
+base_model:
+- mlx-community/chatterbox-turbo
+pipeline_tag: text-to-speech
+library_name: mlx-audio
+tags:
+- text-to-speech
+- speech
+- speech generation
+- voice cloning
+- mlx
+- mlx
+---
+# mlx-community/chatterbox-turbo-8bit
+This model was converted to MLX format from [`mlx-community/chatterbox-turbo`](https://huggingface.co/mlx-community/chatterbox-turbo) using mlx-audio version **0.2.7**.
+Refer to the [original model card](https://huggingface.co/mlx-community/chatterbox-turbo) for more details on the model.
+## Use with mlx
+```bash
+pip install -U mlx-audio
+```
+```bash
+python -m mlx_audio.tts.generate --model mlx-community/chatterbox-turbo-8bit --text "Describe this image."
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "[advertisement]": 50261,
+  "[angry]": 50257,
+  "[chuckle]": 50274,
+  "[clear throat]": 50267,
+  "[cough]": 50270,
+  "[crying]": 50264,
+  "[dramatic]": 50262,
+  "[fear]": 50258,
+  "[gasp]": 50273,
+  "[groan]": 50271,
+  "[happy]": 50265,
+  "[laugh]": 50275,
+  "[narration]": 50263,
+  "[sarcastic]": 50266,
+  "[shush]": 50269,
+  "[sigh]": 50268,
+  "[sniff]": 50272,
+  "[surprised]": 50259,
+  "[whispering]": 50260
+}

conds.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df9ad2c54848027d94cf01f9fc0ed22bc5d3165df6e6a85c903c1124b8a78a4a
+size 164884

config.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+    "architecture": "chatterbox_turbo",
+    "dec_cond_len_seconds": 10,
+    "enc_cond_len_seconds": 15,
+    "gpt2": {
+        "activation_function": "gelu_new",
+        "n_ctx": 8196,
+        "n_embd": 1024,
+        "hidden_size": 1024,
+        "n_head": 16,
+        "n_layer": 24,
+        "n_positions": 8196,
+        "vocab_size": 50276,
+        "layer_norm_epsilon": 1e-05,
+        "attn_pdrop": 0.1,
+        "embd_pdrop": 0.1,
+        "resid_pdrop": 0.1
+    },
+    "model_type": "chatterbox_turbo",
+    "quantization": {
+        "group_size": 64,
+        "bits": 8,
+        "mode": "affine"
+    },
+    "quantization_config": {
+        "group_size": 64,
+        "bits": 8,
+        "mode": "affine"
+    },
+    "s3gen": {
+        "output_sample_rate": 24000,
+        "input_sample_rate": 16000,
+        "silence_token": 4299,
+        "speech_vocab_size": 6561,
+        "meanflow": true,
+        "token_embedding_dim": 512,
+        "encoder_attention_heads": 8,
+        "encoder_linear_units": 2048,
+        "encoder_num_blocks": 6,
+        "encoder_dropout_rate": 0.1,
+        "decoder_in_channels": 320,
+        "decoder_out_channels": 80,
+        "decoder_channels": [
+            256
+        ],
+        "decoder_attention_head_dim": 64,
+        "decoder_n_blocks": 4,
+        "decoder_num_mid_blocks": 12,
+        "decoder_num_heads": 8,
+        "cfm_sigma_min": 1e-06,
+        "cfm_t_scheduler": "cosine",
+        "cfm_inference_cfg_rate": 0.7
+    },
+    "sample_rate": 24000,
+    "t3": {
+        "start_text_token": 255,
+        "stop_text_token": 0,
+        "text_tokens_dict_size": 50276,
+        "max_text_tokens": 2048,
+        "start_speech_token": 6561,
+        "stop_speech_token": 6562,
+        "speech_tokens_dict_size": 6563,
+        "max_speech_tokens": 4096,
+        "llama_config_name": "GPT2_medium",
+        "input_pos_emb": null,
+        "speech_cond_prompt_len": 375,
+        "encoder_type": "voice_encoder",
+        "speaker_embed_size": 256,
+        "use_perceiver_resampler": false,
+        "emotion_adv": false
+    },
+    "voice_encoder": {
+        "num_mels": 40,
+        "sample_rate": 16000,
+        "speaker_embed_size": 256,
+        "ve_hidden_size": 256,
+        "flatten_lstm_params": false,
+        "n_fft": 400,
+        "hop_size": 160,
+        "win_size": 400,
+        "fmax": 8000,
+        "fmin": 0,
+        "preemphasis": 0.0,
+        "mel_power": 2.0,
+        "mel_type": "amp",
+        "normalized_mels": false,
+        "ve_partial_frames": 160,
+        "ve_final_relu": true,
+        "stft_magnitude_min": 0.0001
+    }
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbfe447b04d11bb2d877f1305c37945ca109ffcf74a982a533e4baa40f273ba2
+size 706233417

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,175 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "[angry]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "[fear]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "[surprised]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "[whispering]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "[advertisement]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "[dramatic]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "[narration]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "[crying]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "[happy]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "[sarcastic]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "[clear throat]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "[sigh]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "[shush]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "[cough]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "[groan]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "[sniff]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "[gasp]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "[chuckle]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "[laugh]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff