Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +54 -0
added_tokens.json +21 -0
conds.safetensors +3 -0
config.json +84 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer_config.json +175 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+---
+license: apache-2.0
+language:
+- en
+base_model:
+- mlx-community/chatterbox-turbo-fp16
+pipeline_tag: text-to-speech
+library_name: mlx-audio
+tags:
+- text-to-speech
+- speech
+- speech generation
+- voice cloning
+- mlx
+- tts
+---
+# mlx-community/chatterbox-turbo
+This model was converted to MLX format from [`ResembleAI/chatterbox-turbo`](https://huggingface.co/ResembleAI/chatterbox-turbo) using mlx-audio version **0.2.8**.
+Refer to the [original model card](https://huggingface.co/ResembleAI/chatterbox-turbo) for more details on the model.
+## Use with mlx
+```bash
+pip install -U mlx-audio
+```
+## Voice Cloning:
+```bash
+mlx_audio.tts.generate --model mlx-community/chatterbox-turbo-fp16 --text "Oh, that's hilarious! [chuckle] Um anyway, we do have a new model in store. It's the SkyNet T-800 series and it's got basically everything. Including AI integration with ChatGPT and all that jazz. Would you like me to get some prices for you?" --ref_audio path_to_file.wav --play
+```
+## Emotion Control
+Chatterbox supports expressive event tags that can be inserted directly into your text to add natural vocal expressions:
+| Tag | Description |
+|-----|-------------|
+| `[clear throat]` | Throat clearing sound |
+| `[sigh]` | Sighing expression |
+| `[shush]` | Shushing sound |
+| `[cough]` | Coughing sound |
+| `[groan]` | Groaning expression |
+| `[sniff]` | Sniffing sound |
+| `[gasp]` | Gasping expression |
+| `[chuckle]` | Light chuckling |
+| `[laugh]` | Laughter |
+```bash
+mlx_audio.tts.generate --model mlx-community/chatterbox-turbo-fp16 --text "[sigh] I can't believe it's Monday again. [groan] But hey, [clear throat] let's make the best of it!" --play
+```
+## Default Voice:
+```bash
+mlx_audio.tts.generate --model mlx-community/chatterbox-turbo-fp16 --text "Oh, that's hilarious! [chuckle] Um anyway, we do have a new model in store. It's the SkyNet T-800 series and it's got basically everything. Including AI integration with ChatGPT and all that jazz. Would you like me to get some prices for you?" --play
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "[advertisement]": 50261,
+  "[angry]": 50257,
+  "[chuckle]": 50274,
+  "[clear throat]": 50267,
+  "[cough]": 50270,
+  "[crying]": 50264,
+  "[dramatic]": 50262,
+  "[fear]": 50258,
+  "[gasp]": 50273,
+  "[groan]": 50271,
+  "[happy]": 50265,
+  "[laugh]": 50275,
+  "[narration]": 50263,
+  "[sarcastic]": 50266,
+  "[shush]": 50269,
+  "[sigh]": 50268,
+  "[sniff]": 50272,
+  "[surprised]": 50259,
+  "[whispering]": 50260
+}

conds.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df9ad2c54848027d94cf01f9fc0ed22bc5d3165df6e6a85c903c1124b8a78a4a
+size 164884

config.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "model_type": "chatterbox_turbo",
+  "architecture": "chatterbox_turbo",
+  "t3": {
+    "start_text_token": 255,
+    "stop_text_token": 0,
+    "text_tokens_dict_size": 50276,
+    "max_text_tokens": 2048,
+    "start_speech_token": 6561,
+    "stop_speech_token": 6562,
+    "speech_tokens_dict_size": 6563,
+    "max_speech_tokens": 4096,
+    "llama_config_name": "GPT2_medium",
+    "input_pos_emb": null,
+    "speech_cond_prompt_len": 375,
+    "encoder_type": "voice_encoder",
+    "speaker_embed_size": 256,
+    "use_perceiver_resampler": false,
+    "emotion_adv": false
+  },
+  "gpt2": {
+    "activation_function": "gelu_new",
+    "n_ctx": 8196,
+    "n_embd": 1024,
+    "hidden_size": 1024,
+    "n_head": 16,
+    "n_layer": 24,
+    "n_positions": 8196,
+    "vocab_size": 50276,
+    "layer_norm_epsilon": 1e-05,
+    "attn_pdrop": 0.1,
+    "embd_pdrop": 0.1,
+    "resid_pdrop": 0.1
+  },
+  "voice_encoder": {
+    "num_mels": 40,
+    "sample_rate": 16000,
+    "speaker_embed_size": 256,
+    "ve_hidden_size": 256,
+    "flatten_lstm_params": false,
+    "n_fft": 400,
+    "hop_size": 160,
+    "win_size": 400,
+    "fmax": 8000,
+    "fmin": 0,
+    "preemphasis": 0.0,
+    "mel_power": 2.0,
+    "mel_type": "amp",
+    "normalized_mels": false,
+    "ve_partial_frames": 160,
+    "ve_final_relu": true,
+    "stft_magnitude_min": 1e-4
+  },
+  "s3gen": {
+    "output_sample_rate": 24000,
+    "input_sample_rate": 16000,
+    "silence_token": 4299,
+    "speech_vocab_size": 6561,
+    "meanflow": true,
+    "token_embedding_dim": 512,
+    "encoder_attention_heads": 8,
+    "encoder_linear_units": 2048,
+    "encoder_num_blocks": 6,
+    "encoder_dropout_rate": 0.1,
+    "decoder_in_channels": 320,
+    "decoder_out_channels": 80,
+    "decoder_channels": [256],
+    "decoder_attention_head_dim": 64,
+    "decoder_n_blocks": 4,
+    "decoder_num_mid_blocks": 12,
+    "decoder_num_heads": 8,
+    "cfm_sigma_min": 1e-6,
+    "cfm_t_scheduler": "cosine",
+    "cfm_inference_cfg_rate": 0.7
+  },
+  "sample_rate": 24000,
+  "enc_cond_len_seconds": 15,
+  "dec_cond_len_seconds": 10
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f70328a3f6c5257257aea76e9b14c34d8225745d133e4c1e83aa31f3f72a80b
+size 2985990960

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,175 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "[angry]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "[fear]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "[surprised]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "[whispering]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "[advertisement]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "[dramatic]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "[narration]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "[crying]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "[happy]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "[sarcastic]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "[clear throat]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "[sigh]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "[shush]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "[cough]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "[groan]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "[sniff]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "[gasp]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "[chuckle]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "[laugh]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff