depasquale commited on
Commit
c08bace
·
verified ·
1 Parent(s): 034c265

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. README.md +63 -0
  2. config.json +15 -0
  3. merges.txt +0 -0
  4. model.safetensors +3 -0
  5. tokenizer_config.json +40 -0
  6. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: mlx-audio-plus
3
+ base_model:
4
+ - FunAudioLLM/CosyVoice2-0.5B
5
+ tags:
6
+ - mlx
7
+ - tts
8
+ - cosyvoice2
9
+ pipeline_tag: text-to-speech
10
+ language:
11
+ - en
12
+ - zh
13
+ - ja
14
+ - ko
15
+ ---
16
+
17
+ # mlx-community/CosyVoice2-0.5B-8bit
18
+
19
+ This model was converted to MLX format from [FunAudioLLM/CosyVoice2-0.5B](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B) using [mlx-audio-plus](https://github.com/DePasqualeOrg/mlx-audio-plus) version **0.1.2**.
20
+
21
+ ## Usage
22
+
23
+ ```bash
24
+ pip install -U mlx-audio-plus
25
+ ```
26
+
27
+ ### Inference Modes
28
+
29
+ | Mode | Parameters | Description |
30
+ |------|------------|-------------|
31
+ | Cross-lingual | `ref_audio` | Zero-shot TTS (default) |
32
+ | Zero-shot | `ref_audio` + `ref_text` | Better quality with transcription |
33
+ | Instruct | `ref_audio` + `instruct_text` | Style control (e.g., "speak slowly") |
34
+ | Voice Conversion | `source_audio` + `ref_audio` | Convert audio to target voice |
35
+
36
+ ### Command line
37
+
38
+ ```bash
39
+ # Cross-lingual (default)
40
+ mlx_audio.tts --model mlx-community/CosyVoice2-0.5B-8bit --text "Hello!" --ref_audio ref.wav
41
+
42
+ # Zero-shot (with transcription)
43
+ mlx_audio.tts --model mlx-community/CosyVoice2-0.5B-8bit --text "Hello!" --ref_audio ref.wav --ref_text "Transcription of ref audio."
44
+
45
+ # Instruct (style control)
46
+ mlx_audio.tts --model mlx-community/CosyVoice2-0.5B-8bit --text "Hello!" --ref_audio ref.wav --instruct_text "Speak slowly and calmly"
47
+
48
+ # Voice Conversion
49
+ mlx_audio.tts --model mlx-community/CosyVoice2-0.5B-8bit --source_audio source.wav --ref_audio ref.wav
50
+ ```
51
+
52
+ ### Python
53
+
54
+ ```python
55
+ from mlx_audio.tts.generate import generate_audio
56
+
57
+ generate_audio(
58
+ text="Hello, this is CosyVoice2 on MLX!",
59
+ model="mlx-community/CosyVoice2-0.5B-8bit",
60
+ ref_audio="reference.wav",
61
+ file_prefix="output",
62
+ )
63
+ ```
config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "cosyvoice2",
3
+ "version": "0.5B",
4
+ "sample_rate": 24000,
5
+ "mel_channels": 80,
6
+ "speech_token_size": 6561,
7
+ "dtype": "float16",
8
+ "quantization": {
9
+ "bits": 8,
10
+ "group_size": 64,
11
+ "quantized_components": [
12
+ "tokenizer/model.layers"
13
+ ]
14
+ }
15
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94cb536647f64d1145f3031093a8caf69ba8074b46919499e4f5ae357cb13a2a
3
+ size 957064122
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|im_end|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff