Fish Speech, OpenAudio
Browse files- .gitattributes +11 -0
- fish-speech-gui/releases/v1.4.5/fish-amd64.bin +3 -0
- fish-speech-gui/releases/v1.4.5/fish-speech-gui-v1.4.5.zip +3 -0
- fish-speech-gui/releases/v1.4.5/fish.exe +3 -0
- fish-speech.rs/releases/fish-speech.rs-v0.2.0.zip +3 -0
- fish-speech.rs/releases/fish-speech.rs-v0.2.1.zip +3 -0
- fish-speech.rs/releases/fish-speech.rs-v0.2.2.1.zip +3 -0
- fish-speech.rs/releases/fish-speech.rs-v0.2.2.zip +3 -0
- fish-speech.rs/releases/fish-speech.rs-v0.2.3.zip +3 -0
- fish-speech.rs/releases/fish-speech.rs-v0.3.0.1.zip +3 -0
- fish-speech.rs/releases/fish-speech.rs-v0.3.0.zip +3 -0
- fish-speech/releases/fish-speech-v1.4.0.zip +3 -0
- fish-speech/releases/fish-speech-v1.4.1.zip +3 -0
- fish-speech/releases/fish-speech-v1.4.2.zip +3 -0
- fish-speech/releases/fish-speech-v1.4.3.zip +3 -0
- fish-speech/releases/fish-speech-v1.5.0.zip +3 -0
- fish-speech/releases/fish-speech-v1.5.1.zip +3 -0
- openaudio-gguf/.gitattributes +45 -0
- openaudio-gguf/README.md +46 -0
- openaudio-gguf/codec-bf16.gguf +3 -0
- openaudio-gguf/codec-f16.gguf +3 -0
- openaudio-gguf/codec-f32.gguf +3 -0
- openaudio-gguf/codec-q2_k.gguf +3 -0
- openaudio-gguf/codec-q3_k_m.gguf +3 -0
- openaudio-gguf/codec-q4_k_m.gguf +3 -0
- openaudio-gguf/codec-q5_k_m.gguf +3 -0
- openaudio-gguf/codec-q6_k.gguf +3 -0
- openaudio-gguf/samples/audio1.wav +3 -0
- openaudio-gguf/samples/audio2.wav +3 -0
- openaudio-s1-mini/.gitattributes +35 -0
- openaudio-s1-mini/README.md +92 -0
- openaudio-s1-mini/codec.pth +3 -0
- openaudio-s1-mini/config.json +32 -0
- openaudio-s1-mini/model.pth +3 -0
- openaudio-s1-mini/special_tokens.json +0 -0
- openaudio-s1-mini/tokenizer.tiktoken +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
fish-speech-gui/releases/v1.4.5/fish.exe filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
openaudio-gguf/codec-bf16.gguf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
openaudio-gguf/codec-f16.gguf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
openaudio-gguf/codec-f32.gguf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
openaudio-gguf/codec-q2_k.gguf filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
openaudio-gguf/codec-q3_k_m.gguf filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
openaudio-gguf/codec-q4_k_m.gguf filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
openaudio-gguf/codec-q5_k_m.gguf filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
openaudio-gguf/codec-q6_k.gguf filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
openaudio-gguf/samples/audio1.wav filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
openaudio-gguf/samples/audio2.wav filter=lfs diff=lfs merge=lfs -text
|
fish-speech-gui/releases/v1.4.5/fish-amd64.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70735219758ba10f206907d0d56cac07ebba8dfdb34ed36a19adbfe5611e93d2
|
| 3 |
+
size 73944982
|
fish-speech-gui/releases/v1.4.5/fish-speech-gui-v1.4.5.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ee695e5c1f8ac0d1cf90dcce24f8bdbf24ac4d2d767ef1783836cb62019cf44
|
| 3 |
+
size 5790712
|
fish-speech-gui/releases/v1.4.5/fish.exe
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3857a9c96bbf01909d18c466591ee4e897c6840cb8038f759eb8ac126656f9f0
|
| 3 |
+
size 46259712
|
fish-speech.rs/releases/fish-speech.rs-v0.2.0.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2e42861abc754f5900bf72300926d2eceedbaf7a10e72664e38212b73ede2f9
|
| 3 |
+
size 2056531
|
fish-speech.rs/releases/fish-speech.rs-v0.2.1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a108c7b6160291f61ec5fa15125e6cc74128bca25073ab64bb11fa5bea45e75b
|
| 3 |
+
size 2057262
|
fish-speech.rs/releases/fish-speech.rs-v0.2.2.1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7daf3f8f62ade125de38edf24c980eee569ad9e6859d891403425a53098649b5
|
| 3 |
+
size 2057828
|
fish-speech.rs/releases/fish-speech.rs-v0.2.2.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:579e57584b2b208cba19baa37aa05714e1135cb07d541ff003202f8f80a33cf3
|
| 3 |
+
size 2057268
|
fish-speech.rs/releases/fish-speech.rs-v0.2.3.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a92987a8c4bf511be15344a4fe918c9fff48fc4f0915916ca49d02fa6e5414d3
|
| 3 |
+
size 2057976
|
fish-speech.rs/releases/fish-speech.rs-v0.3.0.1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eaa3813bc3f53113d7ef2e648ed13c17cdaa53af1fdb42017912571c17e2424a
|
| 3 |
+
size 2058697
|
fish-speech.rs/releases/fish-speech.rs-v0.3.0.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1aa614a8186c40b226bf51460eaed45feaf17143050628329c8f078dd1074886
|
| 3 |
+
size 2057318
|
fish-speech/releases/fish-speech-v1.4.0.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be6c9466ad5ffc7e334fda81442c8818fd8090ffef47285b9dae15dcd4dcf480
|
| 3 |
+
size 606436
|
fish-speech/releases/fish-speech-v1.4.1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:beeac30cdc5ceef826faa445c5a5fe3813ba48b6fcf5f35a632b6c4ed92ce6d1
|
| 3 |
+
size 608016
|
fish-speech/releases/fish-speech-v1.4.2.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:77c3e85969e21f1d1cf2789e6a16c6afd6fd0960724f1214ce7907f4c059e608
|
| 3 |
+
size 627081
|
fish-speech/releases/fish-speech-v1.4.3.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8bb033c62c63298d0d357f35bf34d184b8284ee35baf4ff2a6c7427440ef6a3f
|
| 3 |
+
size 926563
|
fish-speech/releases/fish-speech-v1.5.0.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3022b240d5be41943bb300ad5d7bd8e9fa32e81aa5e37ad94f5a51aa532276ec
|
| 3 |
+
size 935374
|
fish-speech/releases/fish-speech-v1.5.1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0161ab223786f0ac31dbdddeb4b423d19fe027be6dcfe7033c16843df0a320b7
|
| 3 |
+
size 887692
|
openaudio-gguf/.gitattributes
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
codec-bf16.gguf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
codec-f16.gguf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
codec-f32.gguf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
codec-q2_k.gguf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
codec-q3_k_m.gguf filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
codec-q4_k_m.gguf filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
codec-q5_k_m.gguf filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
codec-q6_k.gguf filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
samples/audio1.wav filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
samples/audio2.wav filter=lfs diff=lfs merge=lfs -text
|
openaudio-gguf/README.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-sa-4.0
|
| 3 |
+
base_model:
|
| 4 |
+
- fishaudio/openaudio-s1-mini
|
| 5 |
+
pipeline_tag: text-to-speech
|
| 6 |
+
tags:
|
| 7 |
+
- gguf-connector
|
| 8 |
+
---
|
| 9 |
+
## gguf quantized version of openaudio
|
| 10 |
+
- base model from [fishaudio](https://huggingface.co/fishaudio)
|
| 11 |
+
- text-to-speech synthesis
|
| 12 |
+
|
| 13 |
+
### **run it with gguf-connector**
|
| 14 |
+
```
|
| 15 |
+
ggc o2
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+

|
| 19 |
+
|
| 20 |
+
| Prompt | Audio Sample |
|
| 21 |
+
|--------|---------------|
|
| 22 |
+
|`Hey Connector, why your appearance looks so stupid?`<br/>`Oh, really? maybe I ate too much smart beans.`<br/>`Wow. Amazing (laughing).`<br/>`Let's go to get some more smart beans and you will become stupid as well.`<br/> | 🎧 **audio-sample-1**<br><audio controls src="https://huggingface.co/calcuis/openaudio-gguf/resolve/main/samples%5Caudio1.wav"></audio> |
|
| 23 |
+
|`Suddenly the plane's engines began failing, and the pilot says there isn't much time, and he'll keep the plane in the air as long as he can, and told his two passengers to take the only two parachutes on board and bail out. The world's smartest man immediately took a parachute and said "I'm the world's smartest man! The world needs me, so I can't die here!", and then jumped out of the plane. The pilot tells the hippie to hurry up and take the other parachute, because there aren't any more. And the hippie says "Relax man. We'll be fine. The world's smartest man took my backpack."`<br/> | 🎧 **audio-sample-2**<br><audio controls src="https://huggingface.co/calcuis/openaudio-gguf/resolve/main/samples%5Caudio2.wav"></audio> |
|
| 24 |
+
|
| 25 |
+
### **review/reference**
|
| 26 |
+
- simply execute the command (`ggc o2`) above in console/terminal
|
| 27 |
+
- opt a `codec` gguf file in the current directory to interact with (see example below)
|
| 28 |
+
|
| 29 |
+
>
|
| 30 |
+
>GGUF file(s) available. Select which one for codec:
|
| 31 |
+
>
|
| 32 |
+
>1. codec-bf16.gguf
|
| 33 |
+
>2. codec-f16.gguf
|
| 34 |
+
>3. codec-f32.gguf
|
| 35 |
+
>4. codec-q2_k.gguf
|
| 36 |
+
>5. codec-q3_k_m.gguf
|
| 37 |
+
>6. codec-q4_k_m.gguf
|
| 38 |
+
>7. codec-q5_k_m.gguf
|
| 39 |
+
>8. codec-q6_k.gguf
|
| 40 |
+
>
|
| 41 |
+
>Enter your choice (1 to 8): _
|
| 42 |
+
>
|
| 43 |
+
|
| 44 |
+
- note: tokenizer and model will be pulled to `models/fish` automatically during the first launch
|
| 45 |
+
- then run it entirely offline; i.e., from local URL: http://127.0.0.1:7860 with lazy webui
|
| 46 |
+
- gguf-connector ([pypi](https://pypi.org/project/gguf-connector))
|
openaudio-gguf/codec-bf16.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:261394a822e17e24dd24a222c4511fb560e72efa3125efc1760a7577d31a0bd2
|
| 3 |
+
size 1390445280
|
openaudio-gguf/codec-f16.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc386742876414b1810baa790fb2b545960b8932d9f0ba27396575fece7f4503
|
| 3 |
+
size 1390445280
|
openaudio-gguf/codec-f32.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fae3c0fa7bd162dd45751502929da4fc7e6800aa38e43cd816f294ff9f15348
|
| 3 |
+
size 2780022496
|
openaudio-gguf/codec-q2_k.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:046b2dc7ddc9292929210e886a7009ca7007b924a99ed9efebc1499fe0a64a1e
|
| 3 |
+
size 878543584
|
openaudio-gguf/codec-q3_k_m.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da0575a707e08b74d2c4a2c1dd1647cf86de4adf2e66e461d27ac5dc76aa35f5
|
| 3 |
+
size 909640416
|
openaudio-gguf/codec-q4_k_m.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16109c1c80cfeda62e78ec55d1f69c947a99119b5dba95eba0186c06a61d8984
|
| 3 |
+
size 950305504
|
openaudio-gguf/codec-q5_k_m.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86fcb3dd9b5193b6ba06843ed2beddcd3534a84d751bb464fbb2a1f87131fad4
|
| 3 |
+
size 988578528
|
openaudio-gguf/codec-q6_k.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12c1dee9265853be3fceed63b33d25dd9eb4f2496d2f6e8fea5a0788e499e5cd
|
| 3 |
+
size 1029243616
|
openaudio-gguf/samples/audio1.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:958951b601f18b74a4a896cdfac27ba00359600cf33756792d1297811e89df7d
|
| 3 |
+
size 835628
|
openaudio-gguf/samples/audio2.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6fa0bc4cf6a49d45b264532850239f41ef9bed407fd3a154b6d79c8c5f5a393
|
| 3 |
+
size 2396204
|
openaudio-s1-mini/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
openaudio-s1-mini/README.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- text-to-speech
|
| 4 |
+
license: cc-by-nc-sa-4.0
|
| 5 |
+
language:
|
| 6 |
+
- zh
|
| 7 |
+
- en
|
| 8 |
+
- de
|
| 9 |
+
- ja
|
| 10 |
+
- fr
|
| 11 |
+
- es
|
| 12 |
+
- ko
|
| 13 |
+
- ar
|
| 14 |
+
- nl
|
| 15 |
+
- ru
|
| 16 |
+
- it
|
| 17 |
+
- pl
|
| 18 |
+
- pt
|
| 19 |
+
pipeline_tag: text-to-speech
|
| 20 |
+
inference: false
|
| 21 |
+
extra_gated_prompt: >-
|
| 22 |
+
You agree to not use the model to generate contents that violate DMCA or local
|
| 23 |
+
laws.
|
| 24 |
+
extra_gated_fields:
|
| 25 |
+
Country: country
|
| 26 |
+
Specific date: date_picker
|
| 27 |
+
I agree to use this model for non-commercial use ONLY: checkbox
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# OpenAudio S1
|
| 32 |
+
|
| 33 |
+
**OpenAudio S1** is a leading text-to-speech (TTS) model trained on more than 2 million hours of audio data in multiple languages.
|
| 34 |
+
|
| 35 |
+
Supported languages:
|
| 36 |
+
- English (en)
|
| 37 |
+
- Chinese (zh)
|
| 38 |
+
- Japanese (ja)
|
| 39 |
+
- German (de)
|
| 40 |
+
- French (fr)
|
| 41 |
+
- Spanish (es)
|
| 42 |
+
- Korean (ko)
|
| 43 |
+
- Arabic (ar)
|
| 44 |
+
- Russian (ru)
|
| 45 |
+
- Dutch (nl)
|
| 46 |
+
- Italian (it)
|
| 47 |
+
- Polish (pl)
|
| 48 |
+
- Portuguese (pt)
|
| 49 |
+
|
| 50 |
+
Please refer to [Fish Speech Github](https://github.com/fishaudio/fish-speech) for more info.
|
| 51 |
+
Demo available at [Fish Audio Playground](https://fish.audio).
|
| 52 |
+
Visit the [OpenAudio website](https://openaudio.com) for blog & tech report.
|
| 53 |
+
|
| 54 |
+
## Emotion and Tone Support
|
| 55 |
+
|
| 56 |
+
OpenAudio S1 supports a variety of emotional, tone, and special markers to enhance speech synthesis:
|
| 57 |
+
|
| 58 |
+
**1. Emotional markers:**
|
| 59 |
+
(angry) (sad) (disdainful) (excited) (surprised) (satisfied) (unhappy) (anxious) (hysterical) (delighted) (scared) (worried) (indifferent) (upset) (impatient) (nervous) (guilty) (scornful) (frustrated) (depressed) (panicked) (furious) (empathetic) (embarrassed) (reluctant) (disgusted) (keen) (moved) (proud) (relaxed) (grateful) (confident) (interested) (curious) (confused) (joyful) (disapproving) (negative) (denying) (astonished) (serious) (sarcastic) (conciliative) (comforting) (sincere) (sneering) (hesitating) (yielding) (painful) (awkward) (amused)
|
| 60 |
+
|
| 61 |
+
**2. Tone markers:**
|
| 62 |
+
(in a hurry tone) (shouting) (screaming) (whispering) (soft tone)
|
| 63 |
+
|
| 64 |
+
**3. Special markers:**
|
| 65 |
+
(laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting) (groaning) (crowd laughing) (background laughter) (audience laughing)
|
| 66 |
+
|
| 67 |
+
**Special markers with corresponding onomatopoeia:**
|
| 68 |
+
- Laughing: Ha,ha,ha
|
| 69 |
+
- Chuckling: Hmm,hmm
|
| 70 |
+
|
| 71 |
+
## Model Variants and Performance
|
| 72 |
+
|
| 73 |
+
OpenAudio S1 includes the following models:
|
| 74 |
+
- **S1 (4B, proprietary):** The full-sized model.
|
| 75 |
+
- **S1-mini (0.5B):** A distilled version of S1.
|
| 76 |
+
|
| 77 |
+
Both S1 and S1-mini incorporate online Reinforcement Learning from Human Feedback (RLHF).
|
| 78 |
+
|
| 79 |
+
**Seed TTS Eval Metrics (English, auto eval, based on OpenAI gpt-4o-transcribe, speaker distance using Revai/pyannote-wespeaker-voxceleb-resnet34-LM):**
|
| 80 |
+
|
| 81 |
+
- **S1:**
|
| 82 |
+
- WER (Word Error Rate): **0.008**
|
| 83 |
+
- CER (Character Error Rate): **0.004**
|
| 84 |
+
- Distance: **0.332**
|
| 85 |
+
- **S1-mini:**
|
| 86 |
+
- WER (Word Error Rate): **0.011**
|
| 87 |
+
- CER (Character Error Rate): **0.005**
|
| 88 |
+
- Distance: **0.380**
|
| 89 |
+
|
| 90 |
+
## License
|
| 91 |
+
|
| 92 |
+
This model is permissively licensed under the CC-BY-NC-SA-4.0 license.
|
openaudio-s1-mini/codec.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74fc41c5a7151c6f350af8bd7e5d6e3accfcc7f3dfbfac23afd35af07052bb2f
|
| 3 |
+
size 1871099728
|
openaudio-s1-mini/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_o_bias": false,
|
| 3 |
+
"attention_qk_norm": true,
|
| 4 |
+
"attention_qkv_bias": false,
|
| 5 |
+
"codebook_size": 4096,
|
| 6 |
+
"dim": 1024,
|
| 7 |
+
"dropout": 0.0,
|
| 8 |
+
"fast_attention_o_bias": false,
|
| 9 |
+
"fast_attention_qk_norm": false,
|
| 10 |
+
"fast_attention_qkv_bias": false,
|
| 11 |
+
"fast_dim": 1024,
|
| 12 |
+
"fast_head_dim": 64,
|
| 13 |
+
"fast_intermediate_size": 3072,
|
| 14 |
+
"fast_n_head": 16,
|
| 15 |
+
"fast_n_local_heads": 8,
|
| 16 |
+
"head_dim": 128,
|
| 17 |
+
"initializer_range": 0.03125,
|
| 18 |
+
"intermediate_size": 3072,
|
| 19 |
+
"max_seq_len": 8192,
|
| 20 |
+
"model_type": "dual_ar",
|
| 21 |
+
"n_fast_layer": 4,
|
| 22 |
+
"n_head": 16,
|
| 23 |
+
"n_layer": 28,
|
| 24 |
+
"n_local_heads": 8,
|
| 25 |
+
"norm_eps": 1e-06,
|
| 26 |
+
"num_codebooks": 10,
|
| 27 |
+
"rope_base": 1000000,
|
| 28 |
+
"scale_codebook_embeddings": true,
|
| 29 |
+
"tie_word_embeddings": false,
|
| 30 |
+
"use_gradient_checkpointing": true,
|
| 31 |
+
"vocab_size": 155776
|
| 32 |
+
}
|
openaudio-s1-mini/model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e59be7dc6714040dce3cde1f41e730c2f0daa5339785b1cd3b60041208c35e6
|
| 3 |
+
size 1735122974
|
openaudio-s1-mini/special_tokens.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
openaudio-s1-mini/tokenizer.tiktoken
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|