lihongjie
commited on
Commit
·
08a04fb
1
Parent(s):
867fae1
first commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +55 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel +3 -0
- README.md +227 -0
- asset/dingding.png +3 -0
- asset/output.wav +3 -0
- asset/zero_shot_prompt.wav +3 -0
- frontend-onnx/campplus.onnx +3 -0
- frontend-onnx/speech_tokenizer_v3.onnx +3 -0
- main_ax650 +3 -0
- run_ax650.sh +21 -0
- scripts/CosyVoice-BlankEN/merges.txt +0 -0
- scripts/CosyVoice-BlankEN/tokenizer_config.json +40 -0
- scripts/CosyVoice-BlankEN/vocab.json +0 -0
- scripts/audio.py +83 -0
- scripts/cosyvoice3_tokenizer.py +124 -0
- scripts/frontend.py +251 -0
- scripts/gradio_demo.py +161 -0
- scripts/meldataset.py +217 -0
- scripts/process_prompt.py +62 -0
- scripts/requirements.txt +8 -0
- scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +3 -0
- scripts/tokenizer/tokenizer.py +327 -0
- token2wav-axmodels/flow.input_embedding.float16.bin +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,58 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
token2wav-axmodels/flow_estimator_200.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
token2wav-axmodels/speech_window_2x8x480.txt filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
token2wav-axmodels/flow.input_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
token2wav-axmodels/rand_noise_1_80_300.txt filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
main_ax650 filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
frontend-onnx/speech_tokenizer_v3.onnx filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
token2wav-axmodels/hift_p1_100.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
token2wav-axmodels/hift_p1_50.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
token2wav-axmodels/hift_p2_100_final.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
token2wav-axmodels/flow_encoder_50_final.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
token2wav-axmodels/hift_p2_150.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
token2wav-axmodels/hift_p2_50.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
token2wav-axmodels/hift_p1_150.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
frontend-onnx/campplus.onnx filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
token2wav-axmodels/flow_encoder_78.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
token2wav-axmodels/flow_estimator_300.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
token2wav-axmodels/flow.input_embedding.npy filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
token2wav-axmodels/hift_p1_100_final.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
token2wav-axmodels/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
token2wav-axmodels/flow_encoder_28.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
token2wav-axmodels/flow_encoder_53.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
token2wav-axmodels/flow_estimator_250.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
token2wav-axmodels/hift_p2_100.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
asset/zero_shot_prompt.wav filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
asset/dingding.png filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
asset/output.wav filter=lfs diff=lfs merge=lfs -text
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:54b7a3a7095c822489d43ee4b3a490606cfe9ad347d2c9e13c4581d8de0cfab8
|
| 3 |
+
size 12115712
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a04dd04e4b5a95dca860ec1026c736e2aca158f45488969724e8a3f4a5682e85
|
| 3 |
+
size 6506188
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fa9b7fbc8ba5adbedbde9a6704ab2cbb73cdccc370a06b0be79086176179572
|
| 3 |
+
size 272269312
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81970ce053f1dbb0fb6db9f61be43f79c92eef326c08ac2d445e5979d78ed7ad
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00408e4cfaa0d7a0a7fdd6305ee7c66b967ac287eee14f3a6ba146d6426e9591
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4472b2f70c7960269663e981015d5f828c4d240b316cf00e1bf33ff678bbe92f
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c26672ead01ee8164b7405eb2d6c7fd410763ddcabfdd87ebdee6179ebd1f4b
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:33916843b8c8a0a82a35b9f46ab893c3ab9d400660a7cac92958d305b6a1aea8
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e25ade425171c7139ce758936c9d95d13dbec129630a01c88171ee085dc71c75
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3dafa43dc2ec21423d4d9147d1bdd48572d09e1e355dc921f4be6569ce1b799c
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ed263e08194acc4867fa7ffd13ad2044b18be9f8fba1250f276ac23b200e0dd
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:926bfceb3b6fae9afd79b7f5babf2931cb44675077866b192af367d2fc3ad5b4
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a426acb4187d034fc9a30406386c0cd0f9271c0794fa66646349d1d19b4f53e
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:484147aaee42d3314c72b41790c96e633848f44175e37762e897515ba0c7e346
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d207d70025931a9da459d3fcf2469b2b744e6e7bc6173abf7c9f70a9df83d8d9
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:075b956543f4ee84bdc9e61cce69f7286c94d1ff03f25d89d997f1786616dee6
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16ac8a2cba87bdbe3c9ed55cf60f8e5e683f69cc8334e7cf9a7594b7a20a6157
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd7c709bb017122f4e052893cc0ec4d0f7f2bc6a15b0f87c488947d2fccb6565
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae7339df1ee72ca28b557b6933997cec517b1a38766ffbe706814e12cda4137d
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa85ea3722be6839f5a6c01485a276908327f215523a3c09bb8028ce652d96d5
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12c5faedfae53817cb67f868e00d98c80f688b89093b1c142732bc848811682e
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb9f5106bf2280d592e02913fa1c43f7ca1bebb66969436ab79dfc5a4b174749
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:beac97add03d3edf0055cce152990138b9bbc9dac9835a384c35cadc1510f2b6
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60b01ff651579615d6db2ba63001fbcd8c0b740d765ad0eb42dab8b5549538a4
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63f4e960aff4347ba663b1c17ff7cc2f399c663e06610dc35ac0e4d95909bd24
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:196b5a89d35e522141e2e8ca0eac44e5d0723b0e6c3e6dec39730a3c9367fabb
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c46c32e13e5048c236f83ed783fd79457f6cd2dd4b88984fa756be7c53c7b849
|
| 3 |
+
size 17235064
|
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:302e68e08caf347dd063947baff79b68ce5f60da0c0a493942aa51b914c0b7e0
|
| 3 |
+
size 147957518
|
README.md
CHANGED
|
@@ -1,3 +1,230 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
- zh
|
| 6 |
+
base_model:
|
| 7 |
+
- CosyVoice3
|
| 8 |
+
pipeline_tag: text-to-speech
|
| 9 |
+
library_name: transformers
|
| 10 |
+
tags:
|
| 11 |
+
- CosyVoice3
|
| 12 |
+
- Speech
|
| 13 |
---
|
| 14 |
+
|
| 15 |
+
# CosyVoice3
|
| 16 |
+
This version of CosyVoice3 has been converted to run on the Axera NPU using **w8a16** quantization.
|
| 17 |
+
Compatible with Pulsar2 version: 4.2
|
| 18 |
+
|
| 19 |
+
## Convert tools links:
|
| 20 |
+
For those who are interested in model conversion, you can try to export axmodel through the original repo :
|
| 21 |
+
[Cosyvoice](https://github.com/FunAudioLLM/CosyVoice)
|
| 22 |
+
|
| 23 |
+
[Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
|
| 24 |
+
|
| 25 |
+
[AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/CosyVoice3.Axera)
|
| 26 |
+
|
| 27 |
+
## Support Platform
|
| 28 |
+
|
| 29 |
+
- AX650
|
| 30 |
+
- AX650N DEMO Board
|
| 31 |
+
- [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
|
| 32 |
+
- [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
|
| 33 |
+
|
| 34 |
+
**Speech Generation**
|
| 35 |
+
| Stage | Time |
|
| 36 |
+
|------|------|
|
| 37 |
+
| llm prefill ( input_token_num + prompt_token_num 在 [0,128 ] ) | 104 ms |
|
| 38 |
+
| llm prefill ( input_token_num + prompt_token_num 在 [128,256 ] ) | 234 ms |
|
| 39 |
+
| Decode | 21.24 token/s |
|
| 40 |
+
|
| 41 |
+
## How to use
|
| 42 |
+
|
| 43 |
+
Download all files from this repository to the device
|
| 44 |
+
|
| 45 |
+
### 1. PrePare
|
| 46 |
+
|
| 47 |
+
#### 1.1 Copy this project to AX650 Board
|
| 48 |
+
|
| 49 |
+
#### 1.2 Prepare Dependencies
|
| 50 |
+
|
| 51 |
+
**Running HTTP Tokenizer Server** and **Processing Prompt Speech** require these Python packages. If you run these two step on a PC, install them on the PC.
|
| 52 |
+
```
|
| 53 |
+
pip3 install -r scripts/requirements.txt
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### 2. Start HTTP Tokenizer Server
|
| 57 |
+
```
|
| 58 |
+
cd scripts
|
| 59 |
+
python CosyVoice3_tokenizer.py --host {your host} --port {your port}
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
### 3. Run on Axera Device
|
| 64 |
+
There are 2 kinds of device, AX650 Board , AXCL aarch64 Board and AXCL x86 Board.
|
| 65 |
+
|
| 66 |
+
#### 3.1 Run on AX650 Board
|
| 67 |
+
1) Moidfy the HTTP host in `run_ax650.sh`.
|
| 68 |
+
|
| 69 |
+
2) Run `run_ax650.sh`
|
| 70 |
+
```shell
|
| 71 |
+
root@ax650 ~/CosyVoice3 # bash run_ax650.sh
|
| 72 |
+
rm: cannot remove 'output*.wav': No such file or directory
|
| 73 |
+
[I][ Init][ 108]: LLM init start
|
| 74 |
+
[I][ Init][ 34]: connect http://10.122.86.184:12345 ok
|
| 75 |
+
bos_id: 0, eos_id: 1773
|
| 76 |
+
7% | ███ | 2 / 27 [3.11s<42.04s, 0.64 count/s] embed_selector init ok[I][ Init][ 138]: attr.axmodel_num:24
|
| 77 |
+
100% | ████████████████████████████████ | 27 / 27 [10.32s<10.32s, 2.62 count/s] init post axmodel ok,remain_cmm(7178 MB)
|
| 78 |
+
[I][ Init][ 216]: max_token_len : 1023
|
| 79 |
+
[I][ Init][ 221]: kv_cache_size : 128, kv_cache_num: 1023
|
| 80 |
+
[I][ Init][ 229]: prefill_token_num : 128
|
| 81 |
+
[I][ Init][ 233]: grp: 1, prefill_max_token_num : 1
|
| 82 |
+
[I][ Init][ 233]: grp: 2, prefill_max_token_num : 128
|
| 83 |
+
[I][ Init][ 233]: grp: 3, prefill_max_token_num : 256
|
| 84 |
+
[I][ Init][ 233]: grp: 4, prefill_max_token_num : 384
|
| 85 |
+
[I][ Init][ 233]: grp: 5, prefill_max_token_num : 512
|
| 86 |
+
[I][ Init][ 237]: prefill_max_token_num : 512
|
| 87 |
+
[I][ Init][ 249]: LLM init ok
|
| 88 |
+
[I][ Init][ 154]: Token2Wav init ok
|
| 89 |
+
[I][ main][ 273]:
|
| 90 |
+
[I][ Run][ 388]: input token num : 142, prefill_split_num : 2
|
| 91 |
+
[I][ Run][ 422]: input_num_token:128
|
| 92 |
+
[I][ Run][ 422]: input_num_token:14
|
| 93 |
+
[I][ Run][ 607]: ttft: 236.90 ms
|
| 94 |
+
[Main/Token2Wav Thread] Processing batch of 28 tokens...
|
| 95 |
+
Successfully saved audio to output_0.wav (32-bit Float PCM).
|
| 96 |
+
[Main/Token2Wav Thread] Processing batch of 53 tokens...
|
| 97 |
+
Successfully saved audio to output_1.wav (32-bit Float PCM).
|
| 98 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 99 |
+
Successfully saved audio to output_2.wav (32-bit Float PCM).
|
| 100 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 101 |
+
Successfully saved audio to output_3.wav (32-bit Float PCM).
|
| 102 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 103 |
+
Successfully saved audio to output_4.wav (32-bit Float PCM).
|
| 104 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 105 |
+
Successfully saved audio to output_5.wav (32-bit Float PCM).
|
| 106 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 107 |
+
Successfully saved audio to output_6.wav (32-bit Float PCM).
|
| 108 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 109 |
+
Successfully saved audio to output_7.wav (32-bit Float PCM).
|
| 110 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 111 |
+
Successfully saved audio to output_8.wav (32-bit Float PCM).
|
| 112 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 113 |
+
Successfully saved audio to output_9.wav (32-bit Float PCM).
|
| 114 |
+
[I][ Run][ 723]: hit eos, llm finished
|
| 115 |
+
[I][ Run][ 753]: llm finished
|
| 116 |
+
[Main/Token2Wav Thread] Buffer is empty and LLM finished. Exiting.
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
[I][ Run][ 758]: total decode tokens:271
|
| 120 |
+
[N][ Run][ 759]: hit eos,avg 21.47 token/s
|
| 121 |
+
|
| 122 |
+
Successfully saved audio to output_10.wav (32-bit Float PCM).
|
| 123 |
+
Successfully saved audio to output.wav (32-bit Float PCM).
|
| 124 |
+
|
| 125 |
+
Voice generation pipeline completed.
|
| 126 |
+
Type "q" to exit, Ctrl+c to stop current running
|
| 127 |
+
text >>
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
Output Speech:
|
| 131 |
+
[output.wav](asset/output.wav)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
#### Or run on AX650 Board with Gradio GUI
|
| 135 |
+
1) Start server
|
| 136 |
+
```
|
| 137 |
+
bash run_api_ax650.sh
|
| 138 |
+
```
|
| 139 |
+
2) Start Gradio GUI
|
| 140 |
+
```
|
| 141 |
+
python scripts/gradio_demo.py
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
#### 3.2 Run on AXCL aarch64 Board
|
| 145 |
+
```
|
| 146 |
+
bash run_axcl_aarch64.sh
|
| 147 |
+
```
|
| 148 |
+
#### Or run on AXCL aarch64 Board with Gradio GUI
|
| 149 |
+
1) Start server
|
| 150 |
+
```
|
| 151 |
+
bash run_api_axcl_aarch64.sh
|
| 152 |
+
```
|
| 153 |
+
2) Start Gradio GUI
|
| 154 |
+
```
|
| 155 |
+
python scripts/gradio_demo.py
|
| 156 |
+
```
|
| 157 |
+
3) Open the page from a browser
|
| 158 |
+
The page url is : `https://{your device ip}:7860`
|
| 159 |
+
|
| 160 |
+
Note that you need to run these two commands in the project root directory.
|
| 161 |
+
|
| 162 |
+
#### 3.3 Run on AXCL x86 Board
|
| 163 |
+
```
|
| 164 |
+
bash run_axcl_x86.sh
|
| 165 |
+
```
|
| 166 |
+
#### Or run on AXCL aarch64 Board with Gradio GUI
|
| 167 |
+
1) Start server
|
| 168 |
+
```
|
| 169 |
+
bash run_api_axcl_x86.sh
|
| 170 |
+
```
|
| 171 |
+
2) Start Gradio GUI
|
| 172 |
+
```
|
| 173 |
+
python scripts/gradio_demo.py
|
| 174 |
+
```
|
| 175 |
+
3) Open the page from a browser
|
| 176 |
+
The page url is : `https://{your device ip}:7860`
|
| 177 |
+
|
| 178 |
+
Note that you need to run these two commands in the project root directory.
|
| 179 |
+
|
| 180 |
+

|
| 181 |
+
|
| 182 |
+
### Optional. Process Prompt Speech
|
| 183 |
+
If you want to replicate a specific sound, do this step.
|
| 184 |
+
You can use audio in asset/ .
|
| 185 |
+
|
| 186 |
+
#### (1). Downlaod wetext
|
| 187 |
+
```
|
| 188 |
+
pip3 install modelscope
|
| 189 |
+
modelscope download --model pengzhendong/wetext --local_dir pengzhendong/wetext
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
#### (2). Process Prompt Speech
|
| 193 |
+
Example:
|
| 194 |
+
```
|
| 195 |
+
python3 scripts/process_prompt.py --prompt_text asset/zh_man1.txt --prompt_speech asset/zh_man1.wav --output zh_man1
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
Pass parameters according to the actual situation.
|
| 199 |
+
```
|
| 200 |
+
python3 scripts/process_prompt.py -h
|
| 201 |
+
|
| 202 |
+
usage: process_prompt.py [-h] [--model_dir MODEL_DIR] [--wetext_dir WETEXT_DIR] [--sample_rate SAMPLE_RATE] [--prompt_text PROMPT_TEXT] [--prompt_speech PROMPT_SPEECH]
|
| 203 |
+
[--output OUTPUT]
|
| 204 |
+
|
| 205 |
+
options:
|
| 206 |
+
-h, --help show this help message and exit
|
| 207 |
+
--model_dir MODEL_DIR
|
| 208 |
+
tokenizer configuration directionary
|
| 209 |
+
--wetext_dir WETEXT_DIR
|
| 210 |
+
path to wetext
|
| 211 |
+
--sample_rate SAMPLE_RATE
|
| 212 |
+
Sampling rate for prompt audio
|
| 213 |
+
--prompt_text PROMPT_TEXT
|
| 214 |
+
The text content of the prompt(reference) audio. Text or file path.
|
| 215 |
+
--prompt_speech PROMPT_SPEECH
|
| 216 |
+
The path to prompt(reference) audio.
|
| 217 |
+
--output OUTPUT Output data storage directory
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
After executing the above command, files like the following will be generated:
|
| 221 |
+
```
|
| 222 |
+
flow_embedding.txt
|
| 223 |
+
flow_prompt_speech_token.txt
|
| 224 |
+
llm_embedding.txt
|
| 225 |
+
llm_prompt_speech_token.txt
|
| 226 |
+
prompt_speech_feat.txt
|
| 227 |
+
prompt_text.txt
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
When you run run_ax650.sh, pass the output path here to the prompt_files parameter of the run_ax650.sh script.
|
asset/dingding.png
ADDED
|
Git LFS Details
|
asset/output.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c7a4c3837145df17e851c177f849446036e6f541d78eb6e107ea6b9e7b07672
|
| 3 |
+
size 1067564
|
asset/zero_shot_prompt.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd199eb7109fd6ce9943cb297e3cf350c1073af014063dfadbdc100230526243
|
| 3 |
+
size 111496
|
frontend-onnx/campplus.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
|
| 3 |
+
size 28303423
|
frontend-onnx/speech_tokenizer_v3.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23236a74175dbdda47afc66dbadd5bcb41303c467a57c261cb8539ad9db9208d
|
| 3 |
+
size 969451503
|
main_ax650
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e5192387e51f64ea8993eb9bc3e848092aa2f8ce7157891496b152149a42ed6
|
| 3 |
+
size 6647080
|
run_ax650.sh
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LLM_DIR=CosyVoice-BlankEN-Ax650-C64-P256-CTX512/
|
| 2 |
+
TOKEN2WAV_DIR=token2wav-axmodels/
|
| 3 |
+
|
| 4 |
+
./main_ax650 \
|
| 5 |
+
--template_filename_axmodel "${LLM_DIR}/qwen2_p64_l%d_together.axmodel" \
|
| 6 |
+
--token2wav_axmodel_dir $TOKEN2WAV_DIR \
|
| 7 |
+
--n_timesteps 10 \
|
| 8 |
+
--axmodel_num 24 \
|
| 9 |
+
--bos 0 --eos 0 \
|
| 10 |
+
--filename_tokenizer_model "http://10.122.86.184:12345" \
|
| 11 |
+
--filename_post_axmodel "${LLM_DIR}/qwen2_post.axmodel" \
|
| 12 |
+
--filename_decoder_axmodel "${LLM_DIR}/llm_decoder.axmodel" \
|
| 13 |
+
--filename_tokens_embed "${LLM_DIR}/model.embed_tokens.weight.bfloat16.bin" \
|
| 14 |
+
--filename_llm_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
|
| 15 |
+
--filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
|
| 16 |
+
--continue 0 \
|
| 17 |
+
--prompt_files prompt_files \
|
| 18 |
+
--text "高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
chmod 777 output.wav
|
scripts/CosyVoice-BlankEN/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/CosyVoice-BlankEN/tokenizer_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"151643": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": false,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"151644": {
|
| 13 |
+
"content": "<|im_start|>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"151645": {
|
| 21 |
+
"content": "<|im_end|>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
|
| 30 |
+
"bos_token": null,
|
| 31 |
+
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "<|im_end|>",
|
| 34 |
+
"errors": "replace",
|
| 35 |
+
"model_max_length": 32768,
|
| 36 |
+
"pad_token": "<|endoftext|>",
|
| 37 |
+
"split_special_tokens": false,
|
| 38 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 39 |
+
"unk_token": null
|
| 40 |
+
}
|
scripts/CosyVoice-BlankEN/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/audio.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.utils.data
|
| 4 |
+
from librosa.filters import mel as librosa_mel_fn
|
| 5 |
+
from scipy.io.wavfile import read
|
| 6 |
+
|
| 7 |
+
MAX_WAV_VALUE = 32768.0
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def load_wav(full_path):
|
| 11 |
+
sampling_rate, data = read(full_path)
|
| 12 |
+
return data, sampling_rate
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
| 16 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def dynamic_range_decompression(x, C=1):
|
| 20 |
+
return np.exp(x) / C
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
| 24 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def dynamic_range_decompression_torch(x, C=1):
|
| 28 |
+
return torch.exp(x) / C
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def spectral_normalize_torch(magnitudes):
|
| 32 |
+
output = dynamic_range_compression_torch(magnitudes)
|
| 33 |
+
return output
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def spectral_de_normalize_torch(magnitudes):
|
| 37 |
+
output = dynamic_range_decompression_torch(magnitudes)
|
| 38 |
+
return output
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
mel_basis = {}
|
| 42 |
+
hann_window = {}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
| 46 |
+
if torch.min(y) < -1.0:
|
| 47 |
+
print("min value is ", torch.min(y))
|
| 48 |
+
if torch.max(y) > 1.0:
|
| 49 |
+
print("max value is ", torch.max(y))
|
| 50 |
+
|
| 51 |
+
global mel_basis, hann_window # pylint: disable=global-statement
|
| 52 |
+
print("fmax",fmax)
|
| 53 |
+
if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
|
| 54 |
+
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
|
| 55 |
+
mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
| 56 |
+
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
| 57 |
+
|
| 58 |
+
y = torch.nn.functional.pad(
|
| 59 |
+
y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
|
| 60 |
+
)
|
| 61 |
+
y = y.squeeze(1)
|
| 62 |
+
|
| 63 |
+
spec = torch.view_as_real(
|
| 64 |
+
torch.stft(
|
| 65 |
+
y,
|
| 66 |
+
n_fft,
|
| 67 |
+
hop_length=hop_size,
|
| 68 |
+
win_length=win_size,
|
| 69 |
+
window=hann_window[str(y.device)],
|
| 70 |
+
center=center,
|
| 71 |
+
pad_mode="reflect",
|
| 72 |
+
normalized=False,
|
| 73 |
+
onesided=True,
|
| 74 |
+
return_complex=True,
|
| 75 |
+
)
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
|
| 79 |
+
|
| 80 |
+
spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
|
| 81 |
+
spec = spectral_normalize_torch(spec)
|
| 82 |
+
|
| 83 |
+
return spec
|
scripts/cosyvoice3_tokenizer.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
| 2 |
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
| 3 |
+
import json
|
| 4 |
+
import argparse
|
| 5 |
+
from tokenizer.tokenizer import get_qwen_tokenizer
|
| 6 |
+
|
| 7 |
+
class Tokenizer_Http():
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
|
| 11 |
+
self.tokenizer = get_qwen_tokenizer("CosyVoice-BlankEN/", True, "cosyvoice3")
|
| 12 |
+
|
| 13 |
+
def encode(self, prompt):
|
| 14 |
+
|
| 15 |
+
token_ids = self.tokenizer.encode(prompt, allowed_special="all")
|
| 16 |
+
return token_ids
|
| 17 |
+
|
| 18 |
+
def decode(self, token_ids):
|
| 19 |
+
return self.tokenizer.decode(token_ids)
|
| 20 |
+
|
| 21 |
+
# @property
|
| 22 |
+
# def bos_id(self):
|
| 23 |
+
# return self.tokenizer.bos_token_id
|
| 24 |
+
|
| 25 |
+
@property
|
| 26 |
+
def eos_id(self):
|
| 27 |
+
return 1773
|
| 28 |
+
|
| 29 |
+
# @property
|
| 30 |
+
# def bos_token(self):
|
| 31 |
+
# return self.tokenizer.bos_token
|
| 32 |
+
|
| 33 |
+
@property
|
| 34 |
+
def eos_token(self):
|
| 35 |
+
return "<|eot_id|>"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
tokenizer = Tokenizer_Http()
|
| 39 |
+
|
| 40 |
+
# print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
|
| 41 |
+
print(tokenizer.encode("hello world"))
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class Request(BaseHTTPRequestHandler):
|
| 45 |
+
#通过类继承,新定义类
|
| 46 |
+
timeout = 5
|
| 47 |
+
server_version = 'Apache'
|
| 48 |
+
|
| 49 |
+
def do_GET(self):
|
| 50 |
+
print(self.path)
|
| 51 |
+
#在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行)
|
| 52 |
+
self.send_response(200)
|
| 53 |
+
self.send_header("type", "get") #设置响应头,可省略或设置多个
|
| 54 |
+
self.end_headers()
|
| 55 |
+
|
| 56 |
+
if self.path == '/bos_id':
|
| 57 |
+
bos_id = tokenizer.bos_id
|
| 58 |
+
# print(bos_id)
|
| 59 |
+
# to json
|
| 60 |
+
if bos_id is None:
|
| 61 |
+
msg = json.dumps({'bos_id': -1})
|
| 62 |
+
else:
|
| 63 |
+
msg = json.dumps({'bos_id': bos_id})
|
| 64 |
+
elif self.path == '/eos_id':
|
| 65 |
+
eos_id = tokenizer.eos_id
|
| 66 |
+
if eos_id is None:
|
| 67 |
+
msg = json.dumps({'eos_id': -1})
|
| 68 |
+
else:
|
| 69 |
+
msg = json.dumps({'eos_id': eos_id})
|
| 70 |
+
else:
|
| 71 |
+
msg = 'error'
|
| 72 |
+
|
| 73 |
+
print(msg)
|
| 74 |
+
msg = str(msg).encode() #转为str再转为byte格式
|
| 75 |
+
|
| 76 |
+
self.wfile.write(msg) #将byte格式的信息返回给客户端
|
| 77 |
+
|
| 78 |
+
def do_POST(self):
|
| 79 |
+
#在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行)
|
| 80 |
+
data = self.rfile.read(int(
|
| 81 |
+
self.headers['content-length'])) #获取从客户端传入的参数(byte格式)
|
| 82 |
+
data = data.decode() #将byte格式转为str格式
|
| 83 |
+
|
| 84 |
+
self.send_response(200)
|
| 85 |
+
self.send_header("type", "post") #设置响应头,可省略或设置多个
|
| 86 |
+
self.end_headers()
|
| 87 |
+
|
| 88 |
+
if self.path == '/encode':
|
| 89 |
+
req = json.loads(data)
|
| 90 |
+
prompt = req['text']
|
| 91 |
+
|
| 92 |
+
token_ids = tokenizer.encode(prompt)
|
| 93 |
+
if token_ids is None:
|
| 94 |
+
msg = json.dumps({'token_ids': -1})
|
| 95 |
+
else:
|
| 96 |
+
msg = json.dumps({'token_ids': token_ids})
|
| 97 |
+
|
| 98 |
+
elif self.path == '/decode':
|
| 99 |
+
req = json.loads(data)
|
| 100 |
+
token_ids = req['token_ids']
|
| 101 |
+
text = tokenizer.decode(token_ids)
|
| 102 |
+
if text is None:
|
| 103 |
+
msg = json.dumps({'text': ""})
|
| 104 |
+
else:
|
| 105 |
+
msg = json.dumps({'text': text})
|
| 106 |
+
else:
|
| 107 |
+
msg = 'error'
|
| 108 |
+
print(msg)
|
| 109 |
+
msg = str(msg).encode() #转为str再转为byte格式
|
| 110 |
+
|
| 111 |
+
self.wfile.write(msg) #将byte格式的信息返回给客户端
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
|
| 116 |
+
args = argparse.ArgumentParser()
|
| 117 |
+
args.add_argument('--host', type=str, default='localhost')
|
| 118 |
+
args.add_argument('--port', type=int, default=12345)
|
| 119 |
+
args = args.parse_args()
|
| 120 |
+
|
| 121 |
+
host = (args.host, args.port) #设定地址与端口号,'localhost'等价于'127.0.0.1'
|
| 122 |
+
print('http://%s:%s' % host)
|
| 123 |
+
server = HTTPServer(host, Request) #根据地址端口号和新定义的类,创建服务器实例
|
| 124 |
+
server.serve_forever() #开启服务
|
scripts/frontend.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
from functools import partial
|
| 15 |
+
from functools import lru_cache
|
| 16 |
+
from typing import Generator
|
| 17 |
+
import json
|
| 18 |
+
import onnxruntime
|
| 19 |
+
import torch
|
| 20 |
+
import numpy as np
|
| 21 |
+
import whisper
|
| 22 |
+
from typing import Callable
|
| 23 |
+
import torchaudio.compliance.kaldi as kaldi
|
| 24 |
+
import torchaudio
|
| 25 |
+
import os
|
| 26 |
+
import re
|
| 27 |
+
import inflect
|
| 28 |
+
from tokenizer.tokenizer import get_qwen_tokenizer
|
| 29 |
+
from audio import mel_spectrogram
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
import ttsfrd
|
| 33 |
+
use_ttsfrd = True
|
| 34 |
+
except ImportError:
|
| 35 |
+
|
| 36 |
+
from wetext import Normalizer as ZhNormalizer
|
| 37 |
+
from wetext import Normalizer as EnNormalizer
|
| 38 |
+
use_ttsfrd = False
|
| 39 |
+
|
| 40 |
+
import logging
|
| 41 |
+
logging.getLogger('frontend').setLevel(logging.WARNING)
|
| 42 |
+
# logging.basicConfig(level=logging.DEBUG,
|
| 43 |
+
# format='%(asctime)s %(levelname)s %(message)s')
|
| 44 |
+
|
| 45 |
+
class CosyVoiceFrontEnd:
|
| 46 |
+
|
| 47 |
+
def __init__(self,
|
| 48 |
+
pretrained_path: str,
|
| 49 |
+
wetext_dir: str,
|
| 50 |
+
campplus_model: str,
|
| 51 |
+
speech_tokenizer_model: str,
|
| 52 |
+
spk2info: str = '',
|
| 53 |
+
allowed_special: str = 'all'):
|
| 54 |
+
self.tokenizer = get_qwen_tokenizer(pretrained_path, True)
|
| 55 |
+
self.feat_extractor = partial(
|
| 56 |
+
mel_spectrogram,
|
| 57 |
+
n_fft=1920,
|
| 58 |
+
num_mels=80,
|
| 59 |
+
sampling_rate=24000,
|
| 60 |
+
hop_size=480,
|
| 61 |
+
win_size=1920,
|
| 62 |
+
fmin=0,
|
| 63 |
+
fmax=8000,
|
| 64 |
+
center=False)
|
| 65 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 66 |
+
option = onnxruntime.SessionOptions()
|
| 67 |
+
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 68 |
+
option.intra_op_num_threads = 1
|
| 69 |
+
self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
|
| 70 |
+
self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
|
| 71 |
+
providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
|
| 72 |
+
"CPUExecutionProvider"])
|
| 73 |
+
if os.path.exists(spk2info):
|
| 74 |
+
self.spk2info = torch.load(spk2info, map_location=self.device)
|
| 75 |
+
else:
|
| 76 |
+
self.spk2info = {}
|
| 77 |
+
self.allowed_special = allowed_special
|
| 78 |
+
self.use_ttsfrd = use_ttsfrd
|
| 79 |
+
if self.use_ttsfrd:
|
| 80 |
+
self.frd = ttsfrd.TtsFrontendEngine()
|
| 81 |
+
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 82 |
+
assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \
|
| 83 |
+
'failed to initialize ttsfrd resource'
|
| 84 |
+
self.frd.set_lang_type('pinyinvg')
|
| 85 |
+
else:
|
| 86 |
+
self.zh_tn_model = ZhNormalizer(remove_erhua=False, lang="zh")
|
| 87 |
+
self.en_tn_model = EnNormalizer(lang="zh")
|
| 88 |
+
self.inflect_parser = inflect.engine()
|
| 89 |
+
|
| 90 |
+
def _extract_text_token(self, text):
|
| 91 |
+
if isinstance(text, Generator):
|
| 92 |
+
logging.info('get tts_text generator, will return _extract_text_token_generator!')
|
| 93 |
+
# NOTE add a dummy text_token_len for compatibility
|
| 94 |
+
return self._extract_text_token_generator(text), torch.tensor([0], dtype=torch.int32).to(self.device)
|
| 95 |
+
else:
|
| 96 |
+
text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
|
| 97 |
+
text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
|
| 98 |
+
text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
|
| 99 |
+
return text_token, text_token_len
|
| 100 |
+
|
| 101 |
+
def _extract_text_token_generator(self, text_generator):
|
| 102 |
+
for text in text_generator:
|
| 103 |
+
text_token, _ = self._extract_text_token(text)
|
| 104 |
+
for i in range(text_token.shape[1]):
|
| 105 |
+
yield text_token[:, i: i + 1]
|
| 106 |
+
|
| 107 |
+
def _extract_speech_token(self, speech):
|
| 108 |
+
assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
|
| 109 |
+
feat = whisper.log_mel_spectrogram(speech, n_mels=128)
|
| 110 |
+
speech_token = self.speech_tokenizer_session.run(None,
|
| 111 |
+
{self.speech_tokenizer_session.get_inputs()[0].name:
|
| 112 |
+
feat.detach().cpu().numpy(),
|
| 113 |
+
self.speech_tokenizer_session.get_inputs()[1].name:
|
| 114 |
+
np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
|
| 115 |
+
speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
|
| 116 |
+
speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
|
| 117 |
+
return speech_token, speech_token_len
|
| 118 |
+
|
| 119 |
+
def _extract_spk_embedding(self, speech):
|
| 120 |
+
feat = kaldi.fbank(speech,
|
| 121 |
+
num_mel_bins=80,
|
| 122 |
+
dither=0,
|
| 123 |
+
sample_frequency=16000)
|
| 124 |
+
feat = feat - feat.mean(dim=0, keepdim=True)
|
| 125 |
+
embedding = self.campplus_session.run(None,
|
| 126 |
+
{self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
|
| 127 |
+
embedding = torch.tensor([embedding]).to(self.device)
|
| 128 |
+
return embedding
|
| 129 |
+
|
| 130 |
+
def _extract_speech_feat(self, speech):
|
| 131 |
+
speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
|
| 132 |
+
speech_feat = speech_feat.unsqueeze(dim=0)
|
| 133 |
+
speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
|
| 134 |
+
return speech_feat, speech_feat_len
|
| 135 |
+
|
| 136 |
+
def text_normalize(self, text, split=True, text_frontend=True):
|
| 137 |
+
if isinstance(text, Generator):
|
| 138 |
+
logging.info('get tts_text generator, will skip text_normalize!')
|
| 139 |
+
return [text]
|
| 140 |
+
if text_frontend is False or text == '':
|
| 141 |
+
return [text] if split is True else text
|
| 142 |
+
text = text.strip()
|
| 143 |
+
if self.use_ttsfrd:
|
| 144 |
+
texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
|
| 145 |
+
text = ''.join(texts)
|
| 146 |
+
else:
|
| 147 |
+
if contains_chinese(text):
|
| 148 |
+
text = self.zh_tn_model.normalize(text)
|
| 149 |
+
text = text.replace("\n", "")
|
| 150 |
+
text = replace_blank(text)
|
| 151 |
+
text = replace_corner_mark(text)
|
| 152 |
+
text = text.replace(".", "。")
|
| 153 |
+
text = text.replace(" - ", ",")
|
| 154 |
+
text = remove_bracket(text)
|
| 155 |
+
text = re.sub(r'[,,、]+$', '。', text)
|
| 156 |
+
texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
|
| 157 |
+
token_min_n=60, merge_len=20, comma_split=False))
|
| 158 |
+
else:
|
| 159 |
+
text = self.en_tn_model.normalize(text)
|
| 160 |
+
text = spell_out_number(text, self.inflect_parser)
|
| 161 |
+
texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
|
| 162 |
+
token_min_n=60, merge_len=20, comma_split=False))
|
| 163 |
+
texts = [i for i in texts if not is_only_punctuation(i)]
|
| 164 |
+
return texts if split is True else text
|
| 165 |
+
|
| 166 |
+
def frontend_sft(self, tts_text, spk_id):
|
| 167 |
+
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
|
| 168 |
+
embedding = self.spk2info[spk_id]['embedding']
|
| 169 |
+
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
|
| 170 |
+
return model_input
|
| 171 |
+
|
| 172 |
+
def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
|
| 173 |
+
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
|
| 174 |
+
if zero_shot_spk_id == '':
|
| 175 |
+
prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
|
| 176 |
+
prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
|
| 177 |
+
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
|
| 178 |
+
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
| 179 |
+
if resample_rate == 24000:
|
| 180 |
+
# cosyvoice2, force speech_feat % speech_token = 2
|
| 181 |
+
token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
|
| 182 |
+
speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
|
| 183 |
+
speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
|
| 184 |
+
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
| 185 |
+
model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
|
| 186 |
+
'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
|
| 187 |
+
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
|
| 188 |
+
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
|
| 189 |
+
'llm_embedding': embedding, 'flow_embedding': embedding}
|
| 190 |
+
else:
|
| 191 |
+
model_input = self.spk2info[zero_shot_spk_id]
|
| 192 |
+
model_input['text'] = tts_text_token
|
| 193 |
+
model_input['text_len'] = tts_text_token_len
|
| 194 |
+
return model_input
|
| 195 |
+
|
| 196 |
+
def process_prompt(self, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
|
| 197 |
+
if zero_shot_spk_id == '':
|
| 198 |
+
prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
|
| 199 |
+
prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
|
| 200 |
+
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
|
| 201 |
+
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
| 202 |
+
if resample_rate == 24000:
|
| 203 |
+
# cosyvoice2, force speech_feat % speech_token = 2
|
| 204 |
+
token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
|
| 205 |
+
speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
|
| 206 |
+
speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
|
| 207 |
+
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
| 208 |
+
model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
|
| 209 |
+
'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
|
| 210 |
+
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
|
| 211 |
+
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
|
| 212 |
+
'llm_embedding': embedding, 'flow_embedding': embedding}
|
| 213 |
+
else:
|
| 214 |
+
model_input = self.spk2info[zero_shot_spk_id]
|
| 215 |
+
return model_input
|
| 216 |
+
|
| 217 |
+
def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
|
| 218 |
+
model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
|
| 219 |
+
# in cross lingual mode, we remove prompt in llm
|
| 220 |
+
del model_input['prompt_text']
|
| 221 |
+
del model_input['prompt_text_len']
|
| 222 |
+
del model_input['llm_prompt_speech_token']
|
| 223 |
+
del model_input['llm_prompt_speech_token_len']
|
| 224 |
+
return model_input
|
| 225 |
+
|
| 226 |
+
def frontend_instruct(self, tts_text, spk_id, instruct_text):
|
| 227 |
+
model_input = self.frontend_sft(tts_text, spk_id)
|
| 228 |
+
# in instruct mode, we remove spk_embedding in llm due to information leakage
|
| 229 |
+
del model_input['llm_embedding']
|
| 230 |
+
instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
|
| 231 |
+
model_input['prompt_text'] = instruct_text_token
|
| 232 |
+
model_input['prompt_text_len'] = instruct_text_token_len
|
| 233 |
+
return model_input
|
| 234 |
+
|
| 235 |
+
def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
|
| 236 |
+
model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
|
| 237 |
+
del model_input['llm_prompt_speech_token']
|
| 238 |
+
del model_input['llm_prompt_speech_token_len']
|
| 239 |
+
return model_input
|
| 240 |
+
|
| 241 |
+
def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):
|
| 242 |
+
prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
| 243 |
+
prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
|
| 244 |
+
prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
|
| 245 |
+
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
| 246 |
+
source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
|
| 247 |
+
model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
|
| 248 |
+
'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
|
| 249 |
+
'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
|
| 250 |
+
'flow_embedding': embedding}
|
| 251 |
+
return model_input
|
scripts/gradio_demo.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import shutil
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import numpy as np
|
| 5 |
+
import requests
|
| 6 |
+
import time
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
from frontend import CosyVoiceFrontEnd
|
| 11 |
+
import torchaudio
|
| 12 |
+
import logging
|
| 13 |
+
logging.basicConfig(level=logging.WARNING)
|
| 14 |
+
|
| 15 |
+
import subprocess
|
| 16 |
+
import re
|
| 17 |
+
|
| 18 |
+
def get_all_local_ips():
|
| 19 |
+
result = subprocess.run(['ip', 'a'], capture_output=True, text=True)
|
| 20 |
+
output = result.stdout
|
| 21 |
+
|
| 22 |
+
# 匹配所有IPv4
|
| 23 |
+
ips = re.findall(r'inet (\d+\.\d+\.\d+\.\d+)', output)
|
| 24 |
+
|
| 25 |
+
# 过滤掉回环地址
|
| 26 |
+
real_ips = [ip for ip in ips if not ip.startswith('127.')]
|
| 27 |
+
|
| 28 |
+
return real_ips
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
TTS_URL = "http://0.0.0.0:12346/tts"
|
| 32 |
+
GET_URL = "http://0.0.0.0:12346/get"
|
| 33 |
+
TIMESTEPS_URL = "http://0.0.0.0:12346/timesteps"
|
| 34 |
+
PROMPT_FILES_URL = "http://0.0.0.0:12346/prompt_files"
|
| 35 |
+
|
| 36 |
+
args = argparse.ArgumentParser()
|
| 37 |
+
args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN", help="tokenizer configuration directionary")
|
| 38 |
+
args.add_argument('--wetext_dir', type=str, default="pengzhendong/wetext", help="path to wetext")
|
| 39 |
+
args.add_argument('--sample_rate', type=int, default=24000, help="Sampling rate for prompt audio")
|
| 40 |
+
args = args.parse_args()
|
| 41 |
+
frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
|
| 42 |
+
args.wetext_dir,
|
| 43 |
+
"frontend-onnx/campplus.onnx",
|
| 44 |
+
"frontend-onnx/speech_tokenizer_v2.onnx",
|
| 45 |
+
f"{args.model_dir}/spk2info.pt",
|
| 46 |
+
"all")
|
| 47 |
+
|
| 48 |
+
def update_audio(audio_input_path, audio_text):
|
| 49 |
+
def load_wav(wav, target_sr):
|
| 50 |
+
speech, sample_rate = torchaudio.load(wav, backend='soundfile')
|
| 51 |
+
speech = speech.mean(dim=0, keepdim=True)
|
| 52 |
+
if sample_rate != target_sr:
|
| 53 |
+
assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
|
| 54 |
+
speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
|
| 55 |
+
return speech
|
| 56 |
+
output_dir = './output_temp'
|
| 57 |
+
# clear output_dir
|
| 58 |
+
if os.path.exists(output_dir):
|
| 59 |
+
shutil.rmtree(output_dir)
|
| 60 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 61 |
+
zero_shot_spk_id = ""
|
| 62 |
+
prompt_speech_16k = load_wav(audio_input_path, 16000)
|
| 63 |
+
prompt_text = audio_text
|
| 64 |
+
print("prompt_text",prompt_text)
|
| 65 |
+
model_input = frontend.process_prompt( prompt_text, prompt_speech_16k, args.sample_rate, zero_shot_spk_id)
|
| 66 |
+
print("prompt speech token size:", model_input["flow_prompt_speech_token"].shape)
|
| 67 |
+
assert model_input["flow_prompt_speech_token"].shape[1] >=75, f"speech_token length should >= 75, bug get {model_input['flow_prompt_speech_token'].shape[1]}"
|
| 68 |
+
for k, v in model_input.items():
|
| 69 |
+
if "_len" in k:
|
| 70 |
+
continue
|
| 71 |
+
shapes = [str(s) for s in v.shape]
|
| 72 |
+
shape_str = "_".join(shapes)
|
| 73 |
+
if v.dtype in (torch.int32, torch.int64):
|
| 74 |
+
np.savetxt(f"{output_dir}/{k}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
|
| 75 |
+
else:
|
| 76 |
+
np.savetxt(f"{output_dir}/{k}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
r = requests.post(PROMPT_FILES_URL, json={"prompt_files": output_dir}, timeout=5)
|
| 80 |
+
if r.status_code != 200:
|
| 81 |
+
return None, "❌ TTS 请求失败"
|
| 82 |
+
except Exception as e:
|
| 83 |
+
return None, f"❌ TTS 请求异常: {e}"
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def update_timesteps(timesteps):
|
| 87 |
+
try:
|
| 88 |
+
r = requests.post(TIMESTEPS_URL, json={"timesteps": timesteps}, timeout=5)
|
| 89 |
+
if r.status_code != 200:
|
| 90 |
+
return None, "❌ TTS 请求失败"
|
| 91 |
+
except Exception as e:
|
| 92 |
+
return None, f"❌ TTS 请求异常: {e}"
|
| 93 |
+
|
| 94 |
+
def run_tts(text):
|
| 95 |
+
# Step1: 提交 TTS 请求
|
| 96 |
+
try:
|
| 97 |
+
r = requests.post(TTS_URL, json={"text": text}, timeout=5)
|
| 98 |
+
if r.status_code != 200:
|
| 99 |
+
return None, "❌ TTS 请求失败"
|
| 100 |
+
except Exception as e:
|
| 101 |
+
return None, f"❌ TTS 请求异常: {e}"
|
| 102 |
+
|
| 103 |
+
# Step2: 循环调用 /get 获取进度
|
| 104 |
+
progress = gr.Progress()
|
| 105 |
+
wav_file = None
|
| 106 |
+
for i in range(100): # 最多尝试100次,避免死循环
|
| 107 |
+
time.sleep(0.5)
|
| 108 |
+
try:
|
| 109 |
+
resp = requests.post(GET_URL, data="", timeout=5).json()
|
| 110 |
+
except Exception as e:
|
| 111 |
+
return None, f"❌ GET 请求异常: {e}"
|
| 112 |
+
|
| 113 |
+
if resp.get("b_tts_runing", True):
|
| 114 |
+
progress(i / 100, desc="正在生成语音...")
|
| 115 |
+
else:
|
| 116 |
+
wav_file = resp.get("wav_file")
|
| 117 |
+
break
|
| 118 |
+
|
| 119 |
+
if not wav_file or not os.path.exists(wav_file):
|
| 120 |
+
return None, "❌ 语音文件未生成"
|
| 121 |
+
|
| 122 |
+
return wav_file, "✅ 生成完成"
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
with gr.Blocks() as demo:
|
| 126 |
+
gr.Markdown("### 🎙️ AXERA CosyVoice2 Demo")
|
| 127 |
+
|
| 128 |
+
with gr.Row():
|
| 129 |
+
with gr.Column():
|
| 130 |
+
audio_input = gr.Audio(label="输入音频", type="filepath")
|
| 131 |
+
with gr.Column():
|
| 132 |
+
audio_text = gr.Textbox(label="音频文本(自己改一下或者照着念)", value="锄禾日当午,汗滴禾下土。")
|
| 133 |
+
btn_update = gr.Button("更新音源")
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
with gr.Row():
|
| 137 |
+
text_input = gr.Textbox(value="琦琦,麻烦你适配一下这个新的模型吧。", label="输入文本")
|
| 138 |
+
with gr.Column():
|
| 139 |
+
timesteps = gr.Slider(minimum=4, maximum=30, value=7, step=1, label="Timesteps")
|
| 140 |
+
run_btn = gr.Button("生成语音")
|
| 141 |
+
|
| 142 |
+
status = gr.Label(label="状态")
|
| 143 |
+
audio_out = gr.Audio(label="生成结果", type="filepath")
|
| 144 |
+
|
| 145 |
+
run_btn.click(fn=run_tts, inputs=[text_input], outputs=[audio_out, status])
|
| 146 |
+
timesteps.change(fn=update_timesteps, inputs=timesteps)
|
| 147 |
+
|
| 148 |
+
btn_update.click(fn=update_audio, inputs=[audio_input, audio_text])
|
| 149 |
+
|
| 150 |
+
ips = get_all_local_ips()
|
| 151 |
+
for ip in ips:
|
| 152 |
+
print(f"* Running on local URL: https://{ip}:7860")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
demo.launch(
|
| 156 |
+
server_name="0.0.0.0",
|
| 157 |
+
server_port=7860,
|
| 158 |
+
ssl_certfile="./server.crt",
|
| 159 |
+
ssl_keyfile="./server.key",
|
| 160 |
+
ssl_verify=False
|
| 161 |
+
)
|
scripts/meldataset.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" from https://github.com/jik876/hifi-gan """
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
import os
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
import torch.utils.data
|
| 10 |
+
from librosa.filters import mel as librosa_mel_fn
|
| 11 |
+
from librosa.util import normalize
|
| 12 |
+
from scipy.io.wavfile import read
|
| 13 |
+
|
| 14 |
+
MAX_WAV_VALUE = 32768.0
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_wav(full_path):
|
| 18 |
+
sampling_rate, data = read(full_path)
|
| 19 |
+
return data, sampling_rate
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
| 23 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def dynamic_range_decompression(x, C=1):
|
| 27 |
+
return np.exp(x) / C
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
| 31 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def dynamic_range_decompression_torch(x, C=1):
|
| 35 |
+
return torch.exp(x) / C
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def spectral_normalize_torch(magnitudes):
|
| 39 |
+
output = dynamic_range_compression_torch(magnitudes)
|
| 40 |
+
return output
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def spectral_de_normalize_torch(magnitudes):
|
| 44 |
+
output = dynamic_range_decompression_torch(magnitudes)
|
| 45 |
+
return output
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
mel_basis = {}
|
| 49 |
+
hann_window = {}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
| 53 |
+
if torch.min(y) < -1.0:
|
| 54 |
+
print("min value is ", torch.min(y))
|
| 55 |
+
if torch.max(y) > 1.0:
|
| 56 |
+
print("max value is ", torch.max(y))
|
| 57 |
+
|
| 58 |
+
global mel_basis, hann_window # pylint: disable=global-statement
|
| 59 |
+
if fmax not in mel_basis:
|
| 60 |
+
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
| 61 |
+
mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
| 62 |
+
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
| 63 |
+
|
| 64 |
+
y = torch.nn.functional.pad(
|
| 65 |
+
y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
|
| 66 |
+
)
|
| 67 |
+
y = y.squeeze(1)
|
| 68 |
+
|
| 69 |
+
spec = torch.view_as_real(
|
| 70 |
+
torch.stft(
|
| 71 |
+
y,
|
| 72 |
+
n_fft,
|
| 73 |
+
hop_length=hop_size,
|
| 74 |
+
win_length=win_size,
|
| 75 |
+
window=hann_window[str(y.device)],
|
| 76 |
+
center=center,
|
| 77 |
+
pad_mode="reflect",
|
| 78 |
+
normalized=False,
|
| 79 |
+
onesided=True,
|
| 80 |
+
return_complex=True,
|
| 81 |
+
)
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
|
| 85 |
+
|
| 86 |
+
spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
|
| 87 |
+
spec = spectral_normalize_torch(spec)
|
| 88 |
+
|
| 89 |
+
return spec
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def get_dataset_filelist(a):
|
| 93 |
+
with open(a.input_training_file, encoding="utf-8") as fi:
|
| 94 |
+
training_files = [
|
| 95 |
+
os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
with open(a.input_validation_file, encoding="utf-8") as fi:
|
| 99 |
+
validation_files = [
|
| 100 |
+
os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
|
| 101 |
+
]
|
| 102 |
+
return training_files, validation_files
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class MelDataset(torch.utils.data.Dataset):
|
| 106 |
+
def __init__(
|
| 107 |
+
self,
|
| 108 |
+
training_files,
|
| 109 |
+
segment_size,
|
| 110 |
+
n_fft,
|
| 111 |
+
num_mels,
|
| 112 |
+
hop_size,
|
| 113 |
+
win_size,
|
| 114 |
+
sampling_rate,
|
| 115 |
+
fmin,
|
| 116 |
+
fmax,
|
| 117 |
+
split=True,
|
| 118 |
+
shuffle=True,
|
| 119 |
+
n_cache_reuse=1,
|
| 120 |
+
device=None,
|
| 121 |
+
fmax_loss=None,
|
| 122 |
+
fine_tuning=False,
|
| 123 |
+
base_mels_path=None,
|
| 124 |
+
):
|
| 125 |
+
self.audio_files = training_files
|
| 126 |
+
random.seed(1234)
|
| 127 |
+
if shuffle:
|
| 128 |
+
random.shuffle(self.audio_files)
|
| 129 |
+
self.segment_size = segment_size
|
| 130 |
+
self.sampling_rate = sampling_rate
|
| 131 |
+
self.split = split
|
| 132 |
+
self.n_fft = n_fft
|
| 133 |
+
self.num_mels = num_mels
|
| 134 |
+
self.hop_size = hop_size
|
| 135 |
+
self.win_size = win_size
|
| 136 |
+
self.fmin = fmin
|
| 137 |
+
self.fmax = fmax
|
| 138 |
+
self.fmax_loss = fmax_loss
|
| 139 |
+
self.cached_wav = None
|
| 140 |
+
self.n_cache_reuse = n_cache_reuse
|
| 141 |
+
self._cache_ref_count = 0
|
| 142 |
+
self.device = device
|
| 143 |
+
self.fine_tuning = fine_tuning
|
| 144 |
+
self.base_mels_path = base_mels_path
|
| 145 |
+
|
| 146 |
+
def __getitem__(self, index):
|
| 147 |
+
filename = self.audio_files[index]
|
| 148 |
+
if self._cache_ref_count == 0:
|
| 149 |
+
audio, sampling_rate = load_wav(filename)
|
| 150 |
+
audio = audio / MAX_WAV_VALUE
|
| 151 |
+
if not self.fine_tuning:
|
| 152 |
+
audio = normalize(audio) * 0.95
|
| 153 |
+
self.cached_wav = audio
|
| 154 |
+
if sampling_rate != self.sampling_rate:
|
| 155 |
+
raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR")
|
| 156 |
+
self._cache_ref_count = self.n_cache_reuse
|
| 157 |
+
else:
|
| 158 |
+
audio = self.cached_wav
|
| 159 |
+
self._cache_ref_count -= 1
|
| 160 |
+
|
| 161 |
+
audio = torch.FloatTensor(audio)
|
| 162 |
+
audio = audio.unsqueeze(0)
|
| 163 |
+
|
| 164 |
+
if not self.fine_tuning:
|
| 165 |
+
if self.split:
|
| 166 |
+
if audio.size(1) >= self.segment_size:
|
| 167 |
+
max_audio_start = audio.size(1) - self.segment_size
|
| 168 |
+
audio_start = random.randint(0, max_audio_start)
|
| 169 |
+
audio = audio[:, audio_start : audio_start + self.segment_size]
|
| 170 |
+
else:
|
| 171 |
+
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
|
| 172 |
+
|
| 173 |
+
mel = mel_spectrogram(
|
| 174 |
+
audio,
|
| 175 |
+
self.n_fft,
|
| 176 |
+
self.num_mels,
|
| 177 |
+
self.sampling_rate,
|
| 178 |
+
self.hop_size,
|
| 179 |
+
self.win_size,
|
| 180 |
+
self.fmin,
|
| 181 |
+
self.fmax,
|
| 182 |
+
center=False,
|
| 183 |
+
)
|
| 184 |
+
else:
|
| 185 |
+
mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy"))
|
| 186 |
+
mel = torch.from_numpy(mel)
|
| 187 |
+
|
| 188 |
+
if len(mel.shape) < 3:
|
| 189 |
+
mel = mel.unsqueeze(0)
|
| 190 |
+
|
| 191 |
+
if self.split:
|
| 192 |
+
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
|
| 193 |
+
|
| 194 |
+
if audio.size(1) >= self.segment_size:
|
| 195 |
+
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
|
| 196 |
+
mel = mel[:, :, mel_start : mel_start + frames_per_seg]
|
| 197 |
+
audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size]
|
| 198 |
+
else:
|
| 199 |
+
mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant")
|
| 200 |
+
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
|
| 201 |
+
|
| 202 |
+
mel_loss = mel_spectrogram(
|
| 203 |
+
audio,
|
| 204 |
+
self.n_fft,
|
| 205 |
+
self.num_mels,
|
| 206 |
+
self.sampling_rate,
|
| 207 |
+
self.hop_size,
|
| 208 |
+
self.win_size,
|
| 209 |
+
self.fmin,
|
| 210 |
+
self.fmax_loss,
|
| 211 |
+
center=False,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
|
| 215 |
+
|
| 216 |
+
def __len__(self):
|
| 217 |
+
return len(self.audio_files)
|
scripts/process_prompt.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import torch
|
| 4 |
+
import torchaudio
|
| 5 |
+
import numpy as np
|
| 6 |
+
from frontend import CosyVoiceFrontEnd
|
| 7 |
+
|
| 8 |
+
def load_wav(wav, target_sr, min_sr=16000):
|
| 9 |
+
speech, sample_rate = torchaudio.load(wav, backend='soundfile')
|
| 10 |
+
speech = speech.mean(dim=0, keepdim=True)
|
| 11 |
+
if sample_rate != target_sr:
|
| 12 |
+
assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
|
| 13 |
+
speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
|
| 14 |
+
return speech
|
| 15 |
+
|
| 16 |
+
if __name__ == "__main__":
|
| 17 |
+
|
| 18 |
+
args = argparse.ArgumentParser()
|
| 19 |
+
args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN/", help="tokenizer configuration directionary")
|
| 20 |
+
args.add_argument('--wetext_dir', type=str, default="./pengzhendong/wetext", help="path to wetext")
|
| 21 |
+
args.add_argument('--sample_rate', type=int, default=24000, help="Sampling rate for prompt audio")
|
| 22 |
+
args.add_argument('--prompt_text', type=str, default="You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。", help="The text content of the prompt(reference) audio. Text or file path.")
|
| 23 |
+
args.add_argument('--prompt_speech', type=str, default="asset/zero_shot_prompt.wav", help="The path to prompt(reference) audio.")
|
| 24 |
+
args.add_argument('--output', type=str, default="prompt_files", help="Output data storage directory")
|
| 25 |
+
args = args.parse_args()
|
| 26 |
+
|
| 27 |
+
os.makedirs(args.output, exist_ok=True)
|
| 28 |
+
|
| 29 |
+
frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
|
| 30 |
+
args.wetext_dir,
|
| 31 |
+
"../frontend-onnx/campplus.onnx",
|
| 32 |
+
"../frontend-onnx/speech_tokenizer_v3.onnx",
|
| 33 |
+
f"{args.model_dir}/spk2info.pt",
|
| 34 |
+
"all")
|
| 35 |
+
|
| 36 |
+
prompt_speech_16k = load_wav(args.prompt_speech, 16000)
|
| 37 |
+
zero_shot_spk_id = ""
|
| 38 |
+
|
| 39 |
+
if os.path.isfile(args.prompt_text):
|
| 40 |
+
with open(args.prompt_text, "r") as f:
|
| 41 |
+
prompt_text = f.read()
|
| 42 |
+
else:
|
| 43 |
+
prompt_text = args.prompt_text
|
| 44 |
+
print("prompt_text",prompt_text)
|
| 45 |
+
model_input = frontend.process_prompt( prompt_text, prompt_speech_16k, args.sample_rate, zero_shot_spk_id)
|
| 46 |
+
|
| 47 |
+
# model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
|
| 48 |
+
# 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
|
| 49 |
+
# 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
|
| 50 |
+
# 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
|
| 51 |
+
# 'llm_embedding': embedding, 'flow_embedding': embedding}
|
| 52 |
+
print("prompt speech token size:", model_input["flow_prompt_speech_token"].shape)
|
| 53 |
+
assert model_input["flow_prompt_speech_token"].shape[1] >=75, f"speech_token length should >= 75, bug get {model_input['flow_prompt_speech_token'].shape[1]}"
|
| 54 |
+
for k, v in model_input.items():
|
| 55 |
+
if "_len" in k:
|
| 56 |
+
continue
|
| 57 |
+
shapes = [str(s) for s in v.shape]
|
| 58 |
+
shape_str = "_".join(shapes)
|
| 59 |
+
if v.dtype in (torch.int32, torch.int64):
|
| 60 |
+
np.savetxt(f"{args.output}/{k}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
|
| 61 |
+
else:
|
| 62 |
+
np.savetxt(f"{args.output}/{k}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")
|
scripts/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai-whisper==20231117
|
| 2 |
+
transformers
|
| 3 |
+
gradio
|
| 4 |
+
onnxruntime
|
| 5 |
+
torch
|
| 6 |
+
torchaudio
|
| 7 |
+
inflect
|
| 8 |
+
wetext
|
scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:747979631e813193436aabcff7c1c235d37de8097b71c563ec8b63b7a515c718
|
| 3 |
+
size 907395
|
scripts/tokenizer/tokenizer.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import os
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from typing import Optional
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
+
from whisper.tokenizer import Tokenizer
|
| 8 |
+
|
| 9 |
+
import tiktoken
|
| 10 |
+
|
| 11 |
+
LANGUAGES = {
|
| 12 |
+
"en": "english",
|
| 13 |
+
"zh": "chinese",
|
| 14 |
+
"de": "german",
|
| 15 |
+
"es": "spanish",
|
| 16 |
+
"ru": "russian",
|
| 17 |
+
"ko": "korean",
|
| 18 |
+
"fr": "french",
|
| 19 |
+
"ja": "japanese",
|
| 20 |
+
"pt": "portuguese",
|
| 21 |
+
"tr": "turkish",
|
| 22 |
+
"pl": "polish",
|
| 23 |
+
"ca": "catalan",
|
| 24 |
+
"nl": "dutch",
|
| 25 |
+
"ar": "arabic",
|
| 26 |
+
"sv": "swedish",
|
| 27 |
+
"it": "italian",
|
| 28 |
+
"id": "indonesian",
|
| 29 |
+
"hi": "hindi",
|
| 30 |
+
"fi": "finnish",
|
| 31 |
+
"vi": "vietnamese",
|
| 32 |
+
"he": "hebrew",
|
| 33 |
+
"uk": "ukrainian",
|
| 34 |
+
"el": "greek",
|
| 35 |
+
"ms": "malay",
|
| 36 |
+
"cs": "czech",
|
| 37 |
+
"ro": "romanian",
|
| 38 |
+
"da": "danish",
|
| 39 |
+
"hu": "hungarian",
|
| 40 |
+
"ta": "tamil",
|
| 41 |
+
"no": "norwegian",
|
| 42 |
+
"th": "thai",
|
| 43 |
+
"ur": "urdu",
|
| 44 |
+
"hr": "croatian",
|
| 45 |
+
"bg": "bulgarian",
|
| 46 |
+
"lt": "lithuanian",
|
| 47 |
+
"la": "latin",
|
| 48 |
+
"mi": "maori",
|
| 49 |
+
"ml": "malayalam",
|
| 50 |
+
"cy": "welsh",
|
| 51 |
+
"sk": "slovak",
|
| 52 |
+
"te": "telugu",
|
| 53 |
+
"fa": "persian",
|
| 54 |
+
"lv": "latvian",
|
| 55 |
+
"bn": "bengali",
|
| 56 |
+
"sr": "serbian",
|
| 57 |
+
"az": "azerbaijani",
|
| 58 |
+
"sl": "slovenian",
|
| 59 |
+
"kn": "kannada",
|
| 60 |
+
"et": "estonian",
|
| 61 |
+
"mk": "macedonian",
|
| 62 |
+
"br": "breton",
|
| 63 |
+
"eu": "basque",
|
| 64 |
+
"is": "icelandic",
|
| 65 |
+
"hy": "armenian",
|
| 66 |
+
"ne": "nepali",
|
| 67 |
+
"mn": "mongolian",
|
| 68 |
+
"bs": "bosnian",
|
| 69 |
+
"kk": "kazakh",
|
| 70 |
+
"sq": "albanian",
|
| 71 |
+
"sw": "swahili",
|
| 72 |
+
"gl": "galician",
|
| 73 |
+
"mr": "marathi",
|
| 74 |
+
"pa": "punjabi",
|
| 75 |
+
"si": "sinhala",
|
| 76 |
+
"km": "khmer",
|
| 77 |
+
"sn": "shona",
|
| 78 |
+
"yo": "yoruba",
|
| 79 |
+
"so": "somali",
|
| 80 |
+
"af": "afrikaans",
|
| 81 |
+
"oc": "occitan",
|
| 82 |
+
"ka": "georgian",
|
| 83 |
+
"be": "belarusian",
|
| 84 |
+
"tg": "tajik",
|
| 85 |
+
"sd": "sindhi",
|
| 86 |
+
"gu": "gujarati",
|
| 87 |
+
"am": "amharic",
|
| 88 |
+
"yi": "yiddish",
|
| 89 |
+
"lo": "lao",
|
| 90 |
+
"uz": "uzbek",
|
| 91 |
+
"fo": "faroese",
|
| 92 |
+
"ht": "haitian creole",
|
| 93 |
+
"ps": "pashto",
|
| 94 |
+
"tk": "turkmen",
|
| 95 |
+
"nn": "nynorsk",
|
| 96 |
+
"mt": "maltese",
|
| 97 |
+
"sa": "sanskrit",
|
| 98 |
+
"lb": "luxembourgish",
|
| 99 |
+
"my": "myanmar",
|
| 100 |
+
"bo": "tibetan",
|
| 101 |
+
"tl": "tagalog",
|
| 102 |
+
"mg": "malagasy",
|
| 103 |
+
"as": "assamese",
|
| 104 |
+
"tt": "tatar",
|
| 105 |
+
"haw": "hawaiian",
|
| 106 |
+
"ln": "lingala",
|
| 107 |
+
"ha": "hausa",
|
| 108 |
+
"ba": "bashkir",
|
| 109 |
+
"jw": "javanese",
|
| 110 |
+
"su": "sundanese",
|
| 111 |
+
"yue": "cantonese",
|
| 112 |
+
"minnan": "minnan",
|
| 113 |
+
"wuyu": "wuyu",
|
| 114 |
+
"dialect": "dialect",
|
| 115 |
+
"zh/en": "zh/en",
|
| 116 |
+
"en/zh": "en/zh",
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
# language code lookup by name, with a few language aliases
|
| 120 |
+
TO_LANGUAGE_CODE = {
|
| 121 |
+
**{language: code for code, language in LANGUAGES.items()},
|
| 122 |
+
"burmese": "my",
|
| 123 |
+
"valencian": "ca",
|
| 124 |
+
"flemish": "nl",
|
| 125 |
+
"haitian": "ht",
|
| 126 |
+
"letzeburgesch": "lb",
|
| 127 |
+
"pushto": "ps",
|
| 128 |
+
"panjabi": "pa",
|
| 129 |
+
"moldavian": "ro",
|
| 130 |
+
"moldovan": "ro",
|
| 131 |
+
"sinhalese": "si",
|
| 132 |
+
"castilian": "es",
|
| 133 |
+
"mandarin": "zh",
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
AUDIO_EVENT = {
|
| 137 |
+
"ASR": "ASR",
|
| 138 |
+
"AED": "AED",
|
| 139 |
+
"SER": "SER",
|
| 140 |
+
"Speech": "Speech",
|
| 141 |
+
"/Speech": "/Speech",
|
| 142 |
+
"BGM": "BGM",
|
| 143 |
+
"/BGM": "/BGM",
|
| 144 |
+
"Laughter": "Laughter",
|
| 145 |
+
"/Laughter": "/Laughter",
|
| 146 |
+
"Applause": "Applause",
|
| 147 |
+
"/Applause": "/Applause",
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
EMOTION = {
|
| 151 |
+
"HAPPY": "HAPPY",
|
| 152 |
+
"SAD": "SAD",
|
| 153 |
+
"ANGRY": "ANGRY",
|
| 154 |
+
"NEUTRAL": "NEUTRAL",
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
TTS_Vocal_Token = {
|
| 158 |
+
"TTS/B": "TTS/B",
|
| 159 |
+
"TTS/O": "TTS/O",
|
| 160 |
+
"TTS/Q": "TTS/Q",
|
| 161 |
+
"TTS/A": "TTS/A",
|
| 162 |
+
"TTS/CO": "TTS/CO",
|
| 163 |
+
"TTS/CL": "TTS/CL",
|
| 164 |
+
"TTS/H": "TTS/H",
|
| 165 |
+
**{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)}
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
@lru_cache(maxsize=None)
|
| 170 |
+
def get_encoding(name: str = "gpt2", num_languages: int = 99):
|
| 171 |
+
vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
|
| 172 |
+
ranks = {
|
| 173 |
+
base64.b64decode(token): int(rank)
|
| 174 |
+
for token, rank in (line.split() for line in open(vocab_path) if line)
|
| 175 |
+
}
|
| 176 |
+
n_vocab = len(ranks)
|
| 177 |
+
special_tokens = {}
|
| 178 |
+
|
| 179 |
+
specials = [
|
| 180 |
+
"<|endoftext|>",
|
| 181 |
+
"<|startoftranscript|>",
|
| 182 |
+
*[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
|
| 183 |
+
*[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())],
|
| 184 |
+
*[f"<|{emotion}|>" for emotion in list(EMOTION.keys())],
|
| 185 |
+
"<|translate|>",
|
| 186 |
+
"<|transcribe|>",
|
| 187 |
+
"<|startoflm|>",
|
| 188 |
+
"<|startofprev|>",
|
| 189 |
+
"<|nospeech|>",
|
| 190 |
+
"<|notimestamps|>",
|
| 191 |
+
*[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)], # register special tokens for ASR
|
| 192 |
+
*[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())], # register special tokens for TTS
|
| 193 |
+
*[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
for token in specials:
|
| 197 |
+
special_tokens[token] = n_vocab
|
| 198 |
+
n_vocab += 1
|
| 199 |
+
|
| 200 |
+
return tiktoken.Encoding(
|
| 201 |
+
name=os.path.basename(vocab_path),
|
| 202 |
+
explicit_n_vocab=n_vocab,
|
| 203 |
+
pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
| 204 |
+
mergeable_ranks=ranks,
|
| 205 |
+
special_tokens=special_tokens,
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
@lru_cache(maxsize=None)
|
| 210 |
+
def get_tokenizer(
|
| 211 |
+
multilingual: bool,
|
| 212 |
+
*,
|
| 213 |
+
num_languages: int = 99,
|
| 214 |
+
language: Optional[str] = None,
|
| 215 |
+
task: Optional[str] = None, # Literal["transcribe", "translate", None]
|
| 216 |
+
) -> Tokenizer:
|
| 217 |
+
if language is not None:
|
| 218 |
+
language = language.lower()
|
| 219 |
+
if language not in LANGUAGES:
|
| 220 |
+
if language in TO_LANGUAGE_CODE:
|
| 221 |
+
language = TO_LANGUAGE_CODE[language]
|
| 222 |
+
else:
|
| 223 |
+
raise ValueError(f"Unsupported language: {language}")
|
| 224 |
+
|
| 225 |
+
if multilingual:
|
| 226 |
+
encoding_name = "multilingual_zh_ja_yue_char_del"
|
| 227 |
+
language = language or "en"
|
| 228 |
+
task = task or "transcribe"
|
| 229 |
+
else:
|
| 230 |
+
encoding_name = "gpt2"
|
| 231 |
+
language = None
|
| 232 |
+
task = None
|
| 233 |
+
|
| 234 |
+
encoding = get_encoding(name=encoding_name, num_languages=num_languages)
|
| 235 |
+
|
| 236 |
+
return Tokenizer(
|
| 237 |
+
encoding=encoding, num_languages=num_languages, language=language, task=task
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
class CosyVoice2Tokenizer():
|
| 242 |
+
def __init__(self, token_path, skip_special_tokens=True):
|
| 243 |
+
super().__init__()
|
| 244 |
+
# NOTE: non-chat model, all these special tokens keep randomly initialized.
|
| 245 |
+
special_tokens = {
|
| 246 |
+
'eos_token': '<|endoftext|>',
|
| 247 |
+
'pad_token': '<|endoftext|>',
|
| 248 |
+
'additional_special_tokens': [
|
| 249 |
+
'<|im_start|>', '<|im_end|>', '<|endofprompt|>',
|
| 250 |
+
'[breath]', '<strong>', '</strong>', '[noise]',
|
| 251 |
+
'[laughter]', '[cough]', '[clucking]', '[accent]',
|
| 252 |
+
'[quick_breath]',
|
| 253 |
+
"<laughter>", "</laughter>",
|
| 254 |
+
"[hissing]", "[sigh]", "[vocalized-noise]",
|
| 255 |
+
"[lipsmack]", "[mn]"
|
| 256 |
+
]
|
| 257 |
+
}
|
| 258 |
+
self.special_tokens = special_tokens
|
| 259 |
+
self.tokenizer = AutoTokenizer.from_pretrained(token_path)
|
| 260 |
+
self.tokenizer.add_special_tokens(special_tokens)
|
| 261 |
+
self.skip_special_tokens = skip_special_tokens
|
| 262 |
+
|
| 263 |
+
def encode(self, text, **kwargs):
|
| 264 |
+
tokens = self.tokenizer([text], return_tensors="pt")
|
| 265 |
+
tokens = tokens["input_ids"][0].cpu().tolist()
|
| 266 |
+
return tokens
|
| 267 |
+
|
| 268 |
+
def decode(self, tokens):
|
| 269 |
+
tokens = torch.tensor(tokens, dtype=torch.int64)
|
| 270 |
+
text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
|
| 271 |
+
return text
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
class CosyVoice3Tokenizer(CosyVoice2Tokenizer):
|
| 275 |
+
def __init__(self, token_path, skip_special_tokens=True):
|
| 276 |
+
# NOTE: non-chat model, all these special tokens keep randomly initialized.
|
| 277 |
+
special_tokens = {
|
| 278 |
+
'eos_token': '<|endoftext|>',
|
| 279 |
+
'pad_token': '<|endoftext|>',
|
| 280 |
+
'additional_special_tokens': [
|
| 281 |
+
'<|im_start|>', '<|im_end|>', '<|endofprompt|>',
|
| 282 |
+
'[breath]', '<strong>', '</strong>', '[noise]',
|
| 283 |
+
'[laughter]', '[cough]', '[clucking]', '[accent]',
|
| 284 |
+
'[quick_breath]',
|
| 285 |
+
"<laughter>", "</laughter>",
|
| 286 |
+
"[hissing]", "[sigh]", "[vocalized-noise]",
|
| 287 |
+
"[lipsmack]", "[mn]", "<|endofsystem|>",
|
| 288 |
+
"[AA]", "[AA0]", "[AA1]", "[AA2]", "[AE]", "[AE0]", "[AE1]", "[AE2]", "[AH]", "[AH0]", "[AH1]", "[AH2]",
|
| 289 |
+
"[AO]", "[AO0]", "[AO1]", "[AO2]", "[AW]", "[AW0]", "[AW1]", "[AW2]", "[AY]", "[AY0]", "[AY1]", "[AY2]",
|
| 290 |
+
"[B]", "[CH]", "[D]", "[DH]", "[EH]", "[EH0]", "[EH1]", "[EH2]", "[ER]", "[ER0]", "[ER1]", "[ER2]", "[EY]",
|
| 291 |
+
"[EY0]", "[EY1]", "[EY2]", "[F]", "[G]", "[HH]", "[IH]", "[IH0]", "[IH1]", "[IH2]", "[IY]", "[IY0]", "[IY1]",
|
| 292 |
+
"[IY2]", "[JH]", "[K]", "[L]", "[M]", "[N]", "[NG]", "[OW]", "[OW0]", "[OW1]", "[OW2]", "[OY]", "[OY0]",
|
| 293 |
+
"[OY1]", "[OY2]", "[P]", "[R]", "[S]", "[SH]", "[T]", "[TH]", "[UH]", "[UH0]", "[UH1]", "[UH2]", "[UW]",
|
| 294 |
+
"[UW0]", "[UW1]", "[UW2]", "[V]", "[W]", "[Y]", "[Z]", "[ZH]",
|
| 295 |
+
"[a]", "[ai]", "[an]", "[ang]", "[ao]", "[b]", "[c]", "[ch]", "[d]", "[e]", "[ei]", "[en]", "[eng]", "[f]",
|
| 296 |
+
"[g]", "[h]", "[i]", "[ian]", "[in]", "[ing]", "[iu]", "[ià]", "[iàn]", "[iàng]", "[iào]", "[iá]", "[ián]",
|
| 297 |
+
"[iáng]", "[iáo]", "[iè]", "[ié]", "[iòng]", "[ióng]", "[iù]", "[iú]", "[iā]", "[iān]", "[iāng]", "[iāo]",
|
| 298 |
+
"[iē]", "[iě]", "[iōng]", "[iū]", "[iǎ]", "[iǎn]", "[iǎng]", "[iǎo]", "[iǒng]", "[iǔ]", "[j]", "[k]", "[l]",
|
| 299 |
+
"[m]", "[n]", "[o]", "[ong]", "[ou]", "[p]", "[q]", "[r]", "[s]", "[sh]", "[t]", "[u]", "[uang]", "[ue]",
|
| 300 |
+
"[un]", "[uo]", "[uà]", "[uài]", "[uàn]", "[uàng]", "[uá]", "[uái]", "[uán]", "[uáng]", "[uè]", "[ué]", "[uì]",
|
| 301 |
+
"[uí]", "[uò]", "[uó]", "[uā]", "[uāi]", "[uān]", "[uāng]", "[uē]", "[uě]", "[uī]", "[uō]", "[uǎ]", "[uǎi]",
|
| 302 |
+
"[uǎn]", "[uǎng]", "[uǐ]", "[uǒ]", "[vè]", "[w]", "[x]", "[y]", "[z]", "[zh]", "[à]", "[ài]", "[àn]", "[àng]",
|
| 303 |
+
"[ào]", "[á]", "[ái]", "[án]", "[��ng]", "[áo]", "[è]", "[èi]", "[èn]", "[èng]", "[èr]", "[é]", "[éi]", "[én]",
|
| 304 |
+
"[éng]", "[ér]", "[ì]", "[ìn]", "[ìng]", "[í]", "[ín]", "[íng]", "[ò]", "[òng]", "[òu]", "[ó]", "[óng]", "[óu]",
|
| 305 |
+
"[ù]", "[ùn]", "[ú]", "[ún]", "[ā]", "[āi]", "[ān]", "[āng]", "[āo]", "[ē]", "[ēi]", "[ēn]", "[ēng]", "[ě]",
|
| 306 |
+
"[ěi]", "[ěn]", "[ěng]", "[ěr]", "[ī]", "[īn]", "[īng]", "[ō]", "[ōng]", "[ōu]", "[ū]", "[ūn]", "[ǎ]", "[ǎi]",
|
| 307 |
+
"[ǎn]", "[ǎng]", "[ǎo]", "[ǐ]", "[ǐn]", "[ǐng]", "[ǒ]", "[ǒng]", "[ǒu]", "[ǔ]", "[ǔn]", "[ǘ]", "[ǚ]", "[ǜ]"
|
| 308 |
+
]
|
| 309 |
+
}
|
| 310 |
+
self.special_tokens = special_tokens
|
| 311 |
+
self.tokenizer = AutoTokenizer.from_pretrained(token_path)
|
| 312 |
+
self.tokenizer.add_special_tokens(special_tokens)
|
| 313 |
+
self.skip_special_tokens = skip_special_tokens
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
@lru_cache(maxsize=None)
|
| 317 |
+
def get_qwen_tokenizer(
|
| 318 |
+
token_path: str,
|
| 319 |
+
skip_special_tokens: bool,
|
| 320 |
+
version: str = 'cosyvoice2'
|
| 321 |
+
):
|
| 322 |
+
if version == 'cosyvoice2':
|
| 323 |
+
return CosyVoice2Tokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
|
| 324 |
+
elif version == 'cosyvoice3':
|
| 325 |
+
return CosyVoice3Tokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
|
| 326 |
+
else:
|
| 327 |
+
raise ValueError
|
token2wav-axmodels/flow.input_embedding.float16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a7ceb5ede1cac3bdcec37aa034a694821a735087890c2104da238bf1e921bc6
|
| 3 |
+
size 1049760
|