lihongjie commited on Jan 9

Commit

08a04fb

1 Parent(s): 867fae1

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +55 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel +3 -0
README.md +227 -0
asset/dingding.png +3 -0
asset/output.wav +3 -0
asset/zero_shot_prompt.wav +3 -0
frontend-onnx/campplus.onnx +3 -0
frontend-onnx/speech_tokenizer_v3.onnx +3 -0
main_ax650 +3 -0
run_ax650.sh +21 -0
scripts/CosyVoice-BlankEN/merges.txt +0 -0
scripts/CosyVoice-BlankEN/tokenizer_config.json +40 -0
scripts/CosyVoice-BlankEN/vocab.json +0 -0
scripts/audio.py +83 -0
scripts/cosyvoice3_tokenizer.py +124 -0
scripts/frontend.py +251 -0
scripts/gradio_demo.py +161 -0
scripts/meldataset.py +217 -0
scripts/process_prompt.py +62 -0
scripts/requirements.txt +8 -0
scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +3 -0
scripts/tokenizer/tokenizer.py +327 -0
token2wav-axmodels/flow.input_embedding.float16.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,58 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_estimator_200.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/speech_window_2x8x480.txt filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow.input_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/rand_noise_1_80_300.txt filter=lfs diff=lfs merge=lfs -text
+main_ax650 filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel filter=lfs diff=lfs merge=lfs -text
+frontend-onnx/speech_tokenizer_v3.onnx filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_p1_100.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_p1_50.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_p2_100_final.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_encoder_50_final.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_p2_150.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_p2_50.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_p1_150.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel filter=lfs diff=lfs merge=lfs -text
+scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken filter=lfs diff=lfs merge=lfs -text
+frontend-onnx/campplus.onnx filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_encoder_78.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_estimator_300.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow.input_embedding.npy filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_p1_100_final.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_encoder_28.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_encoder_53.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_estimator_250.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_p2_100.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel filter=lfs diff=lfs merge=lfs -text
+asset/zero_shot_prompt.wav filter=lfs diff=lfs merge=lfs -text
+asset/dingding.png filter=lfs diff=lfs merge=lfs -text
+asset/output.wav filter=lfs diff=lfs merge=lfs -text

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54b7a3a7095c822489d43ee4b3a490606cfe9ad347d2c9e13c4581d8de0cfab8
+size 12115712

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a04dd04e4b5a95dca860ec1026c736e2aca158f45488969724e8a3f4a5682e85
+size 6506188

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa9b7fbc8ba5adbedbde9a6704ab2cbb73cdccc370a06b0be79086176179572
+size 272269312

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81970ce053f1dbb0fb6db9f61be43f79c92eef326c08ac2d445e5979d78ed7ad
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00408e4cfaa0d7a0a7fdd6305ee7c66b967ac287eee14f3a6ba146d6426e9591
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4472b2f70c7960269663e981015d5f828c4d240b316cf00e1bf33ff678bbe92f
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c26672ead01ee8164b7405eb2d6c7fd410763ddcabfdd87ebdee6179ebd1f4b
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33916843b8c8a0a82a35b9f46ab893c3ab9d400660a7cac92958d305b6a1aea8
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e25ade425171c7139ce758936c9d95d13dbec129630a01c88171ee085dc71c75
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3dafa43dc2ec21423d4d9147d1bdd48572d09e1e355dc921f4be6569ce1b799c
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ed263e08194acc4867fa7ffd13ad2044b18be9f8fba1250f276ac23b200e0dd
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:926bfceb3b6fae9afd79b7f5babf2931cb44675077866b192af367d2fc3ad5b4
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a426acb4187d034fc9a30406386c0cd0f9271c0794fa66646349d1d19b4f53e
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:484147aaee42d3314c72b41790c96e633848f44175e37762e897515ba0c7e346
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d207d70025931a9da459d3fcf2469b2b744e6e7bc6173abf7c9f70a9df83d8d9
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:075b956543f4ee84bdc9e61cce69f7286c94d1ff03f25d89d997f1786616dee6
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16ac8a2cba87bdbe3c9ed55cf60f8e5e683f69cc8334e7cf9a7594b7a20a6157
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd7c709bb017122f4e052893cc0ec4d0f7f2bc6a15b0f87c488947d2fccb6565
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae7339df1ee72ca28b557b6933997cec517b1a38766ffbe706814e12cda4137d
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa85ea3722be6839f5a6c01485a276908327f215523a3c09bb8028ce652d96d5
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12c5faedfae53817cb67f868e00d98c80f688b89093b1c142732bc848811682e
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb9f5106bf2280d592e02913fa1c43f7ca1bebb66969436ab79dfc5a4b174749
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beac97add03d3edf0055cce152990138b9bbc9dac9835a384c35cadc1510f2b6
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60b01ff651579615d6db2ba63001fbcd8c0b740d765ad0eb42dab8b5549538a4
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63f4e960aff4347ba663b1c17ff7cc2f399c663e06610dc35ac0e4d95909bd24
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:196b5a89d35e522141e2e8ca0eac44e5d0723b0e6c3e6dec39730a3c9367fabb
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c46c32e13e5048c236f83ed783fd79457f6cd2dd4b88984fa756be7c53c7b849
+size 17235064

CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:302e68e08caf347dd063947baff79b68ce5f60da0c0a493942aa51b914c0b7e0
+size 147957518

README.md CHANGED Viewed

@@ -1,3 +1,230 @@
 ---
 license: mit
 ---

 ---
 license: mit
+language:
+- en
+- zh
+base_model:
+- CosyVoice3
+pipeline_tag: text-to-speech
+library_name: transformers
+tags:
+- CosyVoice3
+- Speech
 ---
+# CosyVoice3
+This version of CosyVoice3 has been converted to run on the Axera NPU using **w8a16** quantization.
+Compatible with Pulsar2 version: 4.2
+## Convert tools links:
+For those who are interested in model conversion, you can try to export axmodel through the original repo :
+[Cosyvoice](https://github.com/FunAudioLLM/CosyVoice)
+[Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
+[AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/CosyVoice3.Axera)
+## Support Platform
+- AX650
+  - AX650N DEMO Board
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
+**Speech Generation**
+| Stage | Time |
+|------|------|
+| llm prefill ( input_token_num + prompt_token_num 在 [0,128 ] ) | 104 ms  |
+| llm prefill ( input_token_num + prompt_token_num 在 [128,256 ] ) | 234 ms  |
+| Decode  |  21.24 token/s |
+## How to use
+Download all files from this repository to the device
+### 1. PrePare
+#### 1.1 Copy this project to AX650 Board
+#### 1.2 Prepare Dependencies
+**Running HTTP Tokenizer Server** and **Processing Prompt Speech** require these Python packages. If you run these two step on a PC, install them on the PC.
+```
+pip3 install -r scripts/requirements.txt
+```
+### 2. Start HTTP Tokenizer Server
+```
+cd scripts
+python CosyVoice3_tokenizer.py --host {your host} --port {your port}
+```
+### 3. Run on Axera Device
+There are 2 kinds of device, AX650 Board , AXCL aarch64 Board and AXCL x86 Board.
+#### 3.1 Run on AX650 Board
+1) Moidfy the HTTP host in `run_ax650.sh`.
+2) Run `run_ax650.sh`
+```shell
+root@ax650 ~/CosyVoice3 # bash run_ax650.sh
+rm: cannot remove 'output*.wav': No such file or directory
+[I][                            Init][ 108]: LLM init start
+[I][                            Init][  34]: connect http://10.122.86.184:12345 ok
+bos_id: 0, eos_id: 1773
+  7% | ███                               |   2 /  27 [3.11s<42.04s, 0.64 count/s] embed_selector init ok[I][                            Init][ 138]: attr.axmodel_num:24
+100% | ████████████████████████████████ |  27 /  27 [10.32s<10.32s, 2.62 count/s] init post axmodel ok,remain_cmm(7178 MB)
+[I][                            Init][ 216]: max_token_len : 1023
+[I][                            Init][ 221]: kv_cache_size : 128, kv_cache_num: 1023
+[I][                            Init][ 229]: prefill_token_num : 128
+[I][                            Init][ 233]: grp: 1, prefill_max_token_num : 1
+[I][                            Init][ 233]: grp: 2, prefill_max_token_num : 128
+[I][                            Init][ 233]: grp: 3, prefill_max_token_num : 256
+[I][                            Init][ 233]: grp: 4, prefill_max_token_num : 384
+[I][                            Init][ 233]: grp: 5, prefill_max_token_num : 512
+[I][                            Init][ 237]: prefill_max_token_num : 512
+[I][                            Init][ 249]: LLM init ok
+[I][                            Init][ 154]: Token2Wav init ok
+[I][                            main][ 273]:
+[I][                             Run][ 388]: input token num : 142, prefill_split_num : 2
+[I][                             Run][ 422]: input_num_token:128
+[I][                             Run][ 422]: input_num_token:14
+[I][                             Run][ 607]: ttft: 236.90 ms
+[Main/Token2Wav Thread] Processing batch of 28 tokens...
+Successfully saved audio to output_0.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 53 tokens...
+Successfully saved audio to output_1.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_2.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_3.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_4.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_5.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_6.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_7.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_8.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_9.wav (32-bit Float PCM).
+[I][                             Run][ 723]: hit eos, llm finished
+[I][                             Run][ 753]: llm finished
+[Main/Token2Wav Thread] Buffer is empty and LLM finished. Exiting.
+[I][                             Run][ 758]: total decode tokens:271
+[N][                             Run][ 759]: hit eos,avg 21.47 token/s
+Successfully saved audio to output_10.wav (32-bit Float PCM).
+Successfully saved audio to output.wav (32-bit Float PCM).
+Voice generation pipeline completed.
+Type "q" to exit, Ctrl+c to stop current running
+text >>
+```
+Output Speech：
+[output.wav](asset/output.wav)
+####  Or run on AX650 Board with Gradio GUI
+1) Start server
+```
+bash run_api_ax650.sh
+```
+2) Start Gradio GUI
+```
+python scripts/gradio_demo.py
+```
+#### 3.2 Run on AXCL aarch64 Board
+```
+bash run_axcl_aarch64.sh
+```
+#### Or run on AXCL aarch64 Board with Gradio GUI
+1) Start server
+```
+bash run_api_axcl_aarch64.sh
+```
+2) Start Gradio GUI
+```
+python scripts/gradio_demo.py
+```
+3) Open the page from a browser
+The page url is : `https://{your device ip}:7860`
+Note that you need to run these two commands in the project root directory.
+#### 3.3 Run on AXCL x86 Board
+```
+bash run_axcl_x86.sh
+```
+#### Or run on AXCL aarch64 Board with Gradio GUI
+1) Start server
+```
+bash run_api_axcl_x86.sh
+```
+2) Start Gradio GUI
+```
+python scripts/gradio_demo.py
+```
+3) Open the page from a browser
+The page url is : `https://{your device ip}:7860`
+Note that you need to run these two commands in the project root directory.
+![](./gradio.png)
+### Optional. Process Prompt Speech
+If you want to replicate a specific sound, do this step.
+You can use audio in asset/ .
+#### (1). Downlaod wetext
+```
+pip3 install modelscope
+modelscope download --model pengzhendong/wetext --local_dir pengzhendong/wetext
+```
+#### (2). Process Prompt Speech
+Example:
+```
+python3 scripts/process_prompt.py --prompt_text  asset/zh_man1.txt --prompt_speech asset/zh_man1.wav --output zh_man1
+```
+Pass parameters according to the actual situation.
+```
+python3 scripts/process_prompt.py -h
+usage: process_prompt.py [-h] [--model_dir MODEL_DIR] [--wetext_dir WETEXT_DIR] [--sample_rate SAMPLE_RATE] [--prompt_text PROMPT_TEXT] [--prompt_speech PROMPT_SPEECH]
+                         [--output OUTPUT]
+options:
+  -h, --help            show this help message and exit
+  --model_dir MODEL_DIR
+                        tokenizer configuration directionary
+  --wetext_dir WETEXT_DIR
+                        path to wetext
+  --sample_rate SAMPLE_RATE
+                        Sampling rate for prompt audio
+  --prompt_text PROMPT_TEXT
+                        The text content of the prompt(reference) audio. Text or file path.
+  --prompt_speech PROMPT_SPEECH
+                        The path to prompt(reference) audio.
+  --output OUTPUT       Output data storage directory
+```
+After executing the above command, files like the following will be generated:
+```
+flow_embedding.txt
+flow_prompt_speech_token.txt
+llm_embedding.txt
+llm_prompt_speech_token.txt
+prompt_speech_feat.txt
+prompt_text.txt
+```
+When you run run_ax650.sh, pass the output path here to the prompt_files parameter of the run_ax650.sh script.

asset/dingding.png ADDED Viewed

Git LFS Details

SHA256: 3870bb0a4e3df1f643e09c960b7e03d80da798509c86eaa326db205236b861d5
Pointer size: 130 Bytes
Size of remote file: 96.4 kB

asset/output.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c7a4c3837145df17e851c177f849446036e6f541d78eb6e107ea6b9e7b07672
+size 1067564

asset/zero_shot_prompt.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd199eb7109fd6ce9943cb297e3cf350c1073af014063dfadbdc100230526243
+size 111496

frontend-onnx/campplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

frontend-onnx/speech_tokenizer_v3.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23236a74175dbdda47afc66dbadd5bcb41303c467a57c261cb8539ad9db9208d
+size 969451503

main_ax650 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e5192387e51f64ea8993eb9bc3e848092aa2f8ce7157891496b152149a42ed6
+size 6647080

run_ax650.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+LLM_DIR=CosyVoice-BlankEN-Ax650-C64-P256-CTX512/
+TOKEN2WAV_DIR=token2wav-axmodels/
+./main_ax650 \
+--template_filename_axmodel "${LLM_DIR}/qwen2_p64_l%d_together.axmodel" \
+--token2wav_axmodel_dir $TOKEN2WAV_DIR \
+--n_timesteps 10 \
+--axmodel_num 24 \
+--bos 0 --eos 0 \
+--filename_tokenizer_model "http://10.122.86.184:12345" \
+--filename_post_axmodel "${LLM_DIR}/qwen2_post.axmodel" \
+--filename_decoder_axmodel "${LLM_DIR}/llm_decoder.axmodel" \
+--filename_tokens_embed "${LLM_DIR}/model.embed_tokens.weight.bfloat16.bin" \
+--filename_llm_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
+--filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
+--continue 0 \
+--prompt_files prompt_files \
+--text "高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。"
+chmod 777 output.wav

scripts/CosyVoice-BlankEN/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/CosyVoice-BlankEN/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

scripts/CosyVoice-BlankEN/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/audio.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import numpy as np
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window  # pylint: disable=global-statement
+    print("fmax",fmax)
+    if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

scripts/cosyvoice3_tokenizer.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from transformers import AutoTokenizer, PreTrainedTokenizerFast
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import json
+import argparse
+from tokenizer.tokenizer import get_qwen_tokenizer
+class Tokenizer_Http():
+    def __init__(self):
+        self.tokenizer = get_qwen_tokenizer("CosyVoice-BlankEN/", True, "cosyvoice3")
+    def encode(self, prompt):
+        token_ids = self.tokenizer.encode(prompt, allowed_special="all")
+        return token_ids
+    def decode(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+    # @property
+    # def bos_id(self):
+    #     return self.tokenizer.bos_token_id
+    @property
+    def eos_id(self):
+        return 1773
+    # @property
+    # def bos_token(self):
+    #     return self.tokenizer.bos_token
+    @property
+    def eos_token(self):
+        return "<|eot_id|>"
+tokenizer = Tokenizer_Http()
+# print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
+print(tokenizer.encode("hello world"))
+class Request(BaseHTTPRequestHandler):
+    #通过类继承，新定义类
+    timeout = 5
+    server_version = 'Apache'
+    def do_GET(self):
+        print(self.path)
+        #在新类中定义get的内容（当客户端向该服务端使用get请求时，本服务端将如下运行）
+        self.send_response(200)
+        self.send_header("type", "get")  #设置响应头，可省略或设置多个
+        self.end_headers()
+        if self.path == '/bos_id':
+            bos_id = tokenizer.bos_id
+            # print(bos_id)
+            # to json
+            if bos_id is None:
+                msg = json.dumps({'bos_id': -1})
+            else:
+                msg = json.dumps({'bos_id': bos_id})
+        elif self.path == '/eos_id':
+            eos_id = tokenizer.eos_id
+            if eos_id is None:
+                msg = json.dumps({'eos_id': -1})
+            else:
+                msg = json.dumps({'eos_id': eos_id})
+        else:
+            msg = 'error'
+        print(msg)
+        msg = str(msg).encode()  #转为str再转为byte格式
+        self.wfile.write(msg)  #将byte格式的信息返回给客户端
+    def do_POST(self):
+        #在新类中定义post的内容（当客户端向该服务端使用post请求时，本服务端将如下运行）
+        data = self.rfile.read(int(
+            self.headers['content-length']))  #获取从客户端传入的参数（byte格式）
+        data = data.decode()  #将byte格式转为str格式
+        self.send_response(200)
+        self.send_header("type", "post")  #设置响应头，可省略或设置多个
+        self.end_headers()
+        if self.path == '/encode':
+            req = json.loads(data)
+            prompt = req['text']
+            token_ids = tokenizer.encode(prompt)
+            if token_ids is None:
+                msg = json.dumps({'token_ids': -1})
+            else:
+                msg = json.dumps({'token_ids': token_ids})
+        elif self.path == '/decode':
+            req = json.loads(data)
+            token_ids = req['token_ids']
+            text = tokenizer.decode(token_ids)
+            if text is None:
+                msg = json.dumps({'text': ""})
+            else:
+                msg = json.dumps({'text': text})
+        else:
+            msg = 'error'
+        print(msg)
+        msg = str(msg).encode()  #转为str再转为byte格式
+        self.wfile.write(msg)  #将byte格式的信息返回给客户端
+if __name__ == "__main__":
+    args = argparse.ArgumentParser()
+    args.add_argument('--host', type=str, default='localhost')
+    args.add_argument('--port', type=int, default=12345)
+    args = args.parse_args()
+    host = (args.host, args.port)  #设定地址与端口号，'localhost'等价于'127.0.0.1'
+    print('http://%s:%s' % host)
+    server = HTTPServer(host, Request)  #根据地址端口号和新定义的类，创建服务器实例
+    server.serve_forever()  #开启服务

scripts/frontend.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from functools import lru_cache
+from typing import Generator
+import json
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import os
+import re
+import inflect
+from tokenizer.tokenizer import get_qwen_tokenizer
+from audio import mel_spectrogram
+try:
+    import ttsfrd
+    use_ttsfrd = True
+except ImportError:
+    from wetext import Normalizer as ZhNormalizer
+    from wetext import Normalizer as EnNormalizer
+    use_ttsfrd = False
+import logging
+logging.getLogger('frontend').setLevel(logging.WARNING)
+# logging.basicConfig(level=logging.DEBUG,
+#                     format='%(asctime)s %(levelname)s %(message)s')
+class CosyVoiceFrontEnd:
+    def __init__(self,
+                 pretrained_path: str,
+                 wetext_dir: str,
+                 campplus_model: str,
+                 speech_tokenizer_model: str,
+                 spk2info: str = '',
+                 allowed_special: str = 'all'):
+        self.tokenizer = get_qwen_tokenizer(pretrained_path, True)
+        self.feat_extractor = partial(
+                                mel_spectrogram,
+                                n_fft=1920,
+                                num_mels=80,
+                                sampling_rate=24000,
+                                hop_size=480,
+                                win_size=1920,
+                                fmin=0,
+                                fmax=8000,
+                                center=False)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
+                                                                     providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
+                                                                                "CPUExecutionProvider"])
+        if os.path.exists(spk2info):
+            self.spk2info = torch.load(spk2info, map_location=self.device)
+        else:
+            self.spk2info = {}
+        self.allowed_special = allowed_special
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \
+                'failed to initialize ttsfrd resource'
+            self.frd.set_lang_type('pinyinvg')
+        else:
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False, lang="zh")
+            self.en_tn_model = EnNormalizer(lang="zh")
+            self.inflect_parser = inflect.engine()
+    def _extract_text_token(self, text):
+        if isinstance(text, Generator):
+            logging.info('get tts_text generator, will return _extract_text_token_generator!')
+            # NOTE add a dummy text_token_len for compatibility
+            return self._extract_text_token_generator(text), torch.tensor([0], dtype=torch.int32).to(self.device)
+        else:
+            text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+            text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+            text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+            return text_token, text_token_len
+    def _extract_text_token_generator(self, text_generator):
+        for text in text_generator:
+            text_token, _ = self._extract_text_token(text)
+            for i in range(text_token.shape[1]):
+                yield text_token[:, i: i + 1]
+    def _extract_speech_token(self, speech):
+        assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = self.speech_tokenizer_session.run(None,
+                                                         {self.speech_tokenizer_session.get_inputs()[0].name:
+                                                          feat.detach().cpu().numpy(),
+                                                          self.speech_tokenizer_session.get_inputs()[1].name:
+                                                          np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
+        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_token, speech_token_len
+    def _extract_spk_embedding(self, speech):
+        feat = kaldi.fbank(speech,
+                           num_mel_bins=80,
+                           dither=0,
+                           sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.campplus_session.run(None,
+                                              {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+        embedding = torch.tensor([embedding]).to(self.device)
+        return embedding
+    def _extract_speech_feat(self, speech):
+        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_feat, speech_feat_len
+    def text_normalize(self, text, split=True, text_frontend=True):
+        if isinstance(text, Generator):
+            logging.info('get tts_text generator, will skip text_normalize!')
+            return [text]
+        if text_frontend is False or text == '':
+            return [text] if split is True else text
+        text = text.strip()
+        if self.use_ttsfrd:
+            texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
+            text = ''.join(texts)
+        else:
+            if contains_chinese(text):
+                text = self.zh_tn_model.normalize(text)
+                text = text.replace("\n", "")
+                text = replace_blank(text)
+                text = replace_corner_mark(text)
+                text = text.replace(".", "。")
+                text = text.replace(" - ", "，")
+                text = remove_bracket(text)
+                text = re.sub(r'[，,、]+$', '。', text)
+                texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                             token_min_n=60, merge_len=20, comma_split=False))
+            else:
+                text = self.en_tn_model.normalize(text)
+                text = spell_out_number(text, self.inflect_parser)
+                texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                             token_min_n=60, merge_len=20, comma_split=False))
+        texts = [i for i in texts if not is_only_punctuation(i)]
+        return texts if split is True else text
+    def frontend_sft(self, tts_text, spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        embedding = self.spk2info[spk_id]['embedding']
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        if zero_shot_spk_id == '':
+            prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+            prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+            speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
+            speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+            if resample_rate == 24000:
+                # cosyvoice2, force speech_feat % speech_token = 2
+                token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+                speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+                speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
+            embedding = self._extract_spk_embedding(prompt_speech_16k)
+            model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                           'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                           'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                           'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                           'llm_embedding': embedding, 'flow_embedding': embedding}
+        else:
+            model_input = self.spk2info[zero_shot_spk_id]
+        model_input['text'] = tts_text_token
+        model_input['text_len'] = tts_text_token_len
+        return model_input
+    def process_prompt(self, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
+        if zero_shot_spk_id == '':
+            prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+            prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+            speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
+            speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+            if resample_rate == 24000:
+                # cosyvoice2, force speech_feat % speech_token = 2
+                token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+                speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+                speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
+            embedding = self._extract_spk_embedding(prompt_speech_16k)
+            model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                           'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                           'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                           'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                           'llm_embedding': embedding, 'flow_embedding': embedding}
+        else:
+            model_input = self.spk2info[zero_shot_spk_id]
+        return model_input
+    def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
+        # in cross lingual mode, we remove prompt in llm
+        del model_input['prompt_text']
+        del model_input['prompt_text_len']
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_instruct(self, tts_text, spk_id, instruct_text):
+        model_input = self.frontend_sft(tts_text, spk_id)
+        # in instruct mode, we remove spk_embedding in llm due to information leakage
+        del model_input['llm_embedding']
+        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
+        model_input['prompt_text'] = instruct_text_token
+        model_input['prompt_text_len'] = instruct_text_token_len
+        return model_input
+    def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
+        model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):
+        prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+        prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
+        model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
+                       'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
+                       'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
+                       'flow_embedding': embedding}
+        return model_input

scripts/gradio_demo.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import argparse
+import shutil
+import gradio as gr
+import numpy as np
+import requests
+import time
+import os
+import torch
+from frontend import CosyVoiceFrontEnd
+import torchaudio
+import logging
+logging.basicConfig(level=logging.WARNING)
+import subprocess
+import re
+def get_all_local_ips():
+    result = subprocess.run(['ip', 'a'], capture_output=True, text=True)
+    output = result.stdout
+    # 匹配所有IPv4
+    ips = re.findall(r'inet (\d+\.\d+\.\d+\.\d+)', output)
+    # 过滤掉回环地址
+    real_ips = [ip for ip in ips if not ip.startswith('127.')]
+    return real_ips
+TTS_URL = "http://0.0.0.0:12346/tts"
+GET_URL = "http://0.0.0.0:12346/get"
+TIMESTEPS_URL = "http://0.0.0.0:12346/timesteps"
+PROMPT_FILES_URL = "http://0.0.0.0:12346/prompt_files"
+args = argparse.ArgumentParser()
+args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN", help="tokenizer configuration directionary")
+args.add_argument('--wetext_dir', type=str, default="pengzhendong/wetext", help="path to wetext")
+args.add_argument('--sample_rate', type=int, default=24000, help="Sampling rate for prompt audio")
+args = args.parse_args()
+frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
+                                args.wetext_dir,
+                                "frontend-onnx/campplus.onnx",
+                                "frontend-onnx/speech_tokenizer_v2.onnx",
+                                f"{args.model_dir}/spk2info.pt",
+                                "all")
+def update_audio(audio_input_path, audio_text):
+    def load_wav(wav, target_sr):
+        speech, sample_rate = torchaudio.load(wav, backend='soundfile')
+        speech = speech.mean(dim=0, keepdim=True)
+        if sample_rate != target_sr:
+            assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+            speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+        return speech
+    output_dir = './output_temp'
+    # clear output_dir
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    zero_shot_spk_id = ""
+    prompt_speech_16k = load_wav(audio_input_path, 16000)
+    prompt_text = audio_text
+    print("prompt_text",prompt_text)
+    model_input = frontend.process_prompt( prompt_text, prompt_speech_16k, args.sample_rate, zero_shot_spk_id)
+    print("prompt speech token size:", model_input["flow_prompt_speech_token"].shape)
+    assert model_input["flow_prompt_speech_token"].shape[1] >=75, f"speech_token length should >= 75, bug get {model_input['flow_prompt_speech_token'].shape[1]}"
+    for k, v in model_input.items():
+        if "_len" in k:
+            continue
+        shapes = [str(s) for s in v.shape]
+        shape_str = "_".join(shapes)
+        if v.dtype in (torch.int32, torch.int64):
+            np.savetxt(f"{output_dir}/{k}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
+        else:
+            np.savetxt(f"{output_dir}/{k}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")
+    try:
+        r = requests.post(PROMPT_FILES_URL, json={"prompt_files": output_dir}, timeout=5)
+        if r.status_code != 200:
+            return None, "❌ TTS 请求失败"
+    except Exception as e:
+        return None, f"❌ TTS 请求异常: {e}"
+def update_timesteps(timesteps):
+    try:
+        r = requests.post(TIMESTEPS_URL, json={"timesteps": timesteps}, timeout=5)
+        if r.status_code != 200:
+            return None, "❌ TTS 请求失败"
+    except Exception as e:
+        return None, f"❌ TTS 请求异常: {e}"
+def run_tts(text):
+    # Step1: 提交 TTS 请求
+    try:
+        r = requests.post(TTS_URL, json={"text": text}, timeout=5)
+        if r.status_code != 200:
+            return None, "❌ TTS 请求失败"
+    except Exception as e:
+        return None, f"❌ TTS 请求异常: {e}"
+    # Step2: 循环调用 /get 获取进度
+    progress = gr.Progress()
+    wav_file = None
+    for i in range(100):  # 最多尝试100次，避免死循环
+        time.sleep(0.5)
+        try:
+            resp = requests.post(GET_URL, data="", timeout=5).json()
+        except Exception as e:
+            return None, f"❌ GET 请求异常: {e}"
+        if resp.get("b_tts_runing", True):
+            progress(i / 100, desc="正在生成语音...")
+        else:
+            wav_file = resp.get("wav_file")
+            break
+    if not wav_file or not os.path.exists(wav_file):
+        return None, "❌ 语音文件未生成"
+    return wav_file, "✅ 生成完成"
+with gr.Blocks() as demo:
+    gr.Markdown("### 🎙️ AXERA CosyVoice2 Demo")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(label="输入音频", type="filepath")
+        with gr.Column():
+            audio_text = gr.Textbox(label="音频文本(自己改一下或者照着念)", value="锄禾日当午，汗滴禾下土。")
+            btn_update = gr.Button("更新音源")
+    with gr.Row():
+        text_input = gr.Textbox(value="琦琦，麻烦你适配一下这个新的模型吧。", label="输入文本")
+        with gr.Column():
+            timesteps = gr.Slider(minimum=4, maximum=30, value=7, step=1, label="Timesteps")
+            run_btn = gr.Button("生成语音")
+    status = gr.Label(label="状态")
+    audio_out = gr.Audio(label="生成结果", type="filepath")
+    run_btn.click(fn=run_tts, inputs=[text_input], outputs=[audio_out, status])
+    timesteps.change(fn=update_timesteps, inputs=timesteps)
+    btn_update.click(fn=update_audio, inputs=[audio_input, audio_text])
+ips = get_all_local_ips()
+for ip in ips:
+    print(f"* Running on local URL:  https://{ip}:7860")
+demo.launch(
+    server_name="0.0.0.0",
+    server_port=7860,
+    ssl_certfile="./server.crt",
+    ssl_keyfile="./server.key",
+    ssl_verify=False
+)

scripts/meldataset.py ADDED Viewed

	@@ -0,0 +1,217 @@

+""" from https://github.com/jik876/hifi-gan """
+import math
+import os
+import random
+import numpy as np
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+from librosa.util import normalize
+from scipy.io.wavfile import read
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window  # pylint: disable=global-statement
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def get_dataset_filelist(a):
+    with open(a.input_training_file, encoding="utf-8") as fi:
+        training_files = [
+            os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
+        ]
+    with open(a.input_validation_file, encoding="utf-8") as fi:
+        validation_files = [
+            os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
+        ]
+    return training_files, validation_files
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        training_files,
+        segment_size,
+        n_fft,
+        num_mels,
+        hop_size,
+        win_size,
+        sampling_rate,
+        fmin,
+        fmax,
+        split=True,
+        shuffle=True,
+        n_cache_reuse=1,
+        device=None,
+        fmax_loss=None,
+        fine_tuning=False,
+        base_mels_path=None,
+    ):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+    def __getitem__(self, index):
+        filename = self.audio_files[index]
+        if self._cache_ref_count == 0:
+            audio, sampling_rate = load_wav(filename)
+            audio = audio / MAX_WAV_VALUE
+            if not self.fine_tuning:
+                audio = normalize(audio) * 0.95
+            self.cached_wav = audio
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR")
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = random.randint(0, max_audio_start)
+                    audio = audio[:, audio_start : audio_start + self.segment_size]
+                else:
+                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
+            mel = mel_spectrogram(
+                audio,
+                self.n_fft,
+                self.num_mels,
+                self.sampling_rate,
+                self.hop_size,
+                self.win_size,
+                self.fmin,
+                self.fmax,
+                center=False,
+            )
+        else:
+            mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy"))
+            mel = torch.from_numpy(mel)
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+                if audio.size(1) >= self.segment_size:
+                    mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start : mel_start + frames_per_seg]
+                    audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size]
+                else:
+                    mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant")
+                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
+        mel_loss = mel_spectrogram(
+            audio,
+            self.n_fft,
+            self.num_mels,
+            self.sampling_rate,
+            self.hop_size,
+            self.win_size,
+            self.fmin,
+            self.fmax_loss,
+            center=False,
+        )
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+    def __len__(self):
+        return len(self.audio_files)

scripts/process_prompt.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+import os
+import torch
+import torchaudio
+import numpy as np
+from frontend import CosyVoiceFrontEnd
+def load_wav(wav, target_sr, min_sr=16000):
+    speech, sample_rate = torchaudio.load(wav, backend='soundfile')
+    speech = speech.mean(dim=0, keepdim=True)
+    if sample_rate != target_sr:
+        assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+        speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+    return speech
+if __name__ == "__main__":
+    args = argparse.ArgumentParser()
+    args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN/", help="tokenizer configuration directionary")
+    args.add_argument('--wetext_dir', type=str, default="./pengzhendong/wetext", help="path to wetext")
+    args.add_argument('--sample_rate', type=int, default=24000, help="Sampling rate for prompt audio")
+    args.add_argument('--prompt_text', type=str, default="You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。", help="The text content of the prompt(reference) audio. Text or file path.")
+    args.add_argument('--prompt_speech', type=str, default="asset/zero_shot_prompt.wav", help="The path to prompt(reference) audio.")
+    args.add_argument('--output', type=str, default="prompt_files", help="Output data storage directory")
+    args = args.parse_args()
+    os.makedirs(args.output, exist_ok=True)
+    frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
+                                args.wetext_dir,
+                                "../frontend-onnx/campplus.onnx",
+                                "../frontend-onnx/speech_tokenizer_v3.onnx",
+                                f"{args.model_dir}/spk2info.pt",
+                                "all")
+    prompt_speech_16k = load_wav(args.prompt_speech, 16000)
+    zero_shot_spk_id = ""
+    if os.path.isfile(args.prompt_text):
+        with open(args.prompt_text, "r") as f:
+            prompt_text = f.read()
+    else:
+        prompt_text = args.prompt_text
+    print("prompt_text",prompt_text)
+    model_input = frontend.process_prompt( prompt_text, prompt_speech_16k, args.sample_rate, zero_shot_spk_id)
+    # model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+    #                        'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+    #                        'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+    #                        'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+    #                        'llm_embedding': embedding, 'flow_embedding': embedding}
+    print("prompt speech token size:", model_input["flow_prompt_speech_token"].shape)
+    assert model_input["flow_prompt_speech_token"].shape[1] >=75, f"speech_token length should >= 75, bug get {model_input['flow_prompt_speech_token'].shape[1]}"
+    for k, v in model_input.items():
+        if "_len" in k:
+            continue
+        shapes = [str(s) for s in v.shape]
+        shape_str = "_".join(shapes)
+        if v.dtype in (torch.int32, torch.int64):
+            np.savetxt(f"{args.output}/{k}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
+        else:
+            np.savetxt(f"{args.output}/{k}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")

scripts/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+openai-whisper==20231117
+transformers
+gradio
+onnxruntime
+torch
+torchaudio
+inflect
+wetext

scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:747979631e813193436aabcff7c1c235d37de8097b71c563ec8b63b7a515c718
+size 907395

scripts/tokenizer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import base64
+import os
+from functools import lru_cache
+from typing import Optional
+import torch
+from transformers import AutoTokenizer
+from whisper.tokenizer import Tokenizer
+import tiktoken
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+    "yue": "cantonese",
+    "minnan": "minnan",
+    "wuyu": "wuyu",
+    "dialect": "dialect",
+    "zh/en": "zh/en",
+    "en/zh": "en/zh",
+}
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+    "mandarin": "zh",
+}
+AUDIO_EVENT = {
+    "ASR": "ASR",
+    "AED": "AED",
+    "SER": "SER",
+    "Speech": "Speech",
+    "/Speech": "/Speech",
+    "BGM": "BGM",
+    "/BGM": "/BGM",
+    "Laughter": "Laughter",
+    "/Laughter": "/Laughter",
+    "Applause": "Applause",
+    "/Applause": "/Applause",
+}
+EMOTION = {
+    "HAPPY": "HAPPY",
+    "SAD": "SAD",
+    "ANGRY": "ANGRY",
+    "NEUTRAL": "NEUTRAL",
+}
+TTS_Vocal_Token = {
+    "TTS/B": "TTS/B",
+    "TTS/O": "TTS/O",
+    "TTS/Q": "TTS/Q",
+    "TTS/A": "TTS/A",
+    "TTS/CO": "TTS/CO",
+    "TTS/CL": "TTS/CL",
+    "TTS/H": "TTS/H",
+    **{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)}
+}
+@lru_cache(maxsize=None)
+def get_encoding(name: str = "gpt2", num_languages: int = 99):
+    vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
+    ranks = {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in open(vocab_path) if line)
+    }
+    n_vocab = len(ranks)
+    special_tokens = {}
+    specials = [
+        "<|endoftext|>",
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
+        *[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())],
+        *[f"<|{emotion}|>" for emotion in list(EMOTION.keys())],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+        *[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)],        # register special tokens for ASR
+        *[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())],  # register special tokens for TTS
+        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
+    ]
+    for token in specials:
+        special_tokens[token] = n_vocab
+        n_vocab += 1
+    return tiktoken.Encoding(
+        name=os.path.basename(vocab_path),
+        explicit_n_vocab=n_vocab,
+        pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        mergeable_ranks=ranks,
+        special_tokens=special_tokens,
+    )
+@lru_cache(maxsize=None)
+def get_tokenizer(
+    multilingual: bool,
+    *,
+    num_languages: int = 99,
+    language: Optional[str] = None,
+    task: Optional[str] = None,  # Literal["transcribe", "translate", None]
+) -> Tokenizer:
+    if language is not None:
+        language = language.lower()
+        if language not in LANGUAGES:
+            if language in TO_LANGUAGE_CODE:
+                language = TO_LANGUAGE_CODE[language]
+            else:
+                raise ValueError(f"Unsupported language: {language}")
+    if multilingual:
+        encoding_name = "multilingual_zh_ja_yue_char_del"
+        language = language or "en"
+        task = task or "transcribe"
+    else:
+        encoding_name = "gpt2"
+        language = None
+        task = None
+    encoding = get_encoding(name=encoding_name, num_languages=num_languages)
+    return Tokenizer(
+        encoding=encoding, num_languages=num_languages, language=language, task=task
+    )
+class CosyVoice2Tokenizer():
+    def __init__(self, token_path, skip_special_tokens=True):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]"
+            ]
+        }
+        self.special_tokens = special_tokens
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+    def encode(self, text, **kwargs):
+        tokens = self.tokenizer([text], return_tensors="pt")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+    def decode(self, tokens):
+        tokens = torch.tensor(tokens, dtype=torch.int64)
+        text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+        return text
+class CosyVoice3Tokenizer(CosyVoice2Tokenizer):
+    def __init__(self, token_path, skip_special_tokens=True):
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]", "<|endofsystem|>",
+                "[AA]", "[AA0]", "[AA1]", "[AA2]", "[AE]", "[AE0]", "[AE1]", "[AE2]", "[AH]", "[AH0]", "[AH1]", "[AH2]",
+                "[AO]", "[AO0]", "[AO1]", "[AO2]", "[AW]", "[AW0]", "[AW1]", "[AW2]", "[AY]", "[AY0]", "[AY1]", "[AY2]",
+                "[B]", "[CH]", "[D]", "[DH]", "[EH]", "[EH0]", "[EH1]", "[EH2]", "[ER]", "[ER0]", "[ER1]", "[ER2]", "[EY]",
+                "[EY0]", "[EY1]", "[EY2]", "[F]", "[G]", "[HH]", "[IH]", "[IH0]", "[IH1]", "[IH2]", "[IY]", "[IY0]", "[IY1]",
+                "[IY2]", "[JH]", "[K]", "[L]", "[M]", "[N]", "[NG]", "[OW]", "[OW0]", "[OW1]", "[OW2]", "[OY]", "[OY0]",
+                "[OY1]", "[OY2]", "[P]", "[R]", "[S]", "[SH]", "[T]", "[TH]", "[UH]", "[UH0]", "[UH1]", "[UH2]", "[UW]",
+                "[UW0]", "[UW1]", "[UW2]", "[V]", "[W]", "[Y]", "[Z]", "[ZH]",
+                "[a]", "[ai]", "[an]", "[ang]", "[ao]", "[b]", "[c]", "[ch]", "[d]", "[e]", "[ei]", "[en]", "[eng]", "[f]",
+                "[g]", "[h]", "[i]", "[ian]", "[in]", "[ing]", "[iu]", "[ià]", "[iàn]", "[iàng]", "[iào]", "[iá]", "[ián]",
+                "[iáng]", "[iáo]", "[iè]", "[ié]", "[iòng]", "[ióng]", "[iù]", "[iú]", "[iā]", "[iān]", "[iāng]", "[iāo]",
+                "[iē]", "[iě]", "[iōng]", "[iū]", "[iǎ]", "[iǎn]", "[iǎng]", "[iǎo]", "[iǒng]", "[iǔ]", "[j]", "[k]", "[l]",
+                "[m]", "[n]", "[o]", "[ong]", "[ou]", "[p]", "[q]", "[r]", "[s]", "[sh]", "[t]", "[u]", "[uang]", "[ue]",
+                "[un]", "[uo]", "[uà]", "[uài]", "[uàn]", "[uàng]", "[uá]", "[uái]", "[uán]", "[uáng]", "[uè]", "[ué]", "[uì]",
+                "[uí]", "[uò]", "[uó]", "[uā]", "[uāi]", "[uān]", "[uāng]", "[uē]", "[uě]", "[uī]", "[uō]", "[uǎ]", "[uǎi]",
+                "[uǎn]", "[uǎng]", "[uǐ]", "[uǒ]", "[vè]", "[w]", "[x]", "[y]", "[z]", "[zh]", "[à]", "[ài]", "[àn]", "[àng]",
+                "[ào]", "[á]", "[ái]", "[án]", "[��ng]", "[áo]", "[è]", "[èi]", "[èn]", "[èng]", "[èr]", "[é]", "[éi]", "[én]",
+                "[éng]", "[ér]", "[ì]", "[ìn]", "[ìng]", "[í]", "[ín]", "[íng]", "[ò]", "[òng]", "[òu]", "[ó]", "[óng]", "[óu]",
+                "[ù]", "[ùn]", "[ú]", "[ún]", "[ā]", "[āi]", "[ān]", "[āng]", "[āo]", "[ē]", "[ēi]", "[ēn]", "[ēng]", "[ě]",
+                "[ěi]", "[ěn]", "[ěng]", "[ěr]", "[ī]", "[īn]", "[īng]", "[ō]", "[ōng]", "[ōu]", "[ū]", "[ūn]", "[ǎ]", "[ǎi]",
+                "[ǎn]", "[ǎng]", "[ǎo]", "[ǐ]", "[ǐn]", "[ǐng]", "[ǒ]", "[ǒng]", "[ǒu]", "[ǔ]", "[ǔn]", "[ǘ]", "[ǚ]", "[ǜ]"
+            ]
+        }
+        self.special_tokens = special_tokens
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+@lru_cache(maxsize=None)
+def get_qwen_tokenizer(
+    token_path: str,
+    skip_special_tokens: bool,
+    version: str = 'cosyvoice2'
+):
+    if version == 'cosyvoice2':
+        return CosyVoice2Tokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
+    elif version == 'cosyvoice3':
+        return CosyVoice3Tokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
+    else:
+        raise ValueError

token2wav-axmodels/flow.input_embedding.float16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a7ceb5ede1cac3bdcec37aa034a694821a735087890c2104da238bf1e921bc6
+size 1049760