lihongjie commited on
Commit
08a04fb
·
1 Parent(s): 867fae1

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +55 -0
  2. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin +3 -0
  3. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel +3 -0
  4. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin +3 -0
  5. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel +3 -0
  6. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel +3 -0
  7. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel +3 -0
  8. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel +3 -0
  9. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel +3 -0
  10. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel +3 -0
  11. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel +3 -0
  12. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel +3 -0
  13. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel +3 -0
  14. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel +3 -0
  15. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel +3 -0
  16. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel +3 -0
  17. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel +3 -0
  18. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel +3 -0
  19. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel +3 -0
  20. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel +3 -0
  21. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel +3 -0
  22. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel +3 -0
  23. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel +3 -0
  24. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel +3 -0
  25. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel +3 -0
  26. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel +3 -0
  27. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel +3 -0
  28. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel +3 -0
  29. CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel +3 -0
  30. README.md +227 -0
  31. asset/dingding.png +3 -0
  32. asset/output.wav +3 -0
  33. asset/zero_shot_prompt.wav +3 -0
  34. frontend-onnx/campplus.onnx +3 -0
  35. frontend-onnx/speech_tokenizer_v3.onnx +3 -0
  36. main_ax650 +3 -0
  37. run_ax650.sh +21 -0
  38. scripts/CosyVoice-BlankEN/merges.txt +0 -0
  39. scripts/CosyVoice-BlankEN/tokenizer_config.json +40 -0
  40. scripts/CosyVoice-BlankEN/vocab.json +0 -0
  41. scripts/audio.py +83 -0
  42. scripts/cosyvoice3_tokenizer.py +124 -0
  43. scripts/frontend.py +251 -0
  44. scripts/gradio_demo.py +161 -0
  45. scripts/meldataset.py +217 -0
  46. scripts/process_prompt.py +62 -0
  47. scripts/requirements.txt +8 -0
  48. scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +3 -0
  49. scripts/tokenizer/tokenizer.py +327 -0
  50. token2wav-axmodels/flow.input_embedding.float16.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,58 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel filter=lfs diff=lfs merge=lfs -text
37
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel filter=lfs diff=lfs merge=lfs -text
38
+ token2wav-axmodels/flow_estimator_200.axmodel filter=lfs diff=lfs merge=lfs -text
39
+ token2wav-axmodels/speech_window_2x8x480.txt filter=lfs diff=lfs merge=lfs -text
40
+ token2wav-axmodels/flow.input_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
41
+ token2wav-axmodels/rand_noise_1_80_300.txt filter=lfs diff=lfs merge=lfs -text
42
+ main_ax650 filter=lfs diff=lfs merge=lfs -text
43
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel filter=lfs diff=lfs merge=lfs -text
44
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel filter=lfs diff=lfs merge=lfs -text
45
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel filter=lfs diff=lfs merge=lfs -text
46
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel filter=lfs diff=lfs merge=lfs -text
47
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel filter=lfs diff=lfs merge=lfs -text
48
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel filter=lfs diff=lfs merge=lfs -text
49
+ frontend-onnx/speech_tokenizer_v3.onnx filter=lfs diff=lfs merge=lfs -text
50
+ token2wav-axmodels/hift_p1_100.axmodel filter=lfs diff=lfs merge=lfs -text
51
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin filter=lfs diff=lfs merge=lfs -text
52
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel filter=lfs diff=lfs merge=lfs -text
53
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel filter=lfs diff=lfs merge=lfs -text
54
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel filter=lfs diff=lfs merge=lfs -text
55
+ token2wav-axmodels/hift_p1_50.axmodel filter=lfs diff=lfs merge=lfs -text
56
+ token2wav-axmodels/hift_p2_100_final.axmodel filter=lfs diff=lfs merge=lfs -text
57
+ token2wav-axmodels/flow_encoder_50_final.axmodel filter=lfs diff=lfs merge=lfs -text
58
+ token2wav-axmodels/hift_p2_150.axmodel filter=lfs diff=lfs merge=lfs -text
59
+ token2wav-axmodels/hift_p2_50.axmodel filter=lfs diff=lfs merge=lfs -text
60
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel filter=lfs diff=lfs merge=lfs -text
61
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel filter=lfs diff=lfs merge=lfs -text
62
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel filter=lfs diff=lfs merge=lfs -text
63
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel filter=lfs diff=lfs merge=lfs -text
64
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel filter=lfs diff=lfs merge=lfs -text
65
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel filter=lfs diff=lfs merge=lfs -text
66
+ token2wav-axmodels/hift_p1_150.axmodel filter=lfs diff=lfs merge=lfs -text
67
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
68
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel filter=lfs diff=lfs merge=lfs -text
69
+ scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken filter=lfs diff=lfs merge=lfs -text
70
+ frontend-onnx/campplus.onnx filter=lfs diff=lfs merge=lfs -text
71
+ token2wav-axmodels/flow_encoder_78.axmodel filter=lfs diff=lfs merge=lfs -text
72
+ token2wav-axmodels/flow_estimator_300.axmodel filter=lfs diff=lfs merge=lfs -text
73
+ token2wav-axmodels/flow.input_embedding.npy filter=lfs diff=lfs merge=lfs -text
74
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel filter=lfs diff=lfs merge=lfs -text
75
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel filter=lfs diff=lfs merge=lfs -text
76
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel filter=lfs diff=lfs merge=lfs -text
77
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel filter=lfs diff=lfs merge=lfs -text
78
+ token2wav-axmodels/hift_p1_100_final.axmodel filter=lfs diff=lfs merge=lfs -text
79
+ token2wav-axmodels/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
80
+ token2wav-axmodels/flow_encoder_28.axmodel filter=lfs diff=lfs merge=lfs -text
81
+ token2wav-axmodels/flow_encoder_53.axmodel filter=lfs diff=lfs merge=lfs -text
82
+ token2wav-axmodels/flow_estimator_250.axmodel filter=lfs diff=lfs merge=lfs -text
83
+ token2wav-axmodels/hift_p2_100.axmodel filter=lfs diff=lfs merge=lfs -text
84
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
85
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel filter=lfs diff=lfs merge=lfs -text
86
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel filter=lfs diff=lfs merge=lfs -text
87
+ CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel filter=lfs diff=lfs merge=lfs -text
88
+ asset/zero_shot_prompt.wav filter=lfs diff=lfs merge=lfs -text
89
+ asset/dingding.png filter=lfs diff=lfs merge=lfs -text
90
+ asset/output.wav filter=lfs diff=lfs merge=lfs -text
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm.speech_embedding.float16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54b7a3a7095c822489d43ee4b3a490606cfe9ad347d2c9e13c4581d8de0cfab8
3
+ size 12115712
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/llm_decoder.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a04dd04e4b5a95dca860ec1026c736e2aca158f45488969724e8a3f4a5682e85
3
+ size 6506188
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/model.embed_tokens.weight.bfloat16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fa9b7fbc8ba5adbedbde9a6704ab2cbb73cdccc370a06b0be79086176179572
3
+ size 272269312
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l0_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81970ce053f1dbb0fb6db9f61be43f79c92eef326c08ac2d445e5979d78ed7ad
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l10_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00408e4cfaa0d7a0a7fdd6305ee7c66b967ac287eee14f3a6ba146d6426e9591
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l11_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4472b2f70c7960269663e981015d5f828c4d240b316cf00e1bf33ff678bbe92f
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l12_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c26672ead01ee8164b7405eb2d6c7fd410763ddcabfdd87ebdee6179ebd1f4b
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l13_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33916843b8c8a0a82a35b9f46ab893c3ab9d400660a7cac92958d305b6a1aea8
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l14_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e25ade425171c7139ce758936c9d95d13dbec129630a01c88171ee085dc71c75
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l15_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dafa43dc2ec21423d4d9147d1bdd48572d09e1e355dc921f4be6569ce1b799c
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l16_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ed263e08194acc4867fa7ffd13ad2044b18be9f8fba1250f276ac23b200e0dd
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l17_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:926bfceb3b6fae9afd79b7f5babf2931cb44675077866b192af367d2fc3ad5b4
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l18_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a426acb4187d034fc9a30406386c0cd0f9271c0794fa66646349d1d19b4f53e
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l19_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:484147aaee42d3314c72b41790c96e633848f44175e37762e897515ba0c7e346
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l1_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d207d70025931a9da459d3fcf2469b2b744e6e7bc6173abf7c9f70a9df83d8d9
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l20_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:075b956543f4ee84bdc9e61cce69f7286c94d1ff03f25d89d997f1786616dee6
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l21_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16ac8a2cba87bdbe3c9ed55cf60f8e5e683f69cc8334e7cf9a7594b7a20a6157
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l22_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd7c709bb017122f4e052893cc0ec4d0f7f2bc6a15b0f87c488947d2fccb6565
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l23_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae7339df1ee72ca28b557b6933997cec517b1a38766ffbe706814e12cda4137d
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l2_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa85ea3722be6839f5a6c01485a276908327f215523a3c09bb8028ce652d96d5
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l3_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c5faedfae53817cb67f868e00d98c80f688b89093b1c142732bc848811682e
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l4_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb9f5106bf2280d592e02913fa1c43f7ca1bebb66969436ab79dfc5a4b174749
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l5_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beac97add03d3edf0055cce152990138b9bbc9dac9835a384c35cadc1510f2b6
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l6_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60b01ff651579615d6db2ba63001fbcd8c0b740d765ad0eb42dab8b5549538a4
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l7_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63f4e960aff4347ba663b1c17ff7cc2f399c663e06610dc35ac0e4d95909bd24
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l8_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:196b5a89d35e522141e2e8ca0eac44e5d0723b0e6c3e6dec39730a3c9367fabb
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_p64_l9_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c46c32e13e5048c236f83ed783fd79457f6cd2dd4b88984fa756be7c53c7b849
3
+ size 17235064
CosyVoice-BlankEN-Ax650-C64-P256-CTX512/qwen2_post.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:302e68e08caf347dd063947baff79b68ce5f60da0c0a493942aa51b914c0b7e0
3
+ size 147957518
README.md CHANGED
@@ -1,3 +1,230 @@
1
  ---
2
  license: mit
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ language:
4
+ - en
5
+ - zh
6
+ base_model:
7
+ - CosyVoice3
8
+ pipeline_tag: text-to-speech
9
+ library_name: transformers
10
+ tags:
11
+ - CosyVoice3
12
+ - Speech
13
  ---
14
+
15
+ # CosyVoice3
16
+ This version of CosyVoice3 has been converted to run on the Axera NPU using **w8a16** quantization.
17
+ Compatible with Pulsar2 version: 4.2
18
+
19
+ ## Convert tools links:
20
+ For those who are interested in model conversion, you can try to export axmodel through the original repo :
21
+ [Cosyvoice](https://github.com/FunAudioLLM/CosyVoice)
22
+
23
+ [Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
24
+
25
+ [AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/CosyVoice3.Axera)
26
+
27
+ ## Support Platform
28
+
29
+ - AX650
30
+ - AX650N DEMO Board
31
+ - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
32
+ - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
33
+
34
+ **Speech Generation**
35
+ | Stage | Time |
36
+ |------|------|
37
+ | llm prefill ( input_token_num + prompt_token_num 在 [0,128 ] ) | 104 ms |
38
+ | llm prefill ( input_token_num + prompt_token_num 在 [128,256 ] ) | 234 ms |
39
+ | Decode | 21.24 token/s |
40
+
41
+ ## How to use
42
+
43
+ Download all files from this repository to the device
44
+
45
+ ### 1. PrePare
46
+
47
+ #### 1.1 Copy this project to AX650 Board
48
+
49
+ #### 1.2 Prepare Dependencies
50
+
51
+ **Running HTTP Tokenizer Server** and **Processing Prompt Speech** require these Python packages. If you run these two step on a PC, install them on the PC.
52
+ ```
53
+ pip3 install -r scripts/requirements.txt
54
+ ```
55
+
56
+ ### 2. Start HTTP Tokenizer Server
57
+ ```
58
+ cd scripts
59
+ python CosyVoice3_tokenizer.py --host {your host} --port {your port}
60
+ ```
61
+
62
+
63
+ ### 3. Run on Axera Device
64
+ There are 2 kinds of device, AX650 Board , AXCL aarch64 Board and AXCL x86 Board.
65
+
66
+ #### 3.1 Run on AX650 Board
67
+ 1) Moidfy the HTTP host in `run_ax650.sh`.
68
+
69
+ 2) Run `run_ax650.sh`
70
+ ```shell
71
+ root@ax650 ~/CosyVoice3 # bash run_ax650.sh
72
+ rm: cannot remove 'output*.wav': No such file or directory
73
+ [I][ Init][ 108]: LLM init start
74
+ [I][ Init][ 34]: connect http://10.122.86.184:12345 ok
75
+ bos_id: 0, eos_id: 1773
76
+ 7% | ███ | 2 / 27 [3.11s<42.04s, 0.64 count/s] embed_selector init ok[I][ Init][ 138]: attr.axmodel_num:24
77
+ 100% | ████████████████████████████████ | 27 / 27 [10.32s<10.32s, 2.62 count/s] init post axmodel ok,remain_cmm(7178 MB)
78
+ [I][ Init][ 216]: max_token_len : 1023
79
+ [I][ Init][ 221]: kv_cache_size : 128, kv_cache_num: 1023
80
+ [I][ Init][ 229]: prefill_token_num : 128
81
+ [I][ Init][ 233]: grp: 1, prefill_max_token_num : 1
82
+ [I][ Init][ 233]: grp: 2, prefill_max_token_num : 128
83
+ [I][ Init][ 233]: grp: 3, prefill_max_token_num : 256
84
+ [I][ Init][ 233]: grp: 4, prefill_max_token_num : 384
85
+ [I][ Init][ 233]: grp: 5, prefill_max_token_num : 512
86
+ [I][ Init][ 237]: prefill_max_token_num : 512
87
+ [I][ Init][ 249]: LLM init ok
88
+ [I][ Init][ 154]: Token2Wav init ok
89
+ [I][ main][ 273]:
90
+ [I][ Run][ 388]: input token num : 142, prefill_split_num : 2
91
+ [I][ Run][ 422]: input_num_token:128
92
+ [I][ Run][ 422]: input_num_token:14
93
+ [I][ Run][ 607]: ttft: 236.90 ms
94
+ [Main/Token2Wav Thread] Processing batch of 28 tokens...
95
+ Successfully saved audio to output_0.wav (32-bit Float PCM).
96
+ [Main/Token2Wav Thread] Processing batch of 53 tokens...
97
+ Successfully saved audio to output_1.wav (32-bit Float PCM).
98
+ [Main/Token2Wav Thread] Processing batch of 78 tokens...
99
+ Successfully saved audio to output_2.wav (32-bit Float PCM).
100
+ [Main/Token2Wav Thread] Processing batch of 78 tokens...
101
+ Successfully saved audio to output_3.wav (32-bit Float PCM).
102
+ [Main/Token2Wav Thread] Processing batch of 78 tokens...
103
+ Successfully saved audio to output_4.wav (32-bit Float PCM).
104
+ [Main/Token2Wav Thread] Processing batch of 78 tokens...
105
+ Successfully saved audio to output_5.wav (32-bit Float PCM).
106
+ [Main/Token2Wav Thread] Processing batch of 78 tokens...
107
+ Successfully saved audio to output_6.wav (32-bit Float PCM).
108
+ [Main/Token2Wav Thread] Processing batch of 78 tokens...
109
+ Successfully saved audio to output_7.wav (32-bit Float PCM).
110
+ [Main/Token2Wav Thread] Processing batch of 78 tokens...
111
+ Successfully saved audio to output_8.wav (32-bit Float PCM).
112
+ [Main/Token2Wav Thread] Processing batch of 78 tokens...
113
+ Successfully saved audio to output_9.wav (32-bit Float PCM).
114
+ [I][ Run][ 723]: hit eos, llm finished
115
+ [I][ Run][ 753]: llm finished
116
+ [Main/Token2Wav Thread] Buffer is empty and LLM finished. Exiting.
117
+
118
+
119
+ [I][ Run][ 758]: total decode tokens:271
120
+ [N][ Run][ 759]: hit eos,avg 21.47 token/s
121
+
122
+ Successfully saved audio to output_10.wav (32-bit Float PCM).
123
+ Successfully saved audio to output.wav (32-bit Float PCM).
124
+
125
+ Voice generation pipeline completed.
126
+ Type "q" to exit, Ctrl+c to stop current running
127
+ text >>
128
+ ```
129
+
130
+ Output Speech:
131
+ [output.wav](asset/output.wav)
132
+
133
+
134
+ #### Or run on AX650 Board with Gradio GUI
135
+ 1) Start server
136
+ ```
137
+ bash run_api_ax650.sh
138
+ ```
139
+ 2) Start Gradio GUI
140
+ ```
141
+ python scripts/gradio_demo.py
142
+ ```
143
+
144
+ #### 3.2 Run on AXCL aarch64 Board
145
+ ```
146
+ bash run_axcl_aarch64.sh
147
+ ```
148
+ #### Or run on AXCL aarch64 Board with Gradio GUI
149
+ 1) Start server
150
+ ```
151
+ bash run_api_axcl_aarch64.sh
152
+ ```
153
+ 2) Start Gradio GUI
154
+ ```
155
+ python scripts/gradio_demo.py
156
+ ```
157
+ 3) Open the page from a browser
158
+ The page url is : `https://{your device ip}:7860`
159
+
160
+ Note that you need to run these two commands in the project root directory.
161
+
162
+ #### 3.3 Run on AXCL x86 Board
163
+ ```
164
+ bash run_axcl_x86.sh
165
+ ```
166
+ #### Or run on AXCL aarch64 Board with Gradio GUI
167
+ 1) Start server
168
+ ```
169
+ bash run_api_axcl_x86.sh
170
+ ```
171
+ 2) Start Gradio GUI
172
+ ```
173
+ python scripts/gradio_demo.py
174
+ ```
175
+ 3) Open the page from a browser
176
+ The page url is : `https://{your device ip}:7860`
177
+
178
+ Note that you need to run these two commands in the project root directory.
179
+
180
+ ![](./gradio.png)
181
+
182
+ ### Optional. Process Prompt Speech
183
+ If you want to replicate a specific sound, do this step.
184
+ You can use audio in asset/ .
185
+
186
+ #### (1). Downlaod wetext
187
+ ```
188
+ pip3 install modelscope
189
+ modelscope download --model pengzhendong/wetext --local_dir pengzhendong/wetext
190
+ ```
191
+
192
+ #### (2). Process Prompt Speech
193
+ Example:
194
+ ```
195
+ python3 scripts/process_prompt.py --prompt_text asset/zh_man1.txt --prompt_speech asset/zh_man1.wav --output zh_man1
196
+ ```
197
+
198
+ Pass parameters according to the actual situation.
199
+ ```
200
+ python3 scripts/process_prompt.py -h
201
+
202
+ usage: process_prompt.py [-h] [--model_dir MODEL_DIR] [--wetext_dir WETEXT_DIR] [--sample_rate SAMPLE_RATE] [--prompt_text PROMPT_TEXT] [--prompt_speech PROMPT_SPEECH]
203
+ [--output OUTPUT]
204
+
205
+ options:
206
+ -h, --help show this help message and exit
207
+ --model_dir MODEL_DIR
208
+ tokenizer configuration directionary
209
+ --wetext_dir WETEXT_DIR
210
+ path to wetext
211
+ --sample_rate SAMPLE_RATE
212
+ Sampling rate for prompt audio
213
+ --prompt_text PROMPT_TEXT
214
+ The text content of the prompt(reference) audio. Text or file path.
215
+ --prompt_speech PROMPT_SPEECH
216
+ The path to prompt(reference) audio.
217
+ --output OUTPUT Output data storage directory
218
+ ```
219
+
220
+ After executing the above command, files like the following will be generated:
221
+ ```
222
+ flow_embedding.txt
223
+ flow_prompt_speech_token.txt
224
+ llm_embedding.txt
225
+ llm_prompt_speech_token.txt
226
+ prompt_speech_feat.txt
227
+ prompt_text.txt
228
+ ```
229
+
230
+ When you run run_ax650.sh, pass the output path here to the prompt_files parameter of the run_ax650.sh script.
asset/dingding.png ADDED

Git LFS Details

  • SHA256: 3870bb0a4e3df1f643e09c960b7e03d80da798509c86eaa326db205236b861d5
  • Pointer size: 130 Bytes
  • Size of remote file: 96.4 kB
asset/output.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7a4c3837145df17e851c177f849446036e6f541d78eb6e107ea6b9e7b07672
3
+ size 1067564
asset/zero_shot_prompt.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd199eb7109fd6ce9943cb297e3cf350c1073af014063dfadbdc100230526243
3
+ size 111496
frontend-onnx/campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
frontend-onnx/speech_tokenizer_v3.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23236a74175dbdda47afc66dbadd5bcb41303c467a57c261cb8539ad9db9208d
3
+ size 969451503
main_ax650 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5192387e51f64ea8993eb9bc3e848092aa2f8ce7157891496b152149a42ed6
3
+ size 6647080
run_ax650.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LLM_DIR=CosyVoice-BlankEN-Ax650-C64-P256-CTX512/
2
+ TOKEN2WAV_DIR=token2wav-axmodels/
3
+
4
+ ./main_ax650 \
5
+ --template_filename_axmodel "${LLM_DIR}/qwen2_p64_l%d_together.axmodel" \
6
+ --token2wav_axmodel_dir $TOKEN2WAV_DIR \
7
+ --n_timesteps 10 \
8
+ --axmodel_num 24 \
9
+ --bos 0 --eos 0 \
10
+ --filename_tokenizer_model "http://10.122.86.184:12345" \
11
+ --filename_post_axmodel "${LLM_DIR}/qwen2_post.axmodel" \
12
+ --filename_decoder_axmodel "${LLM_DIR}/llm_decoder.axmodel" \
13
+ --filename_tokens_embed "${LLM_DIR}/model.embed_tokens.weight.bfloat16.bin" \
14
+ --filename_llm_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
15
+ --filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
16
+ --continue 0 \
17
+ --prompt_files prompt_files \
18
+ --text "高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。"
19
+
20
+
21
+ chmod 777 output.wav
scripts/CosyVoice-BlankEN/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
scripts/CosyVoice-BlankEN/tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|im_end|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
scripts/CosyVoice-BlankEN/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/audio.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.utils.data
4
+ from librosa.filters import mel as librosa_mel_fn
5
+ from scipy.io.wavfile import read
6
+
7
+ MAX_WAV_VALUE = 32768.0
8
+
9
+
10
+ def load_wav(full_path):
11
+ sampling_rate, data = read(full_path)
12
+ return data, sampling_rate
13
+
14
+
15
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
16
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
17
+
18
+
19
+ def dynamic_range_decompression(x, C=1):
20
+ return np.exp(x) / C
21
+
22
+
23
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
24
+ return torch.log(torch.clamp(x, min=clip_val) * C)
25
+
26
+
27
+ def dynamic_range_decompression_torch(x, C=1):
28
+ return torch.exp(x) / C
29
+
30
+
31
+ def spectral_normalize_torch(magnitudes):
32
+ output = dynamic_range_compression_torch(magnitudes)
33
+ return output
34
+
35
+
36
+ def spectral_de_normalize_torch(magnitudes):
37
+ output = dynamic_range_decompression_torch(magnitudes)
38
+ return output
39
+
40
+
41
+ mel_basis = {}
42
+ hann_window = {}
43
+
44
+
45
+ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
46
+ if torch.min(y) < -1.0:
47
+ print("min value is ", torch.min(y))
48
+ if torch.max(y) > 1.0:
49
+ print("max value is ", torch.max(y))
50
+
51
+ global mel_basis, hann_window # pylint: disable=global-statement
52
+ print("fmax",fmax)
53
+ if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
54
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
55
+ mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
56
+ hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
57
+
58
+ y = torch.nn.functional.pad(
59
+ y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
60
+ )
61
+ y = y.squeeze(1)
62
+
63
+ spec = torch.view_as_real(
64
+ torch.stft(
65
+ y,
66
+ n_fft,
67
+ hop_length=hop_size,
68
+ win_length=win_size,
69
+ window=hann_window[str(y.device)],
70
+ center=center,
71
+ pad_mode="reflect",
72
+ normalized=False,
73
+ onesided=True,
74
+ return_complex=True,
75
+ )
76
+ )
77
+
78
+ spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
79
+
80
+ spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
81
+ spec = spectral_normalize_torch(spec)
82
+
83
+ return spec
scripts/cosyvoice3_tokenizer.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, PreTrainedTokenizerFast
2
+ from http.server import HTTPServer, BaseHTTPRequestHandler
3
+ import json
4
+ import argparse
5
+ from tokenizer.tokenizer import get_qwen_tokenizer
6
+
7
+ class Tokenizer_Http():
8
+
9
+ def __init__(self):
10
+
11
+ self.tokenizer = get_qwen_tokenizer("CosyVoice-BlankEN/", True, "cosyvoice3")
12
+
13
+ def encode(self, prompt):
14
+
15
+ token_ids = self.tokenizer.encode(prompt, allowed_special="all")
16
+ return token_ids
17
+
18
+ def decode(self, token_ids):
19
+ return self.tokenizer.decode(token_ids)
20
+
21
+ # @property
22
+ # def bos_id(self):
23
+ # return self.tokenizer.bos_token_id
24
+
25
+ @property
26
+ def eos_id(self):
27
+ return 1773
28
+
29
+ # @property
30
+ # def bos_token(self):
31
+ # return self.tokenizer.bos_token
32
+
33
+ @property
34
+ def eos_token(self):
35
+ return "<|eot_id|>"
36
+
37
+
38
+ tokenizer = Tokenizer_Http()
39
+
40
+ # print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
41
+ print(tokenizer.encode("hello world"))
42
+
43
+
44
+ class Request(BaseHTTPRequestHandler):
45
+ #通过类继承,新定义类
46
+ timeout = 5
47
+ server_version = 'Apache'
48
+
49
+ def do_GET(self):
50
+ print(self.path)
51
+ #在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行)
52
+ self.send_response(200)
53
+ self.send_header("type", "get") #设置响应头,可省略或设置多个
54
+ self.end_headers()
55
+
56
+ if self.path == '/bos_id':
57
+ bos_id = tokenizer.bos_id
58
+ # print(bos_id)
59
+ # to json
60
+ if bos_id is None:
61
+ msg = json.dumps({'bos_id': -1})
62
+ else:
63
+ msg = json.dumps({'bos_id': bos_id})
64
+ elif self.path == '/eos_id':
65
+ eos_id = tokenizer.eos_id
66
+ if eos_id is None:
67
+ msg = json.dumps({'eos_id': -1})
68
+ else:
69
+ msg = json.dumps({'eos_id': eos_id})
70
+ else:
71
+ msg = 'error'
72
+
73
+ print(msg)
74
+ msg = str(msg).encode() #转为str再转为byte格式
75
+
76
+ self.wfile.write(msg) #将byte格式的信息返回给客户端
77
+
78
+ def do_POST(self):
79
+ #在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行)
80
+ data = self.rfile.read(int(
81
+ self.headers['content-length'])) #获取从客户端传入的参数(byte格式)
82
+ data = data.decode() #将byte格式转为str格式
83
+
84
+ self.send_response(200)
85
+ self.send_header("type", "post") #设置响应头,可省略或设置多个
86
+ self.end_headers()
87
+
88
+ if self.path == '/encode':
89
+ req = json.loads(data)
90
+ prompt = req['text']
91
+
92
+ token_ids = tokenizer.encode(prompt)
93
+ if token_ids is None:
94
+ msg = json.dumps({'token_ids': -1})
95
+ else:
96
+ msg = json.dumps({'token_ids': token_ids})
97
+
98
+ elif self.path == '/decode':
99
+ req = json.loads(data)
100
+ token_ids = req['token_ids']
101
+ text = tokenizer.decode(token_ids)
102
+ if text is None:
103
+ msg = json.dumps({'text': ""})
104
+ else:
105
+ msg = json.dumps({'text': text})
106
+ else:
107
+ msg = 'error'
108
+ print(msg)
109
+ msg = str(msg).encode() #转为str再转为byte格式
110
+
111
+ self.wfile.write(msg) #将byte格式的信息返回给客户端
112
+
113
+
114
+ if __name__ == "__main__":
115
+
116
+ args = argparse.ArgumentParser()
117
+ args.add_argument('--host', type=str, default='localhost')
118
+ args.add_argument('--port', type=int, default=12345)
119
+ args = args.parse_args()
120
+
121
+ host = (args.host, args.port) #设定地址与端口号,'localhost'等价于'127.0.0.1'
122
+ print('http://%s:%s' % host)
123
+ server = HTTPServer(host, Request) #根据地址端口号和新定义的类,创建服务器实例
124
+ server.serve_forever() #开启服务
scripts/frontend.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from functools import partial
15
+ from functools import lru_cache
16
+ from typing import Generator
17
+ import json
18
+ import onnxruntime
19
+ import torch
20
+ import numpy as np
21
+ import whisper
22
+ from typing import Callable
23
+ import torchaudio.compliance.kaldi as kaldi
24
+ import torchaudio
25
+ import os
26
+ import re
27
+ import inflect
28
+ from tokenizer.tokenizer import get_qwen_tokenizer
29
+ from audio import mel_spectrogram
30
+
31
+ try:
32
+ import ttsfrd
33
+ use_ttsfrd = True
34
+ except ImportError:
35
+
36
+ from wetext import Normalizer as ZhNormalizer
37
+ from wetext import Normalizer as EnNormalizer
38
+ use_ttsfrd = False
39
+
40
+ import logging
41
+ logging.getLogger('frontend').setLevel(logging.WARNING)
42
+ # logging.basicConfig(level=logging.DEBUG,
43
+ # format='%(asctime)s %(levelname)s %(message)s')
44
+
45
+ class CosyVoiceFrontEnd:
46
+
47
+ def __init__(self,
48
+ pretrained_path: str,
49
+ wetext_dir: str,
50
+ campplus_model: str,
51
+ speech_tokenizer_model: str,
52
+ spk2info: str = '',
53
+ allowed_special: str = 'all'):
54
+ self.tokenizer = get_qwen_tokenizer(pretrained_path, True)
55
+ self.feat_extractor = partial(
56
+ mel_spectrogram,
57
+ n_fft=1920,
58
+ num_mels=80,
59
+ sampling_rate=24000,
60
+ hop_size=480,
61
+ win_size=1920,
62
+ fmin=0,
63
+ fmax=8000,
64
+ center=False)
65
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
66
+ option = onnxruntime.SessionOptions()
67
+ option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
68
+ option.intra_op_num_threads = 1
69
+ self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
70
+ self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
71
+ providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
72
+ "CPUExecutionProvider"])
73
+ if os.path.exists(spk2info):
74
+ self.spk2info = torch.load(spk2info, map_location=self.device)
75
+ else:
76
+ self.spk2info = {}
77
+ self.allowed_special = allowed_special
78
+ self.use_ttsfrd = use_ttsfrd
79
+ if self.use_ttsfrd:
80
+ self.frd = ttsfrd.TtsFrontendEngine()
81
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
82
+ assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \
83
+ 'failed to initialize ttsfrd resource'
84
+ self.frd.set_lang_type('pinyinvg')
85
+ else:
86
+ self.zh_tn_model = ZhNormalizer(remove_erhua=False, lang="zh")
87
+ self.en_tn_model = EnNormalizer(lang="zh")
88
+ self.inflect_parser = inflect.engine()
89
+
90
+ def _extract_text_token(self, text):
91
+ if isinstance(text, Generator):
92
+ logging.info('get tts_text generator, will return _extract_text_token_generator!')
93
+ # NOTE add a dummy text_token_len for compatibility
94
+ return self._extract_text_token_generator(text), torch.tensor([0], dtype=torch.int32).to(self.device)
95
+ else:
96
+ text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
97
+ text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
98
+ text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
99
+ return text_token, text_token_len
100
+
101
+ def _extract_text_token_generator(self, text_generator):
102
+ for text in text_generator:
103
+ text_token, _ = self._extract_text_token(text)
104
+ for i in range(text_token.shape[1]):
105
+ yield text_token[:, i: i + 1]
106
+
107
+ def _extract_speech_token(self, speech):
108
+ assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
109
+ feat = whisper.log_mel_spectrogram(speech, n_mels=128)
110
+ speech_token = self.speech_tokenizer_session.run(None,
111
+ {self.speech_tokenizer_session.get_inputs()[0].name:
112
+ feat.detach().cpu().numpy(),
113
+ self.speech_tokenizer_session.get_inputs()[1].name:
114
+ np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
115
+ speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
116
+ speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
117
+ return speech_token, speech_token_len
118
+
119
+ def _extract_spk_embedding(self, speech):
120
+ feat = kaldi.fbank(speech,
121
+ num_mel_bins=80,
122
+ dither=0,
123
+ sample_frequency=16000)
124
+ feat = feat - feat.mean(dim=0, keepdim=True)
125
+ embedding = self.campplus_session.run(None,
126
+ {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
127
+ embedding = torch.tensor([embedding]).to(self.device)
128
+ return embedding
129
+
130
+ def _extract_speech_feat(self, speech):
131
+ speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
132
+ speech_feat = speech_feat.unsqueeze(dim=0)
133
+ speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
134
+ return speech_feat, speech_feat_len
135
+
136
+ def text_normalize(self, text, split=True, text_frontend=True):
137
+ if isinstance(text, Generator):
138
+ logging.info('get tts_text generator, will skip text_normalize!')
139
+ return [text]
140
+ if text_frontend is False or text == '':
141
+ return [text] if split is True else text
142
+ text = text.strip()
143
+ if self.use_ttsfrd:
144
+ texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
145
+ text = ''.join(texts)
146
+ else:
147
+ if contains_chinese(text):
148
+ text = self.zh_tn_model.normalize(text)
149
+ text = text.replace("\n", "")
150
+ text = replace_blank(text)
151
+ text = replace_corner_mark(text)
152
+ text = text.replace(".", "。")
153
+ text = text.replace(" - ", ",")
154
+ text = remove_bracket(text)
155
+ text = re.sub(r'[,,、]+$', '。', text)
156
+ texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
157
+ token_min_n=60, merge_len=20, comma_split=False))
158
+ else:
159
+ text = self.en_tn_model.normalize(text)
160
+ text = spell_out_number(text, self.inflect_parser)
161
+ texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
162
+ token_min_n=60, merge_len=20, comma_split=False))
163
+ texts = [i for i in texts if not is_only_punctuation(i)]
164
+ return texts if split is True else text
165
+
166
+ def frontend_sft(self, tts_text, spk_id):
167
+ tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
168
+ embedding = self.spk2info[spk_id]['embedding']
169
+ model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
170
+ return model_input
171
+
172
+ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
173
+ tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
174
+ if zero_shot_spk_id == '':
175
+ prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
176
+ prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
177
+ speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
178
+ speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
179
+ if resample_rate == 24000:
180
+ # cosyvoice2, force speech_feat % speech_token = 2
181
+ token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
182
+ speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
183
+ speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
184
+ embedding = self._extract_spk_embedding(prompt_speech_16k)
185
+ model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
186
+ 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
187
+ 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
188
+ 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
189
+ 'llm_embedding': embedding, 'flow_embedding': embedding}
190
+ else:
191
+ model_input = self.spk2info[zero_shot_spk_id]
192
+ model_input['text'] = tts_text_token
193
+ model_input['text_len'] = tts_text_token_len
194
+ return model_input
195
+
196
+ def process_prompt(self, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
197
+ if zero_shot_spk_id == '':
198
+ prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
199
+ prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
200
+ speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
201
+ speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
202
+ if resample_rate == 24000:
203
+ # cosyvoice2, force speech_feat % speech_token = 2
204
+ token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
205
+ speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
206
+ speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
207
+ embedding = self._extract_spk_embedding(prompt_speech_16k)
208
+ model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
209
+ 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
210
+ 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
211
+ 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
212
+ 'llm_embedding': embedding, 'flow_embedding': embedding}
213
+ else:
214
+ model_input = self.spk2info[zero_shot_spk_id]
215
+ return model_input
216
+
217
+ def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
218
+ model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
219
+ # in cross lingual mode, we remove prompt in llm
220
+ del model_input['prompt_text']
221
+ del model_input['prompt_text_len']
222
+ del model_input['llm_prompt_speech_token']
223
+ del model_input['llm_prompt_speech_token_len']
224
+ return model_input
225
+
226
+ def frontend_instruct(self, tts_text, spk_id, instruct_text):
227
+ model_input = self.frontend_sft(tts_text, spk_id)
228
+ # in instruct mode, we remove spk_embedding in llm due to information leakage
229
+ del model_input['llm_embedding']
230
+ instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
231
+ model_input['prompt_text'] = instruct_text_token
232
+ model_input['prompt_text_len'] = instruct_text_token_len
233
+ return model_input
234
+
235
+ def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
236
+ model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
237
+ del model_input['llm_prompt_speech_token']
238
+ del model_input['llm_prompt_speech_token_len']
239
+ return model_input
240
+
241
+ def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):
242
+ prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
243
+ prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
244
+ prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
245
+ embedding = self._extract_spk_embedding(prompt_speech_16k)
246
+ source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
247
+ model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
248
+ 'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
249
+ 'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
250
+ 'flow_embedding': embedding}
251
+ return model_input
scripts/gradio_demo.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import shutil
3
+ import gradio as gr
4
+ import numpy as np
5
+ import requests
6
+ import time
7
+ import os
8
+
9
+ import torch
10
+ from frontend import CosyVoiceFrontEnd
11
+ import torchaudio
12
+ import logging
13
+ logging.basicConfig(level=logging.WARNING)
14
+
15
+ import subprocess
16
+ import re
17
+
18
+ def get_all_local_ips():
19
+ result = subprocess.run(['ip', 'a'], capture_output=True, text=True)
20
+ output = result.stdout
21
+
22
+ # 匹配所有IPv4
23
+ ips = re.findall(r'inet (\d+\.\d+\.\d+\.\d+)', output)
24
+
25
+ # 过滤掉回环地址
26
+ real_ips = [ip for ip in ips if not ip.startswith('127.')]
27
+
28
+ return real_ips
29
+
30
+
31
+ TTS_URL = "http://0.0.0.0:12346/tts"
32
+ GET_URL = "http://0.0.0.0:12346/get"
33
+ TIMESTEPS_URL = "http://0.0.0.0:12346/timesteps"
34
+ PROMPT_FILES_URL = "http://0.0.0.0:12346/prompt_files"
35
+
36
+ args = argparse.ArgumentParser()
37
+ args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN", help="tokenizer configuration directionary")
38
+ args.add_argument('--wetext_dir', type=str, default="pengzhendong/wetext", help="path to wetext")
39
+ args.add_argument('--sample_rate', type=int, default=24000, help="Sampling rate for prompt audio")
40
+ args = args.parse_args()
41
+ frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
42
+ args.wetext_dir,
43
+ "frontend-onnx/campplus.onnx",
44
+ "frontend-onnx/speech_tokenizer_v2.onnx",
45
+ f"{args.model_dir}/spk2info.pt",
46
+ "all")
47
+
48
+ def update_audio(audio_input_path, audio_text):
49
+ def load_wav(wav, target_sr):
50
+ speech, sample_rate = torchaudio.load(wav, backend='soundfile')
51
+ speech = speech.mean(dim=0, keepdim=True)
52
+ if sample_rate != target_sr:
53
+ assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
54
+ speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
55
+ return speech
56
+ output_dir = './output_temp'
57
+ # clear output_dir
58
+ if os.path.exists(output_dir):
59
+ shutil.rmtree(output_dir)
60
+ os.makedirs(output_dir, exist_ok=True)
61
+ zero_shot_spk_id = ""
62
+ prompt_speech_16k = load_wav(audio_input_path, 16000)
63
+ prompt_text = audio_text
64
+ print("prompt_text",prompt_text)
65
+ model_input = frontend.process_prompt( prompt_text, prompt_speech_16k, args.sample_rate, zero_shot_spk_id)
66
+ print("prompt speech token size:", model_input["flow_prompt_speech_token"].shape)
67
+ assert model_input["flow_prompt_speech_token"].shape[1] >=75, f"speech_token length should >= 75, bug get {model_input['flow_prompt_speech_token'].shape[1]}"
68
+ for k, v in model_input.items():
69
+ if "_len" in k:
70
+ continue
71
+ shapes = [str(s) for s in v.shape]
72
+ shape_str = "_".join(shapes)
73
+ if v.dtype in (torch.int32, torch.int64):
74
+ np.savetxt(f"{output_dir}/{k}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
75
+ else:
76
+ np.savetxt(f"{output_dir}/{k}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")
77
+
78
+ try:
79
+ r = requests.post(PROMPT_FILES_URL, json={"prompt_files": output_dir}, timeout=5)
80
+ if r.status_code != 200:
81
+ return None, "❌ TTS 请求失败"
82
+ except Exception as e:
83
+ return None, f"❌ TTS 请求异常: {e}"
84
+
85
+
86
+ def update_timesteps(timesteps):
87
+ try:
88
+ r = requests.post(TIMESTEPS_URL, json={"timesteps": timesteps}, timeout=5)
89
+ if r.status_code != 200:
90
+ return None, "❌ TTS 请求失败"
91
+ except Exception as e:
92
+ return None, f"❌ TTS 请求异常: {e}"
93
+
94
+ def run_tts(text):
95
+ # Step1: 提交 TTS 请求
96
+ try:
97
+ r = requests.post(TTS_URL, json={"text": text}, timeout=5)
98
+ if r.status_code != 200:
99
+ return None, "❌ TTS 请求失败"
100
+ except Exception as e:
101
+ return None, f"❌ TTS 请求异常: {e}"
102
+
103
+ # Step2: 循环调用 /get 获取进度
104
+ progress = gr.Progress()
105
+ wav_file = None
106
+ for i in range(100): # 最多尝试100次,避免死循环
107
+ time.sleep(0.5)
108
+ try:
109
+ resp = requests.post(GET_URL, data="", timeout=5).json()
110
+ except Exception as e:
111
+ return None, f"❌ GET 请求异常: {e}"
112
+
113
+ if resp.get("b_tts_runing", True):
114
+ progress(i / 100, desc="正在生成语音...")
115
+ else:
116
+ wav_file = resp.get("wav_file")
117
+ break
118
+
119
+ if not wav_file or not os.path.exists(wav_file):
120
+ return None, "❌ 语音文件未生成"
121
+
122
+ return wav_file, "✅ 生成完成"
123
+
124
+
125
+ with gr.Blocks() as demo:
126
+ gr.Markdown("### 🎙️ AXERA CosyVoice2 Demo")
127
+
128
+ with gr.Row():
129
+ with gr.Column():
130
+ audio_input = gr.Audio(label="输入音频", type="filepath")
131
+ with gr.Column():
132
+ audio_text = gr.Textbox(label="音频文本(自己改一下或者照着念)", value="锄禾日当午,汗滴禾下土。")
133
+ btn_update = gr.Button("更新音源")
134
+
135
+
136
+ with gr.Row():
137
+ text_input = gr.Textbox(value="琦琦,麻烦你适配一下这个新的模型吧。", label="输入文本")
138
+ with gr.Column():
139
+ timesteps = gr.Slider(minimum=4, maximum=30, value=7, step=1, label="Timesteps")
140
+ run_btn = gr.Button("生成语音")
141
+
142
+ status = gr.Label(label="状态")
143
+ audio_out = gr.Audio(label="生成结果", type="filepath")
144
+
145
+ run_btn.click(fn=run_tts, inputs=[text_input], outputs=[audio_out, status])
146
+ timesteps.change(fn=update_timesteps, inputs=timesteps)
147
+
148
+ btn_update.click(fn=update_audio, inputs=[audio_input, audio_text])
149
+
150
+ ips = get_all_local_ips()
151
+ for ip in ips:
152
+ print(f"* Running on local URL: https://{ip}:7860")
153
+
154
+
155
+ demo.launch(
156
+ server_name="0.0.0.0",
157
+ server_port=7860,
158
+ ssl_certfile="./server.crt",
159
+ ssl_keyfile="./server.key",
160
+ ssl_verify=False
161
+ )
scripts/meldataset.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/jik876/hifi-gan """
2
+
3
+ import math
4
+ import os
5
+ import random
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.utils.data
10
+ from librosa.filters import mel as librosa_mel_fn
11
+ from librosa.util import normalize
12
+ from scipy.io.wavfile import read
13
+
14
+ MAX_WAV_VALUE = 32768.0
15
+
16
+
17
+ def load_wav(full_path):
18
+ sampling_rate, data = read(full_path)
19
+ return data, sampling_rate
20
+
21
+
22
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
23
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
24
+
25
+
26
+ def dynamic_range_decompression(x, C=1):
27
+ return np.exp(x) / C
28
+
29
+
30
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
31
+ return torch.log(torch.clamp(x, min=clip_val) * C)
32
+
33
+
34
+ def dynamic_range_decompression_torch(x, C=1):
35
+ return torch.exp(x) / C
36
+
37
+
38
+ def spectral_normalize_torch(magnitudes):
39
+ output = dynamic_range_compression_torch(magnitudes)
40
+ return output
41
+
42
+
43
+ def spectral_de_normalize_torch(magnitudes):
44
+ output = dynamic_range_decompression_torch(magnitudes)
45
+ return output
46
+
47
+
48
+ mel_basis = {}
49
+ hann_window = {}
50
+
51
+
52
+ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
53
+ if torch.min(y) < -1.0:
54
+ print("min value is ", torch.min(y))
55
+ if torch.max(y) > 1.0:
56
+ print("max value is ", torch.max(y))
57
+
58
+ global mel_basis, hann_window # pylint: disable=global-statement
59
+ if fmax not in mel_basis:
60
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
61
+ mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
62
+ hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
63
+
64
+ y = torch.nn.functional.pad(
65
+ y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
66
+ )
67
+ y = y.squeeze(1)
68
+
69
+ spec = torch.view_as_real(
70
+ torch.stft(
71
+ y,
72
+ n_fft,
73
+ hop_length=hop_size,
74
+ win_length=win_size,
75
+ window=hann_window[str(y.device)],
76
+ center=center,
77
+ pad_mode="reflect",
78
+ normalized=False,
79
+ onesided=True,
80
+ return_complex=True,
81
+ )
82
+ )
83
+
84
+ spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
85
+
86
+ spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
87
+ spec = spectral_normalize_torch(spec)
88
+
89
+ return spec
90
+
91
+
92
+ def get_dataset_filelist(a):
93
+ with open(a.input_training_file, encoding="utf-8") as fi:
94
+ training_files = [
95
+ os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
96
+ ]
97
+
98
+ with open(a.input_validation_file, encoding="utf-8") as fi:
99
+ validation_files = [
100
+ os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
101
+ ]
102
+ return training_files, validation_files
103
+
104
+
105
+ class MelDataset(torch.utils.data.Dataset):
106
+ def __init__(
107
+ self,
108
+ training_files,
109
+ segment_size,
110
+ n_fft,
111
+ num_mels,
112
+ hop_size,
113
+ win_size,
114
+ sampling_rate,
115
+ fmin,
116
+ fmax,
117
+ split=True,
118
+ shuffle=True,
119
+ n_cache_reuse=1,
120
+ device=None,
121
+ fmax_loss=None,
122
+ fine_tuning=False,
123
+ base_mels_path=None,
124
+ ):
125
+ self.audio_files = training_files
126
+ random.seed(1234)
127
+ if shuffle:
128
+ random.shuffle(self.audio_files)
129
+ self.segment_size = segment_size
130
+ self.sampling_rate = sampling_rate
131
+ self.split = split
132
+ self.n_fft = n_fft
133
+ self.num_mels = num_mels
134
+ self.hop_size = hop_size
135
+ self.win_size = win_size
136
+ self.fmin = fmin
137
+ self.fmax = fmax
138
+ self.fmax_loss = fmax_loss
139
+ self.cached_wav = None
140
+ self.n_cache_reuse = n_cache_reuse
141
+ self._cache_ref_count = 0
142
+ self.device = device
143
+ self.fine_tuning = fine_tuning
144
+ self.base_mels_path = base_mels_path
145
+
146
+ def __getitem__(self, index):
147
+ filename = self.audio_files[index]
148
+ if self._cache_ref_count == 0:
149
+ audio, sampling_rate = load_wav(filename)
150
+ audio = audio / MAX_WAV_VALUE
151
+ if not self.fine_tuning:
152
+ audio = normalize(audio) * 0.95
153
+ self.cached_wav = audio
154
+ if sampling_rate != self.sampling_rate:
155
+ raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR")
156
+ self._cache_ref_count = self.n_cache_reuse
157
+ else:
158
+ audio = self.cached_wav
159
+ self._cache_ref_count -= 1
160
+
161
+ audio = torch.FloatTensor(audio)
162
+ audio = audio.unsqueeze(0)
163
+
164
+ if not self.fine_tuning:
165
+ if self.split:
166
+ if audio.size(1) >= self.segment_size:
167
+ max_audio_start = audio.size(1) - self.segment_size
168
+ audio_start = random.randint(0, max_audio_start)
169
+ audio = audio[:, audio_start : audio_start + self.segment_size]
170
+ else:
171
+ audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
172
+
173
+ mel = mel_spectrogram(
174
+ audio,
175
+ self.n_fft,
176
+ self.num_mels,
177
+ self.sampling_rate,
178
+ self.hop_size,
179
+ self.win_size,
180
+ self.fmin,
181
+ self.fmax,
182
+ center=False,
183
+ )
184
+ else:
185
+ mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy"))
186
+ mel = torch.from_numpy(mel)
187
+
188
+ if len(mel.shape) < 3:
189
+ mel = mel.unsqueeze(0)
190
+
191
+ if self.split:
192
+ frames_per_seg = math.ceil(self.segment_size / self.hop_size)
193
+
194
+ if audio.size(1) >= self.segment_size:
195
+ mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
196
+ mel = mel[:, :, mel_start : mel_start + frames_per_seg]
197
+ audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size]
198
+ else:
199
+ mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant")
200
+ audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
201
+
202
+ mel_loss = mel_spectrogram(
203
+ audio,
204
+ self.n_fft,
205
+ self.num_mels,
206
+ self.sampling_rate,
207
+ self.hop_size,
208
+ self.win_size,
209
+ self.fmin,
210
+ self.fmax_loss,
211
+ center=False,
212
+ )
213
+
214
+ return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
215
+
216
+ def __len__(self):
217
+ return len(self.audio_files)
scripts/process_prompt.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import torch
4
+ import torchaudio
5
+ import numpy as np
6
+ from frontend import CosyVoiceFrontEnd
7
+
8
+ def load_wav(wav, target_sr, min_sr=16000):
9
+ speech, sample_rate = torchaudio.load(wav, backend='soundfile')
10
+ speech = speech.mean(dim=0, keepdim=True)
11
+ if sample_rate != target_sr:
12
+ assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
13
+ speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
14
+ return speech
15
+
16
+ if __name__ == "__main__":
17
+
18
+ args = argparse.ArgumentParser()
19
+ args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN/", help="tokenizer configuration directionary")
20
+ args.add_argument('--wetext_dir', type=str, default="./pengzhendong/wetext", help="path to wetext")
21
+ args.add_argument('--sample_rate', type=int, default=24000, help="Sampling rate for prompt audio")
22
+ args.add_argument('--prompt_text', type=str, default="You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。", help="The text content of the prompt(reference) audio. Text or file path.")
23
+ args.add_argument('--prompt_speech', type=str, default="asset/zero_shot_prompt.wav", help="The path to prompt(reference) audio.")
24
+ args.add_argument('--output', type=str, default="prompt_files", help="Output data storage directory")
25
+ args = args.parse_args()
26
+
27
+ os.makedirs(args.output, exist_ok=True)
28
+
29
+ frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
30
+ args.wetext_dir,
31
+ "../frontend-onnx/campplus.onnx",
32
+ "../frontend-onnx/speech_tokenizer_v3.onnx",
33
+ f"{args.model_dir}/spk2info.pt",
34
+ "all")
35
+
36
+ prompt_speech_16k = load_wav(args.prompt_speech, 16000)
37
+ zero_shot_spk_id = ""
38
+
39
+ if os.path.isfile(args.prompt_text):
40
+ with open(args.prompt_text, "r") as f:
41
+ prompt_text = f.read()
42
+ else:
43
+ prompt_text = args.prompt_text
44
+ print("prompt_text",prompt_text)
45
+ model_input = frontend.process_prompt( prompt_text, prompt_speech_16k, args.sample_rate, zero_shot_spk_id)
46
+
47
+ # model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
48
+ # 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
49
+ # 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
50
+ # 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
51
+ # 'llm_embedding': embedding, 'flow_embedding': embedding}
52
+ print("prompt speech token size:", model_input["flow_prompt_speech_token"].shape)
53
+ assert model_input["flow_prompt_speech_token"].shape[1] >=75, f"speech_token length should >= 75, bug get {model_input['flow_prompt_speech_token'].shape[1]}"
54
+ for k, v in model_input.items():
55
+ if "_len" in k:
56
+ continue
57
+ shapes = [str(s) for s in v.shape]
58
+ shape_str = "_".join(shapes)
59
+ if v.dtype in (torch.int32, torch.int64):
60
+ np.savetxt(f"{args.output}/{k}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
61
+ else:
62
+ np.savetxt(f"{args.output}/{k}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")
scripts/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ openai-whisper==20231117
2
+ transformers
3
+ gradio
4
+ onnxruntime
5
+ torch
6
+ torchaudio
7
+ inflect
8
+ wetext
scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:747979631e813193436aabcff7c1c235d37de8097b71c563ec8b63b7a515c718
3
+ size 907395
scripts/tokenizer/tokenizer.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ from functools import lru_cache
4
+ from typing import Optional
5
+ import torch
6
+ from transformers import AutoTokenizer
7
+ from whisper.tokenizer import Tokenizer
8
+
9
+ import tiktoken
10
+
11
+ LANGUAGES = {
12
+ "en": "english",
13
+ "zh": "chinese",
14
+ "de": "german",
15
+ "es": "spanish",
16
+ "ru": "russian",
17
+ "ko": "korean",
18
+ "fr": "french",
19
+ "ja": "japanese",
20
+ "pt": "portuguese",
21
+ "tr": "turkish",
22
+ "pl": "polish",
23
+ "ca": "catalan",
24
+ "nl": "dutch",
25
+ "ar": "arabic",
26
+ "sv": "swedish",
27
+ "it": "italian",
28
+ "id": "indonesian",
29
+ "hi": "hindi",
30
+ "fi": "finnish",
31
+ "vi": "vietnamese",
32
+ "he": "hebrew",
33
+ "uk": "ukrainian",
34
+ "el": "greek",
35
+ "ms": "malay",
36
+ "cs": "czech",
37
+ "ro": "romanian",
38
+ "da": "danish",
39
+ "hu": "hungarian",
40
+ "ta": "tamil",
41
+ "no": "norwegian",
42
+ "th": "thai",
43
+ "ur": "urdu",
44
+ "hr": "croatian",
45
+ "bg": "bulgarian",
46
+ "lt": "lithuanian",
47
+ "la": "latin",
48
+ "mi": "maori",
49
+ "ml": "malayalam",
50
+ "cy": "welsh",
51
+ "sk": "slovak",
52
+ "te": "telugu",
53
+ "fa": "persian",
54
+ "lv": "latvian",
55
+ "bn": "bengali",
56
+ "sr": "serbian",
57
+ "az": "azerbaijani",
58
+ "sl": "slovenian",
59
+ "kn": "kannada",
60
+ "et": "estonian",
61
+ "mk": "macedonian",
62
+ "br": "breton",
63
+ "eu": "basque",
64
+ "is": "icelandic",
65
+ "hy": "armenian",
66
+ "ne": "nepali",
67
+ "mn": "mongolian",
68
+ "bs": "bosnian",
69
+ "kk": "kazakh",
70
+ "sq": "albanian",
71
+ "sw": "swahili",
72
+ "gl": "galician",
73
+ "mr": "marathi",
74
+ "pa": "punjabi",
75
+ "si": "sinhala",
76
+ "km": "khmer",
77
+ "sn": "shona",
78
+ "yo": "yoruba",
79
+ "so": "somali",
80
+ "af": "afrikaans",
81
+ "oc": "occitan",
82
+ "ka": "georgian",
83
+ "be": "belarusian",
84
+ "tg": "tajik",
85
+ "sd": "sindhi",
86
+ "gu": "gujarati",
87
+ "am": "amharic",
88
+ "yi": "yiddish",
89
+ "lo": "lao",
90
+ "uz": "uzbek",
91
+ "fo": "faroese",
92
+ "ht": "haitian creole",
93
+ "ps": "pashto",
94
+ "tk": "turkmen",
95
+ "nn": "nynorsk",
96
+ "mt": "maltese",
97
+ "sa": "sanskrit",
98
+ "lb": "luxembourgish",
99
+ "my": "myanmar",
100
+ "bo": "tibetan",
101
+ "tl": "tagalog",
102
+ "mg": "malagasy",
103
+ "as": "assamese",
104
+ "tt": "tatar",
105
+ "haw": "hawaiian",
106
+ "ln": "lingala",
107
+ "ha": "hausa",
108
+ "ba": "bashkir",
109
+ "jw": "javanese",
110
+ "su": "sundanese",
111
+ "yue": "cantonese",
112
+ "minnan": "minnan",
113
+ "wuyu": "wuyu",
114
+ "dialect": "dialect",
115
+ "zh/en": "zh/en",
116
+ "en/zh": "en/zh",
117
+ }
118
+
119
+ # language code lookup by name, with a few language aliases
120
+ TO_LANGUAGE_CODE = {
121
+ **{language: code for code, language in LANGUAGES.items()},
122
+ "burmese": "my",
123
+ "valencian": "ca",
124
+ "flemish": "nl",
125
+ "haitian": "ht",
126
+ "letzeburgesch": "lb",
127
+ "pushto": "ps",
128
+ "panjabi": "pa",
129
+ "moldavian": "ro",
130
+ "moldovan": "ro",
131
+ "sinhalese": "si",
132
+ "castilian": "es",
133
+ "mandarin": "zh",
134
+ }
135
+
136
+ AUDIO_EVENT = {
137
+ "ASR": "ASR",
138
+ "AED": "AED",
139
+ "SER": "SER",
140
+ "Speech": "Speech",
141
+ "/Speech": "/Speech",
142
+ "BGM": "BGM",
143
+ "/BGM": "/BGM",
144
+ "Laughter": "Laughter",
145
+ "/Laughter": "/Laughter",
146
+ "Applause": "Applause",
147
+ "/Applause": "/Applause",
148
+ }
149
+
150
+ EMOTION = {
151
+ "HAPPY": "HAPPY",
152
+ "SAD": "SAD",
153
+ "ANGRY": "ANGRY",
154
+ "NEUTRAL": "NEUTRAL",
155
+ }
156
+
157
+ TTS_Vocal_Token = {
158
+ "TTS/B": "TTS/B",
159
+ "TTS/O": "TTS/O",
160
+ "TTS/Q": "TTS/Q",
161
+ "TTS/A": "TTS/A",
162
+ "TTS/CO": "TTS/CO",
163
+ "TTS/CL": "TTS/CL",
164
+ "TTS/H": "TTS/H",
165
+ **{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)}
166
+ }
167
+
168
+
169
+ @lru_cache(maxsize=None)
170
+ def get_encoding(name: str = "gpt2", num_languages: int = 99):
171
+ vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
172
+ ranks = {
173
+ base64.b64decode(token): int(rank)
174
+ for token, rank in (line.split() for line in open(vocab_path) if line)
175
+ }
176
+ n_vocab = len(ranks)
177
+ special_tokens = {}
178
+
179
+ specials = [
180
+ "<|endoftext|>",
181
+ "<|startoftranscript|>",
182
+ *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
183
+ *[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())],
184
+ *[f"<|{emotion}|>" for emotion in list(EMOTION.keys())],
185
+ "<|translate|>",
186
+ "<|transcribe|>",
187
+ "<|startoflm|>",
188
+ "<|startofprev|>",
189
+ "<|nospeech|>",
190
+ "<|notimestamps|>",
191
+ *[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)], # register special tokens for ASR
192
+ *[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())], # register special tokens for TTS
193
+ *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
194
+ ]
195
+
196
+ for token in specials:
197
+ special_tokens[token] = n_vocab
198
+ n_vocab += 1
199
+
200
+ return tiktoken.Encoding(
201
+ name=os.path.basename(vocab_path),
202
+ explicit_n_vocab=n_vocab,
203
+ pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
204
+ mergeable_ranks=ranks,
205
+ special_tokens=special_tokens,
206
+ )
207
+
208
+
209
+ @lru_cache(maxsize=None)
210
+ def get_tokenizer(
211
+ multilingual: bool,
212
+ *,
213
+ num_languages: int = 99,
214
+ language: Optional[str] = None,
215
+ task: Optional[str] = None, # Literal["transcribe", "translate", None]
216
+ ) -> Tokenizer:
217
+ if language is not None:
218
+ language = language.lower()
219
+ if language not in LANGUAGES:
220
+ if language in TO_LANGUAGE_CODE:
221
+ language = TO_LANGUAGE_CODE[language]
222
+ else:
223
+ raise ValueError(f"Unsupported language: {language}")
224
+
225
+ if multilingual:
226
+ encoding_name = "multilingual_zh_ja_yue_char_del"
227
+ language = language or "en"
228
+ task = task or "transcribe"
229
+ else:
230
+ encoding_name = "gpt2"
231
+ language = None
232
+ task = None
233
+
234
+ encoding = get_encoding(name=encoding_name, num_languages=num_languages)
235
+
236
+ return Tokenizer(
237
+ encoding=encoding, num_languages=num_languages, language=language, task=task
238
+ )
239
+
240
+
241
+ class CosyVoice2Tokenizer():
242
+ def __init__(self, token_path, skip_special_tokens=True):
243
+ super().__init__()
244
+ # NOTE: non-chat model, all these special tokens keep randomly initialized.
245
+ special_tokens = {
246
+ 'eos_token': '<|endoftext|>',
247
+ 'pad_token': '<|endoftext|>',
248
+ 'additional_special_tokens': [
249
+ '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
250
+ '[breath]', '<strong>', '</strong>', '[noise]',
251
+ '[laughter]', '[cough]', '[clucking]', '[accent]',
252
+ '[quick_breath]',
253
+ "<laughter>", "</laughter>",
254
+ "[hissing]", "[sigh]", "[vocalized-noise]",
255
+ "[lipsmack]", "[mn]"
256
+ ]
257
+ }
258
+ self.special_tokens = special_tokens
259
+ self.tokenizer = AutoTokenizer.from_pretrained(token_path)
260
+ self.tokenizer.add_special_tokens(special_tokens)
261
+ self.skip_special_tokens = skip_special_tokens
262
+
263
+ def encode(self, text, **kwargs):
264
+ tokens = self.tokenizer([text], return_tensors="pt")
265
+ tokens = tokens["input_ids"][0].cpu().tolist()
266
+ return tokens
267
+
268
+ def decode(self, tokens):
269
+ tokens = torch.tensor(tokens, dtype=torch.int64)
270
+ text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
271
+ return text
272
+
273
+
274
+ class CosyVoice3Tokenizer(CosyVoice2Tokenizer):
275
+ def __init__(self, token_path, skip_special_tokens=True):
276
+ # NOTE: non-chat model, all these special tokens keep randomly initialized.
277
+ special_tokens = {
278
+ 'eos_token': '<|endoftext|>',
279
+ 'pad_token': '<|endoftext|>',
280
+ 'additional_special_tokens': [
281
+ '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
282
+ '[breath]', '<strong>', '</strong>', '[noise]',
283
+ '[laughter]', '[cough]', '[clucking]', '[accent]',
284
+ '[quick_breath]',
285
+ "<laughter>", "</laughter>",
286
+ "[hissing]", "[sigh]", "[vocalized-noise]",
287
+ "[lipsmack]", "[mn]", "<|endofsystem|>",
288
+ "[AA]", "[AA0]", "[AA1]", "[AA2]", "[AE]", "[AE0]", "[AE1]", "[AE2]", "[AH]", "[AH0]", "[AH1]", "[AH2]",
289
+ "[AO]", "[AO0]", "[AO1]", "[AO2]", "[AW]", "[AW0]", "[AW1]", "[AW2]", "[AY]", "[AY0]", "[AY1]", "[AY2]",
290
+ "[B]", "[CH]", "[D]", "[DH]", "[EH]", "[EH0]", "[EH1]", "[EH2]", "[ER]", "[ER0]", "[ER1]", "[ER2]", "[EY]",
291
+ "[EY0]", "[EY1]", "[EY2]", "[F]", "[G]", "[HH]", "[IH]", "[IH0]", "[IH1]", "[IH2]", "[IY]", "[IY0]", "[IY1]",
292
+ "[IY2]", "[JH]", "[K]", "[L]", "[M]", "[N]", "[NG]", "[OW]", "[OW0]", "[OW1]", "[OW2]", "[OY]", "[OY0]",
293
+ "[OY1]", "[OY2]", "[P]", "[R]", "[S]", "[SH]", "[T]", "[TH]", "[UH]", "[UH0]", "[UH1]", "[UH2]", "[UW]",
294
+ "[UW0]", "[UW1]", "[UW2]", "[V]", "[W]", "[Y]", "[Z]", "[ZH]",
295
+ "[a]", "[ai]", "[an]", "[ang]", "[ao]", "[b]", "[c]", "[ch]", "[d]", "[e]", "[ei]", "[en]", "[eng]", "[f]",
296
+ "[g]", "[h]", "[i]", "[ian]", "[in]", "[ing]", "[iu]", "[ià]", "[iàn]", "[iàng]", "[iào]", "[iá]", "[ián]",
297
+ "[iáng]", "[iáo]", "[iè]", "[ié]", "[iòng]", "[ióng]", "[iù]", "[iú]", "[iā]", "[iān]", "[iāng]", "[iāo]",
298
+ "[iē]", "[iě]", "[iōng]", "[iū]", "[iǎ]", "[iǎn]", "[iǎng]", "[iǎo]", "[iǒng]", "[iǔ]", "[j]", "[k]", "[l]",
299
+ "[m]", "[n]", "[o]", "[ong]", "[ou]", "[p]", "[q]", "[r]", "[s]", "[sh]", "[t]", "[u]", "[uang]", "[ue]",
300
+ "[un]", "[uo]", "[uà]", "[uài]", "[uàn]", "[uàng]", "[uá]", "[uái]", "[uán]", "[uáng]", "[uè]", "[ué]", "[uì]",
301
+ "[uí]", "[uò]", "[uó]", "[uā]", "[uāi]", "[uān]", "[uāng]", "[uē]", "[uě]", "[uī]", "[uō]", "[uǎ]", "[uǎi]",
302
+ "[uǎn]", "[uǎng]", "[uǐ]", "[uǒ]", "[vè]", "[w]", "[x]", "[y]", "[z]", "[zh]", "[à]", "[ài]", "[àn]", "[àng]",
303
+ "[ào]", "[á]", "[ái]", "[án]", "[��ng]", "[áo]", "[è]", "[èi]", "[èn]", "[èng]", "[èr]", "[é]", "[éi]", "[én]",
304
+ "[éng]", "[ér]", "[ì]", "[ìn]", "[ìng]", "[í]", "[ín]", "[íng]", "[ò]", "[òng]", "[òu]", "[ó]", "[óng]", "[óu]",
305
+ "[ù]", "[ùn]", "[ú]", "[ún]", "[ā]", "[āi]", "[ān]", "[āng]", "[āo]", "[ē]", "[ēi]", "[ēn]", "[ēng]", "[ě]",
306
+ "[ěi]", "[ěn]", "[ěng]", "[ěr]", "[ī]", "[īn]", "[īng]", "[ō]", "[ōng]", "[ōu]", "[ū]", "[ūn]", "[ǎ]", "[ǎi]",
307
+ "[ǎn]", "[ǎng]", "[ǎo]", "[ǐ]", "[ǐn]", "[ǐng]", "[ǒ]", "[ǒng]", "[ǒu]", "[ǔ]", "[ǔn]", "[ǘ]", "[ǚ]", "[ǜ]"
308
+ ]
309
+ }
310
+ self.special_tokens = special_tokens
311
+ self.tokenizer = AutoTokenizer.from_pretrained(token_path)
312
+ self.tokenizer.add_special_tokens(special_tokens)
313
+ self.skip_special_tokens = skip_special_tokens
314
+
315
+
316
+ @lru_cache(maxsize=None)
317
+ def get_qwen_tokenizer(
318
+ token_path: str,
319
+ skip_special_tokens: bool,
320
+ version: str = 'cosyvoice2'
321
+ ):
322
+ if version == 'cosyvoice2':
323
+ return CosyVoice2Tokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
324
+ elif version == 'cosyvoice3':
325
+ return CosyVoice3Tokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
326
+ else:
327
+ raise ValueError
token2wav-axmodels/flow.input_embedding.float16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a7ceb5ede1cac3bdcec37aa034a694821a735087890c2104da238bf1e921bc6
3
+ size 1049760