HY-2012 commited on
Commit
0c354cf
·
verified ·
1 Parent(s): 8186f6f

First commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +68 -0
  2. README.md +218 -3
  3. ax_model/.gitattributes +2 -0
  4. ax_model/auto.npy +3 -0
  5. ax_model/chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
  6. ax_model/event_emo.npy +3 -0
  7. ax_model/sensevoice.axmodel +3 -0
  8. ax_model/sensevoice/am.mvn +8 -0
  9. ax_model/sensevoice/config.yaml +97 -0
  10. ax_model/vad/am.mvn +8 -0
  11. ax_model/vad/config.yaml +56 -0
  12. ax_model/withitn.npy +3 -0
  13. ax_spoken_communication_demo.py +719 -0
  14. config.json +0 -0
  15. input_question/Q1.wav +3 -0
  16. input_question/Q2.wav +3 -0
  17. input_question/Q3.wav +3 -0
  18. libaxllm/main_api_ax650 +3 -0
  19. libaxllm/main_api_axcl_aarch64 +3 -0
  20. libaxllm/main_api_axcl_x86 +3 -0
  21. libaxllm/post_config.json +14 -0
  22. libaxllm/qwen2.5_tokenizer/merges.txt +0 -0
  23. libaxllm/qwen2.5_tokenizer/tokenizer.json +0 -0
  24. libaxllm/qwen2.5_tokenizer/tokenizer_config.json +207 -0
  25. libaxllm/qwen2.5_tokenizer/vocab.json +0 -0
  26. libaxllm/qwen2.5_tokenizer_uid.py +189 -0
  27. libaxllm/run_qwen2.5_1.5b_ctx_ax650_api.sh +15 -0
  28. libaxllm/run_qwen2.5_1.5b_ctx_axcl_aarch64_api.sh +13 -0
  29. libaxllm/run_qwen2.5_1.5b_ctx_axcl_x86_api.sh +13 -0
  30. libmelotts/models/decoder-en.axmodel +3 -0
  31. libmelotts/models/decoder-zh.axmodel +3 -0
  32. libmelotts/models/encoder-en.onnx +3 -0
  33. libmelotts/models/encoder-zh.onnx +3 -0
  34. libmelotts/models/g-en.bin +3 -0
  35. libmelotts/models/g-jp.bin +3 -0
  36. libmelotts/models/g-zh_mix_en.bin +3 -0
  37. libmelotts/models/lexicon.txt +0 -0
  38. libmelotts/models/tokens.txt +112 -0
  39. libmelotts/python/split_utils.py +173 -0
  40. libmelotts/python/symbols.py +1237 -0
  41. libmelotts/python/text/__init__.py +35 -0
  42. libmelotts/python/text/bert-base-multilingual-uncased/special_tokens_map.json +7 -0
  43. libmelotts/python/text/bert-base-multilingual-uncased/tokenizer.json +0 -0
  44. libmelotts/python/text/bert-base-multilingual-uncased/tokenizer_config.json +13 -0
  45. libmelotts/python/text/bert-base-multilingual-uncased/vocab.txt +0 -0
  46. libmelotts/python/text/bert-base-uncased/special_tokens_map.json +7 -0
  47. libmelotts/python/text/bert-base-uncased/tokenizer.json +0 -0
  48. libmelotts/python/text/bert-base-uncased/tokenizer_config.json +13 -0
  49. libmelotts/python/text/bert-base-uncased/vocab.txt +0 -0
  50. libmelotts/python/text/chinese.py +198 -0
.gitattributes CHANGED
@@ -33,3 +33,71 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ax_model/sensevoice.axmodel filter=lfs diff=lfs merge=lfs -text
37
+ libmelotts/install/libonnxruntime.so filter=lfs diff=lfs merge=lfs -text
38
+ libmelotts/install/libonnxruntime.so.1.14.0 filter=lfs diff=lfs merge=lfs -text
39
+ libmelotts/install/melotts filter=lfs diff=lfs merge=lfs -text
40
+ libmelotts/models/decoder-en.axmodel filter=lfs diff=lfs merge=lfs -text
41
+ libmelotts/models/decoder-zh.axmodel filter=lfs diff=lfs merge=lfs -text
42
+ libtranslate/libax_translate.so filter=lfs diff=lfs merge=lfs -text
43
+ libtranslate/libsentencepiece.so.0 filter=lfs diff=lfs merge=lfs -text
44
+ libtranslate/opus-mt-en-zh/source.spm filter=lfs diff=lfs merge=lfs -text
45
+ libtranslate/opus-mt-en-zh/target.spm filter=lfs diff=lfs merge=lfs -text
46
+ libtranslate/opus-mt-en-zh.axmodel filter=lfs diff=lfs merge=lfs -text
47
+ vad.axmodel filter=lfs diff=lfs merge=lfs -text
48
+ ax_model/vad.axmodel filter=lfs diff=lfs merge=lfs -text
49
+ main_api_ax650 filter=lfs diff=lfs merge=lfs -text
50
+ libaxllm/main_api_ax650 filter=lfs diff=lfs merge=lfs -text
51
+ wav/zh.wav filter=lfs diff=lfs merge=lfs -text
52
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_comm.so filter=lfs diff=lfs merge=lfs -text
53
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_host_proto.a filter=lfs diff=lfs merge=lfs -text
54
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_ive.so filter=lfs diff=lfs merge=lfs -text
55
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_ivps.so filter=lfs diff=lfs merge=lfs -text
56
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_lite.so filter=lfs diff=lfs merge=lfs -text
57
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_npu.so filter=lfs diff=lfs merge=lfs -text
58
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_pkg.so filter=lfs diff=lfs merge=lfs -text
59
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_rt.so filter=lfs diff=lfs merge=lfs -text
60
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_skel.so filter=lfs diff=lfs merge=lfs -text
61
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_sys.so filter=lfs diff=lfs merge=lfs -text
62
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_vdec.so filter=lfs diff=lfs merge=lfs -text
63
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libaxcl_venc.so filter=lfs diff=lfs merge=lfs -text
64
+ libmelotts/tts_x86/3rdparty/libaxcl/lib/libspdlog.so filter=lfs diff=lfs merge=lfs -text
65
+ libmelotts/tts_x86/3rdparty/onnxruntime_aarch64/lib/libonnxruntime.so filter=lfs diff=lfs merge=lfs -text
66
+ libmelotts/tts_x86/3rdparty/onnxruntime_aarch64/lib/libonnxruntime.so.1.14.0 filter=lfs diff=lfs merge=lfs -text
67
+ libmelotts/tts_x86/3rdparty/onnxruntime_x86/lib/libonnxruntime.so filter=lfs diff=lfs merge=lfs -text
68
+ libmelotts/tts_x86/3rdparty/onnxruntime_x86/lib/libonnxruntime.so.1.14.1 filter=lfs diff=lfs merge=lfs -text
69
+ libmelotts/tts_x86/3rdparty/onnxruntime_x86_ori/lib/libonnxruntime.so filter=lfs diff=lfs merge=lfs -text
70
+ libmelotts/tts_x86/3rdparty/onnxruntime_x86_ori/lib/libonnxruntime.so-- filter=lfs diff=lfs merge=lfs -text
71
+ libmelotts/tts_x86/3rdparty/onnxruntime_x86_ori/lib/libonnxruntime.so.1 filter=lfs diff=lfs merge=lfs -text
72
+ libmelotts/tts_x86/3rdparty/onnxruntime_x86_ori/lib/libonnxruntime.so.1.14.1 filter=lfs diff=lfs merge=lfs -text
73
+ libmelotts/tts_x86/3rdparty/onnxruntime_x86_ori/lib/libonnxruntime.so.1.21.0-- filter=lfs diff=lfs merge=lfs -text
74
+ libmelotts/tts_x86/install/melotts filter=lfs diff=lfs merge=lfs -text
75
+ libmelotts/tts_x86/models/decoder-zh.axmodel filter=lfs diff=lfs merge=lfs -text
76
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_comm.so filter=lfs diff=lfs merge=lfs -text
77
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_host_proto.a filter=lfs diff=lfs merge=lfs -text
78
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_ive.so filter=lfs diff=lfs merge=lfs -text
79
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_ivps.so filter=lfs diff=lfs merge=lfs -text
80
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_lite.so filter=lfs diff=lfs merge=lfs -text
81
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_npu.so filter=lfs diff=lfs merge=lfs -text
82
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_pkg.so filter=lfs diff=lfs merge=lfs -text
83
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_rt.so filter=lfs diff=lfs merge=lfs -text
84
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_skel.so filter=lfs diff=lfs merge=lfs -text
85
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_sys.so filter=lfs diff=lfs merge=lfs -text
86
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_vdec.so filter=lfs diff=lfs merge=lfs -text
87
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libaxcl_venc.so filter=lfs diff=lfs merge=lfs -text
88
+ libmelotts/tts_aarch64/3rdparty/libaxcl/lib/libspdlog.so filter=lfs diff=lfs merge=lfs -text
89
+ libmelotts/tts_aarch64/3rdparty/onnxruntime_aarch64/lib/libonnxruntime.so filter=lfs diff=lfs merge=lfs -text
90
+ libmelotts/tts_aarch64/3rdparty/onnxruntime_aarch64/lib/libonnxruntime.so.1.14.0 filter=lfs diff=lfs merge=lfs -text
91
+ libmelotts/tts_aarch64/3rdparty/onnxruntime_x86/lib/libonnxruntime.so filter=lfs diff=lfs merge=lfs -text
92
+ libmelotts/tts_aarch64/3rdparty/onnxruntime_x86/lib/libonnxruntime.so.1 filter=lfs diff=lfs merge=lfs -text
93
+ libmelotts/tts_aarch64/3rdparty/onnxruntime_x86/lib/libonnxruntime.so.1.21.0 filter=lfs diff=lfs merge=lfs -text
94
+ libmelotts/tts_aarch64/install/melotts filter=lfs diff=lfs merge=lfs -text
95
+ libmelotts/tts_aarch64/models/decoder-zh.axmodel filter=lfs diff=lfs merge=lfs -text
96
+ libaxllm/main_api_axcl_x86 filter=lfs diff=lfs merge=lfs -text
97
+ libaxllm/main_api_axcl_aarch64 filter=lfs diff=lfs merge=lfs -text
98
+ libmelotts/text/fr_phonemizer/example_ipa.txt filter=lfs diff=lfs merge=lfs -text
99
+ libmelotts/python/text/fr_phonemizer/example_ipa.txt filter=lfs diff=lfs merge=lfs -text
100
+ wav/en_6mins.wav filter=lfs diff=lfs merge=lfs -text
101
+ input_question/Q1.wav filter=lfs diff=lfs merge=lfs -text
102
+ input_question/Q2.wav filter=lfs diff=lfs merge=lfs -text
103
+ input_question/Q3.wav filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,218 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ - zh
6
+ pipeline_tag: audio-to-audio
7
+ base_model:
8
+ - FunAudioLLM/SenseVoiceSmall
9
+ - qwen2.5
10
+ - MeloTTS
11
+ tags:
12
+ - VAD
13
+ - ASR
14
+ - LLM
15
+ - TTS
16
+ ---
17
+
18
+
19
+ # Spoken-Communication.axera
20
+
21
+ spoken communication demo on Axera
22
+
23
+ - [x] Python 示例
24
+ - [ ] C++ 示例
25
+
26
+ ## Convert tools links:
27
+
28
+ For those who are interested in model conversion, you can try to export axmodel through the original repo :
29
+ How to Convert from ONNX to axmodel
30
+ - [ASR](https://github.com/AXERA-TECH/3D-Speaker-MT.axera/tree/main/model_convert)
31
+ - [MeloTTS](https://github.com/ml-inory/melotts.axera/tree/main/model_convert)
32
+
33
+ ## 支持平台
34
+
35
+ - AX650N
36
+
37
+ ## 功能
38
+
39
+ 语音交流
40
+
41
+ ## Pipeline组件
42
+
43
+ - [ASR](https://github.com/AXERA-TECH/3D-Speaker-MT.axera/tree/main)
44
+ - [LLM](https://github.com/AXERA-TECH/ax-llm/tree/ax-context),参考生成库文件,保存到libaxllm
45
+ - [MeloTTS](https://github.com/ml-inory/melotts.axera/tree/main/python)
46
+
47
+ ## 上板部署
48
+
49
+ - AX650N 的设备已预装 Ubuntu22.04
50
+ - 以 root 权限登陆 AX650N 的板卡设备
51
+ - 链接互联网,确保 AX650N 的设备能正常执行 apt install, pip install 等指令
52
+ - 已验证设备:AX650N DEMO Board
53
+
54
+ ## Python API 运行
55
+
56
+ 在python3.10(验证)
57
+
58
+ ### pipeline方案:ASR + LLM(Qwen) + MeloTTS
59
+
60
+ ```
61
+ 支持板端运行及算力卡模式运行
62
+ ```
63
+
64
+ ### 工程下载
65
+ ```
66
+ git clone https://huggingface.co/AXERA-TECH/Spoken-Communication.axera 或者
67
+ hf download AXERA-TECH/Spoken-Communication.axera --local-dir Spoken-Communication.axera
68
+
69
+ cd Spoken-Communication.axera
70
+
71
+ 工程目录文件结构如下:
72
+ .
73
+ |-- README.md
74
+ |-- ax_model
75
+ |-- ax_spoken_communication_demo.py
76
+ |-- config.json
77
+ |-- libaxllm
78
+ |-- libmelotts
79
+ |-- model.py
80
+ |-- requirements.txt
81
+ |-- utils
82
+ `-- input_question
83
+
84
+ ```
85
+
86
+ ### 具体流程
87
+
88
+ **板端 demo**
89
+
90
+ 1、安装依赖库
91
+
92
+ ```
93
+ 1):
94
+ 如果环境中没有axengine,下载安装,位置任意
95
+ hf download AXERA-TECH/PyAXEngine --local-dir PyAXEngine
96
+ cd PyAXEngine
97
+ pip3 install axengine-0.1.3-py3-none-any.whl
98
+
99
+ 2):
100
+ cd Spoken-Communication.axera
101
+ pip3 install -r requirements.txt
102
+
103
+ 3):
104
+ apt install espeak 或者
105
+ sudo apt install espeak
106
+ ```
107
+
108
+ 2、模型下载
109
+
110
+ 以Qwen2.5-1.5B为例,[下载地址](https://huggingface.co/AXERA-TECH/Qwen2.5-1.5B-Instruct/tree/main/qwen2.5-1.5b-ctx-ax650)
111
+ ```
112
+ 执行命令:
113
+ hf download AXERA-TECH/Qwen2.5-1.5B-Instruct --local-dir libaxllm --include qwen2.5-1.5b-ctx-ax650/*
114
+
115
+ 模型下载至libaxllm文件夹
116
+ ```
117
+
118
+ 3、在开发板运行以下命令
119
+
120
+ ```
121
+ 1)、运行qwen api
122
+ cd libaxllm
123
+
124
+ 启动支持上下文的 tokenizer 服务器
125
+ python3 qwen2.5_tokenizer_uid.py
126
+
127
+ 运行
128
+ sh run_qwen2.5_1.5b_ctx_ax650_api.sh
129
+
130
+ 2)、运行pipeline板端demo
131
+ cd ..
132
+
133
+ python3 ax_spoken_communication_demo.py --audio_dir input_question --output_dir output_answer --api_url http://10.126.29.158:8000
134
+
135
+ 运行参数说明:
136
+
137
+ | 参数名称 | 说明|
138
+ |-------|------|
139
+ | `--audio_dir` | 音频路径 |
140
+ | `--api_url` | qwen API服务地址,对应其运行服务器 |
141
+ | `--output_dir` | 结果保存路径 |
142
+ ```
143
+
144
+ 输出:
145
+ 1、与输入音频相对应的wav文件,
146
+ 2、识别信息保存成txt文件 -> "output_answer/processing_summary.txt",如下:
147
+ ```
148
+ 批量处理结果汇总
149
+ ==================================================
150
+
151
+ 文件 1: Q1.wav
152
+ 原始文本: 人工智能和人类智能最本质的区别是什么?。
153
+ 回答结果: 人工智能和人类智能最本质的区别在于,人工智能是基于算法和数据进行学习和决策的机器智能,而人类智能是基于经验和直觉进行思考和决策的生物智能。
154
+ 合成音频: Q1_answer.wav
155
+ 处理时间: 8.22 秒
156
+ 音频时长: 15.19 秒
157
+ RTF: 0.54
158
+ --------------------------------------------------
159
+ 文件 2: Q2.wav
160
+ 原始文本: 人工智能没有思想,为什么他能创作出震撼人心的艺术?。
161
+ 回答结果: 人工智能创作艺术是因为它可以通过算法和数据进行学习和分析,理解艺术作品的风格、情感和意义,然后通过生成模型进行创作。这与人类艺术家创作艺术的灵感、经验和直觉不同,但人工智能在某些领域已经表现出超越人类的能力。
162
+ 合成音频: Q2_answer.wav
163
+ 处理时间: 9.43 秒
164
+ 音频时长: 23.68 秒
165
+ RTF: 0.40
166
+ --------------------------------------------------
167
+ 文件 3: Q3.wav
168
+ 原始文本: 人工智能最终会统治人类吗?。
169
+ 回答结果: 人工智能的发展可能会对人类社会产生重大影响,但目前来看,人工智能尚未达到能够统治人类的程度。人工智能主要是在特定任务上表现出色,如数据分析、图像识别等,但在决策、伦理和情感理解等方面仍存在局限。
170
+ 合成音频: Q3_answer.wav
171
+ 处理时间: 8.86 秒
172
+ 音频时长: 22.62 秒
173
+ RTF: 0.39
174
+ --------------------------------------------------
175
+
176
+ 总计: 3 个文件
177
+ 总处理时间: 26.53 秒
178
+ ```
179
+
180
+ 4、Latency
181
+
182
+ AX650N
183
+
184
+ RTF: 约为0.4,如上例。
185
+
186
+
187
+ **算力卡demo**
188
+
189
+ 运行步骤与板端demo大致相同,以aarch64环境为例:
190
+ ```
191
+ 1、运行qwen api
192
+ cd libaxllm
193
+
194
+ 启动支持上下文的 tokenizer 服务器
195
+ python3 qwen2.5_tokenizer_uid.py
196
+
197
+ 运行对应环境的api
198
+ sh run_qwen2.5_1.5b_ctx_axcl_aarch64_api.sh
199
+
200
+ 2、运行pipeline算力卡demo
201
+ cd ..
202
+ python3 ax_spoken_communication_demo.py --audio_dir input_question --api_url http://10.126.33.13:8000 --output_dir output
203
+ ```
204
+ x86环境运行步骤同上
205
+
206
+
207
+
208
+ ## 参考
209
+ - [sensevoice.axera](https://github.com/ml-inory/sensevoice.axera/tree/main)
210
+ - [3D-Speaker.axera](https://github.com/AXERA-TECH/3D-Speaker.axera/tree/master)
211
+ - [melotts.axera](https://github.com/ml-inory/melotts.axera/tree/main)
212
+ - [ax-llm](https://github.com/AXERA-TECH/ax-llm/tree/ax-context)
213
+
214
+
215
+ ## 技术讨论
216
+
217
+ - Github issues
218
+ - QQ 群: 139953715
ax_model/.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.axmodel filter=lfs diff=lfs merge=lfs -text
2
+ *.npy filter=lfs diff=lfs merge=lfs -text
ax_model/auto.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d0997706b30274f7ff3b157ca90df50b7ed8ced35091a0231700355d5ee1374
3
+ size 2368
ax_model/chn_jpn_yue_eng_ko_spectok.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
3
+ size 377341
ax_model/event_emo.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d22e3df5d192fdc3e73e368a2cb576975a5a43a114a8432a91c036adf8e2263
3
+ size 4608
ax_model/sensevoice.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b64a36fa15e75ab5e3b75f18ae87a058970cff76219407e503b54fb53dd8e38
3
+ size 262170623
ax_model/sensevoice/am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 560 560
3
+ [ 0 ]
4
+ <AddShift> 560 560
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 560 560
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
ax_model/sensevoice/config.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ encoder: SenseVoiceEncoderSmall
2
+ encoder_conf:
3
+ output_size: 512
4
+ attention_heads: 4
5
+ linear_units: 2048
6
+ num_blocks: 50
7
+ tp_blocks: 20
8
+ dropout_rate: 0.1
9
+ positional_dropout_rate: 0.1
10
+ attention_dropout_rate: 0.1
11
+ input_layer: pe
12
+ pos_enc_class: SinusoidalPositionEncoder
13
+ normalize_before: true
14
+ kernel_size: 11
15
+ sanm_shfit: 0
16
+ selfattention_layer_type: sanm
17
+
18
+
19
+ model: SenseVoiceSmall
20
+ model_conf:
21
+ length_normalized_loss: true
22
+ sos: 1
23
+ eos: 2
24
+ ignore_id: -1
25
+
26
+ tokenizer: SentencepiecesTokenizer
27
+ tokenizer_conf:
28
+ bpemodel: null
29
+ unk_symbol: <unk>
30
+ split_with_space: true
31
+
32
+ frontend: WavFrontend
33
+ frontend_conf:
34
+ fs: 16000
35
+ window: hamming
36
+ n_mels: 80
37
+ frame_length: 25
38
+ frame_shift: 10
39
+ lfr_m: 7
40
+ lfr_n: 6
41
+ cmvn_file: null
42
+
43
+
44
+ dataset: SenseVoiceCTCDataset
45
+ dataset_conf:
46
+ index_ds: IndexDSJsonl
47
+ batch_sampler: EspnetStyleBatchSampler
48
+ data_split_num: 32
49
+ batch_type: token
50
+ batch_size: 14000
51
+ max_token_length: 2000
52
+ min_token_length: 60
53
+ max_source_length: 2000
54
+ min_source_length: 60
55
+ max_target_length: 200
56
+ min_target_length: 0
57
+ shuffle: true
58
+ num_workers: 4
59
+ sos: ${model_conf.sos}
60
+ eos: ${model_conf.eos}
61
+ IndexDSJsonl: IndexDSJsonl
62
+ retry: 20
63
+
64
+ train_conf:
65
+ accum_grad: 1
66
+ grad_clip: 5
67
+ max_epoch: 20
68
+ keep_nbest_models: 10
69
+ avg_nbest_model: 10
70
+ log_interval: 100
71
+ resume: true
72
+ validate_interval: 10000
73
+ save_checkpoint_interval: 10000
74
+
75
+ optim: adamw
76
+ optim_conf:
77
+ lr: 0.00002
78
+ scheduler: warmuplr
79
+ scheduler_conf:
80
+ warmup_steps: 25000
81
+
82
+ specaug: SpecAugLFR
83
+ specaug_conf:
84
+ apply_time_warp: false
85
+ time_warp_window: 5
86
+ time_warp_mode: bicubic
87
+ apply_freq_mask: true
88
+ freq_mask_width_range:
89
+ - 0
90
+ - 30
91
+ lfr_rate: 6
92
+ num_freq_mask: 1
93
+ apply_time_mask: true
94
+ time_mask_width_range:
95
+ - 0
96
+ - 12
97
+ num_time_mask: 1
ax_model/vad/am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 400 400
3
+ [ 0 ]
4
+ <AddShift> 400 400
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 400 400
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
ax_model/vad/config.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ frontend: WavFrontendOnline
2
+ frontend_conf:
3
+ fs: 16000
4
+ window: hamming
5
+ n_mels: 80
6
+ frame_length: 25
7
+ frame_shift: 10
8
+ dither: 0.0
9
+ lfr_m: 5
10
+ lfr_n: 1
11
+
12
+ model: FsmnVADStreaming
13
+ model_conf:
14
+ sample_rate: 16000
15
+ detect_mode: 1
16
+ snr_mode: 0
17
+ max_end_silence_time: 800
18
+ max_start_silence_time: 3000
19
+ do_start_point_detection: True
20
+ do_end_point_detection: True
21
+ window_size_ms: 200
22
+ sil_to_speech_time_thres: 150
23
+ speech_to_sil_time_thres: 150
24
+ speech_2_noise_ratio: 1.0
25
+ do_extend: 1
26
+ lookback_time_start_point: 200
27
+ lookahead_time_end_point: 100
28
+ max_single_segment_time: 60000
29
+ snr_thres: -100.0
30
+ noise_frame_num_used_for_snr: 100
31
+ decibel_thres: -100.0
32
+ speech_noise_thres: 0.6
33
+ fe_prior_thres: 0.0001
34
+ silence_pdf_num: 1
35
+ sil_pdf_ids: [0]
36
+ speech_noise_thresh_low: -0.1
37
+ speech_noise_thresh_high: 0.3
38
+ output_frame_probs: False
39
+ frame_in_ms: 10
40
+ frame_length_ms: 25
41
+
42
+ encoder: FSMN
43
+ encoder_conf:
44
+ input_dim: 400
45
+ input_affine_dim: 140
46
+ fsmn_layers: 4
47
+ linear_dim: 250
48
+ proj_dim: 128
49
+ lorder: 20
50
+ rorder: 0
51
+ lstride: 1
52
+ rstride: 0
53
+ output_affine_dim: 140
54
+ output_dim: 248
55
+
56
+
ax_model/withitn.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39bf02586f59237894fc2918ab2db4f12ec3c084c41465718832fbd7646ea729
3
+ size 2368
ax_spoken_communication_demo.py ADDED
@@ -0,0 +1,719 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import librosa
4
+ import torch
5
+ import argparse
6
+ import soundfile as sf
7
+ import cn2an
8
+ import requests
9
+ import re
10
+ import numpy as np
11
+ import onnxruntime as ort
12
+ import axengine as axe
13
+
14
+ # 导入SenseVoice相关模块
15
+ from model import SinusoidalPositionEncoder
16
+ from utils.ax_model_bin import AX_SenseVoiceSmall
17
+ from utils.ax_vad_bin import AX_Fsmn_vad
18
+ from utils.vad_utils import merge_vad
19
+ from funasr.tokenizer.sentencepiece_tokenizer import SentencepiecesTokenizer
20
+
21
+ # 导入MeloTTS相关模块
22
+ from libmelotts.python.split_utils import split_sentence
23
+ from libmelotts.python.text import cleaned_text_to_sequence
24
+ from libmelotts.python.text.cleaner import clean_text
25
+ from libmelotts.python.symbols import LANG_TO_SYMBOL_MAP
26
+
27
+ # 配置参数
28
+ # tts 参数
29
+ TTS_MODEL_DIR = "libmelotts/models"
30
+ TTS_MODEL_FILES = {
31
+ "g": "g-zh_mix_en.bin",
32
+ "encoder": "encoder-zh.onnx",
33
+ "decoder": "decoder-zh.axmodel"
34
+ }
35
+
36
+ # Qwen大模型API参数
37
+ QWEN_API_URL = "" # API服务地址 http://10.126.29.158:8000
38
+
39
+
40
+ # TTS辅助函数(从melotts.py移植)
41
+ def intersperse(lst, item):
42
+ result = [item] * (len(lst) * 2 + 1)
43
+ result[1::2] = lst
44
+ return result
45
+
46
+ # 处理字符无法识别
47
+ def get_text_for_tts_infer(text, language_str, symbol_to_id=None):
48
+ """音素处理:确保所有数组长度一致"""
49
+ try:
50
+ norm_text, phone, tone, word2ph = clean_text(text, language_str)
51
+
52
+ # 特殊音素直接映射为空字符串
53
+ phone_mapping = {
54
+ 'ɛ': '', 'æ': '', 'ʌ': '', 'ʊ': '', 'ɔ': '', 'ɪ': '', 'ɝ': '', 'ɚ': '', 'ɑ': '',
55
+ 'ʒ': '', 'θ': '', 'ð': '', 'ŋ': '', 'ʃ': '', 'ʧ': '', 'ʤ': '', 'ː': '', 'ˈ': '',
56
+ 'ˌ': '', 'ʰ': '', 'ʲ': '', 'ʷ': '', 'ʔ': '', 'ɾ': '', 'ɹ': '', 'ɫ': '', 'ɡ': '',
57
+ }
58
+
59
+ # 同步处理 phone 和 tone,确保它们长度一致
60
+ processed_phone = []
61
+ processed_tone = []
62
+ removed_symbols = set()
63
+
64
+ for p, t in zip(phone, tone):
65
+ if p in phone_mapping:
66
+ # 特殊音素直接删除,同时删除对应的 tone
67
+ removed_symbols.add(p)
68
+ elif p in symbol_to_id:
69
+ # 正常音素保留,同时保留对应的 tone
70
+ processed_phone.append(p)
71
+ processed_tone.append(t)
72
+ else:
73
+ # 其他未知音素也删除
74
+ removed_symbols.add(p)
75
+
76
+ # 记录被删除的音素
77
+ if removed_symbols:
78
+ print(f"[音素过滤] 删除了 {len(removed_symbols)} 个特殊音素: {sorted(removed_symbols)}")
79
+ print(f"[音素过滤] 处理后音素序列长度: {len(processed_phone)}")
80
+ print(f"[音素过滤] 处理后音调序列长度: {len(processed_tone)}")
81
+
82
+ # 如果没有有效音素,使用默认音素,
83
+ if not processed_phone:
84
+ print("[警告] 没有有效音素,使用默认中文音素")
85
+ processed_phone = ['ni', 'hao']
86
+ processed_tone = ['1', '3']
87
+ word2ph = [1, 1]
88
+
89
+ # 确保 word2ph 的长度与处理后的音素序列匹配
90
+ if len(processed_phone) != len(phone):
91
+ print(f"[警告] 音素序列长度变化: {len(phone)} -> {len(processed_phone)}")
92
+ # 简单处理:重新计算 word2ph
93
+ word2ph = [1] * len(processed_phone)
94
+
95
+ phone, tone, language = cleaned_text_to_sequence(processed_phone, processed_tone, language_str, symbol_to_id)
96
+
97
+ phone = intersperse(phone, 0)
98
+ tone = intersperse(tone, 0)
99
+ language = intersperse(language, 0)
100
+
101
+ phone = np.array(phone, dtype=np.int32)
102
+ tone = np.array(tone, dtype=np.int32)
103
+ language = np.array(language, dtype=np.int32)
104
+ word2ph = np.array(word2ph, dtype=np.int32) * 2
105
+ word2ph[0] += 1
106
+ return phone, tone, language, norm_text, word2ph
107
+
108
+ except Exception as e:
109
+ print(f"[错误] 文本处理失败: {e}")
110
+ import traceback
111
+ traceback.print_exc()
112
+ raise e
113
+
114
+
115
+ def audio_numpy_concat(segment_data_list, sr, speed=1.):
116
+ audio_segments = []
117
+ for segment_data in segment_data_list:
118
+ audio_segments += segment_data.reshape(-1).tolist()
119
+ audio_segments += [0] * int((sr * 0.05) / speed)
120
+ audio_segments = np.array(audio_segments).astype(np.float32)
121
+ return audio_segments
122
+
123
+
124
+ def merge_sub_audio(sub_audio_list, pad_size, audio_len):
125
+ # Average pad part
126
+ if pad_size > 0:
127
+ for i in range(len(sub_audio_list) - 1):
128
+ sub_audio_list[i][-pad_size:] += sub_audio_list[i+1][:pad_size]
129
+ sub_audio_list[i][-pad_size:] /= 2
130
+ if i > 0:
131
+ sub_audio_list[i] = sub_audio_list[i][pad_size:]
132
+
133
+ sub_audio = np.concatenate(sub_audio_list, axis=-1)
134
+ return sub_audio[:audio_len]
135
+
136
+
137
+ def calc_word2pronoun(word2ph, pronoun_lens):
138
+ indice = [0]
139
+ for ph in word2ph[:-1]:
140
+ indice.append(indice[-1] + ph)
141
+ word2pronoun = []
142
+ for i, ph in zip(indice, word2ph):
143
+ word2pronoun.append(np.sum(pronoun_lens[i : i + ph]))
144
+ return word2pronoun
145
+
146
+
147
+ def generate_slices(word2pronoun, dec_len):
148
+ pn_start, pn_end = 0, 0
149
+ zp_start, zp_end = 0, 0
150
+ zp_len = 0
151
+ pn_slices = []
152
+ zp_slices = []
153
+ while pn_end < len(word2pronoun):
154
+ # 前一个slice长度大于2 且 加上现在这个字没有超过dec_len,则往前overlap两个字
155
+ if pn_end - pn_start > 2 and np.sum(word2pronoun[pn_end - 2 : pn_end + 1]) <= dec_len:
156
+ zp_len = np.sum(word2pronoun[pn_end - 2 : pn_end])
157
+ zp_start = zp_end - zp_len
158
+ pn_start = pn_end - 2
159
+ else:
160
+ zp_len = 0
161
+ zp_start = zp_end
162
+ pn_start = pn_end
163
+
164
+ while pn_end < len(word2pronoun) and zp_len + word2pronoun[pn_end] <= dec_len:
165
+ zp_len += word2pronoun[pn_end]
166
+ pn_end += 1
167
+ zp_end = zp_start + zp_len
168
+ pn_slices.append(slice(pn_start, pn_end))
169
+ zp_slices.append(slice(zp_start, zp_end))
170
+ return pn_slices, zp_slices
171
+
172
+
173
+ # 确认中英文
174
+ def lang_detect_with_regex(text):
175
+ """
176
+ 语言识别
177
+ """
178
+ # 移除所有数字
179
+ text_without_digits = re.sub(r'\d+', '', text)
180
+
181
+ if not text_without_digits:
182
+ return 'unknown'
183
+
184
+ # 检查是否包含中文字符 #中文优先
185
+ if re.search(r'[\u4e00-\u9fff]', text_without_digits):
186
+ return 'chinese'
187
+ else:
188
+ # 检查是否包含英文字母
189
+ if re.search(r'[a-zA-Z]', text_without_digits):
190
+ return 'english'
191
+ else:
192
+ return 'unknown'
193
+
194
+ class QwenTranslationAPI:
195
+ def __init__(self, api_url=QWEN_API_URL):
196
+ self.api_url = api_url
197
+ self.session_id = f"speech_translate_{int(time.time())}"
198
+
199
+ def translate(self, text_content, max_retries=3, timeout=120):
200
+ """调用千问API进行处理"""
201
+ if not text_content or text_content.strip() == "":
202
+ return "输入文本为空"
203
+
204
+ if lang_detect_with_regex(text_content)=='chinese':
205
+ prompt_f = "回答(限制在100个字以内)"
206
+ else:
207
+ prompt_f = "回答(限制在100个字以内)"
208
+
209
+ prompt = f"{prompt_f}:{text_content}"
210
+ print(f"[API] 发送请求: {prompt}")
211
+
212
+ for attempt in range(max_retries):
213
+ try:
214
+ # 第一步:发送生成请求
215
+ generate_url = f"{self.api_url}/api/generate"
216
+ payload = {
217
+ "prompt": prompt,
218
+ "temperature": 0.1, # 降低温度以获得更确定的结果
219
+ "repetition_penalty": 1.0,
220
+ "top-p": 0.9,
221
+ "top-k": 40,
222
+ "max_new_tokens": 512
223
+ }
224
+
225
+ print(f"[API] 开始生成请求 (尝试 {attempt + 1}/{max_retries})")
226
+ response = requests.post(generate_url, json=payload, timeout=30)
227
+ response.raise_for_status()
228
+ print("[API] 生成请求成功")
229
+
230
+ # 第二步:轮询获取结果并合并所有chunk
231
+ result_url = f"{self.api_url}/api/generate_provider"
232
+ start_time = time.time()
233
+ full_translation = ""
234
+ last_chunk = ""
235
+
236
+ while time.time() - start_time < timeout:
237
+ try:
238
+ result_response = requests.get(result_url, timeout=10)
239
+ result_data = result_response.json()
240
+
241
+ # 获取当前chunk
242
+ current_chunk = result_data.get("response", "")#.strip()
243
+ full_translation += current_chunk
244
+
245
+ # 检查是否完成
246
+ if result_data.get("done", False):
247
+ # 确保获取到完整的结果
248
+ print(f"[API] 完成: {full_translation}")
249
+ return full_translation
250
+
251
+ time.sleep(0.05)
252
+
253
+ except requests.exceptions.RequestException as e:
254
+ print(f"[API] 轮询请求失败: {e}")
255
+ if time.time() - start_time > timeout:
256
+ break
257
+ continue
258
+
259
+ print(f"[API] 轮询超时,尝试第 {attempt + 1} 次重试")
260
+
261
+ except requests.exceptions.RequestException as e:
262
+ print(f"[API] 请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
263
+ if attempt < max_retries - 1:
264
+ wait_time = 2 ** attempt # 指数退避
265
+ print(f"[API] 等待 {wait_time} 秒后重试...")
266
+ time.sleep(wait_time)
267
+ else:
268
+ return f"失败: {str(e)}"
269
+ except Exception as e:
270
+ print(f"[API] 过程出错: {e}")
271
+ return f"失败: {str(e)}"
272
+
273
+ return "超时,请检查API服务状态"
274
+
275
+ class SpeechTranslationPipeline:
276
+ def __init__(self,
277
+ tts_model_dir, tts_model_files,
278
+ asr_model_dir="ax_model", seq_len=132,
279
+ tts_dec_len=128, sample_rate=44100, tts_speed=0.8,
280
+ qwen_api_url=QWEN_API_URL):
281
+ self.tts_model_dir = tts_model_dir
282
+ self.tts_model_files = tts_model_files
283
+ self.asr_model_dir = asr_model_dir
284
+ self.seq_len = seq_len
285
+ self.tts_dec_len = tts_dec_len
286
+ self.sample_rate = sample_rate
287
+ self.tts_speed = tts_speed
288
+ self.qwen_api_url = qwen_api_url
289
+
290
+ # 初始化ASR模型
291
+ self._init_asr_models()
292
+
293
+ # 初始化TTS模型
294
+ self._init_tts_models()
295
+
296
+ # 初始化API
297
+ self.translator = QwenTranslationAPI(api_url=qwen_api_url)
298
+
299
+ # 验证所有必需文件存在
300
+ self._validate_files()
301
+
302
+ def _init_asr_models(self):
303
+ """初始化语音识别相关模型"""
304
+ print("Initializing SenseVoice models...")
305
+
306
+ # VAD模型
307
+ self.model_vad = AX_Fsmn_vad(self.asr_model_dir)
308
+
309
+ # 位置编码
310
+ self.embed = SinusoidalPositionEncoder()
311
+ self.position_encoding = self.embed.get_position_encoding(
312
+ torch.randn(1, self.seq_len, 560)).numpy()
313
+
314
+ # ASR模型
315
+ self.model_bin = AX_SenseVoiceSmall(self.asr_model_dir, seq_len=self.seq_len)
316
+
317
+ # Tokenizer
318
+ tokenizer_path = os.path.join(self.asr_model_dir, "chn_jpn_yue_eng_ko_spectok.bpe.model")
319
+ self.tokenizer = SentencepiecesTokenizer(bpemodel=tokenizer_path)
320
+
321
+ print("SenseVoice models initialized successfully.")
322
+
323
+ def _init_tts_models(self):
324
+ """初始化TTS相关模型"""
325
+ print("Initializing MeloTTS models...")
326
+ init_start = time.time()
327
+
328
+ # 加载encoder和decoder模型
329
+ enc_model = os.path.join(self.tts_model_dir, self.tts_model_files["encoder"])
330
+ dec_model = os.path.join(self.tts_model_dir, self.tts_model_files["decoder"])
331
+
332
+ model_load_start = time.time()
333
+ self.sess_enc = ort.InferenceSession(enc_model, providers=["CPUExecutionProvider"], sess_options=ort.SessionOptions())
334
+ self.sess_dec = axe.InferenceSession(dec_model)
335
+ print(f" Load encoder/decoder models: {(time.time() - model_load_start)*1000:.2f}ms")
336
+
337
+ # 加载静态输入g
338
+ g_file = os.path.join(self.tts_model_dir, self.tts_model_files["g"])
339
+ self.tts_g = np.fromfile(g_file, dtype=np.float32).reshape(1, 256, 1)
340
+
341
+ # 设置语言和symbol映射(默认支持中英混合)
342
+ self.tts_language = "ZH_MIX_EN"
343
+ self.symbol_to_id = {s: i for i, s in enumerate(LANG_TO_SYMBOL_MAP[self.tts_language])}
344
+
345
+ # 预热:提前加载所有懒加载的模块(这是主要耗时部分)
346
+ print(" Warming up TTS modules (loading language models, tokenizers, etc.)...")
347
+ warmup_start = time.time()
348
+
349
+ # 中英混合预热
350
+ try:
351
+ warmup_start_mix = time.time()
352
+ warmup_text_mix = "这是一个test测试。"
353
+ _, _, _, _, _ = get_text_for_tts_infer(warmup_text_mix, self.tts_language, symbol_to_id=self.symbol_to_id)
354
+ print(f" Mixed ZH-EN warm-up: {(time.time() - warmup_start_mix)*1000:.2f}ms")
355
+ except Exception as e:
356
+ print(f" Warning: Mixed warm-up failed: {e}")
357
+
358
+ total_init_time = (time.time() - init_start) * 1000
359
+ print(f"MeloTTS models initialized successfully. Total init time: {total_init_time:.2f}ms ({total_init_time/1000:.2f}s)")
360
+
361
+ def _validate_files(self):
362
+ """验证所有必需的文件都存在"""
363
+ # 检查TTS相关文件
364
+ for key, filename in self.tts_model_files.items():
365
+ filepath = os.path.join(self.tts_model_dir, filename)
366
+ if not os.path.exists(filepath):
367
+ raise FileNotFoundError(f"TTS模型文件不存在: {filepath}")
368
+
369
+ # 检查API服务是否可用(可选)
370
+ try:
371
+ response = requests.get(f"{self.qwen_api_url}/api/generate_provider", timeout=5)
372
+ print("[API检查] 千问API服务���接正常")
373
+ except:
374
+ print("[API警告] 无法连接到千问API服务,请确保已启动API服务")
375
+
376
+ def speech_recognition(self, speech, fs):
377
+ """
378
+ 第一步:语音识别(ASR)
379
+ """
380
+ speech_lengths = len(speech)
381
+
382
+ # VAD处理
383
+ print("Running VAD...")
384
+ vad_start_time = time.time()
385
+ res_vad = self.model_vad(speech)[0]
386
+ vad_segments = merge_vad(res_vad, 15 * 1000)
387
+ vad_time_cost = time.time() - vad_start_time
388
+ print(f"VAD processing time: {vad_time_cost:.2f} seconds")
389
+ print(f"VAD segments detected: {len(vad_segments)}")
390
+
391
+ # ASR处理
392
+ print("Running ASR...")
393
+ asr_start_time = time.time()
394
+ all_results = ""
395
+
396
+ # 遍历每个VAD片段并处理
397
+ for i, segment in enumerate(vad_segments):
398
+ segment_start, segment_end = segment
399
+ start_sample = int(segment_start / 1000 * fs)
400
+ end_sample = min(int(segment_end / 1000 * fs), speech_lengths)
401
+ segment_speech = speech[start_sample:end_sample]
402
+
403
+ # 为当前片段创建临时文件
404
+ segment_filename = f"temp_segment_{i}.wav"
405
+ sf.write(segment_filename, segment_speech, fs)
406
+
407
+ # 对当前片段进行识别
408
+ try:
409
+ segment_res = self.model_bin(
410
+ segment_filename,
411
+ "auto", # 语言自动检测
412
+ True, # withitn
413
+ self.position_encoding,
414
+ tokenizer=self.tokenizer,
415
+ )
416
+
417
+ all_results += segment_res
418
+
419
+ # 清理临时文件
420
+ if os.path.exists(segment_filename):
421
+ os.remove(segment_filename)
422
+
423
+ except Exception as e:
424
+ if os.path.exists(segment_filename):
425
+ os.remove(segment_filename)
426
+ print(f"Error processing segment {i}: {e}")
427
+ continue
428
+
429
+ asr_time_cost = time.time() - asr_start_time
430
+ print(f"ASR processing time: {asr_time_cost:.2f} seconds")
431
+ print(f"ASR Result: {all_results}")
432
+
433
+ return all_results.strip()
434
+
435
+ def run_translation(self, text_content):
436
+ """
437
+ 第二步:调用Qwen大模型API处理
438
+ """
439
+ print("Starting translation via API...")
440
+ translation_start_time = time.time()
441
+
442
+ # 使用API进行处理
443
+ translate_content = self.translator.translate(text_content)
444
+
445
+ translation_time_cost = time.time() - translation_start_time
446
+ print(f"Translation processing time: {translation_time_cost:.2f} seconds")
447
+ print(f"Translation Result: {translate_content}")
448
+
449
+ return translate_content
450
+
451
+ def run_tts(self, translate_content, output_dir, output_wav=None):
452
+ """
453
+ 第三步:使用TTS模型合成语音
454
+ """
455
+ output_path = os.path.join(output_dir, output_wav)
456
+
457
+ try:
458
+ # 处理中文文本中的数字
459
+ if lang_detect_with_regex(translate_content) == "chinese":
460
+ translate_content = cn2an.transform(translate_content, "an2cn")
461
+
462
+ print(f"TTS synthesis for text: {translate_content}")
463
+
464
+ # 分句
465
+ sens = split_sentence(translate_content, language_str=self.tts_language)
466
+ print(f"Text split into {len(sens)} sentences")
467
+
468
+ # 最终音频列表
469
+ audio_list = []
470
+
471
+ # 遍历每个句子
472
+ for n, se in enumerate(sens):
473
+ # 处理英文大小写连接
474
+ if self.tts_language in ['EN', 'ZH_MIX_EN']:
475
+ se = re.sub(r'([a-z])([A-Z])', r'\1 \2', se)
476
+
477
+ print(f"Processing sentence[{n}]: {se}")
478
+
479
+ # 转换文本为音素和音调
480
+ phones, tones, lang_ids, norm_text, word2ph = get_text_for_tts_infer(
481
+ se, self.tts_language, symbol_to_id=self.symbol_to_id)
482
+
483
+ # 运行encoder
484
+ encoder_start = time.time()
485
+ z_p, pronoun_lens, audio_len = self.sess_enc.run(None, input_feed={
486
+ 'phone': phones, 'g': self.tts_g,
487
+ 'tone': tones, 'language': lang_ids,
488
+ 'noise_scale': np.array([0], dtype=np.float32),
489
+ 'length_scale': np.array([1.0 / self.tts_speed], dtype=np.float32),
490
+ 'noise_scale_w': np.array([0], dtype=np.float32),
491
+ 'sdp_ratio': np.array([0], dtype=np.float32)})
492
+ print(f"Encoder run time: {1000 * (time.time() - encoder_start):.2f}ms")
493
+
494
+ # 计算每个词的发音长度
495
+ word2pronoun = calc_word2pronoun(word2ph, pronoun_lens)
496
+ # 生成切片
497
+ pn_slices, zp_slices = generate_slices(word2pronoun, self.tts_dec_len)
498
+
499
+ audio_len = audio_len[0]
500
+ sub_audio_list = []
501
+
502
+ for i, (ps, zs) in enumerate(zip(pn_slices, zp_slices)):
503
+ zp_slice = z_p[..., zs]
504
+
505
+ # Padding前zp的长度
506
+ sub_dec_len = zp_slice.shape[-1]
507
+ # Padding前输出音频的长度
508
+ sub_audio_len = 512 * sub_dec_len
509
+
510
+ # Padding到dec_len
511
+ if zp_slice.shape[-1] < self.tts_dec_len:
512
+ zp_slice = np.concatenate((zp_slice, np.zeros((*zp_slice.shape[:-1], self.tts_dec_len - zp_slice.shape[-1]), dtype=np.float32)), axis=-1)
513
+
514
+ decoder_start = time.time()
515
+ audio = self.sess_dec.run(None, input_feed={"z_p": zp_slice, "g": self.tts_g})[0].flatten()
516
+
517
+ # 处理overlap
518
+ audio_start = 0
519
+ if len(sub_audio_list) > 0:
520
+ if pn_slices[i - 1].stop > ps.start:
521
+ # 去掉第一个字
522
+ audio_start = 512 * word2pronoun[ps.start]
523
+
524
+ audio_end = sub_audio_len
525
+ if i < len(pn_slices) - 1:
526
+ if ps.stop > pn_slices[i + 1].start:
527
+ # 去掉最后一个字
528
+ audio_end = sub_audio_len - 512 * word2pronoun[ps.stop - 1]
529
+
530
+ audio = audio[audio_start:audio_end]
531
+ print(f"Decode slice[{i}]: decoder run time {1000 * (time.time() - decoder_start):.2f}ms")
532
+ sub_audio_list.append(audio)
533
+
534
+ # 合并子音频
535
+ sub_audio = merge_sub_audio(sub_audio_list, 0, audio_len)
536
+ audio_list.append(sub_audio)
537
+
538
+ # 拼接所有句子的音频
539
+ audio = audio_numpy_concat(audio_list, sr=self.sample_rate, speed=self.tts_speed)
540
+
541
+ # 保存音频文件
542
+ sf.write(output_path, audio, self.sample_rate)
543
+ print(f"TTS audio saved to {output_path}")
544
+
545
+ return output_path
546
+
547
+ except Exception as e:
548
+ print(f"TTS synthesis failed: {e}")
549
+ import traceback
550
+ traceback.print_exc()
551
+ raise e
552
+
553
+ def full_pipeline(self, speech, fs, output_dir=None, output_tts=None):
554
+ """
555
+ 完整Pipeline:语音识别 -> qwen -> TTS合成
556
+ """
557
+
558
+ # 第一步:语音识别
559
+ print("\n----------------------VAD+ASR----------------------------\n")
560
+ start_time = time.time() # 记录开始时间
561
+ text_content = self.speech_recognition(speech, fs)
562
+ asr_time = time.time() - start_time # 计算耗时
563
+ print(f"语音识别耗时: {asr_time:.2f} 秒")
564
+
565
+ if not text_content or text_content.strip() == "":
566
+ raise ValueError("ASR未能识别出有效文本")
567
+
568
+ # 第二步:qwen
569
+ print("\n---------------------Qwen---------------------------\n")
570
+ start_time = time.time() # 记录开始时间
571
+ translate_content = self.run_translation(text_content)
572
+ translate_time = time.time() - start_time # 计算耗时
573
+ print(f"qwen耗时: {translate_time:.2f} 秒")
574
+
575
+ # 第三步:TTS合成
576
+ print("-------------------------TTS-------------------------------\n")
577
+ start_time = time.time() # 记录开始时间
578
+ output_path = self.run_tts(translate_content, output_dir, output_tts)
579
+ tts_time = time.time() - start_time # 计算耗时
580
+ print(f"TTS合成耗时: {tts_time:.2f} 秒")
581
+
582
+ return {
583
+ "original_text": text_content,
584
+ "translated_text": translate_content,
585
+ "audio_path": output_path
586
+ }
587
+
588
+ def main():
589
+ parser = argparse.ArgumentParser(description="Speech Recognition, Translation and TTS Pipeline")
590
+ parser.add_argument("--audio_dir", type=str, default="./input_question", help="Input audio directory path")
591
+ parser.add_argument("--output_dir", type=str, default="./output_answer", help="Output directory")
592
+ parser.add_argument("--api_url", type=str, default="http://10.126.29.158:8000", help="Qwen API server URL")
593
+
594
+ args = parser.parse_args()
595
+ print("-------------------START------------------------\n")
596
+ os.makedirs(args.output_dir, exist_ok=True)
597
+
598
+ # 检查音频目录是否存���
599
+ if not os.path.exists(args.audio_dir):
600
+ print(f"错误: 音频目录不存在: {args.audio_dir}")
601
+ return
602
+
603
+ # 获取音频目录中的所有.wav文件
604
+ audio_files = []
605
+ for file in os.listdir(args.audio_dir):
606
+ if file.lower().endswith(('.wav', '.mp3')):
607
+ audio_files.append(os.path.join(args.audio_dir, file))
608
+
609
+ # 如果没有找到音频文件
610
+ if not audio_files:
611
+ print(f"错误: 在目录 {args.audio_dir} 中没有找到音频文件")
612
+ return
613
+
614
+ # 按文件名排序,确保处理顺序
615
+ audio_files.sort()
616
+ print(f"找到 {len(audio_files)} 个音频文件: {[os.path.basename(f) for f in audio_files]}")
617
+
618
+ # 初始化pipeline(只需一次)
619
+ pipeline = SpeechTranslationPipeline(
620
+ tts_model_dir=TTS_MODEL_DIR,
621
+ tts_model_files=TTS_MODEL_FILES,
622
+ asr_model_dir="ax_model",
623
+ seq_len=132,
624
+ tts_dec_len=128,
625
+ sample_rate=44100,
626
+ tts_speed=0.8,
627
+ qwen_api_url=args.api_url
628
+ )
629
+
630
+ # 处理每个音频文件
631
+ all_results = []
632
+ total_start_time = time.time()
633
+
634
+ for i, audio_file in enumerate(audio_files):
635
+ print(f"\n{'='*60}")
636
+ print(f"处理第 {i+1}/{len(audio_files)} 个音频文件: {os.path.basename(audio_file)}")
637
+ print(f"{'='*60}")
638
+
639
+ file_start_time = time.time()
640
+
641
+ try:
642
+ # 加载音频
643
+ speech, fs = librosa.load(audio_file, sr=None)
644
+ if fs != 16000:
645
+ print(f"重采样音频从 {fs}Hz 到 16000Hz")
646
+ speech = librosa.resample(y=speech, orig_sr=fs, target_sr=16000)
647
+ fs = 16000
648
+ audio_duration = librosa.get_duration(y=speech, sr=fs)
649
+
650
+ # 生成输出文件名
651
+ base_name = os.path.splitext(os.path.basename(audio_file))[0]
652
+ output_tts = f"{base_name}_answer.wav"
653
+
654
+ # 运行pipeline
655
+ result = pipeline.full_pipeline(speech, fs, args.output_dir, output_tts)
656
+
657
+ # 计算处理时间
658
+ file_time_cost = time.time() - file_start_time
659
+
660
+ out_wav = os.path.join(args.output_dir,output_tts)
661
+ speech, fs = librosa.load(out_wav, sr=None)
662
+ output_duration = librosa.get_duration(y=speech, sr=fs)
663
+ rtf = file_time_cost / output_duration
664
+
665
+ # 添加文件信息到结果
666
+ result.update({
667
+ "audio_file": audio_file,
668
+ "processing_time": file_time_cost,
669
+ "output_duration": output_duration,
670
+ "rtf": rtf
671
+ })
672
+
673
+ all_results.append(result)
674
+
675
+ print(f"\n文件处理完成: {os.path.basename(audio_file)}")
676
+ print(f"原始文本: {result['original_text']}")
677
+ print(f"回答文本: {result['translated_text']}")
678
+ print(f"生成音频: {result['audio_path']}")
679
+ print(f"处理时间: {file_time_cost:.2f} 秒")
680
+ print(f"音频时长: {output_duration:.2f} 秒")
681
+ print(f"RTF: {rtf:.2f}")
682
+
683
+ except Exception as e:
684
+ print(f"处理文件 {audio_file} 时出错: {e}")
685
+ import traceback
686
+ traceback.print_exc()
687
+ continue
688
+
689
+ # 输出总体结果
690
+ total_time_cost = time.time() - total_start_time
691
+ print(f"\n{'='*80}")
692
+ print("所有文件处理完成!")
693
+ print(f"{'='*80}")
694
+ print(f"总共处理了 {len(all_results)} 个文件")
695
+ print(f"总处理时间: {total_time_cost:.2f} 秒")
696
+
697
+ # 保存汇总结果
698
+ summary_file = os.path.join(args.output_dir, "processing_summary.txt")
699
+ with open(summary_file, 'w', encoding='utf-8') as f:
700
+ f.write("批量处理结果汇总\n")
701
+ f.write("=" * 50 + "\n\n")
702
+
703
+ for i, result in enumerate(all_results):
704
+ f.write(f"文件 {i+1}: {os.path.basename(result['audio_file'])}\n")
705
+ f.write(f" 原始文本: {result['original_text']}\n")
706
+ f.write(f" 回答结果: {result['translated_text']}\n")
707
+ f.write(f" 合成音频: {os.path.basename(result['audio_path'])}\n")
708
+ f.write(f" 处理时间: {result['processing_time']:.2f} 秒\n")
709
+ f.write(f" 音频时长: {result['output_duration']:.2f} 秒\n")
710
+ f.write(f" RTF: {result['rtf']:.2f}\n")
711
+ f.write("-" * 50 + "\n")
712
+
713
+ f.write(f"\n总计: {len(all_results)} 个文件\n")
714
+ f.write(f"总处理时间: {total_time_cost:.2f} 秒\n")
715
+
716
+ print(f"详细结果已保存到: {summary_file}")
717
+
718
+ if __name__ == "__main__":
719
+ main()
config.json ADDED
File without changes
input_question/Q1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8991bc8a91bc377ad8ba3e9962edebdceadb7d1d468eb28881fef83738f4c4d1
3
+ size 177644
input_question/Q2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9d817c1110392680cf4873e97f373229d29449c62fd551dc7fde2a360960c61
3
+ size 235244
input_question/Q3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:121fefb370b53aab86072cafc55ab54ed3ff487d3c9955063e0cda9fae7bf5b8
3
+ size 132044
libaxllm/main_api_ax650 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e800cd6e00dd2ad7303cb6fb6b867a33704665bded213fe4bd3be3df025c0821
3
+ size 1064760
libaxllm/main_api_axcl_aarch64 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3558444d93ce7459db247421128aca6ba3fdbde5932eff6aea66653fa7370cdf
3
+ size 1816560
libaxllm/main_api_axcl_x86 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8cbbde225235ace328ae230320d7f9b2d6a9321a8dca3179f4d770edc65a2e0
3
+ size 8811440
libaxllm/post_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "enable_temperature" : true,
3
+ "temperature" : 0.9,
4
+
5
+ "enable_repetition_penalty" : false,
6
+ "repetition_penalty" : 1.2,
7
+ "penalty_window" : 20,
8
+
9
+ "enable_top_p_sampling" : false,
10
+ "top_p" : 0.8,
11
+
12
+ "enable_top_k_sampling" : true,
13
+ "top_k" : 10
14
+ }
libaxllm/qwen2.5_tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
libaxllm/qwen2.5_tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
libaxllm/qwen2.5_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "151658": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "151659": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "151660": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "151661": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "151662": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "151663": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "151664": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ }
180
+ },
181
+ "additional_special_tokens": [
182
+ "<|im_start|>",
183
+ "<|im_end|>",
184
+ "<|object_ref_start|>",
185
+ "<|object_ref_end|>",
186
+ "<|box_start|>",
187
+ "<|box_end|>",
188
+ "<|quad_start|>",
189
+ "<|quad_end|>",
190
+ "<|vision_start|>",
191
+ "<|vision_end|>",
192
+ "<|vision_pad|>",
193
+ "<|image_pad|>",
194
+ "<|video_pad|>"
195
+ ],
196
+ "bos_token": null,
197
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "model_max_length": 131072,
202
+ "pad_token": "<|endoftext|>",
203
+ "split_special_tokens": false,
204
+ "tokenizer_class": "Qwen2Tokenizer",
205
+ "unk_token": null,
206
+ "add_bos_token": false
207
+ }
libaxllm/qwen2.5_tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
libaxllm/qwen2.5_tokenizer_uid.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, PreTrainedTokenizerFast
2
+ from http.server import HTTPServer, BaseHTTPRequestHandler
3
+ import json
4
+ import argparse
5
+ import uuid
6
+
7
+ # 全局字典:存储 uid 到 Tokenizer_Http 实例的映射
8
+ tokenizers = {}
9
+
10
+ class Tokenizer_Http():
11
+ def __init__(self):
12
+ model_id = "qwen2.5_tokenizer"
13
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id)
14
+ self.messages = [
15
+ {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
16
+ ]
17
+ self.token_ids = []
18
+
19
+ def encode(self, prompt, last_reply=None):
20
+ if last_reply is not None:
21
+ self.messages.append({"role": "assistant", "content": last_reply})
22
+ text = self.tokenizer.apply_chat_template(
23
+ self.messages,
24
+ tokenize=False,
25
+ add_generation_prompt=True
26
+ )
27
+ # print("生成的文本:\n============\n", text, "============\n")
28
+ self.token_ids = self.tokenizer.encode(text)[:-3]
29
+ self.messages.append({"role": "user", "content": prompt})
30
+
31
+ text = self.tokenizer.apply_chat_template(
32
+ self.messages,
33
+ tokenize=False,
34
+ add_generation_prompt=True
35
+ )
36
+ print("生成的文本:\n============\n", text, "============\n")
37
+ token_ids = self.tokenizer.encode(text)
38
+ # 找出新增部分
39
+ diff = token_ids[len(self.token_ids):]
40
+ self.token_ids = token_ids
41
+ print(self.decode(diff))
42
+ return token_ids, diff
43
+
44
+ def decode(self, token_ids):
45
+ return self.tokenizer.decode(token_ids)
46
+
47
+ @property
48
+ def bos_id(self):
49
+ return self.tokenizer.bos_token_id
50
+
51
+ @property
52
+ def eos_id(self):
53
+ return self.tokenizer.eos_token_id
54
+
55
+ @property
56
+ def bos_token(self):
57
+ return self.tokenizer.bos_token
58
+
59
+ @property
60
+ def eos_token(self):
61
+ return self.tokenizer.eos_token
62
+
63
+ def reset(self, system_prompt="You are Qwen, created by Alibaba Cloud. You are a helpful assistant."):
64
+ self.messages = [
65
+ {"role": "system", "content": system_prompt},
66
+ ]
67
+ text = self.tokenizer.apply_chat_template(
68
+ self.messages,
69
+ tokenize=False,
70
+ add_generation_prompt=True
71
+ )
72
+ token_ids = self.tokenizer.encode(text)[:-3]
73
+ self.token_ids = token_ids
74
+ print(self.decode(token_ids))
75
+ return token_ids
76
+
77
+
78
+ class Request(BaseHTTPRequestHandler):
79
+ timeout = 5
80
+ server_version = 'Apache'
81
+
82
+ def do_GET(self):
83
+ print("GET 请求路径:", self.path)
84
+ self.send_response(200)
85
+ self.send_header("Content-Type", "application/json")
86
+ self.end_headers()
87
+
88
+ # 新增接口:获取 uid
89
+ if '/get_uid' in self.path:
90
+ new_uid = str(uuid.uuid4())
91
+ print("新 uid:", new_uid)
92
+ # 为该 uid 创建一个新的 Tokenizer_Http 实例
93
+ tokenizers[new_uid] = Tokenizer_Http()
94
+ msg = json.dumps({'uid': new_uid})
95
+ elif '/bos_id' in self.path:
96
+ # 获取 uid 参数(例如 ?uid=xxx)
97
+ uid = self.get_query_param("uid")
98
+ instance: Tokenizer_Http = tokenizers.get(uid)
99
+ if instance is None:
100
+ msg = json.dumps({'error': 'Invalid uid'})
101
+ else:
102
+ bos_id = instance.bos_id
103
+ msg = json.dumps({'bos_id': bos_id if bos_id is not None else -1})
104
+ elif '/eos_id' in self.path:
105
+ uid = self.get_query_param("uid")
106
+ instance: Tokenizer_Http = tokenizers.get(uid)
107
+ if instance is None:
108
+ msg = json.dumps({'error': 'Invalid uid'})
109
+ else:
110
+ eos_id = instance.eos_id
111
+ msg = json.dumps({'eos_id': eos_id if eos_id is not None else -1})
112
+ else:
113
+ msg = json.dumps({'error': 'Invalid GET endpoint'})
114
+
115
+ print("响应消息:", msg)
116
+ self.wfile.write(msg.encode())
117
+
118
+ def do_POST(self):
119
+ content_length = int(self.headers.get('content-length', 0))
120
+ data = self.rfile.read(content_length).decode()
121
+ print("POST 请求路径:", self.path)
122
+ print("接收到的数据:", data)
123
+ req = json.loads(data)
124
+
125
+ self.send_response(200)
126
+ self.send_header("Content-Type", "application/json")
127
+ self.end_headers()
128
+
129
+ if '/encode' in self.path:
130
+ # 请求数据中必须包含 uid, text, 和可选的 last_reply
131
+ uid = req.get('uid')
132
+ prompt = req.get('text')
133
+ last_reply = req.get('last_reply')
134
+ instance: Tokenizer_Http = tokenizers.get(uid)
135
+ if instance is None:
136
+ msg = json.dumps({'error': 'Invalid uid'})
137
+ else:
138
+ token_ids, diff = instance.encode(prompt, last_reply)
139
+ msg = json.dumps({'token_ids': token_ids, 'diff': diff})
140
+ elif '/decode' in self.path:
141
+ uid = req.get('uid')
142
+ token_ids = req.get('token_ids')
143
+ instance: Tokenizer_Http = tokenizers.get(uid)
144
+ if instance is None:
145
+ msg = json.dumps({'error': 'Invalid uid'})
146
+ else:
147
+ text = instance.decode(token_ids)
148
+ msg = json.dumps({'text': text})
149
+ elif '/reset' in self.path:
150
+ uid = req.get("uid")
151
+ system_prompt = req.get("system_prompt")
152
+ instance: Tokenizer_Http = tokenizers.get(uid)
153
+ if instance is None:
154
+ msg = json.dumps({'error': 'Invalid uid'})
155
+ else:
156
+ if system_prompt is not None:
157
+ print("system_prompt:", system_prompt)
158
+ token_ids = instance.reset(system_prompt)
159
+ msg = json.dumps({'token_ids': token_ids})
160
+ else:
161
+ token_ids = instance.reset()
162
+ msg = json.dumps({'token_ids': token_ids})
163
+ else:
164
+ msg = json.dumps({'error': 'Invalid POST endpoint'})
165
+
166
+ print("响应消息:", msg)
167
+ self.wfile.write(msg.encode())
168
+
169
+ def get_query_param(self, key):
170
+ """
171
+ 辅助函数:从 GET 请求的 URL 中获取查询参数的值
172
+ 例如:/bos_id?uid=xxx
173
+ """
174
+ from urllib.parse import urlparse, parse_qs
175
+ query = urlparse(self.path).query
176
+ params = parse_qs(query)
177
+ values = params.get(key)
178
+ return values[0] if values else None
179
+
180
+ if __name__ == "__main__":
181
+ parser = argparse.ArgumentParser()
182
+ parser.add_argument('--host', type=str, default='0.0.0.0')
183
+ parser.add_argument('--port', type=int, default=12345)
184
+ args = parser.parse_args()
185
+
186
+ host = (args.host, args.port)
187
+ print('Server running at http://%s:%s' % host)
188
+ server = HTTPServer(host, Request)
189
+ server.serve_forever()
libaxllm/run_qwen2.5_1.5b_ctx_ax650_api.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./main_api_ax650 \
2
+ --template_filename_axmodel "./qwen2.5-1.5b-ctx-ax650/qwen2_p128_l%d_together.axmodel" \
3
+ --axmodel_num 28 \
4
+ --url_tokenizer_model "http://0.0.0.0:12345" \
5
+ --filename_post_axmodel "./qwen2.5-1.5b-ctx-ax650/qwen2_post.axmodel" \
6
+ --filename_tokens_embed "./qwen2.5-1.5b-ctx-ax650/model.embed_tokens.weight.bfloat16.bin" \
7
+ --tokens_embed_num 151936 \
8
+ --tokens_embed_size 1536
9
+ #--use_mmap_load_embed 1
10
+ #--live_print 1
11
+
12
+
13
+ #--system_prompt "你的名字叫小智(allen),你是一个人畜无害的AI助手。深圳市今天(4月1日)阴天,愚人节,气温在14°C至19°C之间,微风。" \
14
+ #--kvcache_path "./kvcache" \
15
+
libaxllm/run_qwen2.5_1.5b_ctx_axcl_aarch64_api.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./main_api_axcl_aarch64 \
2
+ --system_prompt "你的名字叫小智(allen),你是一个人畜无害的AI助手。深圳市今天(4月1日)阴天,愚人节,气温在14°C至19°C之间,微风。" \
3
+ --template_filename_axmodel "qwen2.5-1.5b-ctx-ax650/qwen2_p128_l%d_together.axmodel" \
4
+ --axmodel_num 28 \
5
+ --url_tokenizer_model "http://127.0.0.1:12345" \
6
+ --filename_post_axmodel "qwen2.5-1.5b-ctx-ax650/qwen2_post.axmodel" \
7
+ --filename_tokens_embed "qwen2.5-1.5b-ctx-ax650/model.embed_tokens.weight.bfloat16.bin" \
8
+ --tokens_embed_num 151936 \
9
+ --tokens_embed_size 1536 \
10
+ --use_mmap_load_embed 1 \
11
+ --devices 0
12
+
13
+ # --kvcache_path "./kvcache" \
libaxllm/run_qwen2.5_1.5b_ctx_axcl_x86_api.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./main_api_axcl_x86 \
2
+ --system_prompt "你的名字叫小智(allen),你是一个人畜无害的AI助手。深圳市今天(4月1日)阴天,愚人节,气温在14°C至19°C之间,微风。" \
3
+ --template_filename_axmodel "qwen2.5-1.5b-ctx-ax650/qwen2_p128_l%d_together.axmodel" \
4
+ --axmodel_num 28 \
5
+ --url_tokenizer_model "http://127.0.0.1:12345" \
6
+ --filename_post_axmodel "qwen2.5-1.5b-ctx-ax650/qwen2_post.axmodel" \
7
+ --filename_tokens_embed "qwen2.5-1.5b-ctx-ax650/model.embed_tokens.weight.bfloat16.bin" \
8
+ --tokens_embed_num 151936 \
9
+ --tokens_embed_size 1536 \
10
+ --use_mmap_load_embed 1 \
11
+ --devices 0
12
+
13
+ # --kvcache_path "./kvcache" \
libmelotts/models/decoder-en.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c93c0fa978cc1c68fbac6a78707dd75b8b9069cb01a1ade6846e2435aa1eb1
3
+ size 44093802
libmelotts/models/decoder-zh.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ea2d8401f18dd371eec50b90bd39dcadf9684aaf3543dace8ce1a9499ef253
3
+ size 44092592
libmelotts/models/encoder-en.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cc51185fb81934c7490c5f9ac993fff7efa98ab41c08cd3753c96abcb297582
3
+ size 31488385
libmelotts/models/encoder-zh.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2b0a5bc2789faef16b4bfc56ab4905364f8163a59f2db3d071b4a14792bfee5
3
+ size 31397760
libmelotts/models/g-en.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:094bf0dbe1cd6c9408707209b2b7261b9df2cd5917d310bfac5945a15a31821a
3
+ size 1024
libmelotts/models/g-jp.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c01dd0961bbe1effca4ed378d2969d6fbd9b579133b722f6968db5cf4d22281e
3
+ size 1024
libmelotts/models/g-zh_mix_en.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c70d897674847882bd35e780aee696ddaff8d04d5c57e4f9cf37611b6821879f
3
+ size 1024
libmelotts/models/lexicon.txt ADDED
The diff for this file is too large to render. See raw diff
 
libmelotts/models/tokens.txt ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _ 0
2
+ AA 1
3
+ E 2
4
+ EE 3
5
+ En 4
6
+ N 5
7
+ OO 6
8
+ V 7
9
+ a 8
10
+ a: 9
11
+ aa 10
12
+ ae 11
13
+ ah 12
14
+ ai 13
15
+ an 14
16
+ ang 15
17
+ ao 16
18
+ aw 17
19
+ ay 18
20
+ b 19
21
+ by 20
22
+ c 21
23
+ ch 22
24
+ d 23
25
+ dh 24
26
+ dy 25
27
+ e 26
28
+ e: 27
29
+ eh 28
30
+ ei 29
31
+ en 30
32
+ eng 31
33
+ er 32
34
+ ey 33
35
+ f 34
36
+ g 35
37
+ gy 36
38
+ h 37
39
+ hh 38
40
+ hy 39
41
+ i 40
42
+ i0 41
43
+ i: 42
44
+ ia 43
45
+ ian 44
46
+ iang 45
47
+ iao 46
48
+ ie 47
49
+ ih 48
50
+ in 49
51
+ ing 50
52
+ iong 51
53
+ ir 52
54
+ iu 53
55
+ iy 54
56
+ j 55
57
+ jh 56
58
+ k 57
59
+ ky 58
60
+ l 59
61
+ m 60
62
+ my 61
63
+ n 62
64
+ ng 63
65
+ ny 64
66
+ o 65
67
+ o: 66
68
+ ong 67
69
+ ou 68
70
+ ow 69
71
+ oy 70
72
+ p 71
73
+ py 72
74
+ q 73
75
+ r 74
76
+ ry 75
77
+ s 76
78
+ sh 77
79
+ t 78
80
+ th 79
81
+ ts 80
82
+ ty 81
83
+ u 82
84
+ u: 83
85
+ ua 84
86
+ uai 85
87
+ uan 86
88
+ uang 87
89
+ uh 88
90
+ ui 89
91
+ un 90
92
+ uo 91
93
+ uw 92
94
+ v 93
95
+ van 94
96
+ ve 95
97
+ vn 96
98
+ w 97
99
+ x 98
100
+ y 99
101
+ z 100
102
+ zh 101
103
+ zy 102
104
+ ! 103
105
+ ? 104
106
+ … 105
107
+ , 106
108
+ . 107
109
+ ' 108
110
+ - 109
111
+ SP 110
112
+ UNK 111
libmelotts/python/split_utils.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import glob
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import re
7
+
8
+ def split_sentence(text, min_len=10, language_str='EN'):
9
+ if language_str in ['EN', 'FR', 'ES', 'SP']:
10
+ sentences = split_sentences_latin(text, min_len=min_len)
11
+ else:
12
+ sentences = split_sentences_zh(text, min_len=min_len)
13
+ return sentences
14
+
15
+
16
+ def split_sentences_latin(text, min_len=10):
17
+ text = re.sub('[。!?;]', '.', text)
18
+ text = re.sub('[,]', ',', text)
19
+ text = re.sub('[“”]', '"', text)
20
+ text = re.sub('[‘’]', "'", text)
21
+ text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
22
+ return [item.strip() for item in txtsplit(text, 256, 512) if item.strip()]
23
+
24
+
25
+ def split_sentences_zh(text, min_len=10):
26
+ text = re.sub('[。!?;]', '.', text)
27
+ text = re.sub('[,]', ',', text)
28
+ # 将文本中的换行符、空格和制表符替换为空格
29
+ text = re.sub('[\n\t ]+', ' ', text)
30
+ # 在标点符号后添加一个空格
31
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
32
+ # 分隔句子并去除前后空格
33
+ # sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
34
+ sentences = [s.strip() for s in text.split('$#!')]
35
+ if len(sentences[-1]) == 0: del sentences[-1]
36
+
37
+ new_sentences = []
38
+ new_sent = []
39
+ count_len = 0
40
+ for ind, sent in enumerate(sentences):
41
+ new_sent.append(sent)
42
+ count_len += len(sent)
43
+ if count_len > min_len or ind == len(sentences) - 1:
44
+ count_len = 0
45
+ new_sentences.append(' '.join(new_sent))
46
+ new_sent = []
47
+ return merge_short_sentences_zh(new_sentences)
48
+
49
+
50
+ def merge_short_sentences_en(sens):
51
+ """Avoid short sentences by merging them with the following sentence.
52
+
53
+ Args:
54
+ List[str]: list of input sentences.
55
+
56
+ Returns:
57
+ List[str]: list of output sentences.
58
+ """
59
+ sens_out = []
60
+ for s in sens:
61
+ # If the previous sentense is too short, merge them with
62
+ # the current sentence.
63
+ if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
64
+ sens_out[-1] = sens_out[-1] + " " + s
65
+ else:
66
+ sens_out.append(s)
67
+ try:
68
+ if len(sens_out[-1].split(" ")) <= 2:
69
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
70
+ sens_out.pop(-1)
71
+ except:
72
+ pass
73
+ return sens_out
74
+
75
+
76
+ def merge_short_sentences_zh(sens):
77
+ # return sens
78
+ """Avoid short sentences by merging them with the following sentence.
79
+
80
+ Args:
81
+ List[str]: list of input sentences.
82
+
83
+ Returns:
84
+ List[str]: list of output sentences.
85
+ """
86
+ sens_out = []
87
+ for s in sens:
88
+ # If the previous sentense is too short, merge them with
89
+ # the current sentence.
90
+ if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
91
+ sens_out[-1] = sens_out[-1] + " " + s
92
+ else:
93
+ sens_out.append(s)
94
+ try:
95
+ if len(sens_out[-1]) <= 2:
96
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
97
+ sens_out.pop(-1)
98
+ except:
99
+ pass
100
+ return sens_out
101
+
102
+
103
+
104
+ def txtsplit(text, desired_length=100, max_length=200):
105
+ """Split text it into chunks of a desired length trying to keep sentences intact."""
106
+ text = re.sub(r'\n\n+', '\n', text)
107
+ text = re.sub(r'\s+', ' ', text)
108
+ text = re.sub(r'[""]', '"', text)
109
+ text = re.sub(r'([,.?!])', r'\1 ', text)
110
+ text = re.sub(r'\s+', ' ', text)
111
+
112
+ rv = []
113
+ in_quote = False
114
+ current = ""
115
+ split_pos = []
116
+ pos = -1
117
+ end_pos = len(text) - 1
118
+ def seek(delta):
119
+ nonlocal pos, in_quote, current
120
+ is_neg = delta < 0
121
+ for _ in range(abs(delta)):
122
+ if is_neg:
123
+ pos -= 1
124
+ current = current[:-1]
125
+ else:
126
+ pos += 1
127
+ current += text[pos]
128
+ if text[pos] == '"':
129
+ in_quote = not in_quote
130
+ return text[pos]
131
+ def peek(delta):
132
+ p = pos + delta
133
+ return text[p] if p < end_pos and p >= 0 else ""
134
+ def commit():
135
+ nonlocal rv, current, split_pos
136
+ rv.append(current)
137
+ current = ""
138
+ split_pos = []
139
+ while pos < end_pos:
140
+ c = seek(1)
141
+ if len(current) >= max_length:
142
+ if len(split_pos) > 0 and len(current) > (desired_length / 2):
143
+ d = pos - split_pos[-1]
144
+ seek(-d)
145
+ else:
146
+ while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
147
+ c = seek(-1)
148
+ commit()
149
+ elif not in_quote and (c in '!?\n' or (c in '.,' and peek(1) in '\n ')):
150
+ while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
151
+ c = seek(1)
152
+ split_pos.append(pos)
153
+ if len(current) >= desired_length:
154
+ commit()
155
+ elif in_quote and peek(1) == '"' and peek(2) in '\n ':
156
+ seek(2)
157
+ split_pos.append(pos)
158
+ rv.append(current)
159
+ rv = [s.strip() for s in rv]
160
+ rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
161
+ return rv
162
+
163
+
164
+ if __name__ == '__main__':
165
+ zh_text = "好的,我来给你讲一个故事吧。从前有一个小姑娘,她叫做小红。小红非常喜欢在森林里玩耍,她经常会和她的小伙伴们一起去探险。有一天,小红和她的小伙伴们走到了森林深处,突然遇到了一只凶猛的野兽。小红的小伙伴们都吓得不敢动弹,但是小红并没有被吓倒,她勇敢地走向野兽,用她的智慧和勇气成功地制服了野兽,保护了她的小伙伴们。从那以后,小红变得更加勇敢和自信,成为了她小伙伴们心中的英雄。"
166
+ en_text = "I didn’t know what to do. I said please kill her because it would be better than being kidnapped,” Ben, whose surname CNN is not using for security concerns, said on Wednesday. “It’s a nightmare. I said ‘please kill her, don’t take her there.’"
167
+ sp_text = "¡Claro! ¿En qué tema te gustaría que te hable en español? Puedo proporcionarte información o conversar contigo sobre una amplia variedad de temas, desde cultura y comida hasta viajes y tecnología. ¿Tienes alguna preferencia en particular?"
168
+ fr_text = "Bien sûr ! En quelle matière voudriez-vous que je vous parle en français ? Je peux vous fournir des informations ou discuter avec vous sur une grande variété de sujets, que ce soit la culture, la nourriture, les voyages ou la technologie. Avez-vous une préférence particulière ?"
169
+
170
+ print(split_sentence(zh_text, language_str='ZH'))
171
+ print(split_sentence(en_text, language_str='EN'))
172
+ print(split_sentence(sp_text, language_str='SP'))
173
+ print(split_sentence(fr_text, language_str='FR'))
libmelotts/python/symbols.py ADDED
@@ -0,0 +1,1237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ zh_mix_en_symbols = [
3
+ "_",
4
+ "AA",
5
+ "E",
6
+ "EE",
7
+ "En",
8
+ "N",
9
+ "OO",
10
+ "V",
11
+ "a",
12
+ "a:",
13
+ "aa",
14
+ "ae",
15
+ "ah",
16
+ "ai",
17
+ "an",
18
+ "ang",
19
+ "ao",
20
+ "aw",
21
+ "ay",
22
+ "b",
23
+ "by",
24
+ "c",
25
+ "ch",
26
+ "d",
27
+ "dh",
28
+ "dy",
29
+ "e",
30
+ "e:",
31
+ "eh",
32
+ "ei",
33
+ "en",
34
+ "eng",
35
+ "er",
36
+ "ey",
37
+ "f",
38
+ "g",
39
+ "gy",
40
+ "h",
41
+ "hh",
42
+ "hy",
43
+ "i",
44
+ "i0",
45
+ "i:",
46
+ "ia",
47
+ "ian",
48
+ "iang",
49
+ "iao",
50
+ "ie",
51
+ "ih",
52
+ "in",
53
+ "ing",
54
+ "iong",
55
+ "ir",
56
+ "iu",
57
+ "iy",
58
+ "j",
59
+ "jh",
60
+ "k",
61
+ "ky",
62
+ "l",
63
+ "m",
64
+ "my",
65
+ "n",
66
+ "ng",
67
+ "ny",
68
+ "o",
69
+ "o:",
70
+ "ong",
71
+ "ou",
72
+ "ow",
73
+ "oy",
74
+ "p",
75
+ "py",
76
+ "q",
77
+ "r",
78
+ "ry",
79
+ "s",
80
+ "sh",
81
+ "t",
82
+ "th",
83
+ "ts",
84
+ "ty",
85
+ "u",
86
+ "u:",
87
+ "ua",
88
+ "uai",
89
+ "uan",
90
+ "uang",
91
+ "uh",
92
+ "ui",
93
+ "un",
94
+ "uo",
95
+ "uw",
96
+ "v",
97
+ "van",
98
+ "ve",
99
+ "vn",
100
+ "w",
101
+ "x",
102
+ "y",
103
+ "z",
104
+ "zh",
105
+ "zy",
106
+ "!",
107
+ "?",
108
+ "…",
109
+ ",",
110
+ ".",
111
+ "'",
112
+ "-",
113
+ "SP",
114
+ "UNK"
115
+ ]
116
+
117
+
118
+ jp_symbols = [
119
+ "_",
120
+ "\"",
121
+ "(",
122
+ ")",
123
+ "*",
124
+ "/",
125
+ ":",
126
+ "AA",
127
+ "E",
128
+ "EE",
129
+ "En",
130
+ "N",
131
+ "OO",
132
+ "Q",
133
+ "V",
134
+ "[",
135
+ "\\",
136
+ "]",
137
+ "^",
138
+ "a",
139
+ "a:",
140
+ "aa",
141
+ "ae",
142
+ "ah",
143
+ "ai",
144
+ "an",
145
+ "ang",
146
+ "ao",
147
+ "aw",
148
+ "ay",
149
+ "b",
150
+ "by",
151
+ "c",
152
+ "ch",
153
+ "d",
154
+ "dh",
155
+ "dy",
156
+ "e",
157
+ "e:",
158
+ "eh",
159
+ "ei",
160
+ "en",
161
+ "eng",
162
+ "er",
163
+ "ey",
164
+ "f",
165
+ "g",
166
+ "gy",
167
+ "h",
168
+ "hh",
169
+ "hy",
170
+ "i",
171
+ "i0",
172
+ "i:",
173
+ "ia",
174
+ "ian",
175
+ "iang",
176
+ "iao",
177
+ "ie",
178
+ "ih",
179
+ "in",
180
+ "ing",
181
+ "iong",
182
+ "ir",
183
+ "iu",
184
+ "iy",
185
+ "j",
186
+ "jh",
187
+ "k",
188
+ "ky",
189
+ "l",
190
+ "m",
191
+ "my",
192
+ "n",
193
+ "ng",
194
+ "ny",
195
+ "o",
196
+ "o:",
197
+ "ong",
198
+ "ou",
199
+ "ow",
200
+ "oy",
201
+ "p",
202
+ "py",
203
+ "q",
204
+ "r",
205
+ "ry",
206
+ "s",
207
+ "sh",
208
+ "t",
209
+ "th",
210
+ "ts",
211
+ "ty",
212
+ "u",
213
+ "u:",
214
+ "ua",
215
+ "uai",
216
+ "uan",
217
+ "uang",
218
+ "uh",
219
+ "ui",
220
+ "un",
221
+ "uo",
222
+ "uw",
223
+ "v",
224
+ "van",
225
+ "ve",
226
+ "vn",
227
+ "w",
228
+ "x",
229
+ "y",
230
+ "z",
231
+ "zh",
232
+ "zy",
233
+ "~",
234
+ "æ",
235
+ "ç",
236
+ "ð",
237
+ "ø",
238
+ "ŋ",
239
+ "œ",
240
+ "ɐ",
241
+ "ɑ",
242
+ "ɒ",
243
+ "ɔ",
244
+ "ɕ",
245
+ "ə",
246
+ "ɛ",
247
+ "ɜ",
248
+ "ɡ",
249
+ "ɣ",
250
+ "ɥ",
251
+ "ɦ",
252
+ "ɪ",
253
+ "ɫ",
254
+ "ɬ",
255
+ "ɭ",
256
+ "ɯ",
257
+ "ɲ",
258
+ "ɵ",
259
+ "ɸ",
260
+ "ɹ",
261
+ "ɾ",
262
+ "ʁ",
263
+ "ʃ",
264
+ "ʊ",
265
+ "ʌ",
266
+ "ʎ",
267
+ "ʏ",
268
+ "ʑ",
269
+ "ʒ",
270
+ "ʝ",
271
+ "ʲ",
272
+ "ˈ",
273
+ "ˌ",
274
+ "ː",
275
+ "̃",
276
+ "̩",
277
+ "β",
278
+ "θ",
279
+ "ᄀ",
280
+ "ᄁ",
281
+ "ᄂ",
282
+ "ᄃ",
283
+ "ᄄ",
284
+ "ᄅ",
285
+ "ᄆ",
286
+ "ᄇ",
287
+ "ᄈ",
288
+ "ᄉ",
289
+ "ᄊ",
290
+ "ᄋ",
291
+ "ᄌ",
292
+ "ᄍ",
293
+ "ᄎ",
294
+ "ᄏ",
295
+ "ᄐ",
296
+ "ᄑ",
297
+ "ᄒ",
298
+ "ᅡ",
299
+ "ᅢ",
300
+ "ᅣ",
301
+ "ᅤ",
302
+ "ᅥ",
303
+ "ᅦ",
304
+ "ᅧ",
305
+ "ᅨ",
306
+ "ᅩ",
307
+ "ᅪ",
308
+ "ᅫ",
309
+ "ᅬ",
310
+ "ᅭ",
311
+ "ᅮ",
312
+ "ᅯ",
313
+ "ᅰ",
314
+ "ᅱ",
315
+ "ᅲ",
316
+ "ᅳ",
317
+ "ᅴ",
318
+ "ᅵ",
319
+ "ᆨ",
320
+ "ᆫ",
321
+ "ᆮ",
322
+ "ᆯ",
323
+ "ᆷ",
324
+ "ᆸ",
325
+ "ᆼ",
326
+ "ㄸ",
327
+ "!",
328
+ "?",
329
+ "…",
330
+ ",",
331
+ ".",
332
+ "'",
333
+ "-",
334
+ "¿",
335
+ "¡",
336
+ "SP",
337
+ "UNK"
338
+ ]
339
+
340
+ en_symbols = [
341
+ "_",
342
+ "\"",
343
+ "(",
344
+ ")",
345
+ "*",
346
+ "/",
347
+ ":",
348
+ "AA",
349
+ "E",
350
+ "EE",
351
+ "En",
352
+ "N",
353
+ "OO",
354
+ "Q",
355
+ "V",
356
+ "[",
357
+ "\\",
358
+ "]",
359
+ "^",
360
+ "a",
361
+ "a:",
362
+ "aa",
363
+ "ae",
364
+ "ah",
365
+ "ai",
366
+ "an",
367
+ "ang",
368
+ "ao",
369
+ "aw",
370
+ "ay",
371
+ "b",
372
+ "by",
373
+ "c",
374
+ "ch",
375
+ "d",
376
+ "dh",
377
+ "dy",
378
+ "e",
379
+ "e:",
380
+ "eh",
381
+ "ei",
382
+ "en",
383
+ "eng",
384
+ "er",
385
+ "ey",
386
+ "f",
387
+ "g",
388
+ "gy",
389
+ "h",
390
+ "hh",
391
+ "hy",
392
+ "i",
393
+ "i0",
394
+ "i:",
395
+ "ia",
396
+ "ian",
397
+ "iang",
398
+ "iao",
399
+ "ie",
400
+ "ih",
401
+ "in",
402
+ "ing",
403
+ "iong",
404
+ "ir",
405
+ "iu",
406
+ "iy",
407
+ "j",
408
+ "jh",
409
+ "k",
410
+ "ky",
411
+ "l",
412
+ "m",
413
+ "my",
414
+ "n",
415
+ "ng",
416
+ "ny",
417
+ "o",
418
+ "o:",
419
+ "ong",
420
+ "ou",
421
+ "ow",
422
+ "oy",
423
+ "p",
424
+ "py",
425
+ "q",
426
+ "r",
427
+ "ry",
428
+ "s",
429
+ "sh",
430
+ "t",
431
+ "th",
432
+ "ts",
433
+ "ty",
434
+ "u",
435
+ "u:",
436
+ "ua",
437
+ "uai",
438
+ "uan",
439
+ "uang",
440
+ "uh",
441
+ "ui",
442
+ "un",
443
+ "uo",
444
+ "uw",
445
+ "v",
446
+ "van",
447
+ "ve",
448
+ "vn",
449
+ "w",
450
+ "x",
451
+ "y",
452
+ "z",
453
+ "zh",
454
+ "zy",
455
+ "~",
456
+ "¡",
457
+ "¿",
458
+ "æ",
459
+ "ç",
460
+ "ð",
461
+ "ø",
462
+ "ŋ",
463
+ "œ",
464
+ "ɐ",
465
+ "ɑ",
466
+ "ɒ",
467
+ "ɔ",
468
+ "ɕ",
469
+ "ə",
470
+ "ɛ",
471
+ "ɜ",
472
+ "ɡ",
473
+ "ɣ",
474
+ "ɥ",
475
+ "ɦ",
476
+ "ɪ",
477
+ "ɫ",
478
+ "ɬ",
479
+ "ɭ",
480
+ "ɯ",
481
+ "ɲ",
482
+ "ɵ",
483
+ "ɸ",
484
+ "ɹ",
485
+ "ɾ",
486
+ "ʁ",
487
+ "ʃ",
488
+ "ʊ",
489
+ "ʌ",
490
+ "ʎ",
491
+ "ʏ",
492
+ "ʑ",
493
+ "ʒ",
494
+ "ʝ",
495
+ "ʲ",
496
+ "ˈ",
497
+ "ˌ",
498
+ "ː",
499
+ "̃",
500
+ "̩",
501
+ "β",
502
+ "θ",
503
+ "ᄀ",
504
+ "ᄁ",
505
+ "ᄂ",
506
+ "ᄃ",
507
+ "ᄄ",
508
+ "ᄅ",
509
+ "ᄆ",
510
+ "ᄇ",
511
+ "ᄈ",
512
+ "ᄉ",
513
+ "ᄊ",
514
+ "ᄋ",
515
+ "ᄌ",
516
+ "ᄍ",
517
+ "ᄎ",
518
+ "ᄏ",
519
+ "ᄐ",
520
+ "ᄑ",
521
+ "ᄒ",
522
+ "ᅡ",
523
+ "ᅢ",
524
+ "ᅣ",
525
+ "ᅤ",
526
+ "ᅥ",
527
+ "ᅦ",
528
+ "ᅧ",
529
+ "ᅨ",
530
+ "ᅩ",
531
+ "ᅪ",
532
+ "ᅫ",
533
+ "ᅬ",
534
+ "ᅭ",
535
+ "ᅮ",
536
+ "ᅯ",
537
+ "ᅰ",
538
+ "ᅱ",
539
+ "ᅲ",
540
+ "ᅳ",
541
+ "ᅴ",
542
+ "ᅵ",
543
+ "ᆨ",
544
+ "ᆫ",
545
+ "ᆮ",
546
+ "ᆯ",
547
+ "ᆷ",
548
+ "ᆸ",
549
+ "ᆼ",
550
+ "ㄸ",
551
+ "!",
552
+ "?",
553
+ "…",
554
+ ",",
555
+ ".",
556
+ "'",
557
+ "-",
558
+ "SP",
559
+ "UNK"
560
+ ]
561
+
562
+ kr_symbols = [
563
+ "_",
564
+ "\"",
565
+ "(",
566
+ ")",
567
+ "*",
568
+ "/",
569
+ ":",
570
+ "AA",
571
+ "E",
572
+ "EE",
573
+ "En",
574
+ "N",
575
+ "OO",
576
+ "Q",
577
+ "V",
578
+ "[",
579
+ "\\",
580
+ "]",
581
+ "^",
582
+ "a",
583
+ "a:",
584
+ "aa",
585
+ "ae",
586
+ "ah",
587
+ "ai",
588
+ "an",
589
+ "ang",
590
+ "ao",
591
+ "aw",
592
+ "ay",
593
+ "b",
594
+ "by",
595
+ "c",
596
+ "ch",
597
+ "d",
598
+ "dh",
599
+ "dy",
600
+ "e",
601
+ "e:",
602
+ "eh",
603
+ "ei",
604
+ "en",
605
+ "eng",
606
+ "er",
607
+ "ey",
608
+ "f",
609
+ "g",
610
+ "gy",
611
+ "h",
612
+ "hh",
613
+ "hy",
614
+ "i",
615
+ "i0",
616
+ "i:",
617
+ "ia",
618
+ "ian",
619
+ "iang",
620
+ "iao",
621
+ "ie",
622
+ "ih",
623
+ "in",
624
+ "ing",
625
+ "iong",
626
+ "ir",
627
+ "iu",
628
+ "iy",
629
+ "j",
630
+ "jh",
631
+ "k",
632
+ "ky",
633
+ "l",
634
+ "m",
635
+ "my",
636
+ "n",
637
+ "ng",
638
+ "ny",
639
+ "o",
640
+ "o:",
641
+ "ong",
642
+ "ou",
643
+ "ow",
644
+ "oy",
645
+ "p",
646
+ "py",
647
+ "q",
648
+ "r",
649
+ "ry",
650
+ "s",
651
+ "sh",
652
+ "t",
653
+ "th",
654
+ "ts",
655
+ "ty",
656
+ "u",
657
+ "u:",
658
+ "ua",
659
+ "uai",
660
+ "uan",
661
+ "uang",
662
+ "uh",
663
+ "ui",
664
+ "un",
665
+ "uo",
666
+ "uw",
667
+ "v",
668
+ "van",
669
+ "ve",
670
+ "vn",
671
+ "w",
672
+ "x",
673
+ "y",
674
+ "z",
675
+ "zh",
676
+ "zy",
677
+ "~",
678
+ "¡",
679
+ "¿",
680
+ "æ",
681
+ "ç",
682
+ "ð",
683
+ "ø",
684
+ "ŋ",
685
+ "œ",
686
+ "ɐ",
687
+ "ɑ",
688
+ "ɒ",
689
+ "ɔ",
690
+ "ɕ",
691
+ "ə",
692
+ "ɛ",
693
+ "ɜ",
694
+ "ɡ",
695
+ "ɣ",
696
+ "ɥ",
697
+ "ɦ",
698
+ "ɪ",
699
+ "ɫ",
700
+ "ɬ",
701
+ "ɭ",
702
+ "ɯ",
703
+ "ɲ",
704
+ "ɵ",
705
+ "ɸ",
706
+ "ɹ",
707
+ "ɾ",
708
+ "ʁ",
709
+ "ʃ",
710
+ "ʊ",
711
+ "ʌ",
712
+ "ʎ",
713
+ "ʏ",
714
+ "ʑ",
715
+ "ʒ",
716
+ "ʝ",
717
+ "ʲ",
718
+ "ˈ",
719
+ "ˌ",
720
+ "ː",
721
+ "̃",
722
+ "̩",
723
+ "β",
724
+ "θ",
725
+ "ᄀ",
726
+ "ᄁ",
727
+ "ᄂ",
728
+ "ᄃ",
729
+ "ᄄ",
730
+ "ᄅ",
731
+ "ᄆ",
732
+ "ᄇ",
733
+ "ᄈ",
734
+ "ᄉ",
735
+ "ᄊ",
736
+ "ᄋ",
737
+ "ᄌ",
738
+ "ᄍ",
739
+ "ᄎ",
740
+ "ᄏ",
741
+ "ᄐ",
742
+ "ᄑ",
743
+ "ᄒ",
744
+ "ᅡ",
745
+ "ᅢ",
746
+ "ᅣ",
747
+ "ᅤ",
748
+ "ᅥ",
749
+ "ᅦ",
750
+ "ᅧ",
751
+ "ᅨ",
752
+ "ᅩ",
753
+ "ᅪ",
754
+ "ᅫ",
755
+ "ᅬ",
756
+ "ᅭ",
757
+ "ᅮ",
758
+ "ᅯ",
759
+ "ᅰ",
760
+ "ᅱ",
761
+ "ᅲ",
762
+ "ᅳ",
763
+ "ᅴ",
764
+ "ᅵ",
765
+ "ᆨ",
766
+ "ᆫ",
767
+ "ᆮ",
768
+ "ᆯ",
769
+ "ᆷ",
770
+ "ᆸ",
771
+ "ᆼ",
772
+ "ㄸ",
773
+ "!",
774
+ "?",
775
+ "…",
776
+ ",",
777
+ ".",
778
+ "'",
779
+ "-",
780
+ "SP",
781
+ "UNK"
782
+ ]
783
+
784
+ es_symbols = [
785
+ "_",
786
+ "\"",
787
+ "(",
788
+ ")",
789
+ "*",
790
+ "/",
791
+ ":",
792
+ "AA",
793
+ "E",
794
+ "EE",
795
+ "En",
796
+ "N",
797
+ "OO",
798
+ "Q",
799
+ "V",
800
+ "[",
801
+ "\\",
802
+ "]",
803
+ "^",
804
+ "a",
805
+ "a:",
806
+ "aa",
807
+ "ae",
808
+ "ah",
809
+ "ai",
810
+ "an",
811
+ "ang",
812
+ "ao",
813
+ "aw",
814
+ "ay",
815
+ "b",
816
+ "by",
817
+ "c",
818
+ "ch",
819
+ "d",
820
+ "dh",
821
+ "dy",
822
+ "e",
823
+ "e:",
824
+ "eh",
825
+ "ei",
826
+ "en",
827
+ "eng",
828
+ "er",
829
+ "ey",
830
+ "f",
831
+ "g",
832
+ "gy",
833
+ "h",
834
+ "hh",
835
+ "hy",
836
+ "i",
837
+ "i0",
838
+ "i:",
839
+ "ia",
840
+ "ian",
841
+ "iang",
842
+ "iao",
843
+ "ie",
844
+ "ih",
845
+ "in",
846
+ "ing",
847
+ "iong",
848
+ "ir",
849
+ "iu",
850
+ "iy",
851
+ "j",
852
+ "jh",
853
+ "k",
854
+ "ky",
855
+ "l",
856
+ "m",
857
+ "my",
858
+ "n",
859
+ "ng",
860
+ "ny",
861
+ "o",
862
+ "o:",
863
+ "ong",
864
+ "ou",
865
+ "ow",
866
+ "oy",
867
+ "p",
868
+ "py",
869
+ "q",
870
+ "r",
871
+ "ry",
872
+ "s",
873
+ "sh",
874
+ "t",
875
+ "th",
876
+ "ts",
877
+ "ty",
878
+ "u",
879
+ "u:",
880
+ "ua",
881
+ "uai",
882
+ "uan",
883
+ "uang",
884
+ "uh",
885
+ "ui",
886
+ "un",
887
+ "uo",
888
+ "uw",
889
+ "v",
890
+ "van",
891
+ "ve",
892
+ "vn",
893
+ "w",
894
+ "x",
895
+ "y",
896
+ "z",
897
+ "zh",
898
+ "zy",
899
+ "~",
900
+ "¡",
901
+ "¿",
902
+ "æ",
903
+ "ç",
904
+ "ð",
905
+ "ø",
906
+ "ŋ",
907
+ "œ",
908
+ "ɐ",
909
+ "ɑ",
910
+ "ɒ",
911
+ "ɔ",
912
+ "ɕ",
913
+ "ə",
914
+ "ɛ",
915
+ "ɜ",
916
+ "ɡ",
917
+ "ɣ",
918
+ "ɥ",
919
+ "ɦ",
920
+ "ɪ",
921
+ "ɫ",
922
+ "ɬ",
923
+ "ɭ",
924
+ "ɯ",
925
+ "ɲ",
926
+ "ɵ",
927
+ "ɸ",
928
+ "ɹ",
929
+ "ɾ",
930
+ "ʁ",
931
+ "ʃ",
932
+ "ʊ",
933
+ "ʌ",
934
+ "ʎ",
935
+ "ʏ",
936
+ "ʑ",
937
+ "ʒ",
938
+ "ʝ",
939
+ "ʲ",
940
+ "ˈ",
941
+ "ˌ",
942
+ "ː",
943
+ "̃",
944
+ "̩",
945
+ "β",
946
+ "θ",
947
+ "ᄀ",
948
+ "ᄁ",
949
+ "ᄂ",
950
+ "ᄃ",
951
+ "ᄄ",
952
+ "ᄅ",
953
+ "ᄆ",
954
+ "ᄇ",
955
+ "ᄈ",
956
+ "ᄉ",
957
+ "ᄊ",
958
+ "ᄋ",
959
+ "ᄌ",
960
+ "ᄍ",
961
+ "ᄎ",
962
+ "ᄏ",
963
+ "ᄐ",
964
+ "ᄑ",
965
+ "ᄒ",
966
+ "ᅡ",
967
+ "ᅢ",
968
+ "ᅣ",
969
+ "ᅤ",
970
+ "ᅥ",
971
+ "ᅦ",
972
+ "ᅧ",
973
+ "ᅨ",
974
+ "ᅩ",
975
+ "ᅪ",
976
+ "ᅫ",
977
+ "ᅬ",
978
+ "ᅭ",
979
+ "ᅮ",
980
+ "ᅯ",
981
+ "ᅰ",
982
+ "ᅱ",
983
+ "ᅲ",
984
+ "ᅳ",
985
+ "ᅴ",
986
+ "ᅵ",
987
+ "ᆨ",
988
+ "ᆫ",
989
+ "ᆮ",
990
+ "ᆯ",
991
+ "ᆷ",
992
+ "ᆸ",
993
+ "ᆼ",
994
+ "ㄸ",
995
+ "!",
996
+ "?",
997
+ "…",
998
+ ",",
999
+ ".",
1000
+ "'",
1001
+ "-",
1002
+ "SP",
1003
+ "UNK"
1004
+ ]
1005
+
1006
+ fr_symbols = [
1007
+ "_",
1008
+ "\"",
1009
+ "(",
1010
+ ")",
1011
+ "*",
1012
+ "/",
1013
+ ":",
1014
+ "AA",
1015
+ "E",
1016
+ "EE",
1017
+ "En",
1018
+ "N",
1019
+ "OO",
1020
+ "Q",
1021
+ "V",
1022
+ "[",
1023
+ "\\",
1024
+ "]",
1025
+ "^",
1026
+ "a",
1027
+ "a:",
1028
+ "aa",
1029
+ "ae",
1030
+ "ah",
1031
+ "ai",
1032
+ "an",
1033
+ "ang",
1034
+ "ao",
1035
+ "aw",
1036
+ "ay",
1037
+ "b",
1038
+ "by",
1039
+ "c",
1040
+ "ch",
1041
+ "d",
1042
+ "dh",
1043
+ "dy",
1044
+ "e",
1045
+ "e:",
1046
+ "eh",
1047
+ "ei",
1048
+ "en",
1049
+ "eng",
1050
+ "er",
1051
+ "ey",
1052
+ "f",
1053
+ "g",
1054
+ "gy",
1055
+ "h",
1056
+ "hh",
1057
+ "hy",
1058
+ "i",
1059
+ "i0",
1060
+ "i:",
1061
+ "ia",
1062
+ "ian",
1063
+ "iang",
1064
+ "iao",
1065
+ "ie",
1066
+ "ih",
1067
+ "in",
1068
+ "ing",
1069
+ "iong",
1070
+ "ir",
1071
+ "iu",
1072
+ "iy",
1073
+ "j",
1074
+ "jh",
1075
+ "k",
1076
+ "ky",
1077
+ "l",
1078
+ "m",
1079
+ "my",
1080
+ "n",
1081
+ "ng",
1082
+ "ny",
1083
+ "o",
1084
+ "o:",
1085
+ "ong",
1086
+ "ou",
1087
+ "ow",
1088
+ "oy",
1089
+ "p",
1090
+ "py",
1091
+ "q",
1092
+ "r",
1093
+ "ry",
1094
+ "s",
1095
+ "sh",
1096
+ "t",
1097
+ "th",
1098
+ "ts",
1099
+ "ty",
1100
+ "u",
1101
+ "u:",
1102
+ "ua",
1103
+ "uai",
1104
+ "uan",
1105
+ "uang",
1106
+ "uh",
1107
+ "ui",
1108
+ "un",
1109
+ "uo",
1110
+ "uw",
1111
+ "v",
1112
+ "van",
1113
+ "ve",
1114
+ "vn",
1115
+ "w",
1116
+ "x",
1117
+ "y",
1118
+ "z",
1119
+ "zh",
1120
+ "zy",
1121
+ "~",
1122
+ "¡",
1123
+ "¿",
1124
+ "æ",
1125
+ "ç",
1126
+ "ð",
1127
+ "ø",
1128
+ "ŋ",
1129
+ "œ",
1130
+ "ɐ",
1131
+ "ɑ",
1132
+ "ɒ",
1133
+ "ɔ",
1134
+ "ɕ",
1135
+ "ə",
1136
+ "ɛ",
1137
+ "ɜ",
1138
+ "ɡ",
1139
+ "ɣ",
1140
+ "ɥ",
1141
+ "ɦ",
1142
+ "ɪ",
1143
+ "ɫ",
1144
+ "ɬ",
1145
+ "ɭ",
1146
+ "ɯ",
1147
+ "ɲ",
1148
+ "ɵ",
1149
+ "ɸ",
1150
+ "ɹ",
1151
+ "ɾ",
1152
+ "ʁ",
1153
+ "ʃ",
1154
+ "ʊ",
1155
+ "ʌ",
1156
+ "ʎ",
1157
+ "ʏ",
1158
+ "ʑ",
1159
+ "ʒ",
1160
+ "ʝ",
1161
+ "ʲ",
1162
+ "ˈ",
1163
+ "ˌ",
1164
+ "ː",
1165
+ "̃",
1166
+ "̩",
1167
+ "β",
1168
+ "θ",
1169
+ "ᄀ",
1170
+ "ᄁ",
1171
+ "ᄂ",
1172
+ "ᄃ",
1173
+ "ᄄ",
1174
+ "ᄅ",
1175
+ "ᄆ",
1176
+ "ᄇ",
1177
+ "ᄈ",
1178
+ "ᄉ",
1179
+ "ᄊ",
1180
+ "ᄋ",
1181
+ "ᄌ",
1182
+ "ᄍ",
1183
+ "ᄎ",
1184
+ "ᄏ",
1185
+ "ᄐ",
1186
+ "ᄑ",
1187
+ "ᄒ",
1188
+ "ᅡ",
1189
+ "ᅢ",
1190
+ "ᅣ",
1191
+ "ᅤ",
1192
+ "ᅥ",
1193
+ "ᅦ",
1194
+ "ᅧ",
1195
+ "ᅨ",
1196
+ "ᅩ",
1197
+ "ᅪ",
1198
+ "ᅫ",
1199
+ "ᅬ",
1200
+ "ᅭ",
1201
+ "ᅮ",
1202
+ "ᅯ",
1203
+ "ᅰ",
1204
+ "ᅱ",
1205
+ "ᅲ",
1206
+ "ᅳ",
1207
+ "ᅴ",
1208
+ "ᅵ",
1209
+ "ᆨ",
1210
+ "ᆫ",
1211
+ "ᆮ",
1212
+ "ᆯ",
1213
+ "ᆷ",
1214
+ "ᆸ",
1215
+ "ᆼ",
1216
+ "ㄸ",
1217
+ "!",
1218
+ "?",
1219
+ "…",
1220
+ ",",
1221
+ ".",
1222
+ "'",
1223
+ "-",
1224
+ "SP",
1225
+ "UNK"
1226
+ ]
1227
+
1228
+ LANG_TO_SYMBOL_MAP = {
1229
+ "ZH": zh_mix_en_symbols,
1230
+ "ZH_MIX_EN": zh_mix_en_symbols,
1231
+ "JP": jp_symbols,
1232
+ "EN": en_symbols,
1233
+ "KR": kr_symbols,
1234
+ "ES": es_symbols,
1235
+ "SP": es_symbols,
1236
+ "FR": fr_symbols
1237
+ }
libmelotts/python/text/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .symbols import *
2
+
3
+
4
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5
+
6
+
7
+ def cleaned_text_to_sequence(cleaned_text, tones, language, symbol_to_id=None):
8
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
9
+ Args:
10
+ text: string to convert to a sequence
11
+ Returns:
12
+ List of integers corresponding to the symbols in the text
13
+ """
14
+ symbol_to_id_map = symbol_to_id if symbol_to_id else _symbol_to_id
15
+ phones = [symbol_to_id_map[symbol] for symbol in cleaned_text]
16
+ tone_start = language_tone_start_map[language]
17
+ tones = [i + tone_start for i in tones]
18
+ lang_id = language_id_map[language]
19
+ lang_ids = [lang_id for i in phones]
20
+ return phones, tones, lang_ids
21
+
22
+
23
+ def get_bert(norm_text, word2ph, language, device):
24
+ from .chinese_bert import get_bert_feature as zh_bert
25
+ from .english_bert import get_bert_feature as en_bert
26
+ from .japanese_bert import get_bert_feature as jp_bert
27
+ from .chinese_mix import get_bert_feature as zh_mix_en_bert
28
+ from .spanish_bert import get_bert_feature as sp_bert
29
+ from .french_bert import get_bert_feature as fr_bert
30
+ from .korean import get_bert_feature as kr_bert
31
+
32
+ lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert, 'ZH_MIX_EN': zh_mix_en_bert,
33
+ 'FR': fr_bert, 'SP': sp_bert, 'ES': sp_bert, "KR": kr_bert}
34
+ bert = lang_bert_func_map[language](norm_text, word2ph, device)
35
+ return bert
libmelotts/python/text/bert-base-multilingual-uncased/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
libmelotts/python/text/bert-base-multilingual-uncased/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
libmelotts/python/text/bert-base-multilingual-uncased/tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": true,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "special_tokens_map_file": null,
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
libmelotts/python/text/bert-base-multilingual-uncased/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
libmelotts/python/text/bert-base-uncased/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
libmelotts/python/text/bert-base-uncased/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
libmelotts/python/text/bert-base-uncased/tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": true,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "special_tokens_map_file": null,
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
libmelotts/python/text/bert-base-uncased/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
libmelotts/python/text/chinese.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from .symbols import punctuation
8
+ from .tone_sandhi import ToneSandhi
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ pinyin_to_symbol_map = {
12
+ line.split("\t")[0]: line.strip().split("\t")[1]
13
+ for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14
+ }
15
+
16
+ rep_map = {
17
+ ":": ",",
18
+ ";": ",",
19
+ ",": ",",
20
+ "。": ".",
21
+ "!": "!",
22
+ "?": "?",
23
+ "\n": ".",
24
+ "·": ",",
25
+ "、": ",",
26
+ "...": "…",
27
+ "$": ".",
28
+ "“": "'",
29
+ "”": "'",
30
+ "‘": "'",
31
+ "’": "'",
32
+ "(": "'",
33
+ ")": "'",
34
+ "(": "'",
35
+ ")": "'",
36
+ "《": "'",
37
+ "》": "'",
38
+ "【": "'",
39
+ "】": "'",
40
+ "[": "'",
41
+ "]": "'",
42
+ "—": "-",
43
+ "~": "-",
44
+ "~": "-",
45
+ "「": "'",
46
+ "」": "'",
47
+ }
48
+
49
+ tone_modifier = ToneSandhi()
50
+
51
+
52
+ def replace_punctuation(text):
53
+ text = text.replace("嗯", "恩").replace("呣", "母")
54
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
55
+
56
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
57
+
58
+ replaced_text = re.sub(
59
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
60
+ )
61
+
62
+ return replaced_text
63
+
64
+
65
+ def g2p(text):
66
+ pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
67
+ sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
68
+ phones, tones, word2ph = _g2p(sentences)
69
+ assert sum(word2ph) == len(phones)
70
+ assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
71
+ phones = ["_"] + phones + ["_"]
72
+ tones = [0] + tones + [0]
73
+ word2ph = [1] + word2ph + [1]
74
+ return phones, tones, word2ph
75
+
76
+
77
+ def _get_initials_finals(word):
78
+ initials = []
79
+ finals = []
80
+ orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
81
+ orig_finals = lazy_pinyin(
82
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
83
+ )
84
+ for c, v in zip(orig_initials, orig_finals):
85
+ initials.append(c)
86
+ finals.append(v)
87
+ return initials, finals
88
+
89
+
90
+ def _g2p(segments):
91
+ import jieba.posseg as psg
92
+
93
+ phones_list = []
94
+ tones_list = []
95
+ word2ph = []
96
+ for seg in segments:
97
+ # Replace all English words in the sentence
98
+ seg = re.sub("[a-zA-Z]+", "", seg)
99
+ seg_cut = psg.lcut(seg)
100
+ initials = []
101
+ finals = []
102
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
103
+ for word, pos in seg_cut:
104
+ if pos == "eng":
105
+ import pdb; pdb.set_trace()
106
+ continue
107
+ sub_initials, sub_finals = _get_initials_finals(word)
108
+ sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109
+ initials.append(sub_initials)
110
+ finals.append(sub_finals)
111
+
112
+ # assert len(sub_initials) == len(sub_finals) == len(word)
113
+ initials = sum(initials, [])
114
+ finals = sum(finals, [])
115
+ #
116
+ for c, v in zip(initials, finals):
117
+ raw_pinyin = c + v
118
+ # NOTE: post process for pypinyin outputs
119
+ # we discriminate i, ii and iii
120
+ if c == v:
121
+ assert c in punctuation
122
+ phone = [c]
123
+ tone = "0"
124
+ word2ph.append(1)
125
+ else:
126
+ v_without_tone = v[:-1]
127
+ tone = v[-1]
128
+
129
+ pinyin = c + v_without_tone
130
+ assert tone in "12345"
131
+
132
+ if c:
133
+ # 多音节
134
+ v_rep_map = {
135
+ "uei": "ui",
136
+ "iou": "iu",
137
+ "uen": "un",
138
+ }
139
+ if v_without_tone in v_rep_map.keys():
140
+ pinyin = c + v_rep_map[v_without_tone]
141
+ else:
142
+ # 单音节
143
+ pinyin_rep_map = {
144
+ "ing": "ying",
145
+ "i": "yi",
146
+ "in": "yin",
147
+ "u": "wu",
148
+ }
149
+ if pinyin in pinyin_rep_map.keys():
150
+ pinyin = pinyin_rep_map[pinyin]
151
+ else:
152
+ single_rep_map = {
153
+ "v": "yu",
154
+ "e": "e",
155
+ "i": "y",
156
+ "u": "w",
157
+ }
158
+ if pinyin[0] in single_rep_map.keys():
159
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160
+
161
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162
+ phone = pinyin_to_symbol_map[pinyin].split(" ")
163
+ word2ph.append(len(phone))
164
+
165
+ phones_list += phone
166
+ tones_list += [int(tone)] * len(phone)
167
+ return phones_list, tones_list, word2ph
168
+
169
+
170
+ def text_normalize(text):
171
+ numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172
+ for number in numbers:
173
+ text = text.replace(number, cn2an.an2cn(number), 1)
174
+ text = replace_punctuation(text)
175
+ return text
176
+
177
+
178
+ def get_bert_feature(text, word2ph, device=None):
179
+ from text import chinese_bert
180
+
181
+ return chinese_bert.get_bert_feature(text, word2ph, device=device)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ from text.chinese_bert import get_bert_feature
186
+
187
+ text = "啊!chemistry 但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
188
+ text = text_normalize(text)
189
+ print(text)
190
+ phones, tones, word2ph = g2p(text)
191
+ bert = get_bert_feature(text, word2ph)
192
+
193
+ print(phones, tones, word2ph, bert.shape)
194
+
195
+
196
+ # # 示例用法
197
+ # text = "这是一个示例文本:,你好!这是一个测试...."
198
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试