GetcharZp commited on
Commit
3bd03cf
·
1 Parent(s): e7dc12c

:sparkles: support piper tts

Browse files
README.md CHANGED
@@ -19,12 +19,10 @@ tags:
19
  <a href="https://github.com/getcharzp/go-speech/pulls" target="blank">
20
  <img src="https://img.shields.io/github/issues-pr/getcharzp/go-speech?style=for-the-badge" alt="go-speech pull-requests"/>
21
  </a>
22
- <a href='https://github.com/getcharzp/go-speech/releases'>
23
- <img src='https://img.shields.io/github/release/getcharzp/go-speech?&label=Latest&style=for-the-badge'>
24
- </a>
25
  </p>
26
 
27
- go-speech 基于 Golang + [ONNX](https://github.com/microsoft/onnxruntime/releases/tag/v1.23.2) 构建的轻量语音库,支持 TTS(文本转语音)与 ASR(语音转文字)。 集成 MeloTTS 、达摩院 Paraformer 架构模型、Whisper 模型。
 
28
 
29
  ## 安装
30
 
@@ -40,6 +38,8 @@ git clone https://huggingface.co/getcharzp/go-speech
40
 
41
  ### TTS
42
 
 
 
43
  ```go
44
  package main
45
 
@@ -74,6 +74,48 @@ func main() {
74
  <source src="https://media.githubusercontent.com/media/GetcharZp/go-speech/master/assets/output.wav" type="audio/wav">
75
  </audio>
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  ### ASR
78
 
79
  #### Paraformer
@@ -131,4 +173,4 @@ func main() {
131
  }
132
  fmt.Printf("识别结果: %s\n", text) // Yesterday was星期一Today is Tuesday明天是星期三
133
  }
134
- ```
 
19
  <a href="https://github.com/getcharzp/go-speech/pulls" target="blank">
20
  <img src="https://img.shields.io/github/issues-pr/getcharzp/go-speech?style=for-the-badge" alt="go-speech pull-requests"/>
21
  </a>
 
 
 
22
  </p>
23
 
24
+ go-speech 基于 Golang + [ONNX](https://github.com/microsoft/onnxruntime/releases/tag/v1.23.2) 构建的轻量语音库,支持 TTS(文本转语音)与 ASR(语音转文字)。
25
+ 集成 MeloTTS、Piper、达摩院 Paraformer 架构模型、Whisper 模型。
26
 
27
  ## 安装
28
 
 
38
 
39
  ### TTS
40
 
41
+ #### [MeloTTS](https://github.com/myshell-ai/MeloTTS) 支持中英混合合成
42
+
43
  ```go
44
  package main
45
 
 
74
  <source src="https://media.githubusercontent.com/media/GetcharZp/go-speech/master/assets/output.wav" type="audio/wav">
75
  </audio>
76
 
77
+ #### [Piper](https://github.com/rhasspy/piper) 支持中文合成
78
+
79
+ ```go
80
+ package main
81
+
82
+ import (
83
+ "github.com/getcharzp/go-speech/tts/pipertts"
84
+ "github.com/up-zero/gotool/fileutil"
85
+ "log"
86
+ )
87
+
88
+ func main() {
89
+ cfg := pipertts.Config{
90
+ OnnxRuntimeLibPath: "../lib/onnxruntime.dll",
91
+ ModelPath: "../pipertts_weights/zh_CN-xiao_ya-medium.onnx",
92
+ ConfigPath: "../pipertts_weights/zh_CN-xiao_ya-medium.onnx.json",
93
+ }
94
+
95
+ ttsEngine, err := pipertts.NewEngine(cfg)
96
+ if err != nil {
97
+ log.Fatalf("创建引擎失败: %v", err)
98
+ }
99
+ defer ttsEngine.Destroy()
100
+
101
+ testText := "2019年12月30日,中国人口突破14亿人。联系电话: 13800138000。"
102
+ wavBytes, err := ttsEngine.SynthesizeToWav(testText)
103
+ if err != nil {
104
+ log.Fatalf("合成失败: %v", err)
105
+ }
106
+
107
+ outputPath := "pipertts_output.wav"
108
+ err = fileutil.FileSave(outputPath, wavBytes)
109
+ if err != nil {
110
+ log.Fatalf("保存失败: %v", err)
111
+ }
112
+ }
113
+ ```
114
+
115
+ <audio controls>
116
+ <source src="https://media.githubusercontent.com/media/GetcharZp/go-speech/master/examples/pipertts_output.wav" type="audio/wav">
117
+ </audio>
118
+
119
  ### ASR
120
 
121
  #### Paraformer
 
173
  }
174
  fmt.Printf("识别结果: %s\n", text) // Yesterday was星期一Today is Tuesday明天是星期三
175
  }
176
+ ```
examples/pipertts_output.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4624bda7cb745299f3994dcc2117f972bbf5ded60e6fdd4323a1f3f2249ef4ef
3
+ size 393772
examples/pipertts_test.go ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package examples
2
+
3
+ import (
4
+ "github.com/up-zero/gotool/fileutil"
5
+ "testing"
6
+
7
+ "github.com/getcharzp/go-speech/tts/pipertts"
8
+ )
9
+
10
+ func TestPiperTTS(t *testing.T) {
11
+ cfg := pipertts.Config{
12
+ OnnxRuntimeLibPath: "../lib/onnxruntime.dll",
13
+ ModelPath: "../pipertts_weights/zh_CN-xiao_ya-medium.onnx",
14
+ ConfigPath: "../pipertts_weights/zh_CN-xiao_ya-medium.onnx.json",
15
+ }
16
+
17
+ ttsEngine, err := pipertts.NewEngine(cfg)
18
+ if err != nil {
19
+ t.Fatalf("创建引擎失败: %v", err)
20
+ }
21
+ defer ttsEngine.Destroy()
22
+
23
+ testText := "2019年12月30日,中国人口突破14亿人。联系电话: 13800138000。"
24
+ wavBytes, err := ttsEngine.SynthesizeToWav(testText)
25
+ if err != nil {
26
+ t.Fatalf("合成失败: %v", err)
27
+ }
28
+
29
+ outputPath := "pipertts_output.wav"
30
+ err = fileutil.FileSave(outputPath, wavBytes)
31
+ if err != nil {
32
+ t.Fatalf("保存失败: %v", err)
33
+ }
34
+ }
pipertts_weights/zh_CN-xiao_ya-medium.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb2ce4ca1f5a36a7b23ae48f0651ce2c854c331ef7a804e5c5dc643d0f74f0e2
3
+ size 63221984
pipertts_weights/zh_CN-xiao_ya-medium.onnx.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio": {
3
+ "sample_rate": 22050,
4
+ "quality": "medium"
5
+ },
6
+ "espeak": {
7
+ "voice": "zh"
8
+ },
9
+ "phoneme_type": "pinyin",
10
+ "num_symbols": 256,
11
+ "num_speakers": 1,
12
+ "inference": {
13
+ "noise_scale": 0.667,
14
+ "length_scale": 1.0,
15
+ "noise_w": 0.8
16
+ },
17
+ "phoneme_id_map": {
18
+ "_": [
19
+ 0
20
+ ],
21
+ "^": [
22
+ 1
23
+ ],
24
+ "$": [
25
+ 2
26
+ ],
27
+ "Ø": [
28
+ 3
29
+ ],
30
+ "b": [
31
+ 4
32
+ ],
33
+ "p": [
34
+ 5
35
+ ],
36
+ "m": [
37
+ 6
38
+ ],
39
+ "f": [
40
+ 7
41
+ ],
42
+ "d": [
43
+ 8
44
+ ],
45
+ "t": [
46
+ 9
47
+ ],
48
+ "n": [
49
+ 10
50
+ ],
51
+ "l": [
52
+ 11
53
+ ],
54
+ "g": [
55
+ 12
56
+ ],
57
+ "k": [
58
+ 13
59
+ ],
60
+ "h": [
61
+ 14
62
+ ],
63
+ "j": [
64
+ 15
65
+ ],
66
+ "q": [
67
+ 16
68
+ ],
69
+ "x": [
70
+ 17
71
+ ],
72
+ "zh": [
73
+ 18
74
+ ],
75
+ "ch": [
76
+ 19
77
+ ],
78
+ "sh": [
79
+ 20
80
+ ],
81
+ "r": [
82
+ 21
83
+ ],
84
+ "z": [
85
+ 22
86
+ ],
87
+ "c": [
88
+ 23
89
+ ],
90
+ "s": [
91
+ 24
92
+ ],
93
+ "y": [
94
+ 25
95
+ ],
96
+ "w": [
97
+ 26
98
+ ],
99
+ "a": [
100
+ 27
101
+ ],
102
+ "o": [
103
+ 28
104
+ ],
105
+ "e": [
106
+ 29
107
+ ],
108
+ "ai": [
109
+ 30
110
+ ],
111
+ "ei": [
112
+ 31
113
+ ],
114
+ "ao": [
115
+ 32
116
+ ],
117
+ "ou": [
118
+ 33
119
+ ],
120
+ "an": [
121
+ 34
122
+ ],
123
+ "en": [
124
+ 35
125
+ ],
126
+ "ang": [
127
+ 36
128
+ ],
129
+ "eng": [
130
+ 37
131
+ ],
132
+ "ong": [
133
+ 38
134
+ ],
135
+ "i": [
136
+ 39
137
+ ],
138
+ "ia": [
139
+ 40
140
+ ],
141
+ "ie": [
142
+ 41
143
+ ],
144
+ "iao": [
145
+ 42
146
+ ],
147
+ "iu": [
148
+ 43
149
+ ],
150
+ "ian": [
151
+ 44
152
+ ],
153
+ "in": [
154
+ 45
155
+ ],
156
+ "iang": [
157
+ 46
158
+ ],
159
+ "ing": [
160
+ 47
161
+ ],
162
+ "iong": [
163
+ 48
164
+ ],
165
+ "u": [
166
+ 49
167
+ ],
168
+ "ua": [
169
+ 50
170
+ ],
171
+ "uo": [
172
+ 51
173
+ ],
174
+ "uai": [
175
+ 52
176
+ ],
177
+ "ui": [
178
+ 53
179
+ ],
180
+ "uan": [
181
+ 54
182
+ ],
183
+ "un": [
184
+ 55
185
+ ],
186
+ "uang": [
187
+ 56
188
+ ],
189
+ "ueng": [
190
+ 57
191
+ ],
192
+ "v": [
193
+ 58
194
+ ],
195
+ "ve": [
196
+ 59
197
+ ],
198
+ "van": [
199
+ 60
200
+ ],
201
+ "vn": [
202
+ 61
203
+ ],
204
+ "er": [
205
+ 62
206
+ ],
207
+ "ue": [
208
+ 63
209
+ ],
210
+ "1": [
211
+ 64
212
+ ],
213
+ "2": [
214
+ 65
215
+ ],
216
+ "3": [
217
+ 66
218
+ ],
219
+ "4": [
220
+ 67
221
+ ],
222
+ "5": [
223
+ 68
224
+ ],
225
+ "。": [
226
+ 69
227
+ ],
228
+ ".": [
229
+ 69
230
+ ],
231
+ "?": [
232
+ 70
233
+ ],
234
+ "?": [
235
+ 70
236
+ ],
237
+ "!": [
238
+ 71
239
+ ],
240
+ "!": [
241
+ 71
242
+ ],
243
+ "—": [
244
+ 72
245
+ ],
246
+ "…": [
247
+ 72
248
+ ],
249
+ "、": [
250
+ 72
251
+ ],
252
+ ",": [
253
+ 72
254
+ ],
255
+ ",": [
256
+ 72
257
+ ],
258
+ ":": [
259
+ 72
260
+ ],
261
+ ":": [
262
+ 72
263
+ ],
264
+ ";": [
265
+ 72
266
+ ],
267
+ ";": [
268
+ 72
269
+ ],
270
+ " ": [
271
+ 72
272
+ ]
273
+ },
274
+ "speaker_id_map": {},
275
+ "hop_length": 256,
276
+ "piper_version": "1.3.0",
277
+ "language": {
278
+ "code": "zh_CN",
279
+ "family": "zh",
280
+ "region": "CN",
281
+ "name_native": "简体中文",
282
+ "name_english": "Chinese",
283
+ "country_english": "China"
284
+ },
285
+ "dataset": "xiao_ya"
286
+ }