wolfofbackstreet commited on
Commit
0cc6d2d
·
1 Parent(s): f25a2ab
Files changed (4) hide show
  1. .gitignore +6 -0
  2. Dockerfile +32 -0
  3. app.py +279 -0
  4. requirements.txt +67 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .idea/ C:\Users\james/.ssh/id_ed25518
2
+ cat C:\Users\james\.ssh\id_ed25518.pub
3
+
4
+ ssh -i C:\Users\james\.ssh\id_ed25518 git@hf.co
5
+ .venv/
6
+ zh_en_melotts/
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ # Set up a new user named "user" with user ID 1000
10
+ RUN useradd -m -u 1000 user
11
+ # Switch to the "user" user
12
+ USER user
13
+ # Set home to the user's home directory
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH \
16
+ PYTHONPATH=$HOME/app \
17
+ PYTHONUNBUFFERED=1 \
18
+ GRADIO_ALLOW_FLAGGING=never \
19
+ GRADIO_NUM_PORTS=1 \
20
+ GRADIO_SERVER_NAME=0.0.0.0 \
21
+ GRADIO_THEME=huggingface \
22
+ SYSTEM=spaces
23
+
24
+ # Set the working directory to the user's home directory
25
+ WORKDIR $HOME/app
26
+
27
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
+ COPY --chown=user . $HOME/app
29
+
30
+ EXPOSE 7860
31
+
32
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+ import os
3
+ from pathlib import Path
4
+ import gradio as gr
5
+ import inspect
6
+ from typing import Callable, Any, get_type_hints, Tuple, Union
7
+ import numpy as np
8
+ import gradio as gr
9
+ from typing import Iterable, List, Tuple
10
+ import jieba3
11
+ import onnxruntime as ort
12
+ import soundfile as sf
13
+ import torch
14
+ import numpy as np
15
+
16
+
17
+ model_path = "zh_en_melotts"
18
+ # Define the local directory where you want to save the files
19
+ local_folder_path = Path(model_path)
20
+
21
+ # Create the directory if it doesn't exist
22
+ os.makedirs(local_folder_path, exist_ok=True)
23
+
24
+ # Download the repository snapshot to the specified local folder
25
+ snapshot_download(
26
+ repo_id="wolfofbackstreet/melotts_chinese_mix_english_onnx",
27
+ local_dir=local_folder_path,
28
+ local_dir_use_symlinks=False # Recommended to avoid symlinks if you want portable files
29
+ )
30
+
31
+
32
+ def parse_docstring(func):
33
+ doc = inspect.getdoc(func)
34
+ if not doc:
35
+ return {"title": "Untitled", "description": ""}
36
+
37
+ lines = doc.splitlines()
38
+ title = next((line.replace("Title:", "").strip() for line in lines if line.startswith("Title:")), "Untitled")
39
+ description = "\n".join(line.strip() for line in lines if line.startswith("Description:"))
40
+ description = description.replace("Description:", "").strip()
41
+
42
+ return {"title": title, "description": description}
43
+
44
+
45
+ def gradio_app_with_docs(func: Callable) -> Callable:
46
+ sig = inspect.signature(func)
47
+ type_hints = get_type_hints(func)
48
+ metadata = parse_docstring(func) # Assuming you have a docstring parser
49
+
50
+ def _map_type(t: type) -> "gr.Component":
51
+ if t == str:
52
+ return gr.Textbox(label="Input")
53
+ elif t == int:
54
+ return gr.Number(precision=0)
55
+ elif t == float:
56
+ return gr.Number()
57
+ elif t == bool:
58
+ return gr.Checkbox()
59
+ elif hasattr(t, "__origin__") and t.__origin__ == list:
60
+ elem_type = getattr(t, "__args__", (Any,))[0]
61
+ if elem_type == str:
62
+ return gr.Dropdown(choices=["Option1", "Option2"])
63
+ else:
64
+ raise ValueError(f"Unsupported list element type: {elem_type}")
65
+ elif getattr(t, "__origin__", None) == tuple:
66
+ args = getattr(t, "__args__", ())
67
+ if len(args) == 2:
68
+ first_type = args[0]
69
+ second_type = args[1]
70
+
71
+ # Handle int and np.ndarray -- common in TTS for (sample_rate, waveform)
72
+ try:
73
+ if (
74
+ issubclass(first_type, int) and
75
+ (hasattr(second_type, "__module__") and second_type.__module__ == "numpy")
76
+ ):
77
+ return gr.Audio(label="Output", type="numpy")
78
+ except TypeError:
79
+ pass
80
+
81
+ raise ValueError(f"Unsupported type: {t}")
82
+
83
+ # Build inputs
84
+ inputs = []
85
+ for name, param in sig.parameters.items():
86
+ if name == "self":
87
+ continue
88
+ param_type = type_hints.get(name, Any)
89
+ component = _map_type(param_type)
90
+ component.label = name.replace("_", " ").title()
91
+ inputs.append(component)
92
+
93
+ # Build outputs
94
+ return_type = type_hints.get("return", Any)
95
+ outputs = _map_type(return_type)
96
+
97
+ # Wrap with Gradio interface
98
+ with gr.Blocks() as demo:
99
+ gr.Markdown(f"## {metadata['title']}\n{metadata['description']}")
100
+ gr.Interface(fn=func, inputs=inputs, outputs=outputs)
101
+
102
+ def wrapper(*args, **kwargs):
103
+ return func(*args, **kwargs)
104
+
105
+ wrapper.launch = lambda: demo.launch()
106
+ return wrapper
107
+
108
+
109
+
110
+
111
+ class Lexicon:
112
+ def __init__(self, lexion_filename: str, tokens_filename: str):
113
+ tokens = dict()
114
+ with open(tokens_filename, encoding="utf-8") as f:
115
+ for line in f:
116
+ s, i = line.split()
117
+ tokens[s] = int(i)
118
+
119
+ lexicon = dict()
120
+ with open(lexion_filename, encoding="utf-8") as f:
121
+ for line in f:
122
+ splits = line.split()
123
+ word_or_phrase = splits[0]
124
+ phone_tone_list = splits[1:]
125
+ assert len(phone_tone_list) & 1 == 0, len(phone_tone_list)
126
+ phones = phone_tone_list[: len(phone_tone_list) // 2]
127
+ phones = [tokens[p] for p in phones]
128
+
129
+ tones = phone_tone_list[len(phone_tone_list) // 2 :]
130
+ tones = [int(t) for t in tones]
131
+
132
+ lexicon[word_or_phrase] = (phones, tones)
133
+
134
+
135
+ lexicon["呣"] = lexicon["母"]
136
+ lexicon["嗯"] = lexicon["恩"]
137
+ self.lexicon = lexicon
138
+
139
+ punctuation = ["!", "?", "…", ",", ".", "'", "-"]
140
+ for p in punctuation:
141
+ i = tokens[p]
142
+ tone = 0
143
+ self.lexicon[p] = ([i], [tone])
144
+ self.lexicon[" "] = ([tokens["_"]], [0])
145
+
146
+ def _convert(self, text: str) -> Tuple[List[int], List[int]]:
147
+ phones = []
148
+ tones = []
149
+
150
+ if text == ",":
151
+ text = ","
152
+ elif text == "。":
153
+ text = "."
154
+ elif text == "!":
155
+ text = "!"
156
+ elif text == "?":
157
+ text = "?"
158
+
159
+ if text not in self.lexicon:
160
+ print("t", text)
161
+ if len(text) > 1:
162
+ for w in text:
163
+ print("w", w)
164
+ p, t = self.convert(w)
165
+ if p:
166
+ phones += p
167
+ tones += t
168
+ return phones, tones
169
+
170
+ phones, tones = self.lexicon[text]
171
+ return phones, tones
172
+
173
+ def convert(self, text_list: Iterable[str]) -> Tuple[List[int], List[int]]:
174
+ phones = []
175
+ tones = []
176
+ for text in text_list:
177
+ print(text)
178
+ p, t = self._convert(text)
179
+ phones += p
180
+ tones += t
181
+ return phones, tones
182
+
183
+
184
+ class OnnxModel:
185
+ def __init__(self, filename):
186
+ session_opts = ort.SessionOptions()
187
+ session_opts.inter_op_num_threads = 1
188
+ session_opts.intra_op_num_threads = 4
189
+
190
+ self.session_opts = session_opts
191
+ self.model = ort.InferenceSession(
192
+ filename,
193
+ sess_options=self.session_opts,
194
+ providers=["CPUExecutionProvider"],
195
+ )
196
+ meta = self.model.get_modelmeta().custom_metadata_map
197
+ self.bert_dim = int(meta["bert_dim"])
198
+ self.ja_bert_dim = int(meta["ja_bert_dim"])
199
+ self.add_blank = int(meta["add_blank"])
200
+ self.sample_rate = int(meta["sample_rate"])
201
+ self.speaker_id = int(meta["speaker_id"])
202
+ self.lang_id = int(meta["lang_id"])
203
+ self.sample_rate = int(meta["sample_rate"])
204
+
205
+ def __call__(self, x, tones):
206
+ """
207
+ Args:
208
+ x: 1-D int64 torch tensor
209
+ tones: 1-D int64 torch tensor
210
+ """
211
+ x = x.unsqueeze(0)
212
+ tones = tones.unsqueeze(0)
213
+
214
+ print(x.shape, tones.shape)
215
+ sid = torch.tensor([self.speaker_id], dtype=torch.int64)
216
+ noise_scale = torch.tensor([0.6], dtype=torch.float32)
217
+ length_scale = torch.tensor([1.0], dtype=torch.float32)
218
+ noise_scale_w = torch.tensor([0.8], dtype=torch.float32)
219
+
220
+ x_lengths = torch.tensor([x.shape[-1]], dtype=torch.int64)
221
+
222
+ y = self.model.run(
223
+ ["y"],
224
+ {
225
+ "x": x.numpy(),
226
+ "x_lengths": x_lengths.numpy(),
227
+ "tones": tones.numpy(),
228
+ "sid": sid.numpy(),
229
+ "noise_scale": noise_scale.numpy(),
230
+ "noise_scale_w": noise_scale_w.numpy(),
231
+ "length_scale": length_scale.numpy(),
232
+ },
233
+ )[0][0][0]
234
+ return y
235
+
236
+
237
+
238
+ model = OnnxModel(local_folder_path / "model.onnx")
239
+ lexicon = Lexicon(lexion_filename= local_folder_path / "lexicon.txt", tokens_filename= local_folder_path / "tokens.txt")
240
+
241
+
242
+ @gradio_app_with_docs
243
+ def tts(text: str) -> tuple[int, np.ndarray]:
244
+ """
245
+ Title: MeloTTS Onnx on CPUU
246
+ Description: A Simple app to test MeloTTS Chinese Mix English on CPU.
247
+ Args:
248
+ prompt (str): A simple prompt.
249
+ Returns:
250
+ str: Simplified response.
251
+ """
252
+
253
+ text = text.lower() # this step is crutial for split words correctly
254
+ tokenizer = jieba3.jieba3(use_hmm=True).cut_text(text)
255
+ phones, tones = lexicon.convert(tokenizer)
256
+ if model.add_blank:
257
+ new_phones = [0] * (2 * len(phones) + 1)
258
+ new_tones = [0] * (2 * len(tones) + 1)
259
+
260
+ new_phones[1::2] = phones
261
+ new_tones[1::2] = tones
262
+
263
+ phones = new_phones
264
+ tones = new_tones
265
+
266
+ phones = torch.tensor(phones, dtype=torch.int64)
267
+ tones = torch.tensor(tones, dtype=torch.int64)
268
+
269
+ print(phones.shape, tones.shape)
270
+
271
+ y = model(x=phones, tones=tones)
272
+ # sf.write(local_folder_path / "test.wav", y, model.sample_rate)
273
+
274
+ return (model.sample_rate, y)
275
+
276
+
277
+ if __name__ == "__main__":
278
+ # Launch the Gradio app
279
+ tts.launch()
requirements.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ certifi==2025.4.26
5
+ cffi==1.17.1
6
+ charset-normalizer==3.4.1
7
+ click==8.1.8
8
+ coloredlogs==15.0.1
9
+ fastapi==0.115.12
10
+ ffmpy==0.5.0
11
+ filelock==3.18.0
12
+ flatbuffers==25.2.10
13
+ fsspec==2025.3.2
14
+ gradio==5.28.0
15
+ gradio_client==1.10.0
16
+ groovy==0.1.2
17
+ h11==0.16.0
18
+ httpcore==1.0.9
19
+ httpx==0.28.1
20
+ huggingface-hub==0.30.2
21
+ humanfriendly==10.0
22
+ idna==3.10
23
+ jieba3==1.0.2
24
+ Jinja2==3.1.6
25
+ markdown-it-py==3.0.0
26
+ MarkupSafe==3.0.2
27
+ mdurl==0.1.2
28
+ mpmath==1.3.0
29
+ networkx==3.4.2
30
+ numpy==2.2.5
31
+ onnx==1.17.0
32
+ onnxruntime==1.21.1
33
+ orjson==3.10.18
34
+ packaging==25.0
35
+ pandas==2.2.3
36
+ pillow==11.2.1
37
+ protobuf==6.30.2
38
+ pycparser==2.22
39
+ pydantic==2.11.4
40
+ pydantic_core==2.33.2
41
+ pydub==0.25.1
42
+ Pygments==2.19.1
43
+ python-dateutil==2.9.0.post0
44
+ python-multipart==0.0.20
45
+ pytz==2025.2
46
+ PyYAML==6.0.2
47
+ requests==2.32.3
48
+ rich==14.0.0
49
+ ruff==0.11.7
50
+ safehttpx==0.1.6
51
+ semantic-version==2.10.0
52
+ shellingham==1.5.4
53
+ six==1.17.0
54
+ sniffio==1.3.1
55
+ soundfile==0.13.1
56
+ starlette==0.46.2
57
+ sympy==1.14.0
58
+ tomlkit==0.13.2
59
+ torch==2.7.0
60
+ tqdm==4.67.1
61
+ typer==0.15.3
62
+ typing-inspection==0.4.0
63
+ typing_extensions==4.13.2
64
+ tzdata==2025.2
65
+ urllib3==2.4.0
66
+ uvicorn==0.34.2
67
+ websockets==15.0.1