Spaces:
Running
Running
fixes
Browse files- app.py +85 -5
- pyproject.toml +1 -0
- requirements.txt +1 -0
- uv.lock +23 -0
app.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
-
from typing import Dict, List, Optional,
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import numpy as np
|
|
@@ -108,6 +108,11 @@ class KittenTTS_1_Onnx:
|
|
| 108 |
providers=chosen_providers,
|
| 109 |
)
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
def _prepare_inputs(
|
| 112 |
self, text: str, voice: str, speed: float
|
| 113 |
) -> Dict[str, np.ndarray]:
|
|
@@ -131,6 +136,61 @@ class KittenTTS_1_Onnx:
|
|
| 131 |
|
| 132 |
return {"input_ids": input_ids, "style": style_vec, "speed": speed_arr}
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
def generate(
|
| 135 |
self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0
|
| 136 |
) -> np.ndarray:
|
|
@@ -174,10 +234,30 @@ class KittenTTS:
|
|
| 174 |
repo_id=repo_id, cache_dir=cache_dir, providers=providers
|
| 175 |
)
|
| 176 |
|
| 177 |
-
def generate(
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
def generate_to_file(
|
| 183 |
self,
|
|
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
+
from typing import Dict, List, Tuple, Optional, Iterator
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import numpy as np
|
|
|
|
| 108 |
providers=chosen_providers,
|
| 109 |
)
|
| 110 |
|
| 111 |
+
self.max_seq_len = self._infer_max_seq_len() or int(os.getenv("KITTEN_MAX_SEQ_LEN", "512"))
|
| 112 |
+
# reserve 2 slots for BOS/EOS tokens inserted below
|
| 113 |
+
self._chunk_budget = max(1, self.max_seq_len - 2)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
def _prepare_inputs(
|
| 117 |
self, text: str, voice: str, speed: float
|
| 118 |
) -> Dict[str, np.ndarray]:
|
|
|
|
| 136 |
|
| 137 |
return {"input_ids": input_ids, "style": style_vec, "speed": speed_arr}
|
| 138 |
|
| 139 |
+
def _infer_max_seq_len(self) -> Optional[int]:
|
| 140 |
+
"""Try to read positional-embedding length from the ONNX initializers.
|
| 141 |
+
Falls back to env var or 512 if unavailable. Optional dependency on 'onnx'.
|
| 142 |
+
"""
|
| 143 |
+
try:
|
| 144 |
+
import onnx # optional
|
| 145 |
+
except Exception:
|
| 146 |
+
return None
|
| 147 |
+
try:
|
| 148 |
+
model = onnx.load(self.model_path)
|
| 149 |
+
except Exception:
|
| 150 |
+
return None
|
| 151 |
+
|
| 152 |
+
for tensor in model.graph.initializer:
|
| 153 |
+
name = tensor.name.lower()
|
| 154 |
+
if "position" in name and len(tensor.dims) == 2:
|
| 155 |
+
# dims[0] = max positions, dims[1] = hidden dim
|
| 156 |
+
return int(tensor.dims[0])
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
def _phonemize_to_clean(self, text: str) -> str:
|
| 160 |
+
"""Phonemize once and keep only characters present in the symbol set."""
|
| 161 |
+
phonemes = self._phonemizer.phonemize([text])[0]
|
| 162 |
+
token_str = " ".join(basic_english_tokenize(phonemes))
|
| 163 |
+
# keep only symbols known to the TextCleaner
|
| 164 |
+
return "".join(c for c in token_str if c in self._cleaner._dict)
|
| 165 |
+
|
| 166 |
+
def _run_onnx(self, token_ids: List[int], voice: str, speed: float) -> np.ndarray:
|
| 167 |
+
"""One inference call with trimming identical to original behavior."""
|
| 168 |
+
input_ids = np.asarray([token_ids], dtype=np.int64)
|
| 169 |
+
style_vec = self.voices[voice]
|
| 170 |
+
speed_arr = np.asarray([speed], dtype=np.float32)
|
| 171 |
+
outputs = self.session.run(None, {"input_ids": input_ids, "style": style_vec, "speed": speed_arr})
|
| 172 |
+
audio = np.asarray(outputs[0], dtype=np.float32)
|
| 173 |
+
if audio.size > 15000:
|
| 174 |
+
audio = audio[5000:-10000]
|
| 175 |
+
return audio
|
| 176 |
+
|
| 177 |
+
def _chunk_token_ids(self, clean: str) -> Iterator[List[int]]:
|
| 178 |
+
"""Yield BOS/segment/EOS token-id sequences within model capacity."""
|
| 179 |
+
n = len(clean)
|
| 180 |
+
i = 0
|
| 181 |
+
while i < n:
|
| 182 |
+
j = min(i + self._chunk_budget, n)
|
| 183 |
+
# prefer to cut at a space when possible, to keep phrasing natural
|
| 184 |
+
cut = clean.rfind(" ", i, j)
|
| 185 |
+
if cut != -1 and cut > i + int(0.6 * self._chunk_budget):
|
| 186 |
+
j = cut + 1 # include the space
|
| 187 |
+
seg = clean[i:j]
|
| 188 |
+
ids = self._cleaner(seg) # segment ids
|
| 189 |
+
ids.insert(0, 0) # BOS
|
| 190 |
+
ids.append(0) # EOS
|
| 191 |
+
yield ids
|
| 192 |
+
i = j
|
| 193 |
+
|
| 194 |
def generate(
|
| 195 |
self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0
|
| 196 |
) -> np.ndarray:
|
|
|
|
| 234 |
repo_id=repo_id, cache_dir=cache_dir, providers=providers
|
| 235 |
)
|
| 236 |
|
| 237 |
+
def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
|
| 238 |
+
"""Synthesize speech with automatic chunking at the model's max length."""
|
| 239 |
+
if voice not in self.available_voices:
|
| 240 |
+
raise ValueError(f"Voice '{voice}' not available. Choose from: {self.available_voices}")
|
| 241 |
+
|
| 242 |
+
# Phonemize once, then either run single-shot or chunked
|
| 243 |
+
clean = self._phonemize_to_clean(text)
|
| 244 |
+
|
| 245 |
+
# Fast path: fits in one pass
|
| 246 |
+
if len(clean) + 2 <= self.max_seq_len:
|
| 247 |
+
ids = self._cleaner(clean)
|
| 248 |
+
ids.insert(0, 0) # BOS
|
| 249 |
+
ids.append(0) # EOS
|
| 250 |
+
return self._run_onnx(ids, voice, speed)
|
| 251 |
+
|
| 252 |
+
# Chunked path: concatenate per-chunk audio
|
| 253 |
+
pieces: List[np.ndarray] = []
|
| 254 |
+
for ids in self._chunk_token_ids(clean):
|
| 255 |
+
pieces.append(self._run_onnx(ids, voice, speed))
|
| 256 |
+
|
| 257 |
+
if not pieces:
|
| 258 |
+
return np.array([], dtype=np.float32)
|
| 259 |
+
return pieces[0] if len(pieces) == 1 else np.concatenate(pieces)
|
| 260 |
+
|
| 261 |
|
| 262 |
def generate_to_file(
|
| 263 |
self,
|
pyproject.toml
CHANGED
|
@@ -8,6 +8,7 @@ dependencies = [
|
|
| 8 |
"gradio>=5.43.1",
|
| 9 |
"huggingface-hub[hf-xet]>=0.34.4",
|
| 10 |
"numpy>=2.3.2",
|
|
|
|
| 11 |
"onnxruntime>=1.22.1",
|
| 12 |
"phonemizer>=3.3.0",
|
| 13 |
"soundfile>=0.13.1",
|
|
|
|
| 8 |
"gradio>=5.43.1",
|
| 9 |
"huggingface-hub[hf-xet]>=0.34.4",
|
| 10 |
"numpy>=2.3.2",
|
| 11 |
+
"onnx>=1.18.0",
|
| 12 |
"onnxruntime>=1.22.1",
|
| 13 |
"phonemizer>=3.3.0",
|
| 14 |
"soundfile>=0.13.1",
|
requirements.txt
CHANGED
|
@@ -39,6 +39,7 @@ markupsafe==3.0.2
|
|
| 39 |
mdurl==0.1.2
|
| 40 |
mpmath==1.3.0
|
| 41 |
numpy==2.3.2
|
|
|
|
| 42 |
onnxruntime==1.22.1
|
| 43 |
orjson==3.11.2
|
| 44 |
packaging==25.0
|
|
|
|
| 39 |
mdurl==0.1.2
|
| 40 |
mpmath==1.3.0
|
| 41 |
numpy==2.3.2
|
| 42 |
+
onnx==1.18.0
|
| 43 |
onnxruntime==1.22.1
|
| 44 |
orjson==3.11.2
|
| 45 |
packaging==25.0
|
uv.lock
CHANGED
|
@@ -532,6 +532,7 @@ dependencies = [
|
|
| 532 |
{ name = "gradio" },
|
| 533 |
{ name = "huggingface-hub", extra = ["hf-xet"] },
|
| 534 |
{ name = "numpy" },
|
|
|
|
| 535 |
{ name = "onnxruntime" },
|
| 536 |
{ name = "phonemizer" },
|
| 537 |
{ name = "soundfile" },
|
|
@@ -542,6 +543,7 @@ requires-dist = [
|
|
| 542 |
{ name = "gradio", specifier = ">=5.43.1" },
|
| 543 |
{ name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.34.4" },
|
| 544 |
{ name = "numpy", specifier = ">=2.3.2" },
|
|
|
|
| 545 |
{ name = "onnxruntime", specifier = ">=1.22.1" },
|
| 546 |
{ name = "phonemizer", specifier = ">=3.3.0" },
|
| 547 |
{ name = "soundfile", specifier = ">=0.13.1" },
|
|
@@ -666,6 +668,27 @@ wheels = [
|
|
| 666 |
{ url = "https://files.pythonhosted.org/packages/c1/9e/1652778bce745a67b5fe05adde60ed362d38eb17d919a540e813d30f6874/numpy-2.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631", size = 10544226, upload-time = "2025-07-24T20:56:34.509Z" },
|
| 667 |
]
|
| 668 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
[[package]]
|
| 670 |
name = "onnxruntime"
|
| 671 |
version = "1.22.1"
|
|
|
|
| 532 |
{ name = "gradio" },
|
| 533 |
{ name = "huggingface-hub", extra = ["hf-xet"] },
|
| 534 |
{ name = "numpy" },
|
| 535 |
+
{ name = "onnx" },
|
| 536 |
{ name = "onnxruntime" },
|
| 537 |
{ name = "phonemizer" },
|
| 538 |
{ name = "soundfile" },
|
|
|
|
| 543 |
{ name = "gradio", specifier = ">=5.43.1" },
|
| 544 |
{ name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.34.4" },
|
| 545 |
{ name = "numpy", specifier = ">=2.3.2" },
|
| 546 |
+
{ name = "onnx", specifier = ">=1.18.0" },
|
| 547 |
{ name = "onnxruntime", specifier = ">=1.22.1" },
|
| 548 |
{ name = "phonemizer", specifier = ">=3.3.0" },
|
| 549 |
{ name = "soundfile", specifier = ">=0.13.1" },
|
|
|
|
| 668 |
{ url = "https://files.pythonhosted.org/packages/c1/9e/1652778bce745a67b5fe05adde60ed362d38eb17d919a540e813d30f6874/numpy-2.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631", size = 10544226, upload-time = "2025-07-24T20:56:34.509Z" },
|
| 669 |
]
|
| 670 |
|
| 671 |
+
[[package]]
|
| 672 |
+
name = "onnx"
|
| 673 |
+
version = "1.18.0"
|
| 674 |
+
source = { registry = "https://pypi.org/simple" }
|
| 675 |
+
dependencies = [
|
| 676 |
+
{ name = "numpy" },
|
| 677 |
+
{ name = "protobuf" },
|
| 678 |
+
{ name = "typing-extensions" },
|
| 679 |
+
]
|
| 680 |
+
sdist = { url = "https://files.pythonhosted.org/packages/3d/60/e56e8ec44ed34006e6d4a73c92a04d9eea6163cc12440e35045aec069175/onnx-1.18.0.tar.gz", hash = "sha256:3d8dbf9e996629131ba3aa1afd1d8239b660d1f830c6688dd7e03157cccd6b9c", size = 12563009, upload-time = "2025-05-12T22:03:09.626Z" }
|
| 681 |
+
wheels = [
|
| 682 |
+
{ url = "https://files.pythonhosted.org/packages/45/da/9fb8824513fae836239276870bfcc433fa2298d34ed282c3a47d3962561b/onnx-1.18.0-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:030d9f5f878c5f4c0ff70a4545b90d7812cd6bfe511de2f3e469d3669c8cff95", size = 18285906, upload-time = "2025-05-12T22:02:45.01Z" },
|
| 683 |
+
{ url = "https://files.pythonhosted.org/packages/05/e8/762b5fb5ed1a2b8e9a4bc5e668c82723b1b789c23b74e6b5a3356731ae4e/onnx-1.18.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8521544987d713941ee1e591520044d35e702f73dc87e91e6d4b15a064ae813d", size = 17421486, upload-time = "2025-05-12T22:02:48.467Z" },
|
| 684 |
+
{ url = "https://files.pythonhosted.org/packages/12/bb/471da68df0364f22296456c7f6becebe0a3da1ba435cdb371099f516da6e/onnx-1.18.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c137eecf6bc618c2f9398bcc381474b55c817237992b169dfe728e169549e8f", size = 17583581, upload-time = "2025-05-12T22:02:51.784Z" },
|
| 685 |
+
{ url = "https://files.pythonhosted.org/packages/76/0d/01a95edc2cef6ad916e04e8e1267a9286f15b55c90cce5d3cdeb359d75d6/onnx-1.18.0-cp313-cp313-win32.whl", hash = "sha256:6c093ffc593e07f7e33862824eab9225f86aa189c048dd43ffde207d7041a55f", size = 15734621, upload-time = "2025-05-12T22:02:54.62Z" },
|
| 686 |
+
{ url = "https://files.pythonhosted.org/packages/64/95/253451a751be32b6173a648b68f407188009afa45cd6388780c330ff5d5d/onnx-1.18.0-cp313-cp313-win_amd64.whl", hash = "sha256:230b0fb615e5b798dc4a3718999ec1828360bc71274abd14f915135eab0255f1", size = 15850472, upload-time = "2025-05-12T22:02:57.54Z" },
|
| 687 |
+
{ url = "https://files.pythonhosted.org/packages/0a/b1/6fd41b026836df480a21687076e0f559bc3ceeac90f2be8c64b4a7a1f332/onnx-1.18.0-cp313-cp313-win_arm64.whl", hash = "sha256:6f91930c1a284135db0f891695a263fc876466bf2afbd2215834ac08f600cfca", size = 15823808, upload-time = "2025-05-12T22:03:00.305Z" },
|
| 688 |
+
{ url = "https://files.pythonhosted.org/packages/70/f3/499e53dd41fa7302f914dd18543da01e0786a58b9a9d347497231192001f/onnx-1.18.0-cp313-cp313t-macosx_12_0_universal2.whl", hash = "sha256:2f4d37b0b5c96a873887652d1cbf3f3c70821b8c66302d84b0f0d89dd6e47653", size = 18316526, upload-time = "2025-05-12T22:03:03.691Z" },
|
| 689 |
+
{ url = "https://files.pythonhosted.org/packages/84/dd/6abe5d7bd23f5ed3ade8352abf30dff1c7a9e97fc1b0a17b5d7c726e98a9/onnx-1.18.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a69afd0baa372162948b52c13f3aa2730123381edf926d7ef3f68ca7cec6d0d0", size = 15865055, upload-time = "2025-05-12T22:03:06.663Z" },
|
| 690 |
+
]
|
| 691 |
+
|
| 692 |
[[package]]
|
| 693 |
name = "onnxruntime"
|
| 694 |
version = "1.22.1"
|