Hard-split oversize chunks; lower max_len below vector_estimator's 1000-token cap
Browse files
app.py
CHANGED
|
@@ -348,6 +348,28 @@ def load_voice_style(paths: List[str]) -> Style:
|
|
| 348 |
# ============================================================
|
| 349 |
# TextToSpeech core (slim pipeline)
|
| 350 |
# ============================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
def chunk_text(text: str, max_len: int = 300) -> List[str]:
|
| 352 |
pattern = (
|
| 353 |
r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
|
|
@@ -364,13 +386,22 @@ def chunk_text(text: str, max_len: int = 300) -> List[str]:
|
|
| 364 |
for sentence in re.split(pattern, paragraph):
|
| 365 |
if len(current) + len(sentence) + 1 <= max_len:
|
| 366 |
current += (" " if current else "") + sentence
|
| 367 |
-
|
| 368 |
if current:
|
| 369 |
chunks.append(current.strip())
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
if current:
|
| 372 |
chunks.append(current.strip())
|
| 373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
|
| 376 |
class BlueTTS:
|
|
@@ -522,7 +553,10 @@ class BlueTTS:
|
|
| 522 |
assert style.ttl.shape[0] == 1, "single-text mode needs a single style"
|
| 523 |
if phonemize:
|
| 524 |
text = self.g2p.phonemize(text, lang=lang)
|
| 525 |
-
|
|
|
|
|
|
|
|
|
|
| 526 |
chunks = chunk_text(text, max_len=max_len)
|
| 527 |
wav_cat: Optional[np.ndarray] = None
|
| 528 |
for chunk in chunks:
|
|
|
|
| 348 |
# ============================================================
|
| 349 |
# TextToSpeech core (slim pipeline)
|
| 350 |
# ============================================================
|
| 351 |
+
def _hard_split(s: str, max_len: int) -> List[str]:
|
| 352 |
+
"""Split ``s`` into pieces of at most ``max_len`` chars, preferring spaces."""
|
| 353 |
+
s = s.strip()
|
| 354 |
+
if len(s) <= max_len:
|
| 355 |
+
return [s] if s else []
|
| 356 |
+
out: List[str] = []
|
| 357 |
+
i, n = 0, len(s)
|
| 358 |
+
while i < n:
|
| 359 |
+
j = min(i + max_len, n)
|
| 360 |
+
if j < n:
|
| 361 |
+
cut = s.rfind(" ", i, j)
|
| 362 |
+
if cut > i + max_len // 4:
|
| 363 |
+
j = cut
|
| 364 |
+
piece = s[i:j].strip()
|
| 365 |
+
if piece:
|
| 366 |
+
out.append(piece)
|
| 367 |
+
i = j
|
| 368 |
+
while i < n and s[i] == " ":
|
| 369 |
+
i += 1
|
| 370 |
+
return out
|
| 371 |
+
|
| 372 |
+
|
| 373 |
def chunk_text(text: str, max_len: int = 300) -> List[str]:
|
| 374 |
pattern = (
|
| 375 |
r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
|
|
|
|
| 386 |
for sentence in re.split(pattern, paragraph):
|
| 387 |
if len(current) + len(sentence) + 1 <= max_len:
|
| 388 |
current += (" " if current else "") + sentence
|
| 389 |
+
else:
|
| 390 |
if current:
|
| 391 |
chunks.append(current.strip())
|
| 392 |
+
current = ""
|
| 393 |
+
if len(sentence) > max_len:
|
| 394 |
+
chunks.extend(_hard_split(sentence, max_len))
|
| 395 |
+
else:
|
| 396 |
+
current = sentence
|
| 397 |
if current:
|
| 398 |
chunks.append(current.strip())
|
| 399 |
+
base = chunks if chunks else [text.strip()]
|
| 400 |
+
# Defensive: guarantee nothing exceeds max_len (e.g. phonemization can blow up).
|
| 401 |
+
out: List[str] = []
|
| 402 |
+
for c in base:
|
| 403 |
+
out.extend(_hard_split(c, max_len))
|
| 404 |
+
return out
|
| 405 |
|
| 406 |
|
| 407 |
class BlueTTS:
|
|
|
|
| 553 |
assert style.ttl.shape[0] == 1, "single-text mode needs a single style"
|
| 554 |
if phonemize:
|
| 555 |
text = self.g2p.phonemize(text, lang=lang)
|
| 556 |
+
# vector_estimator.onnx was exported with a ~1000-token positional buffer;
|
| 557 |
+
# phonemization can ~3x char counts for some languages (Hebrew especially),
|
| 558 |
+
# so keep the synth chunk well below that.
|
| 559 |
+
max_len = 120 if lang == "ko" else 250
|
| 560 |
chunks = chunk_text(text, max_len=max_len)
|
| 561 |
wav_cat: Optional[np.ndarray] = None
|
| 562 |
for chunk in chunks:
|