notmax123 commited on
Commit
fafbe00
·
1 Parent(s): 11dd574

Hard-split oversize chunks; lower max_len below vector_estimator's 1000-token cap

Browse files
Files changed (1) hide show
  1. app.py +38 -4
app.py CHANGED
@@ -348,6 +348,28 @@ def load_voice_style(paths: List[str]) -> Style:
348
  # ============================================================
349
  # TextToSpeech core (slim pipeline)
350
  # ============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  def chunk_text(text: str, max_len: int = 300) -> List[str]:
352
  pattern = (
353
  r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
@@ -364,13 +386,22 @@ def chunk_text(text: str, max_len: int = 300) -> List[str]:
364
  for sentence in re.split(pattern, paragraph):
365
  if len(current) + len(sentence) + 1 <= max_len:
366
  current += (" " if current else "") + sentence
367
- else:
368
  if current:
369
  chunks.append(current.strip())
370
- current = sentence
 
 
 
 
371
  if current:
372
  chunks.append(current.strip())
373
- return chunks if chunks else [text.strip()]
 
 
 
 
 
374
 
375
 
376
  class BlueTTS:
@@ -522,7 +553,10 @@ class BlueTTS:
522
  assert style.ttl.shape[0] == 1, "single-text mode needs a single style"
523
  if phonemize:
524
  text = self.g2p.phonemize(text, lang=lang)
525
- max_len = 120 if lang == "ko" else 300
 
 
 
526
  chunks = chunk_text(text, max_len=max_len)
527
  wav_cat: Optional[np.ndarray] = None
528
  for chunk in chunks:
 
348
  # ============================================================
349
  # TextToSpeech core (slim pipeline)
350
  # ============================================================
351
+ def _hard_split(s: str, max_len: int) -> List[str]:
352
+ """Split ``s`` into pieces of at most ``max_len`` chars, preferring spaces."""
353
+ s = s.strip()
354
+ if len(s) <= max_len:
355
+ return [s] if s else []
356
+ out: List[str] = []
357
+ i, n = 0, len(s)
358
+ while i < n:
359
+ j = min(i + max_len, n)
360
+ if j < n:
361
+ cut = s.rfind(" ", i, j)
362
+ if cut > i + max_len // 4:
363
+ j = cut
364
+ piece = s[i:j].strip()
365
+ if piece:
366
+ out.append(piece)
367
+ i = j
368
+ while i < n and s[i] == " ":
369
+ i += 1
370
+ return out
371
+
372
+
373
  def chunk_text(text: str, max_len: int = 300) -> List[str]:
374
  pattern = (
375
  r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
 
386
  for sentence in re.split(pattern, paragraph):
387
  if len(current) + len(sentence) + 1 <= max_len:
388
  current += (" " if current else "") + sentence
389
+ else:
390
  if current:
391
  chunks.append(current.strip())
392
+ current = ""
393
+ if len(sentence) > max_len:
394
+ chunks.extend(_hard_split(sentence, max_len))
395
+ else:
396
+ current = sentence
397
  if current:
398
  chunks.append(current.strip())
399
+ base = chunks if chunks else [text.strip()]
400
+ # Defensive: guarantee nothing exceeds max_len (e.g. phonemization can blow up).
401
+ out: List[str] = []
402
+ for c in base:
403
+ out.extend(_hard_split(c, max_len))
404
+ return out
405
 
406
 
407
  class BlueTTS:
 
553
  assert style.ttl.shape[0] == 1, "single-text mode needs a single style"
554
  if phonemize:
555
  text = self.g2p.phonemize(text, lang=lang)
556
+ # vector_estimator.onnx was exported with a ~1000-token positional buffer;
557
+ # phonemization can ~3x char counts for some languages (Hebrew especially),
558
+ # so keep the synth chunk well below that.
559
+ max_len = 120 if lang == "ko" else 250
560
  chunks = chunk_text(text, max_len=max_len)
561
  wav_cat: Optional[np.ndarray] = None
562
  for chunk in chunks: