BlueV2

Running

App Files Files Community

notmax123 commited on Apr 24

Commit

fafbe00

1 Parent(s): 11dd574

Hard-split oversize chunks; lower max_len below vector_estimator's 1000-token cap

Browse files

Files changed (1) hide show

app.py +38 -4

app.py CHANGED Viewed

@@ -348,6 +348,28 @@ def load_voice_style(paths: List[str]) -> Style:
 # ============================================================
 # TextToSpeech core (slim pipeline)
 # ============================================================
 def chunk_text(text: str, max_len: int = 300) -> List[str]:
     pattern = (
         r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
@@ -364,13 +386,22 @@ def chunk_text(text: str, max_len: int = 300) -> List[str]:
         for sentence in re.split(pattern, paragraph):
             if len(current) + len(sentence) + 1 <= max_len:
                 current += (" " if current else "") + sentence
-        else:
                 if current:
                     chunks.append(current.strip())
-                current = sentence
         if current:
             chunks.append(current.strip())
-    return chunks if chunks else [text.strip()]
 class BlueTTS:
@@ -522,7 +553,10 @@ class BlueTTS:
         assert style.ttl.shape[0] == 1, "single-text mode needs a single style"
         if phonemize:
             text = self.g2p.phonemize(text, lang=lang)
-        max_len = 120 if lang == "ko" else 300
         chunks = chunk_text(text, max_len=max_len)
         wav_cat: Optional[np.ndarray] = None
         for chunk in chunks:

 # ============================================================
 # TextToSpeech core (slim pipeline)
 # ============================================================
+def _hard_split(s: str, max_len: int) -> List[str]:
+    """Split ``s`` into pieces of at most ``max_len`` chars, preferring spaces."""
+    s = s.strip()
+    if len(s) <= max_len:
+        return [s] if s else []
+    out: List[str] = []
+    i, n = 0, len(s)
+    while i < n:
+        j = min(i + max_len, n)
+        if j < n:
+            cut = s.rfind(" ", i, j)
+            if cut > i + max_len // 4:
+                j = cut
+        piece = s[i:j].strip()
+        if piece:
+            out.append(piece)
+        i = j
+        while i < n and s[i] == " ":
+            i += 1
+    return out
 def chunk_text(text: str, max_len: int = 300) -> List[str]:
     pattern = (
         r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
         for sentence in re.split(pattern, paragraph):
             if len(current) + len(sentence) + 1 <= max_len:
                 current += (" " if current else "") + sentence
+            else:
                 if current:
                     chunks.append(current.strip())
+                    current = ""
+                if len(sentence) > max_len:
+                    chunks.extend(_hard_split(sentence, max_len))
+                else:
+                    current = sentence
         if current:
             chunks.append(current.strip())
+    base = chunks if chunks else [text.strip()]
+    # Defensive: guarantee nothing exceeds max_len (e.g. phonemization can blow up).
+    out: List[str] = []
+    for c in base:
+        out.extend(_hard_split(c, max_len))
+    return out
 class BlueTTS:
         assert style.ttl.shape[0] == 1, "single-text mode needs a single style"
         if phonemize:
             text = self.g2p.phonemize(text, lang=lang)
+        # vector_estimator.onnx was exported with a ~1000-token positional buffer;
+        # phonemization can ~3x char counts for some languages (Hebrew especially),
+        # so keep the synth chunk well below that.
+        max_len = 120 if lang == "ko" else 250
         chunks = chunk_text(text, max_len=max_len)
         wav_cat: Optional[np.ndarray] = None
         for chunk in chunks: