Blue

Runtime error

notmax123 commited on Apr 14

Commit

5d79055

1 Parent(s): 6f91e56

Align phoneme pipeline with modular reference; default UI lang en.

Add BLUE_SYNTH_MAX_CHUNK_LEN, _split_hebrew_prephoneme / _split_oversized_hebrew_clause, IPA chunk_text (.!? + tag boundary fix). Renikud uses renikud_max_clause_chars; espeak-ng subprocess fallback. Unknown segment lang falls back to en.

Made-with: Cursor

Files changed (1) hide show

app.py +199 -155

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ Upstream: https://github.com/maxmelichov/BlueTTS
 import os
 import re
 import sys
 import json
 import time
 import base64
@@ -128,6 +129,9 @@ def text_to_indices_multilang(text: str, base_lang: str = "en") -> list[int]:
         ids.extend(CHAR_TO_ID.get(ch, PAD_ID) for ch in seg)
     return ids
 # ============================================================
 # Text Processing & Chunking
 # ============================================================
@@ -137,6 +141,160 @@ class Style:
     ttl: Any
     dp:  Optional[Any] = None
 class TextProcessor:
     _ESPEAK_MAP = {
         "en": "en-us", "en-us": "en-us", "de": "de", "ge": "de", "it": "it",
@@ -144,8 +302,14 @@ class TextProcessor:
     }
     _INLINE_LANG_PAIR = re.compile(r"<(\w+)>(.*?)(?:</\1>|<\1>)", re.DOTALL)
-    def __init__(self, renikud_path: Optional[str] = None):
         self.renikud = None
         self._espeak_backends: Dict[str, Any] = {}
         self._espeak_separator: Any = None
         self._espeak_ready = False
@@ -208,26 +372,27 @@ class TextProcessor:
             return text
         if not self._espeak_ready:
             self._init_espeak()
-        if not self._espeak_ready:
-            print(f"[WARN] espeak-ng not available, returning raw text for lang={lang}")
-            return text
         try:
-            backend = self._get_espeak_backend(espeak_lang)
-            raw = backend.phonemize(
-                [text], separator=self._espeak_separator
-            )[0]
             return normalize_text(raw, lang=lang)
         except Exception as e:
-            print(f"[WARN] Phonemization failed for lang={lang}: {e}")
-            return text
-    def _renikud_phonemize_hebrew(self, text: str) -> str:
-        """Chunk long Hebrew only for Renikud; join IPA so BlueTTS still chunks at chunk_len."""
-        g2p_chunks = _renikud_chunk_hebrew(text)
-        if len(g2p_chunks) <= 1:
-            return self.renikud.phonemize(text)
-        parts = [self.renikud.phonemize(c) for c in g2p_chunks]
-        return _join_renikud_ipa_parts(parts)
     def _phonemize_segment(self, content: str, lang: str) -> str:
         content = content.strip()
@@ -240,7 +405,13 @@ class TextProcessor:
         if has_hebrew:
             if self.renikud is None:
                 raise self._hebrew_requires_renikud_error()
-            return normalize_text(self._renikud_phonemize_hebrew(content), lang="he")
         if lang == "he":
             return normalize_text(content, lang="he")
         return self._espeak_phonemize(content, lang)
@@ -271,150 +442,23 @@ class TextProcessor:
         return re.sub(r"\s+", " ", " ".join(pieces)).strip()
     def phonemize(self, text: str, lang: str = "en") -> str:
-        # Clean up repeated punctuation to prevent model hallucinations
-        text = re.sub(r"\.+", ".", text)
-        text = re.sub(r"\?+", "?", text)
-        text = re.sub(r"!+", "!", text)
-        text = text.replace("…", ".")
         if self._INLINE_LANG_PAIR.search(text):
             return self._phonemize_mixed(text, base_lang=lang)
-        is_hebrew = any('\u0590' <= c <= '\u05ff' for c in text)
         if lang == "he" or is_hebrew:
             if not is_hebrew:
                 return normalize_text(text, lang="he")
             if self.renikud is not None:
-                return normalize_text(self._renikud_phonemize_hebrew(text), lang="he")
             raise self._hebrew_requires_renikud_error()
         return self._espeak_phonemize(text, lang)
-def _hard_split_chunk(s: str, max_len: int) -> List[str]:
-    s = s.strip()
-    if not s or max_len <= 0:
-        return [s] if s else []
-    if len(s) <= max_len:
-        return [s]
-    out: List[str] = []
-    start = 0
-    n = len(s)
-    while start < n:
-        end = min(start + max_len, n)
-        if end < n:
-            window = s[start:end]
-            cut = window.rfind(" ")
-            if cut > max(max_len // 4, 8):
-                end = start + cut
-        piece = s[start:end].strip()
-        if piece:
-            out.append(piece)
-        start = end
-        while start < n and s[start] == " ":
-            start += 1
-    return out
-def chunk_text(text: str, max_len: int = 300) -> List[str]:
-    pattern = (
-        r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
-        r"(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)"
-        r"(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)"
-        r"(?<!\b[A-Z]\.)(?<=[.!?:,;])\s+"
-    )
-    chunks: List[str] = []
-    for paragraph in re.split(r"\n\s*\n+", text.strip()):
-        paragraph = paragraph.strip()
-        if not paragraph:
-            continue
-        current = ""
-        for sentence in re.split(pattern, paragraph):
-            if len(current) + len(sentence) + 1 <= max_len:
-                current += (" " if current else "") + sentence
-            else:
-                if current:
-                    chunks.append(current.strip())
-                if len(sentence) > max_len:
-                    chunks.extend(_hard_split_chunk(sentence, max_len))
-                    current = ""
-                else:
-                    current = sentence
-        if current:
-            chunks.append(current.strip())
-    base = chunks if chunks else ([text.strip()] if text.strip() else [])
-    # TensorRT engines cap T_text; long IPA without ".!?" must never stay in one oversized chunk.
-    out: List[str] = []
-    for c in base:
-        out.extend(_hard_split_chunk(c, max_len))
-    # Fix language tags that span across chunks
-    fixed_out = []
-    active_tag = None
-    for c in out:
-        c = c.strip()
-        if not c:
-            continue
-        if active_tag and not c.startswith(f"<{active_tag}>"):
-            c = f"<{active_tag}>" + c
-        for m in re.finditer(r"<(/)?([a-z]{2,8})>", c):
-            is_close = bool(m.group(1))
-            tag = m.group(2)
-            if is_close:
-                if active_tag == tag:
-                    active_tag = None
-            else:
-                active_tag = tag
-        if active_tag and not c.endswith(f"</{active_tag}>"):
-            c = c + f"</{active_tag}>"
-        fixed_out.append(c)
-    return fixed_out or ([text.strip()] if text.strip() else [])
-def _join_renikud_ipa_parts(parts: List[str]) -> str:
-    """Join IPA from multiple Renikud calls; normalize whitespace (no duplicate words from join gaps)."""
-    merged = " ".join(p.strip() for p in parts if p and p.strip())
-    return re.sub(r"\s+", " ", merged).strip()
-def _renikud_chunk_hebrew(text: str, max_len: int = 168) -> List[str]:
-    """Split raw Hebrew for Renikud only.
-    Uses sentence breaks (.!?) plus length cap — not the same rules as ``chunk_text`` for IPA
-    (which splits on , : ;). Fewer G2P segments avoids prosodic 'mini-sentence' artifacts that
-    can sound like repetition when stitched. BlueTTS still chunks phoneme strings at chunk_len.
-    """
-    text = text.strip()
-    if not text:
-        return []
-    if len(text) <= max_len:
-        return [text]
-    # Sentence boundaries only; keep commas/colons inside a segment when possible.
-    sent_pat = r"(?<=[.!?])\s+"
-    chunks: List[str] = []
-    for paragraph in re.split(r"\n\s*\n+", text):
-        paragraph = paragraph.strip()
-        if not paragraph:
-            continue
-        current = ""
-        for sentence in re.split(sent_pat, paragraph):
-            if len(current) + len(sentence) + 1 <= max_len:
-                current += (" " if current else "") + sentence
-            else:
-                if current:
-                    chunks.append(current.strip())
-                if len(sentence) > max_len:
-                    chunks.extend(_hard_split_chunk(sentence, max_len))
-                    current = ""
-                else:
-                    current = sentence
-        if current:
-            chunks.append(current.strip())
-    base = chunks if chunks else [text]
-    out: List[str] = []
-    for c in base:
-        out.extend(_hard_split_chunk(c, max_len))
-    return out or [text]
 # ============================================================
 # BlueTTS Core
 # ============================================================
@@ -430,7 +474,7 @@ class BlueTTS:
         speed: float = 1.0,
         seed: int = 42,
         use_gpu: bool = False,
-        chunk_len: int = 150,
         silence_sec: float = 0.15,
         fade_duration: float = 0.02,
         renikud_path: Optional[str] = None,

 import os
 import re
 import sys
+import subprocess
 import json
 import time
 import base64
         ids.extend(CHAR_TO_ID.get(ch, PAD_ID) for ch in seg)
     return ids
+# Max IPA characters per synthesis forward pass (ONNX). Independent of Renikud clause splitting.
+BLUE_SYNTH_MAX_CHUNK_LEN = 150
 # ============================================================
 # Text Processing & Chunking
 # ============================================================
     ttl: Any
     dp:  Optional[Any] = None
+def _hard_split_chunk(s: str, max_len: int) -> List[str]:
+    """Split ``s`` into segments of at most ``max_len`` chars (prefer last space)."""
+    s = s.strip()
+    if not s or max_len <= 0:
+        return [s] if s else []
+    if len(s) <= max_len:
+        return [s]
+    out: List[str] = []
+    start = 0
+    n = len(s)
+    while start < n:
+        end = min(start + max_len, n)
+        if end < n:
+            window = s[start:end]
+            cut = window.rfind(" ")
+            if cut > max(max_len // 4, 8):
+                end = start + cut
+        piece = s[start:end].strip()
+        if piece:
+            out.append(piece)
+        start = end
+        while start < n and s[start] == " ":
+            start += 1
+    return out
+def _split_oversized_hebrew_clause(part: str, max_clause_chars: int) -> List[str]:
+    """Only used when a single sentence is longer than ``max_clause_chars``."""
+    p = part.strip()
+    if not p:
+        return []
+    if len(p) <= max_clause_chars:
+        return [p]
+    if re.search(r":\s", p):
+        pieces = [x.strip() for x in re.split(r"(?<=:)\s+", p) if x.strip()]
+        if len(pieces) > 1:
+            out: List[str] = []
+            for x in pieces:
+                out.extend(_split_oversized_hebrew_clause(x, max_clause_chars))
+            return out
+    if re.search(r"[\u0590-\u05ff]-\s+[\u0590-\u05ff]", p):
+        pieces = [x.strip() for x in re.split(r"(?<=[\u0590-\u05ff])-\s+", p) if x.strip()]
+        if len(pieces) > 1:
+            out2: List[str] = []
+            for x in pieces:
+                out2.extend(_split_oversized_hebrew_clause(x, max_clause_chars))
+            return out2
+    if re.search(r",\s", p):
+        pieces = [x.strip() for x in re.split(r",\s+", p) if x.strip()]
+        if len(pieces) > 1:
+            out3: List[str] = []
+            for x in pieces:
+                out3.extend(_split_oversized_hebrew_clause(x, max_clause_chars))
+            return out3
+    return _hard_split_chunk(p, max_clause_chars)
+def _split_hebrew_prephoneme(text: str, max_clause_chars: int = 96) -> List[str]:
+    """Split raw Hebrew before Renikud G2P.
+    By default only sentence boundaries (``.?!``); colon / hyphen / comma splits run
+    only when one sentence is longer than ``max_clause_chars``.
+    """
+    t = text.strip()
+    if not t:
+        return []
+    t = re.sub(r"\.+", ".", t)
+    t = re.sub(r"\?+", "?", t)
+    t = re.sub(r"!+", "!", t)
+    t = t.replace("…", ".")
+    t = re.sub(r"\s+", " ", t)
+    def refine_one(s: str) -> List[str]:
+        s = s.strip()
+        if not s:
+            return []
+        out: List[str] = []
+        for sent in re.split(r"(?<=[.!?])\s+", s):
+            sent = sent.strip()
+            if not sent:
+                continue
+            out.extend(_split_oversized_hebrew_clause(sent, max_clause_chars))
+        return out
+    clauses: List[str] = []
+    for block in re.split(r"\n+", t):
+        block = block.strip()
+        if block:
+            clauses.extend(refine_one(block))
+    return clauses if clauses else [t]
+def chunk_text(text: str, max_len: int = 300) -> List[str]:
+    """Split IPA/text into sentence-boundary chunks no longer than max_len chars."""
+    text = re.sub(r"([.!?])(</[a-z]{2,8}>)\s+", r"\1\2\n\n", text)
+    pattern = (
+        r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)"
+        r"(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)"
+        r"(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)"
+        r"(?<!\b[A-Z]\.)(?<=[.!?])\s+"
+    )
+    chunks: List[str] = []
+    for paragraph in re.split(r"\n\s*\n+", text.strip()):
+        paragraph = paragraph.strip()
+        if not paragraph:
+            continue
+        current = ""
+        for sentence in re.split(pattern, paragraph):
+            if len(current) + len(sentence) + 1 <= max_len:
+                current += (" " if current else "") + sentence
+            else:
+                if current:
+                    chunks.append(current.strip())
+                if len(sentence) > max_len:
+                    chunks.extend(_hard_split_chunk(sentence, max_len))
+                    current = ""
+                else:
+                    current = sentence
+        if current:
+            chunks.append(current.strip())
+    base = chunks if chunks else ([text.strip()] if text.strip() else [])
+    out: List[str] = []
+    for c in base:
+        out.extend(_hard_split_chunk(c, max_len))
+    fixed_out = []
+    active_tag = None
+    for c in out:
+        c = c.strip()
+        if not c:
+            continue
+        if active_tag and not c.startswith(f"<{active_tag}>"):
+            c = f"<{active_tag}>" + c
+        for m in re.finditer(r"<(/)?([a-z]{2,8})>", c):
+            is_close = bool(m.group(1))
+            tag = m.group(2)
+            if is_close:
+                if active_tag == tag:
+                    active_tag = None
+            else:
+                active_tag = tag
+        if active_tag and not c.endswith(f"</{active_tag}>"):
+            c = c + f"</{active_tag}>"
+        fixed_out.append(c)
+    return fixed_out or ([text.strip()] if text.strip() else [])
 class TextProcessor:
     _ESPEAK_MAP = {
         "en": "en-us", "en-us": "en-us", "de": "de", "ge": "de", "it": "it",
     }
     _INLINE_LANG_PAIR = re.compile(r"<(\w+)>(.*?)(?:</\1>|<\1>)", re.DOTALL)
+    def __init__(
+        self,
+        renikud_path: Optional[str] = None,
+        *,
+        renikud_max_clause_chars: int = 96,
+    ):
         self.renikud = None
+        self._renikud_max_clause_chars = renikud_max_clause_chars
         self._espeak_backends: Dict[str, Any] = {}
         self._espeak_separator: Any = None
         self._espeak_ready = False
             return text
         if not self._espeak_ready:
             self._init_espeak()
+        if self._espeak_ready:
+            try:
+                backend = self._get_espeak_backend(espeak_lang)
+                raw = backend.phonemize(
+                    [text], separator=self._espeak_separator
+                )[0]
+                return normalize_text(raw, lang=lang)
+            except Exception as e:
+                print(f"[WARN] Phonemizer backend failed for lang={lang}: {e}")
         try:
+            result = subprocess.run(
+                ["espeak-ng", "-q", "--ipa=1", "-v", espeak_lang, text],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+            raw = result.stdout.replace("\n", " ").strip()
             return normalize_text(raw, lang=lang)
         except Exception as e:
+            print(f"[WARN] espeak-ng fallback failed for lang={lang}: {e}")
+        return text
     def _phonemize_segment(self, content: str, lang: str) -> str:
         content = content.strip()
         if has_hebrew:
             if self.renikud is None:
                 raise self._hebrew_requires_renikud_error()
+            clauses = _split_hebrew_prephoneme(content, self._renikud_max_clause_chars)
+            ipa_parts = [
+                normalize_text(self.renikud.phonemize(c), lang="he")
+                for c in clauses
+                if c.strip()
+            ]
+            return re.sub(r"\s+", " ", " ".join(ipa_parts)).strip()
         if lang == "he":
             return normalize_text(content, lang="he")
         return self._espeak_phonemize(content, lang)
         return re.sub(r"\s+", " ", " ".join(pieces)).strip()
     def phonemize(self, text: str, lang: str = "en") -> str:
         if self._INLINE_LANG_PAIR.search(text):
             return self._phonemize_mixed(text, base_lang=lang)
+        is_hebrew = any("\u0590" <= c <= "\u05ff" for c in text)
         if lang == "he" or is_hebrew:
             if not is_hebrew:
                 return normalize_text(text, lang="he")
             if self.renikud is not None:
+                clauses = _split_hebrew_prephoneme(text, self._renikud_max_clause_chars)
+                ipa_parts = [
+                    normalize_text(self.renikud.phonemize(c), lang="he")
+                    for c in clauses
+                    if c.strip()
+                ]
+                return re.sub(r"\s+", " ", " ".join(ipa_parts)).strip()
             raise self._hebrew_requires_renikud_error()
         return self._espeak_phonemize(text, lang)
 # ============================================================
 # BlueTTS Core
 # ============================================================
         speed: float = 1.0,
         seed: int = 42,
         use_gpu: bool = False,
+        chunk_len: int = BLUE_SYNTH_MAX_CHUNK_LEN,
         silence_sec: float = 0.15,
         fade_duration: float = 0.02,
         renikud_path: Optional[str] = None,