Spaces:

ShadowHunter222
/

chab2

Running

App Files Files Community

ShadowHunter222 commited on Apr 9

Commit

06dac54

verified ·

1 Parent(s): 5bd9d0c

Upload 2 files

Browse files

Files changed (2) hide show

config.py +15 -3
text_processor.py +41 -7

config.py CHANGED Viewed

@@ -17,6 +17,12 @@ def _get_bool(name: str, default: bool) -> bool:
     return raw.strip().lower() in {"1", "true", "yes", "on"}
 class Config:
     # ── Model ────────────────────────────────────────────────────
     MODEL_ID: str = os.getenv("CB_MODEL_ID", "ResembleAI/chatterbox-turbo-ONNX")
@@ -67,6 +73,12 @@ class Config:
     # (not a model — just a reference WAV, safe to use from any source).
     DEFAULT_VOICE_REPO: str = "onnx-community/chatterbox-ONNX"
     DEFAULT_VOICE_FILE: str = "default_voice.wav"
     MAX_VOICE_UPLOAD_BYTES: int = 10 * 1024 * 1024   # 10 MB
     MIN_REF_DURATION_SEC: float = 1.5
     MAX_REF_DURATION_SEC: float = 30.0
@@ -76,15 +88,15 @@ class Config:
     # ── Streaming ────────────────────────────────────────────────
     # Smaller chunks = faster TTFB (first audio arrives sooner)
     # ~200 chars ≈ 1–2 sentences ≈ fastest first-chunk on 2 vCPU
-    MAX_CHUNK_CHARS: int = int(os.getenv("CB_MAX_CHUNK_CHARS", "100"))
     # Additive parallel mode (3-way split: primary + helper1 + helper2).
     ENABLE_PARALLEL_MODE: bool = _get_bool("CB_ENABLE_PARALLEL_MODE", True)
-    HELPER_BASE_URL: str = os.getenv("CB_HELPER_BASE_URL", "").strip()
     HELPER1_BASE_URL: str = os.getenv(
         "CB_HELPER1_BASE_URL",
         HELPER_BASE_URL,
     ).strip()
-    HELPER2_BASE_URL: str = os.getenv("CB_HELPER2_BASE_URL", "").strip()
     HELPER_TIMEOUT_SEC: float = float(os.getenv("CB_HELPER_TIMEOUT_SEC", "45"))
     HELPER_RETRY_ONCE: bool = _get_bool("CB_HELPER_RETRY_ONCE", True)
     # Optional shared secret for internal chunk endpoints.

     return raw.strip().lower() in {"1", "true", "yes", "on"}
+def _get_csv(name: str, default: str) -> tuple[str, ...]:
+    raw = os.getenv(name, default)
+    items = [x.strip() for x in raw.split(",")]
+    return tuple(x for x in items if x)
 class Config:
     # ── Model ────────────────────────────────────────────────────
     MODEL_ID: str = os.getenv("CB_MODEL_ID", "ResembleAI/chatterbox-turbo-ONNX")
     # (not a model — just a reference WAV, safe to use from any source).
     DEFAULT_VOICE_REPO: str = "onnx-community/chatterbox-ONNX"
     DEFAULT_VOICE_FILE: str = "default_voice.wav"
+    DEFAULT_VOICE_REPOS: tuple[str, ...] = _get_csv(
+        "CB_DEFAULT_VOICE_REPOS",
+        DEFAULT_VOICE_REPO,
+    )
+    PRELOAD_BUILTIN_VOICES: bool = _get_bool("CB_PRELOAD_BUILTIN_VOICES", True)
+    MAX_PRELOAD_BUILTIN_VOICES: int = int(os.getenv("CB_MAX_PRELOAD_BUILTIN_VOICES", "64"))
     MAX_VOICE_UPLOAD_BYTES: int = 10 * 1024 * 1024   # 10 MB
     MIN_REF_DURATION_SEC: float = 1.5
     MAX_REF_DURATION_SEC: float = 30.0
     # ── Streaming ────────────────────────────────────────────────
     # Smaller chunks = faster TTFB (first audio arrives sooner)
     # ~200 chars ≈ 1–2 sentences ≈ fastest first-chunk on 2 vCPU
+    MAX_CHUNK_CHARS: int = int(os.getenv("CB_MAX_CHUNK_CHARS", "150"))
     # Additive parallel mode (3-way split: primary + helper1 + helper2).
     ENABLE_PARALLEL_MODE: bool = _get_bool("CB_ENABLE_PARALLEL_MODE", True)
+    HELPER_BASE_URL: str = os.getenv("CB_HELPER_BASE_URL", "https://shadowhunter222-chab2.hf.space").strip()
     HELPER1_BASE_URL: str = os.getenv(
         "CB_HELPER1_BASE_URL",
         HELPER_BASE_URL,
     ).strip()
+    HELPER2_BASE_URL: str = os.getenv("CB_HELPER2_BASE_URL", "https://shadowhunter222-chab3.hf.space").strip()
     HELPER_TIMEOUT_SEC: float = float(os.getenv("CB_HELPER_TIMEOUT_SEC", "45"))
     HELPER_RETRY_ONCE: bool = _get_bool("CB_HELPER_RETRY_ONCE", True)
     # Optional shared secret for internal chunk endpoints.

text_processor.py CHANGED Viewed

@@ -231,6 +231,23 @@ def sanitize(text: str) -> str:
     for idx, original in tags_found:
         text = text.replace(f"§TAG{idx}§", original)
     return text
@@ -286,25 +303,42 @@ def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> L
 # ═══════════════════════════════════════════════════════════════════
 def _break_long_chunk(text: str, max_chars: int) -> List[str]:
-    """Break a chunk longer than max_chars on commas or word boundaries."""
     parts: List[str] = []
     remaining = text
     while len(remaining) > max_chars:
         break_pos = -1
         include_break_char = False
-        # Prefer punctuation/pauses first to keep prosody natural.
         for marker in (",", ";", ":", "—", "-", "!", "?"):
             pos = remaining.rfind(marker, 0, max_chars)
             if pos > break_pos:
                 break_pos = pos
                 include_break_char = True
-        # Then prefer nearest space before limit.
-        space_pos = remaining.rfind(" ", 0, max_chars)
-        if space_pos > break_pos:
-            break_pos = space_pos
-            include_break_char = False
         # If nothing before limit, look slightly ahead to avoid mid-word cuts.
         if break_pos == -1:

     for idx, original in tags_found:
         text = text.replace(f"§TAG{idx}§", original)
+    # 11. Ensure paralinguistic tags have spaces around them.
+    #     The model needs whitespace boundaries to properly render tags like
+    #     [clear throat]. Without spaces (e.g. "Jerry.[clear throat]I'm"),
+    #     the tag gets swallowed or produces silence instead of the sound.
+    text = re.sub(
+        r"(\w)(\[(?:" + _TAG_NAMES + r")\])",
+        r"\1 \2",
+        text,
+        flags=re.IGNORECASE,
+    )
+    text = re.sub(
+        r"(\[(?:" + _TAG_NAMES + r")\])(\w)",
+        r"\1 \2",
+        text,
+        flags=re.IGNORECASE,
+    )
     return text
 # ═══════════════════════════════════════════════════════════════════
 def _break_long_chunk(text: str, max_chars: int) -> List[str]:
+    """Break a chunk longer than max_chars on natural pause boundaries.
+    Priority order for break points:
+      1. Ellipsis '...' — strongest natural pause within a long sentence
+      2. Punctuation (comma, semicolon, colon, dash, !, ?)
+      3. Nearest space before limit
+      4. Look ahead slightly to avoid mid-word cuts
+    """
     parts: List[str] = []
     remaining = text
     while len(remaining) > max_chars:
         break_pos = -1
         include_break_char = False
+        # First try: break at ellipsis '...' — the strongest internal pause.
+        ellipsis_pos = remaining.rfind("...", 0, max_chars)
+        if ellipsis_pos > 0:
+            # Include all three dots in the current segment
+            break_pos = ellipsis_pos + 3
+            include_break_char = False  # already moved past the dots
+        # Then try punctuation markers (only upgrade if at a later position).
         for marker in (",", ";", ":", "—", "-", "!", "?"):
             pos = remaining.rfind(marker, 0, max_chars)
             if pos > break_pos:
                 break_pos = pos
                 include_break_char = True
+        # Space is a FALLBACK only — never override a punctuation/ellipsis break.
+        # Cutting at punctuation gives the model proper prosody cues;
+        # cutting at a random space creates mid-phrase fragments ("handle the").
+        if break_pos <= 0:
+            space_pos = remaining.rfind(" ", 0, max_chars)
+            if space_pos > 0:
+                break_pos = space_pos
+                include_break_char = False
         # If nothing before limit, look slightly ahead to avoid mid-word cuts.
         if break_pos == -1: