Spaces:

ShadowHunter222
/

chab2

Running

App Files Files Community

ShadowHunter222 commited on Apr 9

Commit

5f92103

verified ·

1 Parent(s): 06dac54

Upload 6 files

Browse files

Files changed (4) hide show

app.py +4 -42
chatterbox_wrapper.py +198 -6
config.py +1 -1
text_processor.py +7 -41

app.py CHANGED Viewed

@@ -1,20 +1,4 @@
-"""
-Chatterbox Turbo TTS -- FastAPI Server
-======================================
-Production-ready API with true real-time MP3 streaming,
-in-memory voice cloning, and fully non-blocking inference.
-Endpoints:
-  GET  /health              -> health check + optional warmup
-  GET  /info                -> model info, supported tags, parameters
-  POST /tts                 -> full audio response (WAV/MP3/FLAC)
-  POST /tts/stream          -> chunked MP3 streaming (MediaSource-ready)
-  POST /tts/true-stream     -> alias for /tts/stream (Kokoro compat)
-  POST /tts/stop/{stream_id}-> cancel a specific active stream
-  POST /tts/stop            -> cancel ALL active streams
-  POST /v1/audio/speech     -> OpenAI-compatible streaming
-"""
-import asyncio
 import io
 import json
 import logging
@@ -259,31 +243,6 @@ async def health(warm_up: bool = False):
     return status
-@app.get("/info")
-async def info():
-    return {
-        "model": Config.MODEL_ID,
-        "dtype": Config.MODEL_DTYPE,
-        "sample_rate": Config.SAMPLE_RATE,
-        "paralinguistic_tags": list(Config.PARALINGUISTIC_TAGS),
-        "tag_usage": "Insert tags directly in text, e.g. 'That is so funny! [laugh] Anyway…'",
-        "parameters": {
-            "max_new_tokens": {"default": Config.MAX_NEW_TOKENS, "range": "64–2048"},
-            "repetition_penalty": {"default": Config.REPETITION_PENALTY, "range": "1.0–2.0"},
-        },
-        "voice_cloning": {
-            "description": "Upload 3–30s reference WAV/MP3 as 'voice_ref' field",
-            "max_upload_mb": Config.MAX_VOICE_UPLOAD_BYTES // (1024 * 1024),
-        },
-        "parallel_mode": {
-            "enabled": Config.ENABLE_PARALLEL_MODE,
-            "helper_configured": bool(Config.HELPER_BASE_URL),
-            "helper_base_url": Config.HELPER_BASE_URL or None,
-            "supports_voice_ref": True,
-        },
-    }
 # ── POST /tts ─────────────────────────────────────────────────────
 @app.post("/tts", response_class=Response)
@@ -805,6 +764,9 @@ async def internal_chunk_synthesize(
     voice_profile = wrapper.default_voice
     if request.voice_key:
         cached_voice = wrapper._voice_cache.get(request.voice_key)
         if cached_voice is None:
             raise HTTPException(409, "Voice key expired or not found")
         voice_profile = cached_voice

 import io
 import json
 import logging
     return status
 # ── POST /tts ─────────────────────────────────────────────────────
 @app.post("/tts", response_class=Response)
     voice_profile = wrapper.default_voice
     if request.voice_key:
         cached_voice = wrapper._voice_cache.get(request.voice_key)
+        if cached_voice is None:
+            # Built-in voices are permanent in wrapper registry even if TTL cache entry expired.
+            cached_voice = wrapper.get_builtin_voice_by_hash(request.voice_key)
         if cached_voice is None:
             raise HTTPException(409, "Voice key expired or not found")
         voice_profile = cached_voice

chatterbox_wrapper.py CHANGED Viewed

@@ -27,6 +27,7 @@ import tempfile
 import time
 from collections import OrderedDict
 from dataclasses import dataclass
 from typing import Callable, Generator, Optional
 import librosa
@@ -48,6 +49,21 @@ _SUPPORTED_AUDIO_EXTENSIONS = {
 }
 # ═══════════════════════════════════════════════════════════════════
 # Data Structures
 # ═══════════════════════════════════════════════════════════════════
@@ -203,8 +219,15 @@ class ChatterboxWrapper:
             ttl_seconds=self.cfg.VOICE_CACHE_TTL_SEC,
         )
-        logger.info("Encoding default reference voice …")
-        self.default_voice = self._load_default_voice()
         logger.info("✅ ChatterboxWrapper ready")
@@ -260,16 +283,185 @@ class ChatterboxWrapper:
         opts.enable_mem_reuse = True
         return opts
-    # ─── Default voice ────────────────────────────────────────────
-    def _load_default_voice(self) -> VoiceProfile:
         path = hf_hub_download(
             self.cfg.DEFAULT_VOICE_REPO,
             filename=self.cfg.DEFAULT_VOICE_FILE,
             cache_dir=self.cfg.MODELS_DIR,
         )
-        audio, _ = librosa.load(path, sr=self.cfg.SAMPLE_RATE)
-        return self._encode_audio_array(audio, audio_hash="__default__")
     # ─── Voice encoding ──────────────────────────────────────────

 import time
 from collections import OrderedDict
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Callable, Generator, Optional
 import librosa
 }
+def _slugify(text: str) -> str:
+    buf = []
+    prev_underscore = False
+    for ch in text.strip().lower():
+        if ch.isalnum():
+            buf.append(ch)
+            prev_underscore = False
+        else:
+            if not prev_underscore:
+                buf.append("_")
+                prev_underscore = True
+    slug = "".join(buf).strip("_")
+    return slug or "voice"
 # ═══════════════════════════════════════════════════════════════════
 # Data Structures
 # ═══════════════════════════════════════════════════════════════════
             ttl_seconds=self.cfg.VOICE_CACHE_TTL_SEC,
         )
+        self._builtin_voice_profiles: dict[str, VoiceProfile] = {}
+        self._builtin_voice_bytes: dict[str, bytes] = {}
+        self._builtin_voice_by_hash: dict[str, VoiceProfile] = {}
+        self._voice_alias_to_id: dict[str, str] = {}
+        self._builtin_voice_catalog: list[dict] = []
+        self._default_voice_id: str = "default"
+        logger.info("Loading built-in voices (HF default + local samples) …")
+        self.default_voice = self._load_builtin_voices()
         logger.info("✅ ChatterboxWrapper ready")
         opts.enable_mem_reuse = True
         return opts
+    # ─── Built-in voices (HF default + local samples) ────────────
+    def _download_hf_default_voice_bytes(self) -> bytes:
         path = hf_hub_download(
             self.cfg.DEFAULT_VOICE_REPO,
             filename=self.cfg.DEFAULT_VOICE_FILE,
             cache_dir=self.cfg.MODELS_DIR,
         )
+        return Path(path).read_bytes()
+    def _list_local_voice_paths(self) -> list[Path]:
+        wrapper_dir = Path(__file__).resolve().parent
+        # Support both module-level and repo-root deployment layouts.
+        candidates = []
+        for d in (wrapper_dir, Path.cwd().resolve(), wrapper_dir.parent):
+            try:
+                resolved = d.resolve()
+            except Exception:
+                continue
+            if resolved.is_dir() and resolved not in candidates:
+                candidates.append(resolved)
+        voices: list[Path] = []
+        seen_real_paths: set[str] = set()
+        for root in candidates:
+            try:
+                entries = sorted(root.iterdir(), key=lambda x: x.name.lower())
+            except Exception:
+                continue
+            for p in entries:
+                if not p.is_file():
+                    continue
+                if p.suffix.lower() not in _SUPPORTED_AUDIO_EXTENSIONS:
+                    continue
+                real_path = str(p.resolve())
+                if real_path in seen_real_paths:
+                    continue
+                seen_real_paths.add(real_path)
+                voices.append(p)
+        return voices
+    def _make_unique_voice_id(self, preferred: str) -> str:
+        base = _slugify(preferred)
+        candidate = base
+        idx = 2
+        while candidate in self._builtin_voice_profiles:
+            candidate = f"{base}_{idx}"
+            idx += 1
+        return candidate
+    def _register_builtin_voice(
+        self,
+        *,
+        preferred_id: str,
+        display_name: str,
+        source: str,
+        source_ref: str,
+        audio_bytes: bytes,
+        is_default: bool = False,
+    ) -> str:
+        if not audio_bytes:
+            raise ValueError("Voice file is empty")
+        voice_id = self._make_unique_voice_id(preferred_id)
+        audio_hash = hashlib.md5(audio_bytes).hexdigest()
+        profile = self._voice_cache.get(audio_hash)
+        if profile is None:
+            audio = _load_audio_bytes(audio_bytes, sr=self.cfg.SAMPLE_RATE)
+            profile = self._encode_audio_array(audio, audio_hash=audio_hash)
+            self._voice_cache.put(audio_hash, profile)
+        else:
+            # Keep hash attached to cached profile for metadata/voice-key usage.
+            profile.audio_hash = audio_hash
+        self._builtin_voice_profiles[voice_id] = profile
+        self._builtin_voice_bytes[voice_id] = audio_bytes
+        if audio_hash:
+            self._builtin_voice_by_hash[audio_hash] = profile
+        aliases: list[str] = []
+        for alias in (voice_id, _slugify(Path(display_name).stem)):
+            if alias not in self._voice_alias_to_id:
+                self._voice_alias_to_id[alias] = voice_id
+                aliases.append(alias)
+        if is_default:
+            self._default_voice_id = voice_id
+            self._voice_alias_to_id["default"] = voice_id
+            if "default" not in aliases:
+                aliases.append("default")
+        self._builtin_voice_catalog.append(
+            {
+                "id": voice_id,
+                "display_name": display_name,
+                "source": source,
+                "source_ref": source_ref,
+                "aliases": aliases,
+                "voice_key": audio_hash,
+            }
+        )
+        return voice_id
+    def _load_builtin_voices(self) -> VoiceProfile:
+        # 1) HF default voice (kept as true default fallback)
+        hf_bytes = self._download_hf_default_voice_bytes()
+        self._register_builtin_voice(
+            preferred_id="default_hf_voice",
+            display_name=self.cfg.DEFAULT_VOICE_FILE,
+            source="huggingface",
+            source_ref=f"{self.cfg.DEFAULT_VOICE_REPO}:{self.cfg.DEFAULT_VOICE_FILE}",
+            audio_bytes=hf_bytes,
+            is_default=True,
+        )
+        # 2) Local voice samples placed next to app files
+        for path in self._list_local_voice_paths():
+            # Avoid duplicate entry if someone also copied default_voice.wav locally.
+            if path.name == self.cfg.DEFAULT_VOICE_FILE:
+                continue
+            try:
+                self._register_builtin_voice(
+                    preferred_id=path.stem,
+                    display_name=path.name,
+                    source="local",
+                    source_ref=str(path.name),
+                    audio_bytes=path.read_bytes(),
+                    is_default=False,
+                )
+            except Exception as e:
+                logger.warning(f"Skipping local voice {path.name}: {e}")
+        default_profile = self._builtin_voice_profiles.get(self._default_voice_id)
+        if default_profile is None:
+            raise RuntimeError("Default built-in voice could not be initialized")
+        logger.info(
+            f"Built-in voices loaded: {len(self._builtin_voice_catalog)} "
+            f"(default={self._default_voice_id})"
+        )
+        return default_profile
+    def list_builtin_voices(self) -> list[dict]:
+        """Return metadata for startup-preloaded voices."""
+        return [dict(v) for v in self._builtin_voice_catalog]
+    @property
+    def default_voice_name(self) -> str:
+        return self._default_voice_id
+    def resolve_voice_id(self, voice_name: Optional[str]) -> str:
+        if voice_name is None:
+            return self._default_voice_id
+        key = _slugify(str(voice_name))
+        if not key:
+            return self._default_voice_id
+        voice_id = self._voice_alias_to_id.get(key)
+        if voice_id is None:
+            available = ", ".join(sorted(self._voice_alias_to_id.keys()))
+            raise ValueError(f"Unknown voice '{voice_name}'. Available: {available}")
+        return voice_id
+    def get_builtin_voice(self, voice_name: Optional[str]) -> VoiceProfile:
+        voice_id = self.resolve_voice_id(voice_name)
+        profile = self._builtin_voice_profiles[voice_id]
+        if profile.audio_hash:
+            self._voice_cache.put(profile.audio_hash, profile)
+        return profile
+    def get_builtin_voice_bytes(self, voice_name: Optional[str]) -> Optional[bytes]:
+        voice_id = self.resolve_voice_id(voice_name)
+        return self._builtin_voice_bytes.get(voice_id)
+    def get_builtin_voice_by_hash(self, audio_hash: str) -> Optional[VoiceProfile]:
+        return self._builtin_voice_by_hash.get((audio_hash or "").strip())
     # ─── Voice encoding ──────────────────────────────────────────

config.py CHANGED Viewed

@@ -88,7 +88,7 @@ class Config:
     # ── Streaming ────────────────────────────────────────────────
     # Smaller chunks = faster TTFB (first audio arrives sooner)
     # ~200 chars ≈ 1–2 sentences ≈ fastest first-chunk on 2 vCPU
-    MAX_CHUNK_CHARS: int = int(os.getenv("CB_MAX_CHUNK_CHARS", "150"))
     # Additive parallel mode (3-way split: primary + helper1 + helper2).
     ENABLE_PARALLEL_MODE: bool = _get_bool("CB_ENABLE_PARALLEL_MODE", True)
     HELPER_BASE_URL: str = os.getenv("CB_HELPER_BASE_URL", "https://shadowhunter222-chab2.hf.space").strip()

     # ── Streaming ────────────────────────────────────────────────
     # Smaller chunks = faster TTFB (first audio arrives sooner)
     # ~200 chars ≈ 1–2 sentences ≈ fastest first-chunk on 2 vCPU
+    MAX_CHUNK_CHARS: int = int(os.getenv("CB_MAX_CHUNK_CHARS", "100"))
     # Additive parallel mode (3-way split: primary + helper1 + helper2).
     ENABLE_PARALLEL_MODE: bool = _get_bool("CB_ENABLE_PARALLEL_MODE", True)
     HELPER_BASE_URL: str = os.getenv("CB_HELPER_BASE_URL", "https://shadowhunter222-chab2.hf.space").strip()

text_processor.py CHANGED Viewed

@@ -231,23 +231,6 @@ def sanitize(text: str) -> str:
     for idx, original in tags_found:
         text = text.replace(f"§TAG{idx}§", original)
-    # 11. Ensure paralinguistic tags have spaces around them.
-    #     The model needs whitespace boundaries to properly render tags like
-    #     [clear throat]. Without spaces (e.g. "Jerry.[clear throat]I'm"),
-    #     the tag gets swallowed or produces silence instead of the sound.
-    text = re.sub(
-        r"(\w)(\[(?:" + _TAG_NAMES + r")\])",
-        r"\1 \2",
-        text,
-        flags=re.IGNORECASE,
-    )
-    text = re.sub(
-        r"(\[(?:" + _TAG_NAMES + r")\])(\w)",
-        r"\1 \2",
-        text,
-        flags=re.IGNORECASE,
-    )
     return text
@@ -303,42 +286,25 @@ def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> L
 # ═══════════════════════════════════════════════════════════════════
 def _break_long_chunk(text: str, max_chars: int) -> List[str]:
-    """Break a chunk longer than max_chars on natural pause boundaries.
-    Priority order for break points:
-      1. Ellipsis '...' — strongest natural pause within a long sentence
-      2. Punctuation (comma, semicolon, colon, dash, !, ?)
-      3. Nearest space before limit
-      4. Look ahead slightly to avoid mid-word cuts
-    """
     parts: List[str] = []
     remaining = text
     while len(remaining) > max_chars:
         break_pos = -1
         include_break_char = False
-        # First try: break at ellipsis '...' — the strongest internal pause.
-        ellipsis_pos = remaining.rfind("...", 0, max_chars)
-        if ellipsis_pos > 0:
-            # Include all three dots in the current segment
-            break_pos = ellipsis_pos + 3
-            include_break_char = False  # already moved past the dots
-        # Then try punctuation markers (only upgrade if at a later position).
         for marker in (",", ";", ":", "—", "-", "!", "?"):
             pos = remaining.rfind(marker, 0, max_chars)
             if pos > break_pos:
                 break_pos = pos
                 include_break_char = True
-        # Space is a FALLBACK only — never override a punctuation/ellipsis break.
-        # Cutting at punctuation gives the model proper prosody cues;
-        # cutting at a random space creates mid-phrase fragments ("handle the").
-        if break_pos <= 0:
-            space_pos = remaining.rfind(" ", 0, max_chars)
-            if space_pos > 0:
-                break_pos = space_pos
-                include_break_char = False
         # If nothing before limit, look slightly ahead to avoid mid-word cuts.
         if break_pos == -1:

     for idx, original in tags_found:
         text = text.replace(f"§TAG{idx}§", original)
     return text
 # ═══════════════════════════════════════════════════════════════════
 def _break_long_chunk(text: str, max_chars: int) -> List[str]:
+    """Break a chunk longer than max_chars on commas or word boundaries."""
     parts: List[str] = []
     remaining = text
     while len(remaining) > max_chars:
         break_pos = -1
         include_break_char = False
+        # Prefer punctuation/pauses first to keep prosody natural.
         for marker in (",", ";", ":", "—", "-", "!", "?"):
             pos = remaining.rfind(marker, 0, max_chars)
             if pos > break_pos:
                 break_pos = pos
                 include_break_char = True
+        # Then prefer nearest space before limit.
+        space_pos = remaining.rfind(" ", 0, max_chars)
+        if space_pos > break_pos:
+            break_pos = space_pos
+            include_break_char = False
         # If nothing before limit, look slightly ahead to avoid mid-word cuts.
         if break_pos == -1: