Spaces:

ShadowHunter222
/

Chab

Running

App Files Files Community

ShadowHunter222 commited on Apr 13

Commit

9ea9ec8

verified ·

1 Parent(s): 1ff75e7

Upload 10 files

Browse files

Files changed (11) hide show

.gitattributes +4 -0
3cpo_prompt.wav +3 -0
Dockerfile +33 -0
aave_female_01_prompt.wav +3 -0
app.py +962 -0
chatterbox_wrapper.py +733 -0
config.py +102 -0
her_prompt.wav +3 -0
ivr_female_02_prompt.wav +3 -0
requirements.txt +24 -0
text_processor.py +331 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+3cpo_prompt.wav filter=lfs diff=lfs merge=lfs -text
+aave_female_01_prompt.wav filter=lfs diff=lfs merge=lfs -text
+her_prompt.wav filter=lfs diff=lfs merge=lfs -text
+ivr_female_02_prompt.wav filter=lfs diff=lfs merge=lfs -text

3cpo_prompt.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a830bbf5494096e593dcfb6e099cfa334cb8b0b34d1403c69d36c02649c5ab15
+size 513452

Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+FROM python:3.11-slim
+# Audio codec libraries for soundfile/librosa
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libsndfile1 ffmpeg && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Install PyTorch CPU first (from dedicated index for smaller size)
+RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
+# Install remaining dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY config.py text_processor.py chatterbox_wrapper.py app.py ./
+COPY 3cpo_prompt.wav aave_female_01_prompt.wav her_prompt.wav ivr_female_02_prompt.wav ./
+# Pre-download ONNX models + tokenizer at build time
+RUN python -c "\
+from chatterbox_wrapper import ChatterboxWrapper; \
+ChatterboxWrapper(download_only=True); \
+print('Models pre-downloaded successfully')"
+# Prevent thread oversubscription in production
+ENV OMP_NUM_THREADS=1
+ENV MKL_NUM_THREADS=1
+ENV OPENBLAS_NUM_THREADS=1
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

aave_female_01_prompt.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:971a3568a5a1521612bff565ed416aea62e30da3e00a53d771ff2c26da78276d
+size 1217636

app.py ADDED Viewed

	@@ -0,0 +1,962 @@

+"""
+Chatterbox Turbo TTS -- FastAPI Server
+======================================
+Production-ready API with true real-time MP3 streaming,
+in-memory voice cloning, and fully non-blocking inference.
+Endpoints:
+  GET  /health              -> health check + optional warmup
+  GET  /info                -> model info, supported tags, parameters
+  POST /tts                 -> full audio response (WAV/MP3/FLAC)
+  POST /tts/stream          -> chunked MP3 streaming (MediaSource-ready)
+  POST /tts/true-stream     -> alias for /tts/stream (Kokoro compat)
+  POST /tts/stop/{stream_id}-> cancel a specific active stream
+  POST /tts/stop            -> cancel ALL active streams
+  POST /v1/audio/speech     -> OpenAI-compatible streaming
+"""
+import asyncio
+import io
+import json
+import logging
+import queue as stdlib_queue
+import threading
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from typing import Generator, Optional
+import numpy as np
+import soundfile as sf
+from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
+from fastapi.responses import Response, StreamingResponse
+from contextlib import asynccontextmanager
+from config import Config
+from chatterbox_wrapper import ChatterboxWrapper, GenerationCancelled, VoiceProfile
+import text_processor
+# ── Logging ───────────────────────────────────────────────────────
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s │ %(levelname)-7s │ %(name)s │ %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+# ── Thread pool for CPU-bound inference ───────────────────────────
+tts_executor = ThreadPoolExecutor(max_workers=Config.MAX_WORKERS)
+# ── Lifespan ──────────────────────────────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    try:
+        wrapper = ChatterboxWrapper()
+        app.state.wrapper = wrapper
+        logger.info("✅ Model loaded, server ready")
+    except Exception as e:
+        logger.error(f"❌ Model loading failed: {e}")
+        raise
+    yield
+    tts_executor.shutdown(wait=False)
+app = FastAPI(
+    title="Chatterbox Turbo TTS API",
+    version="1.0.0",
+    docs_url="/docs",
+    lifespan=lifespan,
+)
+# ── CORS Middleware ───────────────────────────────────────────────
+@app.middleware("http")
+async def cors_middleware(request: Request, call_next):
+    origin = request.headers.get("origin")
+    # Preflight
+    if request.method == "OPTIONS" and origin in Config.ALLOWED_ORIGINS:
+        return Response(
+            status_code=200,
+            headers={
+                "Access-Control-Allow-Origin": origin,
+                "Access-Control-Allow-Methods": "*",
+                "Access-Control-Allow-Headers": "*",
+                "Access-Control-Allow-Credentials": "true",
+            },
+        )
+    if not origin or origin in Config.ALLOWED_ORIGINS:
+        response = await call_next(request)
+        if origin:
+            response.headers["Access-Control-Allow-Origin"] = origin
+            response.headers["Access-Control-Allow-Credentials"] = "true"
+            response.headers["Access-Control-Allow-Methods"] = "*"
+            response.headers["Access-Control-Allow-Headers"] = "*"
+            response.headers["Access-Control-Expose-Headers"] = "X-Stream-Id"
+        return response
+    logger.warning(f"🚫 Blocked origin: {origin}")
+    return Response(status_code=403, content="Forbidden: Origin not allowed")
+# ═══════════════════════════════════════════════════════════════════
+# Helper: resolve voice from optional upload
+# ═══════════════════════════════════════════════════════════════════
+async def _resolve_voice(
+    voice_ref: Optional[UploadFile],
+    voice_name: str,
+    wrapper: ChatterboxWrapper,
+) -> VoiceProfile:
+    """Return a VoiceProfile from uploaded audio, built-in voice name, or default."""
+    # 1) If a file was uploaded, encode it (highest priority)
+    if voice_ref is not None and voice_ref.filename:
+        audio_bytes = await voice_ref.read()
+        if len(audio_bytes) > Config.MAX_VOICE_UPLOAD_BYTES:
+            raise HTTPException(status_code=413, detail="Voice file too large (max 10 MB)")
+        if len(audio_bytes) == 0:
+            raise HTTPException(status_code=400, detail="Empty voice file")
+        loop = asyncio.get_running_loop()
+        try:
+            return await loop.run_in_executor(
+                tts_executor, wrapper.encode_voice_from_bytes, audio_bytes
+            )
+        except ValueError as e:
+            raise HTTPException(status_code=400, detail=str(e))
+        except Exception as e:
+            logger.error(f"Voice encoding failed: {e}")
+            raise HTTPException(
+                status_code=400,
+                detail=f"Could not process voice file: {str(e)}. "
+                       f"Supported formats: WAV, MP3, MPEG, M4A, OGG, FLAC, WebM."
+            )
+    # 2) Resolve by built-in voice name (returns cached profile — no encoding)
+    try:
+        return wrapper.get_builtin_voice(voice_name)
+    except (ValueError, KeyError) as e:
+        raise HTTPException(status_code=400, detail=str(e))
+# ═══════════════════════════════════════════════════════════════════
+# Helper: encode numpy audio to bytes in given format
+# ═══════════════════════════════════════════════════════════════════
+def _encode_audio(audio: np.ndarray, fmt: str = "wav") -> tuple[bytes, str]:
+    buf = io.BytesIO()
+    fmt_lower = fmt.lower()
+    if fmt_lower == "mp3":
+        sf.write(buf, audio, Config.SAMPLE_RATE, format="mp3")
+        media = "audio/mpeg"
+    elif fmt_lower == "flac":
+        sf.write(buf, audio, Config.SAMPLE_RATE, format="flac")
+        media = "audio/flac"
+    else:
+        sf.write(buf, audio, Config.SAMPLE_RATE, format="wav")
+        media = "audio/wav"
+    return buf.getvalue(), media
+def _encode_mp3_chunk(audio: np.ndarray) -> bytes:
+    """Encode one numpy chunk to MP3 bytes (same encoder path as current server)."""
+    data, _ = _encode_audio(audio, fmt="mp3")
+    return data
+def _build_helper_endpoint(base_url: str, path: str) -> str:
+    return f"{base_url.rstrip('/')}{path}"
+def _internal_headers() -> dict[str, str]:
+    headers = {"Content-Type": "application/json", "Accept": "audio/mpeg"}
+    if Config.INTERNAL_SHARED_SECRET:
+        headers["X-Internal-Secret"] = Config.INTERNAL_SHARED_SECRET
+    return headers
+def _helper_request_chunk(
+    helper_base_url: str,
+    payload: dict,
+    timeout_sec: float,
+) -> bytes:
+    url = _build_helper_endpoint(helper_base_url, "/internal/chunk/synthesize")
+    body = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(
+        url=url,
+        data=body,
+        headers=_internal_headers(),
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
+        return resp.read()
+def _helper_register_voice(
+    helper_base_url: str,
+    stream_id: str,
+    audio_bytes: bytes,
+    timeout_sec: float,
+) -> str:
+    """Register reference voice on helper once, return voice_key for chunk calls."""
+    query = urllib.parse.urlencode({"stream_id": stream_id})
+    url = _build_helper_endpoint(helper_base_url, f"/internal/voice/register?{query}")
+    headers = {"Content-Type": "application/octet-stream", "Accept": "application/json"}
+    if Config.INTERNAL_SHARED_SECRET:
+        headers["X-Internal-Secret"] = Config.INTERNAL_SHARED_SECRET
+    req = urllib.request.Request(
+        url=url,
+        data=audio_bytes,
+        headers=headers,
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
+        data = json.loads(resp.read().decode("utf-8"))
+    voice_key = (data.get("voice_key") or "").strip()
+    if not voice_key:
+        raise RuntimeError("helper voice registration returned no voice_key")
+    return voice_key
+def _helper_cancel_stream(helper_base_url: str, stream_id: str):
+    """Best-effort cancellation signal to helper."""
+    try:
+        url = _build_helper_endpoint(helper_base_url, f"/internal/chunk/cancel/{stream_id}")
+        req = urllib.request.Request(
+            url=url,
+            data=b"",
+            headers=_internal_headers(),
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=3.0):
+            pass
+    except Exception:
+        pass
+# ═══════════════════════════════════════════════════════════════════
+# Endpoints
+# ═══════════════════════════════════════════════════════════════════
+@app.get("/health")
+async def health(warm_up: bool = False):
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    status = {
+        "status": "healthy" if wrapper else "loading",
+        "model_loaded": wrapper is not None,
+        "model_dtype": Config.MODEL_DTYPE,
+        "streaming_supported": True,
+        "voice_cache_entries": wrapper._voice_cache.size if wrapper else 0,
+    }
+    if warm_up and wrapper:
+        try:
+            loop = asyncio.get_running_loop()
+            await loop.run_in_executor(tts_executor, wrapper.warmup)
+            status["warm_up"] = "success"
+        except Exception as e:
+            status["warm_up"] = f"failed: {e}"
+    return status
+@app.get("/info")
+async def info():
+    return {
+        "model": Config.MODEL_ID,
+        "dtype": Config.MODEL_DTYPE,
+        "sample_rate": Config.SAMPLE_RATE,
+        "paralinguistic_tags": list(Config.PARALINGUISTIC_TAGS),
+        "tag_usage": "Insert tags directly in text, e.g. 'That is so funny! [laugh] Anyway…'",
+        "parameters": {
+            "max_new_tokens": {"default": Config.MAX_NEW_TOKENS, "range": "64–2048"},
+            "repetition_penalty": {"default": Config.REPETITION_PENALTY, "range": "1.0–2.0"},
+        },
+        "voice_cloning": {
+            "description": "Upload 3–30s reference WAV/MP3 as 'voice_ref' field",
+            "max_upload_mb": Config.MAX_VOICE_UPLOAD_BYTES // (1024 * 1024),
+        },
+        "parallel_mode": {
+            "enabled": Config.ENABLE_PARALLEL_MODE,
+            "helper_configured": bool(Config.HELPER_BASE_URL),
+            "helper_base_url": Config.HELPER_BASE_URL or None,
+            "supports_voice_ref": True,
+        },
+    }
+@app.get("/voices")
+async def list_voices():
+    """Return all built-in voices available for selection."""
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    return {
+        "default": wrapper.default_voice_name,
+        "voices": wrapper.list_builtin_voices(),
+    }
+# ── POST /tts ─────────────────────────────────────────────────────
+@app.post("/tts", response_class=Response)
+async def text_to_speech(
+    text: str = Form(...),
+    voice_ref: Optional[UploadFile] = File(None),
+    voice_name: str = Form("default"),
+    output_format: str = Form("wav"),
+    max_new_tokens: int = Form(Config.MAX_NEW_TOKENS),
+    repetition_penalty: float = Form(Config.REPETITION_PENALTY),
+):
+    """Generate complete audio for the given text."""
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    if not text or not text.strip():
+        raise HTTPException(400, "Text is required")
+    voice = await _resolve_voice(voice_ref, voice_name, wrapper)
+    loop = asyncio.get_running_loop()
+    try:
+        audio = await loop.run_in_executor(
+            tts_executor,
+            wrapper.generate_speech,
+            text, voice, max_new_tokens, repetition_penalty,
+        )
+    except ValueError as e:
+        raise HTTPException(400, str(e))
+    except Exception as e:
+        logger.error(f"TTS error: {e}")
+        raise HTTPException(500, "Internal server error")
+    data, media_type = _encode_audio(audio, output_format)
+    return Response(
+        content=data,
+        media_type=media_type,
+        headers={"Content-Disposition": f"attachment; filename=tts_output.{output_format}"},
+    )
+# ═══════════════════════════════════════════════════════════════════
+# Active Stream Registry (for cancellation)
+# ═══════════════════════════════════════════════════════════════════
+_active_streams: dict[str, threading.Event] = {}
+_internal_cancelled_streams: set[str] = set()
+_internal_cancel_lock = threading.Lock()
+_internal_stream_voice_keys: dict[str, set[str]] = {}
+# ═══════════════════════════════════════════════════════════════════
+# Pipeline Streaming Generator
+# ═══════════════════════════════════════════════════════════════════
+def _pipeline_stream_generator(
+    wrapper: ChatterboxWrapper,
+    text: str,
+    voice: VoiceProfile,
+    max_new_tokens: int,
+    repetition_penalty: float,
+    stream_id: str,
+) -> Generator[bytes, None, None]:
+    """Two-stage producer-consumer pipeline for minimal inter-chunk gaps.
+    Architecture:
+      Producer thread (heavyweight, ~80% CPU):
+        ONNX token generation → audio decoding → raw numpy arrays → queue
+      Consumer (this generator, lightweight, ~20% CPU):
+        queue → MP3 encode → yield to HTTP response
+    Why this helps:
+      - ONNX model runs CONTINUOUSLY without waiting for MP3 encode or HTTP
+      - MP3 encoding (libsndfile, C code) releases GIL → true parallelism
+      - ONNX inference (C++ code) also releases GIL → both run simultaneously
+      - Queue(maxsize=2) lets producer stay 1-2 chunks ahead
+    Cancellation:
+      - cancel_event checked between chunks + every 25 autoregressive steps
+      - Client disconnect triggers GeneratorExit → finally sets cancel
+      - /tts/stop endpoint sets cancel externally
+    """
+    cancel_event = threading.Event()
+    _active_streams[stream_id] = cancel_event
+    # Raw audio buffer: producer puts numpy arrays, consumer takes them
+    audio_buffer: stdlib_queue.Queue = stdlib_queue.Queue(maxsize=2)
+    def _producer():
+        """Heavyweight worker: runs ONNX model continuously."""
+        try:
+            for audio_chunk in wrapper.stream_speech(
+                text, voice,
+                max_new_tokens=max_new_tokens,
+                repetition_penalty=repetition_penalty,
+                is_cancelled=cancel_event.is_set,
+            ):
+                if cancel_event.is_set():
+                    break
+                while not cancel_event.is_set():
+                    try:
+                        audio_buffer.put(audio_chunk, timeout=0.1)
+                        break
+                    except stdlib_queue.Full:
+                        continue
+        except GenerationCancelled:
+            logger.info(f"[{stream_id}] Generation cancelled")
+        except Exception as e:
+            while not cancel_event.is_set():
+                try:
+                    audio_buffer.put(e, timeout=0.1)
+                    break
+                except stdlib_queue.Full:
+                    continue
+        finally:
+            while not cancel_event.is_set():
+                try:
+                    audio_buffer.put(None, timeout=0.1)
+                    break
+                except stdlib_queue.Full:
+                    continue
+    producer = threading.Thread(target=_producer, daemon=True)
+    producer.start()
+    try:
+        # Consumer: lightweight MP3 encoding + yield
+        while True:
+            item = audio_buffer.get()
+            if item is None:
+                break
+            if isinstance(item, Exception):
+                logger.error(f"[{stream_id}] Stream error: {item}")
+                break
+            if cancel_event.is_set():
+                break
+            # MP3 encode (C code, releases GIL, runs parallel with next ONNX step)
+            buf = io.BytesIO()
+            sf.write(buf, item, Config.SAMPLE_RATE, format="mp3")
+            yield buf.getvalue()
+    finally:
+        # Cleanup: signal producer to stop + deregister
+        cancel_event.set()
+        _active_streams.pop(stream_id, None)
+def _parallel_odd_even_stream_generator(
+    wrapper: ChatterboxWrapper,
+    text: str,
+    local_voice: VoiceProfile,
+    helper_voice_bytes: Optional[bytes],
+    max_new_tokens: int,
+    repetition_penalty: float,
+    stream_id: str,
+    helper_base_url: str,
+) -> Generator[bytes, None, None]:
+    """Additive odd/even split streamer (primary handles odd, helper handles even)."""
+    cancel_event = threading.Event()
+    _active_streams[stream_id] = cancel_event
+    clean_text = text_processor.sanitize(text.strip()[: Config.MAX_TEXT_LENGTH])
+    chunks = text_processor.split_for_streaming(clean_text)
+    total_chunks = len(chunks)
+    if total_chunks == 0:
+        _active_streams.pop(stream_id, None)
+        return
+    lock = threading.Lock()
+    cond = threading.Condition(lock)
+    ready: dict[int, bytes] = {}
+    first_error: Optional[Exception] = None
+    workers_done = 0
+    def _publish(idx: int, data: bytes):
+        with cond:
+            ready[idx] = data
+            cond.notify_all()
+    def _set_error(err: Exception):
+        nonlocal first_error
+        with cond:
+            if first_error is None:
+                first_error = err
+            cond.notify_all()
+    def _worker_done():
+        nonlocal workers_done
+        with cond:
+            workers_done += 1
+            cond.notify_all()
+    def _synth_local(chunk_text: str) -> bytes:
+        audio = wrapper.generate_speech(
+            chunk_text,
+            local_voice,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+        )
+        return _encode_mp3_chunk(audio)
+    def _odd_worker():
+        try:
+            for idx in range(0, total_chunks, 2):
+                if cancel_event.is_set():
+                    break
+                data = _synth_local(chunks[idx])
+                _publish(idx, data)
+        except Exception as e:
+            _set_error(e)
+        finally:
+            _worker_done()
+    def _even_worker():
+        helper_available = True
+        helper_voice_key: Optional[str] = None
+        try:
+            if helper_voice_bytes:
+                attempts = 2 if Config.HELPER_RETRY_ONCE else 1
+                last_err: Optional[Exception] = None
+                for _ in range(attempts):
+                    try:
+                        helper_voice_key = _helper_register_voice(
+                            helper_base_url=helper_base_url,
+                            stream_id=stream_id,
+                            audio_bytes=helper_voice_bytes,
+                            timeout_sec=max(1.0, Config.HELPER_TIMEOUT_SEC),
+                        )
+                        last_err = None
+                        break
+                    except Exception as reg_err:
+                        last_err = reg_err
+                        continue
+                if last_err is not None:
+                    helper_available = False
+                    logger.warning(
+                        f"[{stream_id}] Helper voice registration failed; "
+                        "falling back to local synthesis for even chunks"
+                    )
+            for idx in range(1, total_chunks, 2):
+                if cancel_event.is_set():
+                    break
+                if helper_available:
+                    payload = {
+                        "stream_id": stream_id,
+                        "chunk_index": idx,
+                        "text": chunks[idx],
+                        "max_new_tokens": max_new_tokens,
+                        "repetition_penalty": repetition_penalty,
+                        "output_format": "mp3",
+                    }
+                    if helper_voice_key:
+                        payload["voice_key"] = helper_voice_key
+                    attempts = 2 if Config.HELPER_RETRY_ONCE else 1
+                    last_err: Optional[Exception] = None
+                    for _ in range(attempts):
+                        try:
+                            helper_data = _helper_request_chunk(
+                                helper_base_url=helper_base_url,
+                                payload=payload,
+                                timeout_sec=max(1.0, Config.HELPER_TIMEOUT_SEC),
+                            )
+                            _publish(idx, helper_data)
+                            last_err = None
+                            break
+                        except Exception as helper_err:
+                            last_err = helper_err
+                            continue
+                    if last_err is None:
+                        continue
+                    helper_available = False
+                    logger.warning(
+                        f"[{stream_id}] Helper failed at chunk {idx}; "
+                        "falling back to local synthesis for remaining even chunks"
+                    )
+                # Local fallback for even chunks
+                data = _synth_local(chunks[idx])
+                _publish(idx, data)
+        except Exception as e:
+            _set_error(e)
+        finally:
+            _worker_done()
+    odd_thread = threading.Thread(target=_odd_worker, daemon=True)
+    even_thread = threading.Thread(target=_even_worker, daemon=True)
+    odd_thread.start()
+    even_thread.start()
+    next_idx = 0
+    try:
+        while next_idx < total_chunks:
+            with cond:
+                while (
+                    next_idx not in ready
+                    and first_error is None
+                    and not cancel_event.is_set()
+                    and workers_done < 2
+                ):
+                    cond.wait(timeout=0.1)
+                if cancel_event.is_set():
+                    break
+                if next_idx in ready:
+                    data = ready.pop(next_idx)
+                elif first_error is not None:
+                    logger.error(f"[{stream_id}] Parallel stream error: {first_error}")
+                    break
+                elif workers_done >= 2:
+                    logger.error(
+                        f"[{stream_id}] Parallel stream ended with missing chunk index {next_idx}"
+                    )
+                    break
+                else:
+                    continue
+            yield data
+            next_idx += 1
+    finally:
+        cancel_event.set()
+        _helper_cancel_stream(helper_base_url, stream_id)
+        odd_thread.join(timeout=1.0)
+        even_thread.join(timeout=1.0)
+        _active_streams.pop(stream_id, None)
+# ── POST /tts/stream & /tts/true-stream ──────────────────────────
+@app.post("/tts/stream")
+@app.post("/tts/true-stream")
+async def stream_text_to_speech(
+    text: str = Form(...),
+    voice_ref: Optional[UploadFile] = File(None),
+    voice_name: str = Form("default"),
+    max_new_tokens: int = Form(Config.MAX_NEW_TOKENS),
+    repetition_penalty: float = Form(Config.REPETITION_PENALTY),
+):
+    """True real-time streaming: yields MP3 chunks as each sentence finishes.
+    Response includes X-Stream-Id header for cancellation via /tts/stop.
+    Compatible with frontend's MediaSource + ReadableStream pattern.
+    """
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    if not text or not text.strip():
+        raise HTTPException(400, "Text is required")
+    voice = await _resolve_voice(voice_ref, voice_name, wrapper)
+    stream_id = uuid.uuid4().hex[:12]
+    return StreamingResponse(
+        _pipeline_stream_generator(
+            wrapper, text, voice, max_new_tokens, repetition_penalty, stream_id,
+        ),
+        media_type="audio/mpeg",
+        headers={
+            "Content-Disposition": "attachment; filename=tts_stream.mp3",
+            "Transfer-Encoding": "chunked",
+            "X-Stream-Id": stream_id,
+            "X-Streaming-Type": "true-realtime",
+            "Cache-Control": "no-cache",
+        },
+    )
+@app.post("/tts/parallel-stream")
+async def parallel_stream_text_to_speech(
+    text: str = Form(...),
+    voice_ref: Optional[UploadFile] = File(None),
+    voice_name: str = Form("default"),
+    max_new_tokens: int = Form(Config.MAX_NEW_TOKENS),
+    repetition_penalty: float = Form(Config.REPETITION_PENALTY),
+    helper_url: Optional[str] = Form(None),
+):
+    """Additive odd/even split stream mode (primary + helper)."""
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    if not Config.ENABLE_PARALLEL_MODE:
+        raise HTTPException(503, "Parallel mode is disabled")
+    if not text or not text.strip():
+        raise HTTPException(400, "Text is required")
+    local_voice: VoiceProfile = wrapper.default_voice
+    helper_voice_bytes: Optional[bytes] = None
+    if voice_ref is not None and voice_ref.filename:
+        helper_voice_bytes = await voice_ref.read()
+        if len(helper_voice_bytes) > Config.MAX_VOICE_UPLOAD_BYTES:
+            raise HTTPException(status_code=413, detail="Voice file too large (max 10 MB)")
+        if len(helper_voice_bytes) == 0:
+            raise HTTPException(status_code=400, detail="Empty voice file")
+        loop = asyncio.get_running_loop()
+        try:
+            local_voice = await loop.run_in_executor(
+                tts_executor, wrapper.encode_voice_from_bytes, helper_voice_bytes
+            )
+        except Exception as e:
+            logger.error(f"Parallel voice encoding failed: {e}")
+            raise HTTPException(400, "Could not process voice file for parallel mode")
+    else:
+        # Built-in voice selected by name — resolve locally and prepare
+        # bytes for helper registration so helpers cache the same hash.
+        try:
+            selected_voice_id = wrapper.resolve_voice_id(voice_name)
+            local_voice = wrapper.get_builtin_voice(selected_voice_id)
+        except ValueError as e:
+            raise HTTPException(status_code=400, detail=str(e))
+        # Only send bytes to helper if a non-default voice was selected,
+        # because the helper's own default is already loaded.
+        if selected_voice_id != wrapper.default_voice_name:
+            helper_voice_bytes = wrapper.get_builtin_voice_bytes(selected_voice_id)
+            if not helper_voice_bytes:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Selected voice '{voice_name}' is unavailable for helper registration",
+                )
+    resolved_helper = (helper_url or Config.HELPER_BASE_URL).strip()
+    if not resolved_helper:
+        raise HTTPException(
+            400,
+            "Helper URL not configured. Set CB_HELPER_BASE_URL or pass helper_url.",
+        )
+    stream_id = uuid.uuid4().hex[:12]
+    return StreamingResponse(
+        _parallel_odd_even_stream_generator(
+            wrapper=wrapper,
+            text=text,
+            local_voice=local_voice,
+            helper_voice_bytes=helper_voice_bytes,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            stream_id=stream_id,
+            helper_base_url=resolved_helper,
+        ),
+        media_type="audio/mpeg",
+        headers={
+            "Content-Disposition": "attachment; filename=tts_parallel_stream.mp3",
+            "Transfer-Encoding": "chunked",
+            "X-Stream-Id": stream_id,
+            "X-Streaming-Type": "parallel-odd-even",
+            "Cache-Control": "no-cache",
+        },
+    )
+# ── JSON body variant (Kokoro/OpenAI compatibility) ───────────────
+from pydantic import BaseModel, Field
+class InternalChunkRequest(BaseModel):
+    stream_id: str = Field(..., min_length=1, max_length=64)
+    chunk_index: int = Field(..., ge=0)
+    text: str = Field(..., min_length=1, max_length=10000)
+    max_new_tokens: int = Field(default=Config.MAX_NEW_TOKENS, ge=64, le=2048)
+    repetition_penalty: float = Field(default=Config.REPETITION_PENALTY, ge=1.0, le=2.0)
+    output_format: str = Field(default="mp3")
+    voice_key: Optional[str] = Field(default=None, min_length=1, max_length=64)
+class TTSJsonRequest(BaseModel):
+    text: str = Field(..., min_length=1, max_length=50000)
+    voice: str = Field(default="default")
+    speed: float = Field(default=1.0, ge=0.5, le=2.0)  # reserved for future use
+    max_new_tokens: int = Field(default=Config.MAX_NEW_TOKENS, ge=64, le=2048)
+    repetition_penalty: float = Field(default=Config.REPETITION_PENALTY, ge=1.0, le=2.0)
+@app.post("/internal/voice/register")
+async def internal_voice_register(http_request: Request):
+    """Register voice once for a stream; returns reusable voice_key."""
+    if Config.INTERNAL_SHARED_SECRET:
+        provided = http_request.headers.get("X-Internal-Secret", "")
+        if provided != Config.INTERNAL_SHARED_SECRET:
+            raise HTTPException(403, "Forbidden")
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    audio_bytes = await http_request.body()
+    if len(audio_bytes) > Config.MAX_VOICE_UPLOAD_BYTES:
+        raise HTTPException(status_code=413, detail="Voice file too large (max 10 MB)")
+    if len(audio_bytes) == 0:
+        raise HTTPException(status_code=400, detail="Empty voice file")
+    loop = asyncio.get_running_loop()
+    try:
+        voice = await loop.run_in_executor(
+            tts_executor, wrapper.encode_voice_from_bytes, audio_bytes
+        )
+    except Exception as e:
+        logger.error(f"[internal] voice register failed: {e}")
+        raise HTTPException(400, "Voice registration failed")
+    voice_key = (voice.audio_hash or "").strip()
+    if not voice_key:
+        raise HTTPException(500, "Voice key unavailable")
+    stream_id = (http_request.query_params.get("stream_id") or "").strip()
+    if stream_id:
+        with _internal_cancel_lock:
+            keys = _internal_stream_voice_keys.setdefault(stream_id, set())
+            keys.add(voice_key)
+    return {"status": "registered", "voice_key": voice_key}
+@app.post("/internal/chunk/synthesize")
+async def internal_chunk_synthesize(
+    request: InternalChunkRequest,
+    http_request: Request,
+):
+    """Internal endpoint used by primary/helper parallel routing."""
+    if Config.INTERNAL_SHARED_SECRET:
+        provided = http_request.headers.get("X-Internal-Secret", "")
+        if provided != Config.INTERNAL_SHARED_SECRET:
+            raise HTTPException(403, "Forbidden")
+    with _internal_cancel_lock:
+        if request.stream_id in _internal_cancelled_streams:
+            raise HTTPException(409, "Stream already cancelled")
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    voice_profile = wrapper.default_voice
+    if request.voice_key:
+        cached_voice = wrapper._voice_cache.get(request.voice_key)
+        if cached_voice is None:
+            # Built-in voices are permanent in wrapper registry even if TTL cache entry expired.
+            cached_voice = wrapper.get_builtin_voice_by_hash(request.voice_key)
+        if cached_voice is None:
+            raise HTTPException(409, "Voice key expired or not found")
+        voice_profile = cached_voice
+    loop = asyncio.get_running_loop()
+    try:
+        audio = await loop.run_in_executor(
+            tts_executor,
+            wrapper.generate_speech,
+            request.text,
+            voice_profile,
+            request.max_new_tokens,
+            request.repetition_penalty,
+        )
+    except Exception as e:
+        logger.error(f"[internal] chunk {request.chunk_index} failed: {e}")
+        raise HTTPException(500, "Chunk synthesis failed")
+    fmt = (request.output_format or "mp3").lower()
+    if fmt not in {"mp3", "wav", "flac"}:
+        fmt = "mp3"
+    data, media_type = _encode_audio(audio, fmt=fmt)
+    return Response(
+        content=data,
+        media_type=media_type,
+        headers={
+            "X-Stream-Id": request.stream_id,
+            "X-Chunk-Index": str(request.chunk_index),
+        },
+    )
+@app.post("/internal/chunk/cancel/{stream_id}")
+async def internal_chunk_cancel(stream_id: str, http_request: Request):
+    if Config.INTERNAL_SHARED_SECRET:
+        provided = http_request.headers.get("X-Internal-Secret", "")
+        if provided != Config.INTERNAL_SHARED_SECRET:
+            raise HTTPException(403, "Forbidden")
+    with _internal_cancel_lock:
+        _internal_cancelled_streams.add(stream_id)
+        _internal_stream_voice_keys.pop(stream_id, None)
+    return {"status": "cancelled", "stream_id": stream_id}
+@app.post("/v1/audio/speech")
+async def openai_compatible_tts(request: TTSJsonRequest):
+    """OpenAI-compatible streaming endpoint (JSON body, no file upload).
+    Uses built-in voice selection via `voice`. For voice cloning, use /tts/stream with FormData.
+    """
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    try:
+        selected_voice = wrapper.get_builtin_voice(request.voice)
+    except ValueError as e:
+        raise HTTPException(400, str(e))
+    stream_id = uuid.uuid4().hex[:12]
+    return StreamingResponse(
+        _pipeline_stream_generator(
+            wrapper, request.text, selected_voice,
+            request.max_new_tokens, request.repetition_penalty, stream_id,
+        ),
+        media_type="audio/mpeg",
+        headers={
+            "Transfer-Encoding": "chunked",
+            "X-Stream-Id": stream_id,
+            "Cache-Control": "no-cache",
+        },
+    )
+# ═══════════════════════════════════════════════════════════════════
+# Stop / Cancel Endpoint
+# ═══════════════════════════════════════════════════════════════════
+@app.post("/tts/stop/{stream_id}")
+async def stop_stream(stream_id: str):
+    """Stop an active TTS stream by its ID (from X-Stream-Id header).
+    Cancels the ONNX generation loop mid-token, freeing CPU immediately.
+    """
+    event = _active_streams.get(stream_id)
+    if event:
+        event.set()
+        logger.info(f"Stream {stream_id} cancelled by client")
+        return {"status": "stopped", "stream_id": stream_id}
+    return {"status": "not_found", "stream_id": stream_id}
+@app.post("/tts/stop")
+async def stop_all_streams():
+    """Emergency stop: cancel ALL active TTS streams."""
+    count = len(_active_streams)
+    for sid, event in list(_active_streams.items()):
+        event.set()
+    _active_streams.clear()
+    logger.info(f"Stopped all streams ({count} active)")
+    return {"status": "stopped_all", "count": count}
+# ═══════════════════════════════════════════════════════════════════
+# Entrypoint
+# ═══════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=Config.HOST, port=Config.PORT)

chatterbox_wrapper.py ADDED Viewed

	@@ -0,0 +1,733 @@

+"""
+Chatterbox Turbo TTS — ONNX Inference Wrapper
+═══════════════════════════════════════════════
+Orchestrates the 4-component ONNX pipeline:
+  embed_tokens → speech_encoder → language_model → conditional_decoder
+Optimised for lowest-latency CPU inference on 2 vCPU:
+  • Sequential execution, thread count = physical cores, no spinning
+  • Token list pre-allocation (avoids O(n²) np.concatenate in loop)
+  • In-memory voice caching (no disk writes for uploads)
+  • Robust audio loading: WAV, MP3, MPEG, M4A, OGG, FLAC, WebM
+  • Sentence-level streaming for real-time playback
+"""
+# ── Suppress harmless transformers warnings BEFORE import ─────────
+import os
+import warnings
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+warnings.filterwarnings("ignore", message=".*model of type.*chatterbox.*")
+import hashlib
+import io
+import logging
+import subprocess
+import tempfile
+import time
+from collections import OrderedDict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Generator, Optional
+import librosa
+import numpy as np
+import onnxruntime as ort
+import soundfile as soundfile_lib
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
+from config import Config
+import text_processor
+logger = logging.getLogger(__name__)
+# ── Supported audio MIME types for voice upload ───────────────────
+_SUPPORTED_AUDIO_EXTENSIONS = {
+    ".wav", ".mp3", ".mpeg", ".mpga", ".m4a", ".mp4",
+    ".ogg", ".oga", ".opus", ".flac", ".webm", ".aac", ".wma",
+}
+def _slugify(text: str) -> str:
+    """Convert a display name to a safe, lowercase identifier."""
+    buf = []
+    prev_underscore = False
+    for ch in text.strip().lower():
+        if ch.isalnum():
+            buf.append(ch)
+            prev_underscore = False
+        else:
+            if not prev_underscore:
+                buf.append("_")
+                prev_underscore = True
+    slug = "".join(buf).strip("_")
+    return slug or "voice"
+# ═══════════════════════════════════════════════════════════════════
+# Data Structures
+# ═══════════════════════════════════════════════════════════════════
+@dataclass
+class VoiceProfile:
+    """Cached speaker embedding extracted from reference audio."""
+    cond_emb: np.ndarray
+    prompt_token: np.ndarray
+    speaker_embeddings: np.ndarray
+    speaker_features: np.ndarray
+    audio_hash: str = ""
+class GenerationCancelled(Exception):
+    """Raised when inference is cancelled by the client."""
+    pass
+# ═══════════════════════════════════════════════════════════════════
+# LRU Voice Cache
+# ═══════════════════════════════════════════════════════════════════
+class _VoiceCache:
+    """LRU cache for VoiceProfile objects with TTL-based expiration.
+    Entries auto-expire after `ttl_seconds` (default: 1 hour).
+    Re-uploading the same voice file within the TTL window returns
+    the cached profile instantly — no re-encoding needed.
+    """
+    def __init__(self, maxsize: int, ttl_seconds: int = 3600):
+        self._cache: OrderedDict[str, tuple[VoiceProfile, float]] = OrderedDict()
+        self._maxsize = maxsize
+        self._ttl = ttl_seconds
+    def _evict_expired(self):
+        """Remove all entries older than TTL."""
+        now = time.time()
+        expired = [k for k, (_, ts) in self._cache.items() if now - ts > self._ttl]
+        for k in expired:
+            del self._cache[k]
+            logger.debug(f"Voice cache expired: {k[:8]}…")
+    def get(self, key: str) -> Optional[VoiceProfile]:
+        self._evict_expired()
+        if key in self._cache:
+            profile, ts = self._cache[key]
+            remaining = self._ttl - (time.time() - ts)
+            self._cache.move_to_end(key)
+            logger.info(f"Voice cache HIT: {key[:8]}… (expires in {remaining:.0f}s)")
+            return profile
+        return None
+    def put(self, key: str, profile: VoiceProfile):
+        self._evict_expired()
+        if key in self._cache:
+            self._cache.move_to_end(key)
+        else:
+            if len(self._cache) >= self._maxsize:
+                evicted_key, _ = self._cache.popitem(last=False)
+                logger.debug(f"Voice cache evicted (LRU): {evicted_key[:8]}…")
+        self._cache[key] = (profile, time.time())
+        logger.info(f"Voice cache STORED: {key[:8]}… (TTL: {self._ttl}s, size: {len(self._cache)})")
+    @property
+    def size(self) -> int:
+        return len(self._cache)
+# ═══════════════════════════════════════════════════════════════════
+# Audio Loading (robust multi-format support)
+# ═══════════════════════════════════════════════════════════════════
+def _load_audio_bytes(audio_bytes: bytes, sr: int = 24000) -> np.ndarray:
+    """Load audio from raw bytes, supporting WAV/MP3/MPEG/M4A/OGG/FLAC/WebM.
+    Strategy: try soundfile (fast, native) → librosa (ffmpeg backend) → ffmpeg CLI.
+    """
+    buf = io.BytesIO(audio_bytes)
+    # 1) Try soundfile (handles WAV, FLAC, OGG natively — fastest)
+    try:
+        audio, file_sr = soundfile_lib.read(buf)
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)  # stereo → mono
+        if file_sr != sr:
+            audio = librosa.resample(audio.astype(np.float32), orig_sr=file_sr, target_sr=sr)
+        return audio.astype(np.float32)
+    except Exception:
+        buf.seek(0)
+    # 2) Try librosa (handles MP3 via audioread + ffmpeg backend)
+    try:
+        audio, _ = librosa.load(buf, sr=sr, mono=True)
+        return audio.astype(np.float32)
+    except Exception:
+        buf.seek(0)
+    # 3) Fallback: use ffmpeg CLI to convert to WAV in memory
+    try:
+        proc = subprocess.run(
+            ["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ac", "1", "-ar", str(sr), "pipe:1"],
+            input=audio_bytes, capture_output=True, timeout=30,
+        )
+        if proc.returncode == 0 and len(proc.stdout) > 44:
+            wav_buf = io.BytesIO(proc.stdout)
+            audio, _ = soundfile_lib.read(wav_buf)
+            return audio.astype(np.float32)
+    except Exception:
+        pass
+    raise ValueError(
+        "Could not decode audio file. Supported formats: "
+        "WAV, MP3, MPEG, M4A, OGG, FLAC, WebM, AAC. "
+        "Please upload a valid audio file."
+    )
+# ═══════════════════════════════════════════════════════════════════
+# Main Wrapper
+# ═══════════════════════════════════════════════════════════════════
+class ChatterboxWrapper:
+    def __init__(self, download_only: bool = False):
+        self.cfg = Config
+        os.makedirs(self.cfg.MODELS_DIR, exist_ok=True)
+        logger.info(f"Downloading ONNX models (dtype={self.cfg.MODEL_DTYPE}) …")
+        self._model_paths = self._download_models()
+        if download_only:
+            return
+        logger.info(
+            f"Creating ONNX Runtime sessions "
+            f"(intra_op_threads={self.cfg.CPU_THREADS}, workers={self.cfg.MAX_WORKERS}) …"
+        )
+        opts = self._make_session_options()
+        providers = ["CPUExecutionProvider"]
+        self.embed_session   = ort.InferenceSession(self._model_paths["embed_tokens"],       sess_options=opts, providers=providers)
+        self.encoder_session = ort.InferenceSession(self._model_paths["speech_encoder"],      sess_options=opts, providers=providers)
+        self.lm_session      = ort.InferenceSession(self._model_paths["language_model"],      sess_options=opts, providers=providers)
+        self.decoder_session = ort.InferenceSession(self._model_paths["conditional_decoder"], sess_options=opts, providers=providers)
+        logger.info("Loading tokenizer …")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.MODEL_ID)
+        self._voice_cache = _VoiceCache(
+            maxsize=self.cfg.VOICE_CACHE_SIZE,
+            ttl_seconds=self.cfg.VOICE_CACHE_TTL_SEC,
+        )
+        self._builtin_voice_profiles: dict[str, VoiceProfile] = {}
+        self._builtin_voice_bytes: dict[str, bytes] = {}
+        self._builtin_voice_by_hash: dict[str, VoiceProfile] = {}
+        self._voice_alias_to_id: dict[str, str] = {}
+        self._builtin_voice_catalog: list[dict] = []
+        self._default_voice_id: str = "default"
+        logger.info("Loading built-in voices (HF default + local samples) …")
+        self.default_voice = self._load_builtin_voices()
+        logger.info("✅ ChatterboxWrapper ready")
+    # ─── Model download ──────────────────────────────────────────
+    def _download_models(self) -> dict:
+        """Download all 4 ONNX components + weight files from HuggingFace."""
+        components = ("conditional_decoder", "speech_encoder", "embed_tokens", "language_model")
+        paths = {}
+        for name in components:
+            paths[name] = self._download_component(name, self.cfg.MODEL_DTYPE)
+        return paths
+    def _download_component(self, name: str, dtype: str) -> str:
+        if dtype == "fp32":
+            filename = f"{name}.onnx"
+        elif dtype == "q8":
+            filename = f"{name}_quantized.onnx"
+        else:
+            filename = f"{name}_{dtype}.onnx"
+        graph = hf_hub_download(
+            self.cfg.MODEL_ID, subfolder="onnx", filename=filename,
+            cache_dir=self.cfg.MODELS_DIR,
+        )
+        # Download companion weight file
+        try:
+            hf_hub_download(
+                self.cfg.MODEL_ID, subfolder="onnx", filename=f"{filename}_data",
+                cache_dir=self.cfg.MODELS_DIR,
+            )
+        except Exception:
+            pass  # Some quantized variants embed weights in-graph
+        return graph
+    # ─── Session configuration (optimised for 2 vCPU) ─────────────
+    def _make_session_options(self) -> ort.SessionOptions:
+        opts = ort.SessionOptions()
+        # Sequential execution: no parallel graph scheduling overhead
+        opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+        # Match physical cores exactly (2 for HF Space free tier)
+        opts.intra_op_num_threads = self.cfg.CPU_THREADS
+        opts.inter_op_num_threads = 1
+        # Full graph optimisations (constant folding, fusion, etc.)
+        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        # Disable thread spinning — wastes CPU cycles on busy-wait
+        opts.add_session_config_entry("session.intra_op.allow_spinning", "0")
+        opts.add_session_config_entry("session.inter_op.allow_spinning", "0")
+        # Enable memory optimisations
+        opts.enable_cpu_mem_arena = True
+        opts.enable_mem_pattern = True
+        opts.enable_mem_reuse = True
+        return opts
+    # ─── Built-in voices (HF default + local samples) ────────────
+    def _download_hf_default_voice_bytes(self) -> bytes:
+        path = hf_hub_download(
+            self.cfg.DEFAULT_VOICE_REPO,
+            filename=self.cfg.DEFAULT_VOICE_FILE,
+            cache_dir=self.cfg.MODELS_DIR,
+        )
+        return Path(path).read_bytes()
+    def _list_local_voice_paths(self) -> list[Path]:
+        wrapper_dir = Path(__file__).resolve().parent
+        # Support both module-level and repo-root deployment layouts.
+        candidates = []
+        for d in (wrapper_dir, Path.cwd().resolve(), wrapper_dir.parent):
+            try:
+                resolved = d.resolve()
+            except Exception:
+                continue
+            if resolved.is_dir() and resolved not in candidates:
+                candidates.append(resolved)
+        voices: list[Path] = []
+        seen_real_paths: set[str] = set()
+        for root in candidates:
+            try:
+                entries = sorted(root.iterdir(), key=lambda x: x.name.lower())
+            except Exception:
+                continue
+            for p in entries:
+                if not p.is_file():
+                    continue
+                if p.suffix.lower() not in _SUPPORTED_AUDIO_EXTENSIONS:
+                    continue
+                real_path = str(p.resolve())
+                if real_path in seen_real_paths:
+                    continue
+                seen_real_paths.add(real_path)
+                voices.append(p)
+        logger.info(
+            "Local voice scan complete: %s files across %s",
+            len(voices),
+            [str(x) for x in candidates],
+        )
+        return voices
+    def _make_unique_voice_id(self, preferred: str) -> str:
+        base = _slugify(preferred)
+        candidate = base
+        idx = 2
+        while candidate in self._builtin_voice_profiles:
+            candidate = f"{base}_{idx}"
+            idx += 1
+        return candidate
+    def _register_builtin_voice(
+        self,
+        *,
+        preferred_id: str,
+        display_name: str,
+        source: str,
+        source_ref: str,
+        audio_bytes: bytes,
+        is_default: bool = False,
+    ) -> str:
+        if not audio_bytes:
+            raise ValueError("Voice file is empty")
+        voice_id = self._make_unique_voice_id(preferred_id)
+        audio_hash = hashlib.md5(audio_bytes).hexdigest()
+        profile = self._voice_cache.get(audio_hash)
+        if profile is None:
+            audio = _load_audio_bytes(audio_bytes, sr=self.cfg.SAMPLE_RATE)
+            profile = self._encode_audio_array(audio, audio_hash=audio_hash)
+            self._voice_cache.put(audio_hash, profile)
+        else:
+            # Keep hash attached to cached profile for metadata/voice-key usage.
+            profile.audio_hash = audio_hash
+        self._builtin_voice_profiles[voice_id] = profile
+        self._builtin_voice_bytes[voice_id] = audio_bytes
+        if audio_hash:
+            self._builtin_voice_by_hash[audio_hash] = profile
+        aliases: list[str] = []
+        for alias in (voice_id, _slugify(Path(display_name).stem)):
+            if alias not in self._voice_alias_to_id:
+                self._voice_alias_to_id[alias] = voice_id
+                aliases.append(alias)
+        if is_default:
+            self._default_voice_id = voice_id
+            self._voice_alias_to_id["default"] = voice_id
+            if "default" not in aliases:
+                aliases.append("default")
+        self._builtin_voice_catalog.append(
+            {
+                "id": voice_id,
+                "display_name": display_name,
+                "source": source,
+                "source_ref": source_ref,
+                "aliases": aliases,
+                "voice_key": audio_hash,
+            }
+        )
+        return voice_id
+    def _load_builtin_voices(self) -> VoiceProfile:
+        # 1) HF default voice (kept as true default fallback)
+        hf_bytes = self._download_hf_default_voice_bytes()
+        self._register_builtin_voice(
+            preferred_id="default_hf_voice",
+            display_name=self.cfg.DEFAULT_VOICE_FILE,
+            source="huggingface",
+            source_ref=f"{self.cfg.DEFAULT_VOICE_REPO}:{self.cfg.DEFAULT_VOICE_FILE}",
+            audio_bytes=hf_bytes,
+            is_default=True,
+        )
+        # 2) Local voice samples placed next to app files
+        for path in self._list_local_voice_paths():
+            # Avoid duplicate entry if someone also copied default_voice.wav locally.
+            if path.name == self.cfg.DEFAULT_VOICE_FILE:
+                continue
+            try:
+                self._register_builtin_voice(
+                    preferred_id=path.stem,
+                    display_name=path.name,
+                    source="local",
+                    source_ref=str(path.name),
+                    audio_bytes=path.read_bytes(),
+                    is_default=False,
+                )
+            except Exception as e:
+                logger.warning(f"Skipping local voice {path.name}: {e}")
+        default_profile = self._builtin_voice_profiles.get(self._default_voice_id)
+        if default_profile is None:
+            raise RuntimeError("Default built-in voice could not be initialized")
+        logger.info(
+            f"Built-in voices loaded: {len(self._builtin_voice_catalog)} "
+            f"(default={self._default_voice_id})"
+        )
+        return default_profile
+    def list_builtin_voices(self) -> list[dict]:
+        """Return metadata for startup-preloaded voices."""
+        return [dict(v) for v in self._builtin_voice_catalog]
+    @property
+    def default_voice_name(self) -> str:
+        return self._default_voice_id
+    def resolve_voice_id(self, voice_name: Optional[str]) -> str:
+        if voice_name is None:
+            return self._default_voice_id
+        key = _slugify(str(voice_name))
+        if not key:
+            return self._default_voice_id
+        voice_id = self._voice_alias_to_id.get(key)
+        if voice_id is None:
+            available = ", ".join(sorted(self._voice_alias_to_id.keys()))
+            raise ValueError(f"Unknown voice '{voice_name}'. Available: {available}")
+        return voice_id
+    def get_builtin_voice(self, voice_name: Optional[str]) -> VoiceProfile:
+        voice_id = self.resolve_voice_id(voice_name)
+        profile = self._builtin_voice_profiles[voice_id]
+        if profile.audio_hash:
+            self._voice_cache.put(profile.audio_hash, profile)
+        return profile
+    def get_builtin_voice_bytes(self, voice_name: Optional[str]) -> Optional[bytes]:
+        voice_id = self.resolve_voice_id(voice_name)
+        return self._builtin_voice_bytes.get(voice_id)
+    def get_builtin_voice_by_hash(self, audio_hash: str) -> Optional[VoiceProfile]:
+        return self._builtin_voice_by_hash.get((audio_hash or "").strip())
+    # ─── Voice encoding ──────────────────────────────────────────
+    def encode_voice_from_bytes(self, audio_bytes: bytes) -> VoiceProfile:
+        """Encode reference audio from raw bytes (in-memory, no disk write).
+        Accepts: WAV, MP3, MPEG, M4A, OGG, FLAC, WebM, AAC, WMA, Opus.
+        """
+        audio_hash = hashlib.md5(audio_bytes).hexdigest()
+        cached = self._voice_cache.get(audio_hash)
+        if cached is not None:
+            logger.info(f"Voice cache hit: {audio_hash[:8]}…")
+            return cached
+        # Robust multi-format audio loading
+        audio = _load_audio_bytes(audio_bytes, sr=self.cfg.SAMPLE_RATE)
+        # Validate duration
+        duration = len(audio) / self.cfg.SAMPLE_RATE
+        if duration < self.cfg.MIN_REF_DURATION_SEC:
+            raise ValueError(
+                f"Reference audio too short ({duration:.1f}s). "
+                f"Minimum: {self.cfg.MIN_REF_DURATION_SEC}s"
+            )
+        if duration > self.cfg.MAX_REF_DURATION_SEC:
+            audio = audio[: int(self.cfg.MAX_REF_DURATION_SEC * self.cfg.SAMPLE_RATE)]
+        profile = self._encode_audio_array(audio, audio_hash=audio_hash)
+        self._voice_cache.put(audio_hash, profile)
+        return profile
+    def _encode_audio_array(self, audio: np.ndarray, audio_hash: str = "") -> VoiceProfile:
+        """Run speech_encoder on a float32 mono audio array."""
+        audio_input = audio[np.newaxis, :].astype(np.float32)
+        cond_emb, prompt_token, speaker_emb, speaker_feat = self.encoder_session.run(
+            None, {"audio_values": audio_input}
+        )
+        return VoiceProfile(
+            cond_emb=cond_emb,
+            prompt_token=prompt_token,
+            speaker_embeddings=speaker_emb,
+            speaker_features=speaker_feat,
+            audio_hash=audio_hash,
+        )
+    # ─── Full generation (non-streaming) ──────────────────────────
+    def generate_speech(
+        self,
+        text: str,
+        voice: Optional[VoiceProfile] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+    ) -> np.ndarray:
+        """Generate complete audio for the given text."""
+        voice = voice or self.default_voice
+        text = text_processor.sanitize(text.strip()[: self.cfg.MAX_TEXT_LENGTH])
+        if not text:
+            raise ValueError("Text is empty after sanitization")
+        tokens = self._generate_tokens(
+            text, voice,
+            max_new_tokens or self.cfg.MAX_NEW_TOKENS,
+            repetition_penalty or self.cfg.REPETITION_PENALTY,
+        )
+        return self._decode_tokens(tokens, voice)
+    # ─── Streaming generation ─────────────────────────────────────
+    def stream_speech(
+        self,
+        text: str,
+        voice: Optional[VoiceProfile] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        is_cancelled: Optional[Callable[[], bool]] = None,
+    ) -> Generator[np.ndarray, None, None]:
+        """Yield audio chunks sentence-by-sentence for real-time streaming.
+        Each sentence is independently processed through the full pipeline
+        so the first chunk arrives as fast as possible (low TTFB).
+        Args:
+            is_cancelled: Optional callable that returns True to abort generation.
+                          Checked between chunks and every 25 autoregressive steps.
+        """
+        voice = voice or self.default_voice
+        text = text_processor.sanitize(text.strip()[: self.cfg.MAX_TEXT_LENGTH])
+        if not text:
+            return
+        sentences = text_processor.split_for_streaming(text)
+        _max = max_new_tokens or self.cfg.MAX_NEW_TOKENS
+        _rep = repetition_penalty or self.cfg.REPETITION_PENALTY
+        _check = is_cancelled or (lambda: False)
+        for i, sentence in enumerate(sentences):
+            # Check cancellation between chunks
+            if _check():
+                logger.info("Generation cancelled by client (between chunks)")
+                return
+            if not sentence.strip():
+                continue
+            t0 = time.perf_counter()
+            try:
+                tokens = self._generate_tokens(sentence, voice, _max, _rep, _check)
+                if _check():
+                    return
+                audio = self._decode_tokens(tokens, voice)
+                elapsed = time.perf_counter() - t0
+                audio_duration = len(audio) / self.cfg.SAMPLE_RATE
+                rtf = elapsed / audio_duration if audio_duration > 0 else 0
+                logger.info(
+                    f"Chunk {i + 1}/{len(sentences)}: "
+                    f"{len(sentence)} chars → {audio_duration:.1f}s audio "
+                    f"in {elapsed:.2f}s (RTF: {rtf:.2f}x)"
+                )
+                yield audio
+            except GenerationCancelled:
+                logger.info(f"Generation cancelled mid-token at chunk {i + 1}")
+                return
+            except Exception as e:
+                logger.error(f"Error on chunk {i + 1}: {e}")
+                raise
+    # ─── Autoregressive token generation (OPTIMISED) ──────────────
+    def _generate_tokens(
+        self,
+        text: str,
+        voice: VoiceProfile,
+        max_new_tokens: int,
+        repetition_penalty: float,
+        is_cancelled: Callable[[], bool] = lambda: False,
+    ) -> np.ndarray:
+        """Run embed → LM autoregressive loop. Returns raw token array.
+        Optimisations:
+          • Token list instead of repeated np.concatenate (O(n) → O(1) append)
+          • Unique tokens set for inline repetition penalty (avoids exponential penalty bug)
+          • Pre-allocated attention mask for zero-copy slicing
+          • Correct dimensional routing for step 0 prompt processing
+        """
+        input_ids = self.tokenizer(text, return_tensors="np")["input_ids"].astype(np.int64)
+        # Pre-allocate collections
+        token_list: list[int] = [self.cfg.START_SPEECH_TOKEN]
+        unique_tokens: set[int] = {self.cfg.START_SPEECH_TOKEN}
+        penalty = repetition_penalty
+        past_key_values = None
+        attention_mask_full = None
+        seq_len = 0
+        for step in range(max_new_tokens):
+            if step > 0 and step % 25 == 0 and is_cancelled():
+                raise GenerationCancelled()
+            embeds = self.embed_session.run(None, {"input_ids": input_ids})[0]
+            if step == 0:
+                # Prepend speaker conditioning
+                embeds = np.concatenate((voice.cond_emb, embeds), axis=1)
+                batch, seq_len, _ = embeds.shape
+                past_key_values = {
+                    inp.name: np.zeros(
+                        [batch, self.cfg.NUM_KV_HEADS, 0, self.cfg.HEAD_DIM],
+                        dtype=np.float16 if inp.type == "tensor(float16)" else np.float32,
+                    )
+                    for inp in self.lm_session.get_inputs()
+                    if "past_key_values" in inp.name
+                }
+                # Pre-allocate full attention mask
+                attention_mask_full = np.ones((batch, seq_len + max_new_tokens), dtype=np.int64)
+                attention_mask = attention_mask_full[:, :seq_len]
+                # Step 0 requires position_ids matching prompt sequence length
+                position_ids = np.arange(seq_len, dtype=np.int64).reshape(batch, -1)
+            else:
+                # O(1) zero-copy slice for subsequent steps
+                attention_mask = attention_mask_full[:, : seq_len + step]
+                # Single position ID for the single new token
+                position_ids = np.array([[seq_len + step - 1]], dtype=np.int64)
+            # Language model forward pass
+            logits, *present_kv = self.lm_session.run(
+                None,
+                dict(
+                    inputs_embeds=embeds,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    **past_key_values,
+                ),
+            )
+            # ── Inline repetition penalty + token selection ───────
+            last_logits = logits[0, -1, :].copy()  # shape: (vocab_size,)
+            # Apply repetition penalty strictly to unique tokens to prevent over-penalization
+            for tok_id in unique_tokens:
+                if last_logits[tok_id] < 0:
+                    last_logits[tok_id] *= penalty
+                else:
+                    last_logits[tok_id] /= penalty
+            next_token = int(np.argmax(last_logits))
+            token_list.append(next_token)
+            unique_tokens.add(next_token)
+            if next_token == self.cfg.STOP_SPEECH_TOKEN:
+                break
+            # Update state for next step
+            input_ids = np.array([[next_token]], dtype=np.int64)
+            for j, key in enumerate(past_key_values):
+                past_key_values[key] = present_kv[j]
+        return np.array([token_list], dtype=np.int64)
+    # ─── Token → audio decoding ───────────────────────────────────
+    def _decode_tokens(self, generated: np.ndarray, voice: VoiceProfile) -> np.ndarray:
+        """Decode speech tokens to a float32 waveform at 24 kHz."""
+        # Strip START token; strip STOP token if present
+        tokens = generated[:, 1:]
+        if tokens.shape[1] > 0 and tokens[0, -1] == self.cfg.STOP_SPEECH_TOKEN:
+            tokens = tokens[:, :-1]
+        if tokens.shape[1] == 0:
+            return np.zeros(0, dtype=np.float32)
+        # Prepend prompt token + append silence
+        silence = np.full(
+            (tokens.shape[0], 3), self.cfg.SILENCE_TOKEN, dtype=np.int64
+        )
+        full_tokens = np.concatenate(
+            [voice.prompt_token, tokens, silence], axis=1
+        )
+        wav = self.decoder_session.run(
+            None,
+            {
+                "speech_tokens": full_tokens,
+                "speaker_embeddings": voice.speaker_embeddings,
+                "speaker_features": voice.speaker_features,
+            },
+        )[0].squeeze(axis=0)
+        return wav
+    # ─── Warmup ───────────────────────────────────────────────────
+    def warmup(self):
+        """Run a short inference to warm up ONNX sessions and JIT paths."""
+        try:
+            t0 = time.perf_counter()
+            _ = self.generate_speech("Hello.", self.default_voice, max_new_tokens=32)
+            logger.info(f"Warmup done in {time.perf_counter() - t0:.2f}s")
+        except Exception as e:
+            logger.warning(f"Warmup failed (non-critical): {e}")

config.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Chatterbox Turbo TTS — Centralized Configuration
+═══════════════════════════════════════════════════
+Optimised for HF Space free tier (2 vCPU).
+Adjust MODEL_DTYPE to switch quantization (q8/q4/fp16/fp32).
+All settings overridable via environment variables prefixed CB_.
+"""
+import os
+_HERE = os.path.dirname(os.path.abspath(__file__))
+def _get_bool(name: str, default: bool) -> bool:
+    raw = os.getenv(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+class Config:
+    # ── Model ────────────────────────────────────────────────────
+    MODEL_ID: str = os.getenv("CB_MODEL_ID", "ResembleAI/chatterbox-turbo-ONNX")
+    #   fp32  → highest quality, ~1.4 GB, slowest
+    #   fp16  → good quality,    ~0.7 GB
+    #   q8    → ★ recommended,   ~0.35 GB, best balance
+    #   q4    → smallest,        ~0.17 GB, fastest, slight loss
+    #   q4f16 → q4 weights + fp16 activations
+    MODEL_DTYPE: str = os.getenv("CB_MODEL_DTYPE", "q4")
+    MODELS_DIR: str = os.getenv("CB_MODELS_DIR", os.path.join(_HERE, "models"))
+    # ── ONNX Runtime CPU tuning (optimised for 2 vCPU) ───────────
+    #
+    # KEY RULE: intra_op threads MUST match physical cores.
+    #   → 4 threads on 2 cores = oversubscription = SLOWER.
+    #   → 2 threads on 2 cores = each op uses both cores perfectly.
+    #
+    # MAX_WORKERS = 1 ensures ONE inference gets both cores.
+    #   → 2 workers would split 2 cores = both requests slow.
+    #
+    CPU_THREADS: int = int(os.getenv("CB_CPU_THREADS", "2"))
+    MAX_WORKERS: int = int(os.getenv("CB_MAX_WORKERS", "1"))
+    # ── Generation defaults ──────────────────────────────────────
+    SAMPLE_RATE: int = 24000
+    MAX_NEW_TOKENS: int = int(os.getenv("CB_MAX_NEW_TOKENS", "768"))
+    REPETITION_PENALTY: float = float(os.getenv("CB_REPETITION_PENALTY", "1.2"))
+    MAX_TEXT_LENGTH: int = int(os.getenv("CB_MAX_TEXT_LENGTH", "50000"))
+    # ── Model constants (official card — do not change) ──────────
+    START_SPEECH_TOKEN: int = 6561
+    STOP_SPEECH_TOKEN: int = 6562
+    SILENCE_TOKEN: int = 4299
+    NUM_KV_HEADS: int = 16
+    HEAD_DIM: int = 64
+    # ── Paralinguistic tags (Turbo native) ───────────────────────
+    PARALINGUISTIC_TAGS: tuple = (
+        "laugh", "chuckle", "cough", "sigh", "gasp",
+        "shush", "groan", "sniff", "clear throat",
+    )
+    # ── Voice / reference audio ──────────────────────────────────
+    # NOTE: Official ResembleAI/chatterbox-turbo-ONNX has no bundled voice.
+    # The default_voice.wav is a plain audio sample from community repo
+    # (not a model — just a reference WAV, safe to use from any source).
+    DEFAULT_VOICE_REPO: str = "onnx-community/chatterbox-ONNX"
+    DEFAULT_VOICE_FILE: str = "default_voice.wav"
+    MAX_VOICE_UPLOAD_BYTES: int = 10 * 1024 * 1024   # 10 MB
+    MIN_REF_DURATION_SEC: float = 1.5
+    MAX_REF_DURATION_SEC: float = 30.0
+    VOICE_CACHE_SIZE: int = int(os.getenv("CB_VOICE_CACHE_SIZE", "20"))
+    VOICE_CACHE_TTL_SEC: int = int(os.getenv("CB_VOICE_CACHE_TTL", "3600"))  # 1 hour
+    # ── Streaming ────────────────────────────────────────────────
+    # Smaller chunks = faster TTFB (first audio arrives sooner)
+    # ~200 chars ≈ 1–2 sentences ≈ fastest first-chunk on 2 vCPU
+    MAX_CHUNK_CHARS: int = int(os.getenv("CB_MAX_CHUNK_CHARS", "100"))
+    # Additive parallel mode (odd/even split across primary/helper).
+    ENABLE_PARALLEL_MODE: bool = _get_bool("CB_ENABLE_PARALLEL_MODE", True)
+    HELPER_BASE_URL: str = os.getenv("CB_HELPER_BASE_URL", "https://shadowhunter222-chab2.hf.space").strip()
+    HELPER_TIMEOUT_SEC: float = float(os.getenv("CB_HELPER_TIMEOUT_SEC", "45"))
+    HELPER_RETRY_ONCE: bool = _get_bool("CB_HELPER_RETRY_ONCE", True)
+    # Optional shared secret for internal chunk endpoints.
+    INTERNAL_SHARED_SECRET: str = os.getenv("CB_INTERNAL_SHARED_SECRET", "").strip()
+    # ── Server ───────────────────────────────────────────────────
+    HOST: str = os.getenv("CB_HOST", "0.0.0.0")
+    PORT: int = int(os.getenv("CB_PORT", "7860"))
+    ALLOWED_ORIGINS: list = [
+        "https://toolboxesai.com",
+        "www.toolboxesai.com",
+        "https://www.toolboxesai.com",
+        "http://localhost:8788",  "http://127.0.0.1:8788",
+        "http://localhost:5502",  "http://127.0.0.1:5502",
+        "http://localhost:5501",  "http://127.0.0.1:5501",
+        "http://localhost:5500",  "http://127.0.0.1:5500",
+        "http://localhost:5173",  "http://127.0.0.1:5173",
+        "http://localhost:7860",  "http://127.0.0.1:7860",
+    ]

her_prompt.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8eaabbeafe26ad6f78b56dcc32608763eeb69485db074c7136c6818f04a93ced
+size 725328

ivr_female_02_prompt.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64953bf94657c4334532319fd4f20e9859c31af4445940916b04f129ef1f89e6
+size 2779278

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# =========================================================
+# Chatterbox Turbo TTS - Dependencies (CPU-only)
+# =========================================================
+# PyTorch CPU (required by transformers tokenizer internals)
+torch --index-url https://download.pytorch.org/whl/cpu
+# Core API
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+python-multipart>=0.0.6
+# ONNX Runtime (CPU inference)
+onnxruntime>=1.17.0
+# Audio processing
+numpy>=1.24.0
+librosa>=0.10.0
+soundfile>=0.12.0
+# Tokenizer + model download
+transformers>=4.46.0
+huggingface-hub>=0.19.0

text_processor.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""
+Chatterbox Turbo TTS — Text Processor
+═══════════════════════════════════════
+Sanitizes raw input text and splits it into sentence-level chunks
+for streaming TTS.  Paralinguistic tags ([laugh], [cough], …) are
+explicitly preserved so the model can render them.
+Punctuation Philosophy (based on Resemble AI recommendations):
+  ✅ PRESERVE (benefits prosody):
+     • Ellipsis ...    → dramatic pause, trailing thought, hesitation
+     • Em dash —       → abrupt transition, dramatic break
+     • Comma ,         → short natural pause / breathing point
+     • Period .        → full stop, pitch drop, sentence boundary
+     • ! and ?         → exclamatory / interrogative inflection
+     • Semicolon ;     → medium pause, clause bridge (NOT a split point)
+     • Colon :         → medium pause, introduces explanation (NOT a split point)
+     • Parentheses ()  → quieter/explanatory tone shift
+     • Quotes ""       → dialogue cue
+     • Apostrophe '    → contractions (don't, it's)
+     • CAPS words      → emphasis / volume increase
+  ❌ FILTER (harms output):
+     • Excessive repeated punctuation (!!!! → !, ???? → ?, ,,, → ,)
+     • 4+ dots (.... → ...)
+     • Emojis, URLs, markdown, HTML tags
+     • Non-standard Unicode punctuation (guillemets, etc.)
+"""
+import re
+from typing import List
+from config import Config
+# ═══════════════════════════════════════════════════════════════════
+# Pre-compiled regex patterns (compiled once at import → zero cost)
+# ═══════════════════════════════════════════════════════════════════
+# — Paralinguistic tag protector (matches [laugh], [clear throat], etc.)
+_TAG_NAMES = "|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS)
+_RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE)
+# — Markdown / structural noise
+_RE_CODE_BLOCK   = re.compile(r"```[\s\S]*?```")
+_RE_INLINE_CODE  = re.compile(r"`([^`]+)`")
+_RE_IMAGE        = re.compile(r"!\[([^\]]*)\]\([^)]+\)")
+_RE_LINK         = re.compile(r"\[([^\]]+)\]\([^)]+\)")
+_RE_BOLD_AST     = re.compile(r"\*\*(.+?)\*\*")
+_RE_BOLD_UND     = re.compile(r"__(.+?)__")
+_RE_STRIKE       = re.compile(r"~~(.+?)~~")
+_RE_ITALIC_AST   = re.compile(r"\*(.+?)\*")
+_RE_ITALIC_UND   = re.compile(r"(?<!\w)_(.+?)_(?!\w)")
+_RE_HEADER       = re.compile(r"^#{1,6}\s+", re.MULTILINE)
+_RE_BLOCKQUOTE   = re.compile(r"^>+\s?", re.MULTILINE)
+_RE_HR           = re.compile(r"^[-*_]{3,}$", re.MULTILINE)
+_RE_BULLET       = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
+_RE_ORDERED      = re.compile(r"^\s*\d+\.\s+", re.MULTILINE)
+# — URLs, emojis, HTML entities
+_RE_URL          = re.compile(r"https?://\S+")
+_RE_EMOJI        = re.compile(
+    r"["
+    r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
+    r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
+    r"\U00002702-\U000027B0\U0001F900-\U0001F9FF"
+    r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF"
+    r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F"
+    r"\U0000200D"
+    r"]+", re.UNICODE,
+)
+_RE_HTML_ENTITY  = re.compile(r"&(?:#x?[\da-fA-F]+|\w+);")
+# HTML entities → speakable replacements
+# NOTE: &hellip; → "..." (preserves dramatic pause), &mdash;/&ndash; → "—" (preserves dramatic break)
+_HTML_ENTITIES   = {
+    "&amp;": " and ", "&lt;": " less than ", "&gt;": " greater than ",
+    "&nbsp;": " ", "&quot;": '"', "&apos;": "'",
+    "&mdash;": "—", "&ndash;": "—", "&hellip;": "...",
+}
+# — Smart/curly quote normalization → ASCII equivalents
+# These Unicode variants may confuse the tokenizer; normalizing ensures clean input.
+_SMART_QUOTE_MAP = str.maketrans({
+    "\u201c": '"',   # " left double quotation mark
+    "\u201d": '"',   # " right double quotation mark
+    "\u2018": "'",   # ' left single quotation mark
+    "\u2019": "'",   # ' right single quotation mark
+    "\u00ab": '"',   # « left guillemet
+    "\u00bb": '"',   # » right guillemet
+    "\u201e": '"',   # „ double low quotation mark
+    "\u201f": '"',   # ‟ double high reversed quotation mark
+    "\u2032": "'",   # ′ prime
+    "\u2033": '"',   # ″ double prime
+    "\u2013": "—",   # – en dash → em dash (dramatic pause)
+    "\u2014": "—",   # — em dash (keep as-is after mapping)
+    "\u2026": "...", # … horizontal ellipsis → three dots
+})
+# — ALL CAPS normalization
+# Words entirely in caps (length >= 4) often get spelled out by the TTS engine (e.g. NOTHING).
+# By converting them to Title Case, they'll be processed naturally as words.
+_RE_ALL_CAPS = re.compile(r"\b[A-Z]{4,}\b")
+# — Punctuation normalization
+#   Ellipsis (... / ..) is PRESERVED — it creates dramatic pauses in Chatterbox.
+#   Only 4+ dots are excessive and get capped to standard ellipsis.
+_RE_EXCESSIVE_DOTS   = re.compile(r"\.{4,}")       # ....+ → ... (cap excessive)
+_RE_NORMALIZE_DOTS   = re.compile(r"\.{2,3}")       # .. or ... → ... (standardize)
+_RE_REPEATED_EXCLAM  = re.compile(r"!{2,}")          # !! → !
+_RE_REPEATED_QUEST   = re.compile(r"\?{2,}")         # ?? → ?
+_RE_REPEATED_SEMI    = re.compile(r";{2,}")           # ;; → ;
+_RE_REPEATED_COLON   = re.compile(r":{2,}")           # :: → :
+_RE_REPEATED_COMMA   = re.compile(r",{2,}")           # ,, → ,
+_RE_REPEATED_DASH    = re.compile(r"-{3,}")           # --- → — (em dash)
+# — Abbreviation protection
+# Common abbreviations ending in "." that should NOT trigger sentence splitting.
+# These get a placeholder before splitting, then get restored.
+_ABBREVIATIONS = (
+    "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Ave", "Blvd",
+    "vs", "etc", "approx", "dept", "est", "govt", "inc", "corp", "ltd",
+    "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+    "Gen", "Col", "Sgt", "Capt", "Lt", "Cmdr", "Adm",
+    "Fig", "Vol", "No", "Ref", "Rev", "Ph",
+)
+_RE_ABBREV = re.compile(
+    r"\b(" + "|".join(re.escape(a) for a in _ABBREVIATIONS) + r")\.",
+    re.IGNORECASE,
+)
+# — Whitespace
+_RE_MULTI_SPACE      = re.compile(r"[ \t]+")
+_RE_MULTI_NEWLINE    = re.compile(r"\n{3,}")
+_RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])")
+# — Sentence boundary (split point)
+# Split ONLY on true sentence-ending punctuation: . ! ?
+# Semicolons and colons are clause connectors — they bridge related thoughts
+# and should NOT be used as split points (creates choppy, unnatural fragments).
+# Ellipsis (...) is also intentionally excluded from splitting: letting it split the stream
+# creates a compound lag between chunks, making the pause artificially excessive.
+_RE_SENTENCE_SPLIT = re.compile(
+    r"""(?:(?<=[.!?])(?<!\.\.\.)|(?<=[.!?][)\]"'])(?<!\.\.\.\.))\s+"""
+)
+_MIN_MERGE_WORDS = 5
+# ═══════════════════════════════════════════════════════════════════
+# Public API
+# ═══════════════════════════════════════════════════════════════════
+def sanitize(text: str) -> str:
+    """Clean raw input for TTS while preserving prosody-beneficial punctuation.
+    Preserves: ellipsis (...), em dashes (—), commas, periods, !, ?, ;, :, quotes.
+    Removes: emojis, URLs, markdown, HTML, excessive repeated punctuation.
+    """
+    if not text:
+        return text
+    # 0. Normalize smart/curly quotes and Unicode punctuation FIRST
+    #    This ensures downstream regex works on clean ASCII-like punctuation.
+    text = text.translate(_SMART_QUOTE_MAP)
+    # 1. Normalize ALL CAPS words to Title Case to prevent spelling out
+    text = _RE_ALL_CAPS.sub(lambda m: m.group(0).capitalize(), text)
+    # 2. Protect paralinguistic tags by replacing with placeholders
+    tags_found: list[tuple[int, str]] = []
+    def _protect_tag(m):
+        idx = len(tags_found)
+        tags_found.append((idx, m.group(0)))
+        return f"§TAG{idx}§"
+    text = _RE_PARA_TAG.sub(_protect_tag, text)
+    # 3. Protect abbreviations from sentence-boundary splitting
+    #    "Dr. Smith" → "Dr§ Smith" (restored later)
+    abbrevs_found: list[tuple[int, str]] = []
+    def _protect_abbrev(m):
+        idx = len(abbrevs_found)
+        abbrevs_found.append((idx, m.group(0)))
+        return f"{m.group(1)}§ABR{idx}§"
+    text = _RE_ABBREV.sub(_protect_abbrev, text)
+    # 4. Strip non-speakable structures
+    text = _RE_URL.sub("", text)
+    text = _RE_CODE_BLOCK.sub("", text)
+    text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text)
+    text = _RE_LINK.sub(r"\1", text)
+    text = _RE_BOLD_AST.sub(r"\1", text)
+    text = _RE_BOLD_UND.sub(r"\1", text)
+    text = _RE_STRIKE.sub(r"\1", text)
+    text = _RE_ITALIC_AST.sub(r"\1", text)
+    text = _RE_ITALIC_UND.sub(r"\1", text)
+    text = _RE_INLINE_CODE.sub(r"\1", text)
+    text = _RE_HEADER.sub("", text)
+    text = _RE_BLOCKQUOTE.sub("", text)
+    text = _RE_HR.sub("", text)
+    text = _RE_BULLET.sub("", text)
+    text = _RE_ORDERED.sub("", text)
+    # 5. Emojis, hashtags
+    text = _RE_EMOJI.sub("", text)
+    text = re.sub(r"#(\w+)", r"\1", text)
+    # 6. HTML entities → speakable text
+    text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text)
+    # 7. Normalize punctuation (PRESERVE prosody-beneficial marks)
+    #    Order matters: handle excessive dots first, then standardize ellipsis.
+    text = _RE_EXCESSIVE_DOTS.sub("...", text)       # ....+ → ... (cap)
+    text = _RE_NORMALIZE_DOTS.sub("...", text)        # .. or ... → ... (standardize)
+    text = _RE_REPEATED_EXCLAM.sub("!", text)         # !! → !
+    text = _RE_REPEATED_QUEST.sub("?", text)          # ?? → ?
+    text = _RE_REPEATED_SEMI.sub(";", text)           # ;; → ;
+    text = _RE_REPEATED_COLON.sub(":", text)          # :: → :
+    text = _RE_REPEATED_COMMA.sub(",", text)          # ,, → ,
+    text = _RE_REPEATED_DASH.sub("—", text)           # --- → em dash
+    # 8. Whitespace cleanup
+    text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text)
+    text = _RE_MULTI_SPACE.sub(" ", text)
+    text = _RE_MULTI_NEWLINE.sub("\n\n", text)
+    text = text.strip()
+    # 9. Strip abbreviation dots (Mr. → Mr, Dr. → Dr, etc.)
+    #    The dot is not needed for correct TTS pronunciation and removing it
+    #    prevents false sentence-boundary splits in split_for_streaming().
+    for idx, original in abbrevs_found:
+        text = text.replace(f"§ABR{idx}§", "")
+    # 10. Restore paralinguistic tags
+    for idx, original in tags_found:
+        text = text.replace(f"§TAG{idx}§", original)
+    return text
+def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]:
+    """Split sanitized text into sentence-level chunks for streaming.
+    Strategy:
+      1. Split on sentence-ending punctuation boundaries (. ! ?)
+         — NOT on semicolons, colons, or ellipsis (those are non-breaking boundaries)
+      2. Enforce max_chars per chunk (split long sentences on commas / spaces)
+      3. Merge short chunks (≤5 words) with the next to avoid tiny segments
+    """
+    if not text:
+        return []
+    # Step 1: sentence split
+    raw_chunks = _RE_SENTENCE_SPLIT.split(text)
+    raw_chunks = [c.strip() for c in raw_chunks if c.strip()]
+    # Step 2: enforce max length per chunk
+    sized: List[str] = []
+    for chunk in raw_chunks:
+        if len(chunk) <= max_chars:
+            sized.append(chunk)
+        else:
+            sized.extend(_break_long_chunk(chunk, max_chars))
+    # Step 3: merge short chunks
+    if len(sized) <= 1:
+        return sized
+    merged: List[str] = []
+    carry = ""
+    for i, chunk in enumerate(sized):
+        if carry:
+            chunk = carry + " " + chunk
+            carry = ""
+        if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1:
+            carry = chunk
+        else:
+            merged.append(chunk)
+    if carry:
+        if merged:
+            merged[-1] += " " + carry
+        else:
+            merged.append(carry)
+    return merged
+# ═══════════════════════════════════════════════════════════════════
+# Internal helpers
+# ═══════════════════════════════════════════════════════════════════
+def _break_long_chunk(text: str, max_chars: int) -> List[str]:
+    """Break a chunk longer than max_chars on commas or word boundaries."""
+    parts: List[str] = []
+    remaining = text
+    while len(remaining) > max_chars:
+        break_pos = -1
+        include_break_char = False
+        # Prefer punctuation/pauses first to keep prosody natural.
+        for marker in (",", ";", ":", "—", "-", "!", "?"):
+            pos = remaining.rfind(marker, 0, max_chars)
+            if pos > break_pos:
+                break_pos = pos
+                include_break_char = True
+        # Then prefer nearest space before limit.
+        space_pos = remaining.rfind(" ", 0, max_chars)
+        if space_pos > break_pos:
+            break_pos = space_pos
+            include_break_char = False
+        # If nothing before limit, look slightly ahead to avoid mid-word cuts.
+        if break_pos == -1:
+            forward_limit = min(len(remaining), max_chars + 24)
+            m = re.search(r"[\s,;:!?]", remaining[max_chars:forward_limit])
+            if m:
+                break_pos = max_chars + m.start()
+                include_break_char = remaining[break_pos] in ",;:!?"
+            else:
+                break_pos = max_chars
+                include_break_char = False
+        cut_at = break_pos + (1 if include_break_char else 0)
+        if cut_at <= 0:
+            cut_at = min(max_chars, len(remaining))
+        segment = remaining[:cut_at].strip()
+        if segment:
+            parts.append(segment)
+        remaining = remaining[cut_at:].lstrip()
+    if remaining.strip():
+        parts.append(remaining.strip())
+    return parts