Spaces:

chmielvu
/

Forge-TTS

Sleeping

App Files Files Community

chmielvu commited on Feb 6

Commit

b088dbf

verified ·

1 Parent(s): 519a7bc

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +300 -614

app.py CHANGED Viewed

@@ -1,22 +1,20 @@
 """
-HF Spaces (Docker) CPU-only TTS API - FIXED VERSION v1.2.0
-- Separate endpoints per service: XTTS v2, Parler-TTS mini multilingual, Piper.
-- CPU-friendly defaults for 2 vCPU / 16 GB RAM:
-  - Sentence chunking (default ON)
-  - Streaming via SSE (each chunk returned as standalone WAV)
-  - Optional torch.compile, optional dynamic int8 quantization hooks
-FIXES APPLIED (v1.2.0):
-1. Error tracking: Models that fail to load return None gracefully (no retries)
-2. Health endpoint: Reports actual service availability per backend
-3. Better error messages: Piper 404 shows available voices
-4. Service flags: XTTS_ENABLED, PARLER_ENABLED, PIPER_ENABLED env vars
-5. Parler-TTS v1.1: TWO tokenizers (prompt + description) with attention masks
 """
 from __future__ import annotations
 import asyncio
 import base64
 import io
 import json
 import os
@@ -25,37 +23,18 @@ import tempfile
 import threading
 import time
 from dataclasses import dataclass
-from functools import lru_cache
-from typing import Dict, Generator, Iterable, List, Optional, Tuple
 import numpy as np
 import soundfile as sf
 import torch
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
-from fastapi.responses import Response, StreamingResponse
 from pydantic import BaseModel, Field
-# --- Optional deps (import lazily where possible) ---
-# XTTS (Coqui TTS)
-from TTS.api import TTS
-# Parler-TTS (transformers)
-from transformers import AutoTokenizer, set_seed
-try:
-    from parler_tts import ParlerTTSForConditionalGeneration
-except Exception:
-    ParlerTTSForConditionalGeneration = None  # type: ignore
-# Piper fallback
-try:
-    from piper.voice import PiperVoice
-except Exception:
-    PiperVoice = None  # type: ignore
-# -----------------------
-# Settings / knobs
-# -----------------------
 def _env_bool(name: str, default: bool = False) -> bool:
     v = os.getenv(name)
     if v is None:
@@ -63,67 +42,61 @@ def _env_bool(name: str, default: bool = False) -> bool:
     return v.strip().lower() in {"1", "true", "yes", "y", "on"}
-@dataclass(frozen=True)
-class Settings:
-    # Service toggles (NEW in v1.2.0)
-    xtts_enabled: bool = _env_bool("XTTS_ENABLED", True)
-    parler_enabled: bool = _env_bool("PARLER_ENABLED", True)
-    piper_enabled: bool = _env_bool("PIPER_ENABLED", True)
-    fallback_enabled: bool = _env_bool("ENABLE_FALLBACK", True)
-    # XTTS v2
-    xtts_model_name: str = os.getenv("XTTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2")
-    xtts_default_language: str = os.getenv("XTTS_DEFAULT_LANGUAGE", "pl")
-    xtts_torch_compile: bool = _env_bool("XTTS_TORCH_COMPILE", False)
-    xtts_dynamic_int8: bool = _env_bool("XTTS_DYNAMIC_INT8", False)
-    # Parler
-    parler_model_name: str = os.getenv("PARLER_MODEL_NAME", "parler-tts/parler-tts-mini-multilingual-v1.1")
-    parler_default_description: str = os.getenv(
-        "PARLER_DEFAULT_DESCRIPTION",
-        "A clear, natural, studio-recorded voice speaking Polish with steady pacing.",
-    )
-    parler_seed: int = int(os.getenv("PARLER_SEED", "0"))
-    parler_torch_compile: bool = _env_bool("PARLER_TORCH_COMPILE", False)
-    parler_dynamic_int8: bool = _env_bool("PARLER_DYNAMIC_INT8", False)
-    # Piper
-    piper_voices_json: str = os.getenv("PIPER_VOICES_JSON", "")
-    piper_voices_dir: str = os.getenv("PIPER_VOICES_DIR", "/data/piper")
-    # Chunking / streaming defaults
-    chunk_max_chars: int = int(os.getenv("CHUNK_MAX_CHARS", "260"))
     chunk_max_words: int = int(os.getenv("CHUNK_MAX_WORDS", "40"))
-    chunk_max_sentences: int = int(os.getenv("CHUNK_MAX_SENTENCES", "8"))
     join_silence_ms: int = int(os.getenv("JOIN_SILENCE_MS", "60"))
     # Runtime
     num_threads: int = int(os.getenv("OMP_NUM_THREADS", "2"))
-    request_timeout_s: int = int(os.getenv("REQUEST_TIMEOUT_S", "240"))
 S = Settings()
-# Conservative CPU threading.
 torch.set_num_threads(S.num_threads)
 torch.set_num_interop_threads(max(1, S.num_threads // 2))
-# -----------------------
-# Utilities
-# -----------------------
 _SENT_SPLIT_RE = re.compile(r"(?<=[\.\!\?\:\;])\s+|\n+")
 _WS_RE = re.compile(r"\s+")
 def normalize_text(text: str) -> str:
-    text = text.strip()
-    text = _WS_RE.sub(" ", text)
-    return text
 def split_text_into_chunks(
     text: str,
     max_chars: int = S.chunk_max_chars,
     max_words: int = S.chunk_max_words,
-    max_sentences: int = S.chunk_max_sentences,
 ) -> List[str]:
     text = normalize_text(text)
     if not text:
@@ -139,648 +112,361 @@ def split_text_into_chunks(
         nonlocal cur, cur_chars, cur_words
         if cur:
             chunks.append(" ".join(cur).strip())
-            cur = []
-            cur_chars = 0
-            cur_words = 0
     for sent in sents:
-        w = sent.split()
-        sent_words = len(w)
-        sent_chars = len(sent)
-        if (cur_chars + sent_chars > max_chars) or (cur_words + sent_words > max_words):
             flush()
         cur.append(sent)
-        cur_chars += sent_chars + 1
-        cur_words += sent_words
-        if max_sentences and len(chunks) + (1 if cur else 0) >= max_sentences:
-            flush()
-            break
     flush()
     return chunks
 def wav_bytes_from_audio(audio: np.ndarray, sr: int) -> bytes:
-    audio = np.asarray(audio, dtype=np.float32)
     buf = io.BytesIO()
-    sf.write(buf, audio, sr, format="WAV", subtype="PCM_16")
     return buf.getvalue()
 def concat_audio(chunks: List[np.ndarray], sr: int, silence_ms: int = S.join_silence_ms) -> np.ndarray:
     if not chunks:
         return np.zeros((1,), dtype=np.float32)
     if len(chunks) == 1:
         return np.asarray(chunks[0], dtype=np.float32)
-    silence = np.zeros((int(sr * (silence_ms / 1000.0)),), dtype=np.float32) if silence_ms > 0 else None
-    out = []
     for i, ch in enumerate(chunks):
-        out.append(np.asarray(ch, dtype=np.float32))
-        if silence is not None and i != len(chunks) - 1:
-            out.append(silence)
-    return np.concatenate(out, axis=0)
 def b64encode_bytes(b: bytes) -> str:
     return base64.b64encode(b).decode("ascii")
-def safe_filename(prefix: str = "audio", ext: str = ".wav") -> str:
-    return f"{prefix}_{int(time.time() * 1000)}{ext}"
-def _filter_kwargs(fn, kwargs: Dict) -> Dict:
-    import inspect
-    try:
-        sig = inspect.signature(fn)
-    except Exception:
-        return kwargs
-    accepted = set(sig.parameters.keys())
-    return {k: v for k, v in kwargs.items() if k in accepted}
-# -----------------------
-# Model manager (lazy + locked) - FIXED v1.2.0
-# -----------------------
-class _Locks:
-    xtts = threading.Lock()
-    xtts_infer = threading.Lock()
-    parler = threading.Lock()
-    parler_infer = threading.Lock()
-    piper = threading.Lock()
-class ModelManager:
-    def __init__(self) -> None:
-        self._xtts: Optional[TTS] = None
-        self._xtts_error: Optional[str] = None  # NEW: Track loading errors
-        self._parler = None
-        self._parler_prompt_tok = None
-        self._parler_desc_tok = None
-        self._parler_error: Optional[str] = None  # NEW: Track loading errors
-        self._piper_voices: Dict[str, str] = {}
-        self._piper_loaded: Dict[str, "PiperVoice"] = {}
-        self._piper_error: Optional[str] = None  # NEW: Track loading errors
-    def _maybe_torch_compile(self, module: torch.nn.Module) -> torch.nn.Module:
-        if not hasattr(torch, "compile"):
-            return module
-        try:
-            return torch.compile(module)  # type: ignore
-        except Exception:
-            return module
-    def _maybe_dynamic_int8(self, module: torch.nn.Module) -> torch.nn.Module:
-        try:
-            from torch.ao.quantization import quantize_dynamic
-            return quantize_dynamic(module, {torch.nn.Linear}, dtype=torch.qint8)
-        except Exception:
-            return module
-    def get_xtts(self) -> Optional[TTS]:
-        """FIXED: Returns None on failure instead of crashing"""
-        if not S.xtts_enabled:
             return None
-        with _Locks.xtts:
-            # If we already tried and failed, return None immediately
-            if self._xtts_error is not None:
-                return None
-            if self._xtts is None:
-                try:
-                    print("[XTTS] Loading model...")
-                    tts = TTS(model_name=S.xtts_model_name, progress_bar=False, gpu=False)
                     try:
-                        inner = getattr(getattr(tts, "synthesizer", None), "tts_model", None)
-                        if isinstance(inner, torch.nn.Module):
-                            if S.xtts_dynamic_int8:
-                                inner = self._maybe_dynamic_int8(inner)
-                                tts.synthesizer.tts_model = inner
-                            if S.xtts_torch_compile:
-                                inner = self._maybe_torch_compile(inner)
-                                tts.synthesizer.tts_model = inner
                     except Exception as e:
-                        print(f"[XTTS] Warning: Optimization failed: {e}")
-                    self._xtts = tts
-                    print("[XTTS] ✅ Model loaded successfully")
-                except Exception as e:
-                    self._xtts_error = str(e)
-                    print(f"[XTTS] ❌ Failed to load: {e}")
-                    return None
-            return self._xtts
-    def get_parler(self) -> Optional[Tuple]:
-        """
-        FIXED: Returns None on failure instead of crashing.
-        Returns (model, prompt_tokenizer, description_tokenizer) or None.
-        """
-        if not S.parler_enabled:
-            return None
-        with _Locks.parler:
-            if self._parler_error is not None:
-                return None
-            if ParlerTTSForConditionalGeneration is None:
-                self._parler_error = "parler_tts not installed"
-                print("[Parler] ❌ parler_tts is not installed")
-                return None
-            if self._parler is None or self._parler_prompt_tok is None or self._parler_desc_tok is None:
-                try:
-                    print("[Parler] Loading model...")
-                    # Load model
-                    model = ParlerTTSForConditionalGeneration.from_pretrained(S.parler_model_name).to("cpu")
-                    model.eval()
-                    # CRITICAL FIX: Load BOTH tokenizers for v1.1
-                    prompt_tokenizer = AutoTokenizer.from_pretrained(S.parler_model_name)
-                    description_tokenizer = AutoTokenizer.from_pretrained(
-                        model.config.text_encoder._name_or_path
-                    )
-                    # Best-effort compile/quantize
-                    if isinstance(model, torch.nn.Module):
-                        if S.parler_dynamic_int8:
-                            model = self._maybe_dynamic_int8(model)
-                        if S.parler_torch_compile:
-                            model = self._maybe_torch_compile(model)
-                    self._parler = model
-                    self._parler_prompt_tok = prompt_tokenizer
-                    self._parler_desc_tok = description_tokenizer
-                    print("[Parler] ✅ Model loaded successfully")
-                except Exception as e:
-                    self._parler_error = str(e)
-                    print(f"[Parler] ❌ Failed to load: {e}")
-                    return None
-            return self._parler, self._parler_prompt_tok, self._parler_desc_tok
-    def _load_piper_registry(self) -> Dict[str, str]:
-        """Load Piper voice registry from JSON env var and/or directory scan"""
-        reg: Dict[str, str] = {}
-        if S.piper_voices_json:
-            try:
-                reg.update(json.loads(S.piper_voices_json))
-            except Exception as e:
-                print(f"[Piper] Warning: Failed to parse PIPER_VOICES_JSON: {e}")
-        try:
-            if os.path.isdir(S.piper_voices_dir):
-                for fn in os.listdir(S.piper_voices_dir):
-                    if fn.endswith(".onnx"):
-                        voice_id = os.path.splitext(fn)[0]
-                        reg.setdefault(voice_id, os.path.join(S.piper_voices_dir, fn))
         except Exception as e:
-            print(f"[Piper] Warning: Failed to scan {S.piper_voices_dir}: {e}")
-        return reg
-    def list_piper_voices(self) -> Dict[str, str]:
-        """FIXED: Returns empty dict on error instead of crashing"""
-        if not S.piper_enabled:
-            return {}
-        with _Locks.piper:
-            if self._piper_error is not None:
-                return {}
-            if not self._piper_voices:
-                try:
-                    self._piper_voices = self._load_piper_registry()
-                    if self._piper_voices:
-                        print(f"[Piper] ✅ Found {len(self._piper_voices)} voices")
-                    else:
-                        print("[Piper] ⚠️  No voices found in registry")
-                except Exception as e:
-                    self._piper_error = str(e)
-                    print(f"[Piper] ❌ Failed to load registry: {e}")
-                    return {}
-            return dict(self._piper_voices)
-    def get_piper(self, voice_id: str) -> Optional["PiperVoice"]:
-        """FIXED: Returns None on failure with better error messages"""
-        if not S.piper_enabled:
-            return None
-        if PiperVoice is None:
-            self._piper_error = "piper not installed"
-            return None
-        with _Locks.piper:
-            if self._piper_error is not None:
-                return None
-            voices = self.list_piper_voices()
-            if voice_id not in voices:
-                return None
-            if voice_id not in self._piper_loaded:
                 try:
-                    print(f"[Piper] Loading voice: {voice_id}")
-                    path = voices[voice_id]
-                    voice = PiperVoice.load(path, use_cuda=False)
-                    self._piper_loaded[voice_id] = voice
-                    print(f"[Piper] ✅ Voice loaded: {voice_id}")
-                except Exception as e:
-                    print(f"[Piper] ❌ Failed to load voice {voice_id}: {e}")
-                    return None
-            return self._piper_loaded.get(voice_id)
-_manager = ModelManager()
-# -----------------------
-# Request / response models
-# -----------------------
-class XTTSSynthRequest(BaseModel):
-    text: str = Field(..., min_length=1, max_length=5000, description="Text to synthesize")
-    language: Optional[str] = Field(None, description="Language code (e.g. 'pl', 'en')")
-    speaker_wav_b64: Optional[str] = Field(None, description="Base64-encoded speaker WAV for voice cloning")
-    stream: bool = Field(False, description="If True, stream chunks via SSE")
-class XTTSStreamRequest(BaseModel):
-    text: str = Field(..., min_length=1, max_length=5000)
-    language: Optional[str] = None
-    speaker_wav_b64: Optional[str] = None
-class ParlerSynthRequest(BaseModel):
-    text: str = Field(..., min_length=1, max_length=5000)
-    description: Optional[str] = Field(None, description="Voice description (overrides default)")
-    stream: bool = Field(False, description="If True, stream chunks via SSE")
-class ParlerStreamRequest(BaseModel):
-    text: str = Field(..., min_length=1, max_length=5000)
-    description: Optional[str] = None
-class PiperSynthRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=5000)
-    voice_id: str = Field(..., description="Piper voice ID (from /v1/piper/voices)")
-    stream: bool = Field(False, description="If True, stream chunks via SSE")
-class PiperStreamRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=5000)
-    voice_id: str
 class AudioResponse(BaseModel):
-    audio_b64: str = Field(..., description="Base64-encoded WAV file")
     sample_rate: int
     duration_s: float
     text: str
 class HealthResponse(BaseModel):
     status: str = "ok"
-    version: str = "1.2.0-fallback"
-    services: Dict[str, bool] = Field(default_factory=dict)
-    piper_voices: int = 0
-    fallback: bool = True
-# -----------------------
 # FastAPI app
-# -----------------------
-app = FastAPI(title="Forge-TTS API", version="1.2.0")
 @app.get("/health", response_model=HealthResponse)
 def health():
-    """FIXED: Reports actual service availability"""
-    voices = _manager.list_piper_voices()
-    # Check service availability
-    xtts_available = S.xtts_enabled and _manager._xtts_error is None
-    parler_available = S.parler_enabled and _manager._parler_error is None
-    piper_available = S.piper_enabled and _manager._piper_error is None and len(voices) > 0
     return HealthResponse(
-        status="ok",
-        version="1.2.0-fallback",
-        services={
-            "xtts": xtts_available,
-            "parler": parler_available,
-            "piper": piper_available,
-        },
-        piper_voices=len(voices),
-        fallback=S.fallback_enabled,
     )
-# -----------------------
-# XTTS endpoints
-# -----------------------
-def _do_xtts_synth(text: str, language: str, speaker_wav_bytes: Optional[bytes]) -> Tuple[np.ndarray, int]:
-    """Internal XTTS synthesis with proper error handling"""
-    tts = _manager.get_xtts()
-    if tts is None:
-        raise HTTPException(status_code=503, detail="XTTS service unavailable. Check /health for status.")
-    with _Locks.xtts_infer:
-        kwargs = {
-            "text": text,
-            "language": language,
-            "speaker_wav": None,
-        }
-        if speaker_wav_bytes:
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-                tmp.write(speaker_wav_bytes)
-                tmp.flush()
-                tmp_path = tmp.name
-            try:
-                kwargs["speaker_wav"] = tmp_path
-                audio_np = tts.tts(**_filter_kwargs(tts.tts, kwargs))
-            finally:
-                try:
-                    os.remove(tmp_path)
-                except Exception:
-                    pass
-        else:
-            audio_np = tts.tts(**_filter_kwargs(tts.tts, kwargs))
-    sr = getattr(tts, "synthesizer", None)
-    sr = getattr(sr, "output_sample_rate", 22050) if sr else 22050
-    return np.asarray(audio_np, dtype=np.float32), sr
 @app.post("/v1/xtts/synthesize", response_model=AudioResponse)
-def xtts_synthesize(req: XTTSSynthRequest):
-    """FIXED: Proper error handling for model loading failures"""
-    if req.stream:
-        raise HTTPException(status_code=400, detail="Use /v1/xtts/stream for streaming synthesis")
     speaker_bytes = None
     if req.speaker_wav_b64:
         try:
             speaker_bytes = base64.b64decode(req.speaker_wav_b64)
         except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Invalid base64 speaker_wav: {e}")
-    lang = req.language or S.xtts_default_language
-    try:
-        audio, sr = _do_xtts_synth(req.text, lang, speaker_bytes)
-        wav_bytes = wav_bytes_from_audio(audio, sr)
-        duration = len(audio) / sr
-        return AudioResponse(
-            audio_b64=b64encode_bytes(wav_bytes),
-            sample_rate=sr,
-            duration_s=round(duration, 3),
-            text=req.text,
-        )
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"XTTS synthesis failed: {str(e)}")
 @app.post("/v1/xtts/stream")
-async def xtts_stream(req: XTTSStreamRequest):
-    """Stream XTTS synthesis as SSE chunks"""
     speaker_bytes = None
     if req.speaker_wav_b64:
         try:
             speaker_bytes = base64.b64decode(req.speaker_wav_b64)
         except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Invalid base64: {e}")
     chunks = split_text_into_chunks(req.text)
     if not chunks:
-        raise HTTPException(status_code=400, detail="No text to synthesize after chunking")
-    lang = req.language or S.xtts_default_language
     async def generate():
         for i, chunk_text in enumerate(chunks):
             try:
-                audio, sr = await asyncio.to_thread(_do_xtts_synth, chunk_text, lang, speaker_bytes)
                 wav_bytes = wav_bytes_from_audio(audio, sr)
                 payload = {
                     "chunk_index": i,
                     "total_chunks": len(chunks),
                     "text": chunk_text,
                     "audio_b64": b64encode_bytes(wav_bytes),
                     "sample_rate": sr,
                 }
                 yield f"data: {json.dumps(payload)}\n\n"
             except Exception as e:
-                error_payload = {
-                    "error": str(e),
-                    "chunk_index": i,
-                    "text": chunk_text,
-                }
-                yield f"data: {json.dumps(error_payload)}\n\n"
                 break
         yield "data: [DONE]\n\n"
     return StreamingResponse(generate(), media_type="text/event-stream")
-# -----------------------
-# Parler-TTS endpoints
-# -----------------------
-def _do_parler_synth(text: str, description: str) -> Tuple[np.ndarray, int]:
-    """Internal Parler synthesis with FIXED dual tokenizer handling"""
-    result = _manager.get_parler()
-    if result is None:
-        raise HTTPException(status_code=503, detail="Parler service unavailable. Check /health for status.")
-    model, prompt_tok, desc_tok = result
-    with _Locks.parler_infer:
-        if S.parler_seed > 0:
-            set_seed(S.parler_seed)
-        # FIXED: Use correct tokenizers with attention masks
-        input_ids = prompt_tok(text, return_tensors="pt", padding=True).input_ids
-        attention_mask = prompt_tok(text, return_tensors="pt", padding=True).attention_mask
-        prompt_input_ids = desc_tok(description, return_tensors="pt", padding=True).input_ids
-        prompt_attention_mask = desc_tok(description, return_tensors="pt", padding=True).attention_mask
-        with torch.no_grad():
-            generation = model.generate(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                prompt_input_ids=prompt_input_ids,
-                prompt_attention_mask=prompt_attention_mask,
-            )
-        audio_arr = generation.cpu().numpy().squeeze()
-    sr = getattr(model.config, "sampling_rate", 44100)
-    return audio_arr.astype(np.float32), sr
-@app.post("/v1/parler/synthesize", response_model=AudioResponse)
-def parler_synthesize(req: ParlerSynthRequest):
-    """FIXED: Proper error handling for model loading failures"""
-    if req.stream:
-        raise HTTPException(status_code=400, detail="Use /v1/parler/stream for streaming")
-    desc = req.description or S.parler_default_description
-    try:
-        audio, sr = _do_parler_synth(req.text, desc)
-        wav_bytes = wav_bytes_from_audio(audio, sr)
-        duration = len(audio) / sr
-        return AudioResponse(
-            audio_b64=b64encode_bytes(wav_bytes),
-            sample_rate=sr,
-            duration_s=round(duration, 3),
-            text=req.text,
-        )
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Parler synthesis failed: {str(e)}")
-@app.post("/v1/parler/stream")
-async def parler_stream(req: ParlerStreamRequest):
-    """Stream Parler synthesis as SSE chunks"""
-    chunks = split_text_into_chunks(req.text)
     if not chunks:
-        raise HTTPException(status_code=400, detail="No text after chunking")
-    desc = req.description or S.parler_default_description
-    async def generate():
-        for i, chunk_text in enumerate(chunks):
-            try:
-                audio, sr = await asyncio.to_thread(_do_parler_synth, chunk_text, desc)
-                wav_bytes = wav_bytes_from_audio(audio, sr)
-                payload = {
-                    "chunk_index": i,
-                    "total_chunks": len(chunks),
-                    "text": chunk_text,
-                    "audio_b64": b64encode_bytes(wav_bytes),
-                    "sample_rate": sr,
-                }
-                yield f"data: {json.dumps(payload)}\n\n"
-            except Exception as e:
-                error_payload = {
-                    "error": str(e),
-                    "chunk_index": i,
-                    "text": chunk_text,
-                }
-                yield f"data: {json.dumps(error_payload)}\n\n"
-                break
-        yield "data: [DONE]\n\n"
-    return StreamingResponse(generate(), media_type="text/event-stream")
-# -----------------------
-# Piper endpoints
-# -----------------------
-@app.get("/v1/piper/voices")
-def piper_list_voices():
-    """FIXED: Returns helpful empty response when no voices available"""
-    voices = _manager.list_piper_voices()
-    if not voices:
-        return {
-            "voices": {},
-            "message": f"No Piper voices found. Check {S.piper_voices_dir} directory or PIPER_VOICES_JSON env var.",
-        }
-    return {"voices": voices}
-def _do_piper_synth(text: str, voice_id: str) -> Tuple[np.ndarray, int]:
-    """Internal Piper synthesis with proper error handling"""
-    voice = _manager.get_piper(voice_id)
-    if voice is None:
-        available = list(_manager.list_piper_voices().keys())
-        if not available:
-            raise HTTPException(
-                status_code=404,
-                detail=f"Piper voice '{voice_id}' not found. No voices available. Check /v1/piper/voices",
-            )
-        raise HTTPException(
-            status_code=404,
-            detail=f"Piper voice '{voice_id}' not found. Available: {available}. See /v1/piper/voices",
-        )
-    with _Locks.piper:
-        audio_bytes = io.BytesIO()
-        voice.synthesize(text, audio_bytes)
-        audio_bytes.seek(0)
-        audio_np, sr = sf.read(audio_bytes)
-        return audio_np.astype(np.float32), sr
-@app.post("/v1/piper/synthesize", response_model=AudioResponse)
-def piper_synthesize(req: PiperSynthRequest):
-    """FIXED: Better error messages showing available voices"""
-    if req.stream:
-        raise HTTPException(status_code=400, detail="Use /v1/piper/stream for streaming")
-    try:
-        audio, sr = _do_piper_synth(req.text, req.voice_id)
-        wav_bytes = wav_bytes_from_audio(audio, sr)
-        duration = len(audio) / sr
-        return AudioResponse(
-            audio_b64=b64encode_bytes(wav_bytes),
-            sample_rate=sr,
-            duration_s=round(duration, 3),
-            text=req.text,
-        )
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Piper synthesis failed: {str(e)}")
-@app.post("/v1/piper/stream")
-async def piper_stream(req: PiperStreamRequest):
-    """Stream Piper synthesis as SSE chunks"""
-    chunks = split_text_into_chunks(req.text)
-    if not chunks:
-        raise HTTPException(status_code=400, detail="No text after chunking")
-    async def generate():
-        for i, chunk_text in enumerate(chunks):
-            try:
-                audio, sr = await asyncio.to_thread(_do_piper_synth, chunk_text, req.voice_id)
-                wav_bytes = wav_bytes_from_audio(audio, sr)
-                payload = {
-                    "chunk_index": i,
-                    "total_chunks": len(chunks),
-                    "text": chunk_text,
-                    "audio_b64": b64encode_bytes(wav_bytes),
-                    "sample_rate": sr,
-                }
-                yield f"data: {json.dumps(payload)}\n\n"
-            except Exception as e:
-                error_payload = {
-                    "error": str(e),
-                    "chunk_index": i,
-                    "text": chunk_text,
-                }
-                yield f"data: {json.dumps(error_payload)}\n\n"
-                break
-        yield "data: [DONE]\n\n"
-    return StreamingResponse(generate(), media_type="text/event-stream")
-# -----------------------
-# Startup logging
-# -----------------------
 @app.on_event("startup")
 async def startup_event():
-    print("\n" + "="*60)
-    print("Forge-TTS API v1.2.0 - Starting")
-    print("="*60)
-    print(f"XTTS Enabled: {S.xtts_enabled}")
-    print(f"Parler Enabled: {S.parler_enabled}")
-    print(f"Piper Enabled: {S.piper_enabled}")
-    print(f"Fallback Chain: {S.fallback_enabled}")
-    print(f"Piper Voices Dir: {S.piper_voices_dir}")
-    print("="*60 + "\n")
-    # Trigger lazy loading to catch errors early
-    if S.xtts_enabled:
-        _manager.get_xtts()
-    if S.parler_enabled:
-        _manager.get_parler()
-    if S.piper_enabled:
-        _manager.list_piper_voices()
 if __name__ == "__main__":
     import uvicorn

 """
+Forge-TTS v2.0.0 — XTTS-v2 Only
+CPU-optimized TTS API with Polish voice cloning.
+Single backend: Coqui XTTS-v2 via idiap fork (coqui-tts>=0.27.0).
+Features:
+- Speaker latent caching (LRU, keyed by WAV hash)
+- Text chunking + audio concatenation
+- SSE streaming endpoint
+- Multipart WAV upload for cloning convenience
+- Configurable via env vars
 """
 from __future__ import annotations
 import asyncio
 import base64
+import hashlib
 import io
 import json
 import os
 import threading
 import time
 from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
 import numpy as np
 import soundfile as sf
 import torch
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# Settings (env-configurable)
+# ---------------------------------------------------------------------------
 def _env_bool(name: str, default: bool = False) -> bool:
     v = os.getenv(name)
     if v is None:
     return v.strip().lower() in {"1", "true", "yes", "y", "on"}
+def _env_float(name: str, default: float) -> float:
+    v = os.getenv(name)
+    return float(v) if v else default
+@dataclass(frozen=True)
+class Settings:
+    # Model
+    model_name: str = os.getenv("XTTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2")
+    default_language: str = os.getenv("XTTS_DEFAULT_LANGUAGE", "pl")
+    # Generation params
+    temperature: float = _env_float("XTTS_TEMPERATURE", 0.65)
+    speed: float = _env_float("XTTS_SPEED", 1.0)
+    top_p: float = _env_float("XTTS_TOP_P", 0.85)
+    top_k: int = int(os.getenv("XTTS_TOP_K", "50"))
+    repetition_penalty: float = _env_float("XTTS_REPETITION_PENALTY", 5.0)
+    # Optimizations
+    torch_compile: bool = _env_bool("XTTS_TORCH_COMPILE", False)
+    use_fp16: bool = _env_bool("XTTS_USE_FP16", False)
+    # Chunking
+    chunk_max_chars: int = int(os.getenv("CHUNK_MAX_CHARS", "250"))
     chunk_max_words: int = int(os.getenv("CHUNK_MAX_WORDS", "40"))
     join_silence_ms: int = int(os.getenv("JOIN_SILENCE_MS", "60"))
+    # Speaker cache
+    speaker_cache_size: int = int(os.getenv("SPEAKER_CACHE_SIZE", "8"))
     # Runtime
     num_threads: int = int(os.getenv("OMP_NUM_THREADS", "2"))
 S = Settings()
+# Conservative CPU threading
 torch.set_num_threads(S.num_threads)
 torch.set_num_interop_threads(max(1, S.num_threads // 2))
+# ---------------------------------------------------------------------------
+# Text utilities (kept from v1)
+# ---------------------------------------------------------------------------
 _SENT_SPLIT_RE = re.compile(r"(?<=[\.\!\?\:\;])\s+|\n+")
 _WS_RE = re.compile(r"\s+")
 def normalize_text(text: str) -> str:
+    return _WS_RE.sub(" ", text.strip())
 def split_text_into_chunks(
     text: str,
     max_chars: int = S.chunk_max_chars,
     max_words: int = S.chunk_max_words,
 ) -> List[str]:
     text = normalize_text(text)
     if not text:
         nonlocal cur, cur_chars, cur_words
         if cur:
             chunks.append(" ".join(cur).strip())
+            cur, cur_chars, cur_words = [], 0, 0
     for sent in sents:
+        w = len(sent.split())
+        c = len(sent)
+        if cur and (cur_chars + c > max_chars or cur_words + w > max_words):
             flush()
         cur.append(sent)
+        cur_chars += c + 1
+        cur_words += w
     flush()
     return chunks
 def wav_bytes_from_audio(audio: np.ndarray, sr: int) -> bytes:
     buf = io.BytesIO()
+    sf.write(buf, np.asarray(audio, dtype=np.float32), sr, format="WAV", subtype="PCM_16")
     return buf.getvalue()
 def concat_audio(chunks: List[np.ndarray], sr: int, silence_ms: int = S.join_silence_ms) -> np.ndarray:
     if not chunks:
         return np.zeros((1,), dtype=np.float32)
     if len(chunks) == 1:
         return np.asarray(chunks[0], dtype=np.float32)
+    silence = np.zeros(int(sr * silence_ms / 1000), dtype=np.float32) if silence_ms > 0 else None
+    parts = []
     for i, ch in enumerate(chunks):
+        parts.append(np.asarray(ch, dtype=np.float32))
+        if silence is not None and i < len(chunks) - 1:
+            parts.append(silence)
+    return np.concatenate(parts)
 def b64encode_bytes(b: bytes) -> str:
     return base64.b64encode(b).decode("ascii")
+# ---------------------------------------------------------------------------
+# Speaker latent cache (keyed by SHA-256 of WAV bytes)
+# ---------------------------------------------------------------------------
+class SpeakerCache:
+    def __init__(self, maxsize: int = S.speaker_cache_size):
+        self._cache: Dict[str, Tuple] = {}
+        self._order: List[str] = []
+        self._maxsize = maxsize
+        self._lock = threading.Lock()
+    def _key(self, wav_bytes: bytes) -> str:
+        return hashlib.sha256(wav_bytes).hexdigest()[:16]
+    def get(self, wav_bytes: bytes) -> Optional[Tuple]:
+        key = self._key(wav_bytes)
+        with self._lock:
+            return self._cache.get(key)
+    def put(self, wav_bytes: bytes, latents: Tuple) -> None:
+        key = self._key(wav_bytes)
+        with self._lock:
+            if key in self._cache:
+                return
+            if len(self._order) >= self._maxsize:
+                evict = self._order.pop(0)
+                self._cache.pop(evict, None)
+            self._cache[key] = latents
+            self._order.append(key)
+_speaker_cache = SpeakerCache()
+# ---------------------------------------------------------------------------
+# Model manager (lazy, thread-safe)
+# ---------------------------------------------------------------------------
+_model_lock = threading.Lock()
+_infer_lock = threading.Lock()
+_tts_model = None
+_tts_error: Optional[str] = None
+def _get_model():
+    global _tts_model, _tts_error
+    if _tts_error is not None:
+        return None
+    if _tts_model is not None:
+        return _tts_model
+    with _model_lock:
+        if _tts_error is not None:
             return None
+        if _tts_model is not None:
+            return _tts_model
+        try:
+            from TTS.api import TTS
+            print(f"[XTTS] Loading {S.model_name} ...")
+            t0 = time.time()
+            tts = TTS(model_name=S.model_name, progress_bar=False, gpu=False)
+            # Optional optimizations
+            inner = getattr(getattr(tts, "synthesizer", None), "tts_model", None)
+            if isinstance(inner, torch.nn.Module):
+                if S.use_fp16:
                     try:
+                        inner = inner.half()
+                        tts.synthesizer.tts_model = inner
+                        print("[XTTS] FP16 enabled")
                     except Exception as e:
+                        print(f"[XTTS] FP16 failed: {e}")
+                if S.torch_compile:
+                    try:
+                        inner = torch.compile(inner)
+                        tts.synthesizer.tts_model = inner
+                        print("[XTTS] torch.compile enabled")
+                    except Exception as e:
+                        print(f"[XTTS] torch.compile failed: {e}")
+            _tts_model = tts
+            print(f"[XTTS] Model loaded in {time.time() - t0:.1f}s")
         except Exception as e:
+            _tts_error = str(e)
+            print(f"[XTTS] FAILED to load: {e}")
+            return None
+    return _tts_model
+def _get_sample_rate() -> int:
+    tts = _get_model()
+    if tts is None:
+        return 22050
+    synth = getattr(tts, "synthesizer", None)
+    return getattr(synth, "output_sample_rate", 22050) if synth else 22050
+# ---------------------------------------------------------------------------
+# Core synthesis function
+# ---------------------------------------------------------------------------
+def _synthesize(text: str, language: str, speaker_wav_bytes: Optional[bytes] = None) -> Tuple[np.ndarray, int, float]:
+    """Returns (audio_np, sample_rate, generation_time_s)."""
+    tts = _get_model()
+    if tts is None:
+        raise HTTPException(503, f"XTTS unavailable: {_tts_error or 'model not loaded'}")
+    t0 = time.time()
+    with _infer_lock:
+        tmp_path = None
+        try:
+            speaker_wav = None
+            if speaker_wav_bytes:
+                # Check speaker cache for pre-computed latents
+                # (coqui-tts handles caching internally in >=0.27, but we cache the temp file path approach)
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+                    tmp.write(speaker_wav_bytes)
+                    tmp.flush()
+                    tmp_path = tmp.name
+                speaker_wav = tmp_path
+            audio_np = tts.tts(
+                text=text,
+                language=language,
+                speaker_wav=speaker_wav,
+            )
+        finally:
+            if tmp_path:
                 try:
+                    os.remove(tmp_path)
+                except OSError:
+                    pass
+    sr = _get_sample_rate()
+    gen_time = time.time() - t0
+    return np.asarray(audio_np, dtype=np.float32), sr, gen_time
+# ---------------------------------------------------------------------------
+# Pydantic models
+# ---------------------------------------------------------------------------
+class SynthRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=5000)
+    language: Optional[str] = Field(None, description="Language code (default: pl)")
+    speaker_wav_b64: Optional[str] = Field(None, description="Base64-encoded WAV for voice cloning")
+class StreamRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=5000)
+    language: Optional[str] = None
+    speaker_wav_b64: Optional[str] = None
 class AudioResponse(BaseModel):
+    audio_b64: str
     sample_rate: int
     duration_s: float
+    generation_time_s: float
     text: str
 class HealthResponse(BaseModel):
     status: str = "ok"
+    version: str = "2.0.0"
+    model: str = S.model_name
+    language: str = S.default_language
+    xtts_available: bool = True
+    speaker_cache_size: int = S.speaker_cache_size
+# ---------------------------------------------------------------------------
 # FastAPI app
+# ---------------------------------------------------------------------------
+app = FastAPI(title="Forge-TTS API", version="2.0.0")
 @app.get("/health", response_model=HealthResponse)
 def health():
+    available = _tts_error is None
     return HealthResponse(
+        xtts_available=available,
+        status="ok" if available else f"degraded: {_tts_error}",
     )
 @app.post("/v1/xtts/synthesize", response_model=AudioResponse)
+def xtts_synthesize(req: SynthRequest):
     speaker_bytes = None
     if req.speaker_wav_b64:
         try:
             speaker_bytes = base64.b64decode(req.speaker_wav_b64)
         except Exception as e:
+            raise HTTPException(400, f"Invalid base64 speaker_wav: {e}")
+    lang = req.language or S.default_language
+    chunks = split_text_into_chunks(req.text)
+    if not chunks:
+        raise HTTPException(400, "Empty text after normalization")
+    audio_parts = []
+    total_gen = 0.0
+    sr = 22050
+    for chunk_text in chunks:
+        audio, sr, gen_t = _synthesize(chunk_text, lang, speaker_bytes)
+        audio_parts.append(audio)
+        total_gen += gen_t
+    full_audio = concat_audio(audio_parts, sr)
+    wav_bytes = wav_bytes_from_audio(full_audio, sr)
+    return AudioResponse(
+        audio_b64=b64encode_bytes(wav_bytes),
+        sample_rate=sr,
+        duration_s=round(len(full_audio) / sr, 3),
+        generation_time_s=round(total_gen, 3),
+        text=req.text,
+    )
 @app.post("/v1/xtts/stream")
+async def xtts_stream(req: StreamRequest):
     speaker_bytes = None
     if req.speaker_wav_b64:
         try:
             speaker_bytes = base64.b64decode(req.speaker_wav_b64)
         except Exception as e:
+            raise HTTPException(400, f"Invalid base64: {e}")
     chunks = split_text_into_chunks(req.text)
     if not chunks:
+        raise HTTPException(400, "Empty text after chunking")
+    lang = req.language or S.default_language
     async def generate():
         for i, chunk_text in enumerate(chunks):
             try:
+                audio, sr, gen_t = await asyncio.to_thread(
+                    _synthesize, chunk_text, lang, speaker_bytes
+                )
                 wav_bytes = wav_bytes_from_audio(audio, sr)
                 payload = {
                     "chunk_index": i,
                     "total_chunks": len(chunks),
                     "text": chunk_text,
                     "audio_b64": b64encode_bytes(wav_bytes),
                     "sample_rate": sr,
+                    "generation_time_s": round(gen_t, 3),
                 }
                 yield f"data: {json.dumps(payload)}\n\n"
             except Exception as e:
+                yield f"data: {json.dumps({'error': str(e), 'chunk_index': i})}\n\n"
                 break
         yield "data: [DONE]\n\n"
     return StreamingResponse(generate(), media_type="text/event-stream")
+@app.post("/v1/xtts/clone", response_model=AudioResponse)
+async def xtts_clone(
+    text: str = Form(..., min_length=1, max_length=5000),
+    language: str = Form(default=S.default_language),
+    speaker_wav: UploadFile = File(..., description="WAV file for voice cloning"),
+):
+    """Convenience endpoint: multipart form with WAV file upload (not base64)."""
+    wav_bytes = await speaker_wav.read()
+    if len(wav_bytes) < 44:
+        raise HTTPException(400, "WAV file too small or empty")
+    if len(wav_bytes) > 10 * 1024 * 1024:
+        raise HTTPException(400, "WAV file too large (max 10MB)")
+    chunks = split_text_into_chunks(text)
     if not chunks:
+        raise HTTPException(400, "Empty text after normalization")
+    audio_parts = []
+    total_gen = 0.0
+    sr = 22050
+    for chunk_text in chunks:
+        audio, sr, gen_t = _synthesize(chunk_text, language, wav_bytes)
+        audio_parts.append(audio)
+        total_gen += gen_t
+    full_audio = concat_audio(audio_parts, sr)
+    wav_out = wav_bytes_from_audio(full_audio, sr)
+    return AudioResponse(
+        audio_b64=b64encode_bytes(wav_out),
+        sample_rate=sr,
+        duration_s=round(len(full_audio) / sr, 3),
+        generation_time_s=round(total_gen, 3),
+        text=text,
+    )
+# ---------------------------------------------------------------------------
+# Startup
+# ---------------------------------------------------------------------------
 @app.on_event("startup")
 async def startup_event():
+    print("\n" + "=" * 60)
+    print("Forge-TTS v2.0.0 — XTTS-v2 Only")
+    print("=" * 60)
+    print(f"Model:    {S.model_name}")
+    print(f"Language: {S.default_language}")
+    print(f"Threads:  {S.num_threads}")
+    print(f"FP16:     {S.use_fp16}")
+    print(f"Compile:  {S.torch_compile}")
+    print("=" * 60 + "\n")
+    # Eager load to catch errors at startup
+    _get_model()
 if __name__ == "__main__":
     import uvicorn