Spaces:

Chronisin
/

chronisai

Configuration error

App Files Files Community

RiishabhSinghal commited on 29 days ago

Commit

51e47e1

1 Parent(s): e082b9f

Resolve app.py merge conflicts

Browse files

Files changed (1) hide show

app.py +14 -688

app.py CHANGED Viewed

@@ -1,9 +1,4 @@
 import os
-<<<<<<< HEAD
-os.environ["PYTHONUTF8"] = "1"
-os.environ["PYTHONIOENCODING"] = "utf-8"
-=======
-<<<<<<< HEAD
 os.environ["PYTHONUTF8"] = "1"
 os.environ["PYTHONIOENCODING"] = "utf-8"
@@ -14,20 +9,10 @@ sys.stderr.reconfigure(encoding="utf-8")
 import re
 import gc
-=======
->>>>>>> main
-import sys
-sys.stdout.reconfigure(encoding='utf-8')
-sys.stderr.reconfigure(encoding='utf-8')
-import re
-import gc
-import base64
->>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
 import tempfile
 import subprocess
 import shutil
 import threading
-<<<<<<< HEAD
 from pathlib import Path
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -46,33 +31,22 @@ _tts_instance = None  # lazy-loaded TTS object
 print("=== Chronis XTTS-v2 Space Booting ===", flush=True)
-# ──────────────────────────────────────────────────────────────────────────────
-# Setup — install TTS library and download XTTS-v2 weights on first run
-# ──────────────────────────────────────────────────────────────────────────────
 def setup():
-    """
-    Installs the Coqui TTS library if absent, then downloads XTTS-v2 weights
-    to MODEL_DIR (skipped when weights are already present).
-    """
-    # 1. Make sure the TTS package is available
     try:
         import TTS  # noqa: F401
         print("[setup] TTS library already installed.", flush=True)
     except ImportError:
         print("[setup] Installing TTS library ...", flush=True)
-        subprocess.run(
-            [sys.executable, "-m", "pip", "install", "TTS", "-q"],
-            check=True,
-        )
         print("[setup] TTS library installed.", flush=True)
-    # 2. Pre-download XTTS-v2 weights so first inference isn't cold
     MODEL_DIR.mkdir(parents=True, exist_ok=True)
     config_path = MODEL_DIR / "config.json"
     if not config_path.exists():
         print("[setup] Downloading XTTS-v2 weights ...", flush=True)
         from huggingface_hub import snapshot_download
         snapshot_download(
             repo_id="coqui/XTTS-v2",
             local_dir=str(MODEL_DIR),
@@ -84,415 +58,21 @@ def setup():
 def get_tts():
-    """
-    Lazy-load the TTS model.  Reuses the same instance across calls so the
-    ~1.8 GB model is only loaded into memory once per process.
-    """
     global _tts_instance
     if _tts_instance is None:
         from TTS.api import TTS
         print("[tts] Loading XTTS-v2 model ...", flush=True)
         _tts_instance = TTS(
             model_path=str(MODEL_DIR),
             config_path=str(MODEL_DIR / "config.json"),
             progress_bar=False,
-            gpu=False,  # CPU-only; set True if CUDA available
         )
-        print("[tts] Model loaded ✓", flush=True)
     return _tts_instance
-=======
-try:
-    import tomllib
-except ModuleNotFoundError:
-    try:
-        import tomli as tomllib
-    except ModuleNotFoundError:
-        tomllib = None
-try:
-    import tomli_w
-except ModuleNotFoundError:
-    tomli_w = None
-from pathlib import Path
-os.environ["GRADIO_SSR_MODE"]        = "0"
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["OMP_NUM_THREADS"]        = str(os.cpu_count() or 1)
-os.environ["CUDA_VISIBLE_DEVICES"]   = ""
-import gradio as gr
-from pydub import AudioSegment
-from huggingface_hub import snapshot_download
-SECRET    = os.environ.get("API_SECRET", "")
-REPO_DIR  = Path(os.environ.get("FISH_REPO_DIR",  r"C:\tmp\fish-speech"))
-MODEL_DIR = Path(os.environ.get("FISH_MODEL_DIR", r"C:\tmp\fish-speech-weights"))
-inference_lock = threading.Lock()
-initialized    = False
-print("=== Chronis Fish Speech Space Booting ===", flush=True)
-# ──────────────────────────────────────────────────────────────────────────────
-# Patch 1 — LogMelSpectrogram
-#
-# History of bugs fixed in this class:
-#
-# Round 1 — AttributeError: 'LogMelSpectrogram' has no attribute 'hop_length'
-#   firefly.py reads self.spec_transform.hop_length (and n_mels, n_fft, etc.)
-#   directly on the object. They were only stored inside self._transform.
-#   Fix: expose every __init__ param as a top-level self.* attribute.
-#
-# Round 2 (current) — RuntimeError: size of tensor a (1292) must match b (160)
-#                      at non-singleton dimension 3
-#
-#   Root cause A — wrong input shape -> 4-D output:
-#     vqgan/inference.py loads audio with torchaudio.load() -> (C, T),
-#     then passes it as (1, C, T) = (1, 1, T) to model.encode().
-#     firefly.encode() calls self.spec_transform(audios) with a 3-D tensor.
-#     T.MelSpectrogram treats every dim except the last as a batch dim,
-#     so (B=1, C=1, T) -> output (B=1, C=1, n_mels, T_frames) [4-D].
-#     Downstream masks are computed as 3-D (B, 1, T_vq).
-#     PyTorch broadcasting aligns from the right:
-#       mels:            (1, 1, 160, 1292)   dim-3 = 1292
-#       mel_masks_conv:  (1,  1,  1,  160)   dim-3 =  160
-#       -> "size of tensor a (1292) must match b (160) at non-singleton dim 3"
-#     Fix: squeeze the channel dim inside forward() so output is always 3-D.
-#
-#   Root cause B — wrong default hyperparameters:
-#     The "21hz" in firefly-gan-vq-fsq-8x1024-21hz encodes the token rate:
-#       44100 / (hop_length × 8_conv_strides) ≈ 21  ->  hop_length = 256
-#     n_mels is 160 for fish-speech, not 128.
-#     Hydra injects the correct values via __init__ kwargs, but using the
-#     right defaults prevents silent fallback failures.
-# ──────────────────────────────────────────────────────────────────────────────
-SPECTROGRAM_SRC = '''\
-"""
-fish_speech.utils.spectrogram  —  patched by Chronis setup.
-See app.py Patch 1 comment block for the full explanation of fixes.
-"""
-import torch
-import torch.nn as nn
-import torchaudio.transforms as T
-class LogMelSpectrogram(nn.Module):
-    def __init__(
-        self,
-        sample_rate: int   = 44100,
-        n_fft:       int   = 1024,
-        hop_length:  int   = 256,
-        win_length:  int   = 1024,
-        n_mels:      int   = 160,
-        f_min:       float = 0.0,
-        f_max:       float = None,
-        center:      bool  = True,
-        power:       float = 1.0,
-        norm:        str   = None,
-        mel_scale:   str   = "slaney",
-        clamp_min:   float = 1e-5,
-    ):
-        super().__init__()
-        # Every param must be a direct instance attribute.
-        # firefly.py reads them as self.spec_transform.<attr>.
-        self.sample_rate = sample_rate
-        self.n_fft       = n_fft
-        self.hop_length  = hop_length
-        self.win_length  = win_length
-        self.n_mels      = n_mels
-        self.f_min       = f_min
-        self.f_max       = f_max if f_max is not None else float(sample_rate) / 2.0
-        self.clamp_min   = clamp_min
-        self._transform = T.MelSpectrogram(
-            sample_rate = sample_rate,
-            n_fft       = n_fft,
-            hop_length  = hop_length,
-            win_length  = win_length,
-            n_mels      = n_mels,
-            f_min       = f_min,
-            f_max       = self.f_max,
-            center      = center,
-            power       = power,
-            norm        = norm,
-            mel_scale   = mel_scale,
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        x   : (B, T) | (T,) | (B, 1, T) | (B, C, T)
-        out : (B, n_mels, T_frames)  — always 3-D, never 4-D
-        The channel-squeeze is critical. vqgan/inference.py passes audio as
-        (B=1, C=1, T); without the squeeze T.MelSpectrogram returns a 4-D
-        tensor which mismatches the 3-D conv mask, crashing at dim 3.
-        """
-        if x.ndim == 3:
-            if x.shape[1] == 1:
-                x = x.squeeze(1)   # mono   (B, 1, T) -> (B, T)
-            else:
-                x = x.mean(dim=1)  # stereo (B, C, T) -> (B, T)
-        mel = self._transform(x)
-        return torch.log(torch.clamp(mel, min=self.clamp_min))
-'''
-def _patch_spectrogram_module():
-    utils_dir = REPO_DIR / "fish_speech" / "utils"
-    utils_dir.mkdir(parents=True, exist_ok=True)
-    init_file = utils_dir / "__init__.py"
-    if not init_file.exists():
-        init_file.write_text("# auto-generated by Chronis setup\n")
-    spec_file = utils_dir / "spectrogram.py"
-    spec_file.write_text(SPECTROGRAM_SRC)
-    # Delete any stale .pyc that could shadow the updated .py
-    pyc_dir = utils_dir / "__pycache__"
-    if pyc_dir.exists():
-        for pyc in pyc_dir.glob("spectrogram*.pyc"):
-            pyc.unlink()
-            print(f"[patch] deleted stale {pyc}", flush=True)
-    print(f"[patch] wrote {spec_file}", flush=True)
-# ──────────────────────────────────────────────────────────────────────────────
-# Patch 2 — strip pyaudio from all dependency manifests
-# ──────────────────────────────────────────────────────────────────────────────
-def _drop_dep(dep_list: list, pattern: str) -> list:
-    return [d for d in dep_list if not d.lower().startswith(pattern)]
-def _patch_pyproject_toml():
-    pyproject = REPO_DIR / "pyproject.toml"
-    if not pyproject.exists():
-        return
-    with open(pyproject, "rb") as f:
-        data = tomllib.load(f)
-    changed = False
-    deps = data.get("project", {}).get("dependencies", [])
-    if deps:
-        new_deps = _drop_dep(deps, "pyaudio")
-        if new_deps != deps:
-            data["project"]["dependencies"] = new_deps
-            changed = True
-    poetry_deps = data.get("tool", {}).get("poetry", {}).get("dependencies", {})
-    if "pyaudio" in poetry_deps or "PyAudio" in poetry_deps:
-        poetry_deps.pop("pyaudio", None)
-        poetry_deps.pop("PyAudio", None)
-        changed = True
-    if changed:
-        with open(pyproject, "wb") as f:
-            tomli_w.dump(data, f)
-        print("[patch] removed pyaudio from pyproject.toml", flush=True)
-def _patch_requirements_txt():
-    for fname in ("requirements.txt", "requirements-base.txt"):
-        req = REPO_DIR / fname
-        if not req.exists():
-            continue
-        lines     = req.read_text().splitlines()
-        new_lines = [l for l in lines if not l.lower().startswith("pyaudio")]
-        if new_lines != lines:
-            req.write_text("\n".join(new_lines) + "\n")
-            print(f"[patch] removed pyaudio from {fname}", flush=True)
-def _patch_setup_cfg():
-    setup_cfg = REPO_DIR / "setup.cfg"
-    if not setup_cfg.exists():
-        return
-    text     = setup_cfg.read_text()
-    new_text = "\n".join(
-        l for l in text.splitlines() if not l.strip().lower().startswith("pyaudio")
-    )
-    if new_text != text:
-        setup_cfg.write_text(new_text)
-        print("[patch] removed pyaudio from setup.cfg", flush=True)
-def _patch_dependencies():
-    global tomllib, tomli_w
-    if tomllib is None or tomli_w is None:
-        subprocess.run(
-            [sys.executable, "-m", "pip", "install", "tomli", "tomli_w", "-q"],
-            check=True,
-        )
-        import tomli   as tomllib
-        import tomli_w as tomli_w
-    _patch_pyproject_toml()
-    _patch_requirements_txt()
-    _patch_setup_cfg()
-# ──────────────────────────────────────────────────────────────────────────────
-# Patch 3 — CPU-safe subprocess wrapper
-# ──────────────────────────────────────────────────────────────────────────────
-WRAPPER_PATH = Path("/tmp/_chronis_torch_cpu.py")
-_WRAPPER_SRC = '''\
-"""
-Chronis CPU-safe subprocess wrapper.
-Forces torch.load -> CPU, disables weights_only, redirects .to(cuda) -> .to(cpu).
-Usage: python _chronis_torch_cpu.py <real_script.py> [args...]
-"""
-import sys
-import torch
-import runpy
-_original_load = torch.load
-def _cpu_safe_load(f, map_location=None, pickle_module=None, **kwargs):
-    kwargs["weights_only"] = False
-    kwargs["map_location"]  = "cpu"
-    if pickle_module is not None:
-        kwargs["pickle_module"] = pickle_module
-    return _original_load(f, **kwargs)
-torch.load = _cpu_safe_load
-_orig_module_to = torch.nn.Module.to
-def _cpu_module_to(self, *args, **kwargs):
-    new_args = []
-    for a in args:
-        if isinstance(a, (str, torch.device)) and "cuda" in str(a):
-            a = torch.device("cpu")
-        new_args.append(a)
-    if "device" in kwargs and "cuda" in str(kwargs["device"]):
-        kwargs["device"] = torch.device("cpu")
-    return _orig_module_to(self, *new_args, **kwargs)
-torch.nn.Module.to = _cpu_module_to
-_orig_tensor_to = torch.Tensor.to
-def _cpu_tensor_to(self, *args, **kwargs):
-    new_args = []
-    for a in args:
-        if isinstance(a, (str, torch.device)) and "cuda" in str(a):
-            a = torch.device("cpu")
-        new_args.append(a)
-    if "device" in kwargs and "cuda" in str(kwargs["device"]):
-        kwargs["device"] = torch.device("cpu")
-    return _orig_tensor_to(self, *new_args, **kwargs)
-torch.Tensor.to = _cpu_tensor_to
-sys.argv = sys.argv[1:]
-runpy.run_path(sys.argv[0], run_name="__main__")
-'''
-def _patch_torch_load():
-    WRAPPER_PATH.write_text(_WRAPPER_SRC)
-    print(f"[patch] wrote subprocess wrapper -> {WRAPPER_PATH}", flush=True)
-# ──────────────────────────────────────────────────────────────────────────────
-def _build_env():
-    existing = os.environ.get("PYTHONPATH", "")
-    parts = [str(REPO_DIR)]
-    if existing:
-        parts.append(existing)
-    new_pythonpath = os.pathsep.join(parts)
-    return {
-        **os.environ,
-        "PYTHONPATH":           new_pythonpath,
-        "HYDRA_FULL_ERROR":     "1",
-        "CUDA_VISIBLE_DEVICES": "",
-        "PYTHONUTF8":           "1",
-    }
-# Add this to _patch_spectrogram_module() in app.py, replacing the current version:
-def _patch_spectrogram_module():
-    # Ensure the full package chain exists
-    for pkg_dir in [
-        REPO_DIR / "fish_speech",
-        REPO_DIR / "fish_speech" / "utils",
-    ]:
-        pkg_dir.mkdir(parents=True, exist_ok=True)
-        init_file = pkg_dir / "__init__.py"
-        if not init_file.exists():
-            init_file.write_text("# auto-generated\n")
-            print(f"[patch] created {init_file}", flush=True)
-    spec_file = REPO_DIR / "fish_speech" / "utils" / "spectrogram.py"
-    spec_file.write_text(SPECTROGRAM_SRC)
-    # Nuke ALL pycache under fish_speech to prevent stale imports
-    for pyc_dir in (REPO_DIR / "fish_speech").rglob("__pycache__"):
-        for f in pyc_dir.iterdir():
-            f.unlink()
-        print(f"[patch] cleared {pyc_dir}", flush=True)
-    print(f"[patch] wrote {spec_file}", flush=True)
-# ──────────────────────────────────────────────────────────────────────────────
-# Setup
-# ──────────────────────────────────────────────────────────────────────────────
-def setup():
-    global initialized
-    if initialized:
-        return
-    if not REPO_DIR.exists():
-        print("Cloning Fish Speech v1.5.0 ...", flush=True)
-        subprocess.run(
-            [
-                "git", "clone",
-                "--depth", "1",
-                "--branch", "v1.5.0",
-                "https://github.com/fishaudio/fish-speech.git",
-                str(REPO_DIR),
-            ],
-            check=True,
-        )
-    _patch_spectrogram_module()
-    _patch_dependencies()
-    _patch_torch_load()
-    # print("Installing Fish Speech (editable) ...", flush=True)
-    # subprocess.run(
-    #     [sys.executable, "-m", "pip", "install", "-e", ".", "--quiet"],
-    #     cwd=str(REPO_DIR),
-    #     check=True,
-    # )
-    # Re-apply AFTER pip install — editable install can cache stale .pyc files
-    _patch_spectrogram_module()
-    if str(REPO_DIR) not in sys.path:
-        sys.path.insert(0, str(REPO_DIR))
-    if not MODEL_DIR.exists() or not any(MODEL_DIR.iterdir()):
-        print("Downloading Fish Speech 1.5 weights ...", flush=True)
-        snapshot_download(
-            repo_id               = "fishaudio/fish-speech-1.5",
-            local_dir             = str(MODEL_DIR),
-            local_dir_use_symlinks = False,
-        )
-    print("Setup complete.", flush=True)
-    initialized = True
->>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
-# ──────────────────────────────────────────────────────────────────────────────
-# Text helpers
-# ──────────────────────────────────────────────────────────────────────────────
 def clean_text(text: str) -> str:
     text = re.sub(r"[^\x00-\x7F]+", " ", text)
@@ -502,21 +82,10 @@ def clean_text(text: str) -> str:
     return text[:500]
-<<<<<<< HEAD
 def split_sentences(text: str, max_chars: int = 200) -> list[str]:
-    """
-    XTTS handles longer segments better than Fish Speech, so we use a
-    generous 200-char chunk limit instead of 120.
-    """
     parts = re.split(r"(?<=[.!?])\s+", text)
     chunks: list[str] = []
     buf = ""
-=======
-def split_sentences(text: str, max_chars: int = 120) -> list:
-    parts  = re.split(r"(?<=[.!?])\s+", text)
-    chunks = []
-    buf    = ""
->>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
     for p in parts:
         if len(buf) + len(p) < max_chars:
             buf = (buf + " " + p).strip()
@@ -529,27 +98,15 @@ def split_sentences(text: str, max_chars: int = 120) -> list:
     return chunks or [text]
-# ──────────────────────────────────────────────────────────────────────────────
-# Audio helpers
-# ──────────────────────────────────────────────────────────────────────────────
 def prepare_ref_audio(ref_path: str) -> str:
-    """
-<<<<<<< HEAD
-    Normalise reference audio to mono 24 000 Hz WAV, capped at 10 seconds.
-    XTTS-v2 expects 24 kHz input for its speaker encoder.
-    Recommended reference length: 6-12 s; we cap at 10 s for CPU speed.
-    """
     audio = AudioSegment.from_file(ref_path)
-    audio = audio.set_channels(1).set_frame_rate(24_000).normalize()
-    if len(audio) > 10_000:
-        audio = audio[:10_000]
-    elif len(audio) < 1_000:
-        raise ValueError(
-            f"Reference audio too short ({len(audio)} ms). Need at least 1 second."
-        )
     fd, tmp_path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
@@ -557,20 +114,7 @@ def prepare_ref_audio(ref_path: str) -> str:
     return tmp_path
-# ──────────────────────────────────────────────────────────────────────────────
-# Inference
-# ──────────────────────────────────────────────────────────────────────────────
 def run_chunk(tts, text: str, ref_audio: str, out_path: str):
-    """
-    Synthesise one text chunk and write the result to out_path (WAV).
-    XTTS-v2 tts_to_file() signature:
-        text           – the utterance
-        speaker_wav    – reference audio file(s) for voice cloning
-        language       – BCP-47 code; "en" covers most use-cases
-        file_path      – output WAV path
-    """
     tts.tts_to_file(
         text=text,
         speaker_wav=ref_audio,
@@ -581,20 +125,17 @@ def run_chunk(tts, text: str, ref_audio: str, out_path: str):
 def synthesize(text: str, ref_audio_path: str, secret: str):
     with inference_lock:
-        # ── Auth ──────────────────────────────────────────────────────────────
         if SECRET and secret != SECRET:
             return None, "Unauthorized"
         if not ref_audio_path or not Path(ref_audio_path).exists():
             return None, "Reference audio missing or not uploaded"
-        # ── First-run setup ───────────────────────────────────────────────────
         try:
             setup()
         except Exception as e:
             return None, f"Setup failed: {e}"
-        # ── Synthesis ─────────────────────────────────────────────────────────
         cleaned = clean_text(text)
         chunks = split_sentences(cleaned)
         workdir = Path(tempfile.mkdtemp(prefix="chronis_xtts_"))
@@ -616,15 +157,12 @@ def synthesize(text: str, ref_audio_path: str, secret: str):
             fd, tmp_out = tempfile.mkstemp(suffix=".wav")
             os.close(fd)
             combined.export(tmp_out, format="wav")
-            # Return file path directly so Gradio renders a playable audio output.
             final_audio_path = tmp_out
             tmp_out = None
             return final_audio_path, "ok"
         except Exception as e:
             print(f"[synth] ERROR: {e}", flush=True)
             return None, str(e)
         finally:
             if clean_ref and Path(clean_ref).exists():
                 try:
@@ -636,196 +174,9 @@ def synthesize(text: str, ref_audio_path: str, secret: str):
                     os.unlink(tmp_out)
                 except OSError:
                     pass
-=======
-    Normalise to mono 44100 Hz WAV, capped at 8 seconds.
-    Fish Speech docs recommend 3-10 s of reference. We cap at 8 s:
-      - Short enough to keep CPU encode time reasonable
-      - Long enough for good speaker characterisation
-      - Avoids edge-case rounding in the conv-mask stride at 15 s lengths
-    """
-    audio = AudioSegment.from_file(ref_path)
-    audio = audio.set_channels(1).set_frame_rate(44100).normalize()
-    if len(audio) > 8_000:
-        audio = audio[:8_000]
-    elif len(audio) < 1_000:
-        raise ValueError(
-            f"Reference audio too short ({len(audio)}ms). Need at least 1 second."
-        )
-    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    audio.export(tmp.name, format="wav")
-    return tmp.name
-# ──────────────────────────────────────────────────────────────────────────────
-# Inference pipeline
-# ─────────────────────────────────────────────────────────────────────��────────
-def run_step(cmd: list, name: str, cwd: Path, expect_output: Path = None):
-    """
-    Run a Fish Speech subprocess through the CPU wrapper.
-    Raises a detailed RuntimeError on non-zero exit or missing expected output.
-    """
-    print(f"[{name}] starting ...", flush=True)
-    wrapped_cmd = [cmd[0], str(WRAPPER_PATH)] + cmd[1:]
-    result = subprocess.run(
-        wrapped_cmd,
-        cwd            = str(cwd),
-        capture_output = True,
-        text           = True,
-        encoding       = "utf-8",
-        errors         = "replace",
-        env            = _build_env(),
-        timeout        = 600,
-    )
-    if result.stdout.strip():
-       print(f"[{name}] stdout:\n{result.stdout[-1200:]}".encode("utf-8", "replace").decode(), flush=True)
-    if result.returncode != 0:
-        diag = (
-            f"[{name}] FAILED (exit {result.returncode})\n"
-            f"--- stderr ---\n{result.stderr[-1500:]}\n"
-            f"--- stdout ---\n{result.stdout[-600:]}"
-        )
-        print(diag, flush=True)
-        raise RuntimeError(diag)
-    if expect_output is not None and not expect_output.exists():
-        raise RuntimeError(
-            f"[{name}] exited 0 but expected output missing: {expect_output}\n"
-            f"stdout: {result.stdout[-800:]}\nstderr: {result.stderr[-800:]}"
-        )
-    print(f"[{name}] done ✓", flush=True)
-def run_chunk(text: str, ref_audio: str, workdir: Path, idx: int) -> str:
-    chunk_dir = workdir / f"chunk_{idx}"
-    chunk_dir.mkdir(parents=True, exist_ok=True)
-    ref_copy   = chunk_dir / "ref.wav"
-    shutil.copy(ref_audio, ref_copy)
-    vq_tokens  = chunk_dir / "fake.npy"
-    sem_tokens = chunk_dir / "codes_0.npy"
-    out_wav    = chunk_dir / "fake.wav"
-    # In fish-speech v1.5, tools/vqgan/inference.py handles BOTH encode and
-    # decode. Mode is auto-detected from the input file extension:
-    #   .wav -> encode -> writes fake.npy
-    #   .npy -> decode -> writes fake.wav
-    vqgan_script = str(REPO_DIR / "tools" / "vqgan" / "inference.py")
-    t2s_script   = str(REPO_DIR / "fish_speech" / "models" / "text2semantic" / "inference.py")
-    firefly_ckpt = str(MODEL_DIR / "firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
-    # Step 1: Reference audio -> VQ tokens
-    run_step(
-        [
-            sys.executable, vqgan_script,
-            "-i",                str(ref_copy),
-            "--checkpoint-path", firefly_ckpt,
-            "--device",          "cpu",
-        ],
-        name          = "Codec Encode",
-        cwd           = chunk_dir,
-        expect_output = vq_tokens,
-    )
-    # Step 2: Text + VQ tokens -> semantic codes
-    run_step(
-        [
-            sys.executable, t2s_script,
-            "--text",            text,
-            "--prompt-tokens",   str(vq_tokens),
-            "--checkpoint-path", str(MODEL_DIR),
-            "--num-samples",     "1",
-            "--device",          "cpu",
-        ],
-        name          = "Text2Semantic",
-        cwd           = chunk_dir,
-        expect_output = sem_tokens,
-    )
-    # Step 3: Semantic codes -> audio
-    run_step(
-        [
-            sys.executable, vqgan_script,
-            "-i",                str(sem_tokens),
-            "--checkpoint-path", firefly_ckpt,
-            "--device",          "cpu",
-        ],
-        name          = "Codec Decode",
-        cwd           = chunk_dir,
-        expect_output = out_wav,
-    )
-    return str(out_wav)
-# ──────────────────────────────────────────────────────────────────────────────
-# Main synthesis entry point
-# ──────────────────────────────────────────────────────────────────────────────
-def synthesize(text: str, ref_audio_path: str, secret: str):
-    with inference_lock:
-        if SECRET and secret != SECRET:
-            return "", "Unauthorized"
-        if not ref_audio_path or not Path(ref_audio_path).exists():
-            return "", "Reference audio missing or not uploaded"
-        try:
-            # Check if the model directory already has files in it
-            if not MODEL_DIR.exists() or not any(MODEL_DIR.iterdir()):
-                print("[synth] Running first-time setup...", flush=True)
-                setup()
-            else:
-                # This skips the 'pip install' that causes the Access Denied error
-                print("[synth] Skipping setup: Model weights already present.", flush=True)
-        except Exception as e:
-            return "", f"Setup failed: {e}"
-        cleaned = clean_text(text)
-        chunks  = split_sentences(cleaned)
-        workdir = Path(tempfile.mkdtemp(prefix="chronis_tts_"))
-        try:
-            clean_ref = prepare_ref_audio(ref_audio_path)
-            combined  = AudioSegment.empty()
-            for i, chunk in enumerate(chunks):
-                print(f"[synth] chunk {i+1}/{len(chunks)}: {chunk[:80]!r}", flush=True)
-                out       = run_chunk(chunk, clean_ref, workdir, i)
-                combined += AudioSegment.from_wav(out)
-                gc.collect()
-            tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-            combined.export(tmp.name, format="wav")
-            with open(tmp.name, "rb") as f:
-                audio_b64 = base64.b64encode(f.read()).decode()
-            os.unlink(tmp.name)
-            return audio_b64, "ok"
-        except Exception as e:
-            print(f"[synth] ERROR: {e}", flush=True)
-            return "", str(e)
-        finally:
->>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
             shutil.rmtree(workdir, ignore_errors=True)
-# ──────────────────────────────────────────────────────────────────────────────
-<<<<<<< HEAD
-# Gradio UI  (same contract as the Fish Speech version)
-# ──────────────────────────────────────────────────────────────────────────────
 demo = gr.Interface(
     fn=synthesize,
     inputs=[
@@ -839,34 +190,9 @@ demo = gr.Interface(
     ],
     api_name="predict",
     title="Chronis XTTS-v2",
-    description="Voice cloning TTS — send a voice note, get the cloned voice back.",
     flagging_mode="never",
 )
 demo.queue()
 demo.launch(server_name="0.0.0.0", server_port=7860)
-=======
-# Gradio UI
-# ──────────────────────────────────────────────────────────────────────────────
-demo = gr.Interface(
-    fn      = synthesize,
-    inputs  = [
-        gr.Textbox(label="Text to synthesise"),
-        gr.Audio(type="filepath", label="Reference Voice (3-8 second voice note)"),
-        gr.Textbox(label="Secret", type="password"),
-    ],
-    outputs = [
-        gr.Textbox(label="Audio Base64"),
-        gr.Textbox(label="Status"),
-    ],
-    api_name      = "predict",
-    title         = "Chronis Fish Speech",
-    description   = "Voice cloning TTS - send a voice note, get the cloned voice back.",
-    flagging_mode = "never",
-)
-demo.queue()
-demo.launch(server_name="0.0.0.0", server_port=7860)
->>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef

 import os
 os.environ["PYTHONUTF8"] = "1"
 os.environ["PYTHONIOENCODING"] = "utf-8"
 import re
 import gc
 import tempfile
 import subprocess
 import shutil
 import threading
 from pathlib import Path
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 print("=== Chronis XTTS-v2 Space Booting ===", flush=True)
 def setup():
+    """Install Coqui TTS if needed and download XTTS-v2 weights once."""
     try:
         import TTS  # noqa: F401
         print("[setup] TTS library already installed.", flush=True)
     except ImportError:
         print("[setup] Installing TTS library ...", flush=True)
+        subprocess.run([sys.executable, "-m", "pip", "install", "TTS", "-q"], check=True)
         print("[setup] TTS library installed.", flush=True)
     MODEL_DIR.mkdir(parents=True, exist_ok=True)
     config_path = MODEL_DIR / "config.json"
     if not config_path.exists():
         print("[setup] Downloading XTTS-v2 weights ...", flush=True)
         from huggingface_hub import snapshot_download
         snapshot_download(
             repo_id="coqui/XTTS-v2",
             local_dir=str(MODEL_DIR),
 def get_tts():
+    """Lazy-load model once per process."""
     global _tts_instance
     if _tts_instance is None:
         from TTS.api import TTS
         print("[tts] Loading XTTS-v2 model ...", flush=True)
         _tts_instance = TTS(
             model_path=str(MODEL_DIR),
             config_path=str(MODEL_DIR / "config.json"),
             progress_bar=False,
+            gpu=False,
         )
+        print("[tts] Model loaded", flush=True)
     return _tts_instance
 def clean_text(text: str) -> str:
     text = re.sub(r"[^\x00-\x7F]+", " ", text)
     return text[:500]
 def split_sentences(text: str, max_chars: int = 200) -> list[str]:
     parts = re.split(r"(?<=[.!?])\s+", text)
     chunks: list[str] = []
     buf = ""
     for p in parts:
         if len(buf) + len(p) < max_chars:
             buf = (buf + " " + p).strip()
     return chunks or [text]
 def prepare_ref_audio(ref_path: str) -> str:
+    """Normalize to mono 24k WAV and cap to 10 seconds."""
     audio = AudioSegment.from_file(ref_path)
+    audio = audio.set_channels(1).set_frame_rate(24000).normalize()
+    if len(audio) > 10000:
+        audio = audio[:10000]
+    elif len(audio) < 1000:
+        raise ValueError(f"Reference audio too short ({len(audio)} ms). Need at least 1 second.")
     fd, tmp_path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
     return tmp_path
 def run_chunk(tts, text: str, ref_audio: str, out_path: str):
     tts.tts_to_file(
         text=text,
         speaker_wav=ref_audio,
 def synthesize(text: str, ref_audio_path: str, secret: str):
     with inference_lock:
         if SECRET and secret != SECRET:
             return None, "Unauthorized"
         if not ref_audio_path or not Path(ref_audio_path).exists():
             return None, "Reference audio missing or not uploaded"
         try:
             setup()
         except Exception as e:
             return None, f"Setup failed: {e}"
         cleaned = clean_text(text)
         chunks = split_sentences(cleaned)
         workdir = Path(tempfile.mkdtemp(prefix="chronis_xtts_"))
             fd, tmp_out = tempfile.mkstemp(suffix=".wav")
             os.close(fd)
             combined.export(tmp_out, format="wav")
             final_audio_path = tmp_out
             tmp_out = None
             return final_audio_path, "ok"
         except Exception as e:
             print(f"[synth] ERROR: {e}", flush=True)
             return None, str(e)
         finally:
             if clean_ref and Path(clean_ref).exists():
                 try:
                     os.unlink(tmp_out)
                 except OSError:
                     pass
             shutil.rmtree(workdir, ignore_errors=True)
 demo = gr.Interface(
     fn=synthesize,
     inputs=[
     ],
     api_name="predict",
     title="Chronis XTTS-v2",
+    description="Voice cloning TTS - send a voice note, get the cloned voice back.",
     flagging_mode="never",
 )
 demo.queue()
 demo.launch(server_name="0.0.0.0", server_port=7860)