Spaces:

Chronisin
/

chronisai

Configuration error

App Files Files Community

RiishabhSinghal commited on 27 days ago

Commit

1fc8196

2 Parent(s): 21409cb 6eaf50d

Merge remote main with local XTTS app

Browse files

Files changed (4) hide show

.gitattributes +35 -0
README.md +14 -0
app.py +575 -0
requirements.txt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,4 @@
 # XTTS Voice Clone Starter (Windows)
 This project gives you a fast setup to clone a voice using **Coqui XTTS v2**.
@@ -60,3 +61,16 @@ Full fine-tuning exists but is heavier (GPU VRAM, dataset, longer runs). Start w
 - If model download is slow/fails, retry with stable internet.
 - If you hit out-of-memory errors, close GPU-heavy apps or run on CPU.
 - If output sounds noisy, improve reference quality first.

+<<<<<<< HEAD
 # XTTS Voice Clone Starter (Windows)
 This project gives you a fast setup to clone a voice using **Coqui XTTS v2**.
 - If model download is slow/fails, retry with stable internet.
 - If you hit out-of-memory errors, close GPU-heavy apps or run on CPU.
 - If output sounds noisy, improve reference quality first.
+=======
+---
+title: Chronis TTS
+emoji: 🎙
+colorFrom: gray
+colorTo: gray
+sdk: gradio
+sdk_version: 5.23.0
+app_file: app.py
+pinned: false
+python_version: "3.10"
+---
+>>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 os.environ["PYTHONUTF8"] = "1"
 os.environ["PYTHONIOENCODING"] = "utf-8"
@@ -9,10 +10,17 @@ sys.stderr.reconfigure(encoding="utf-8")
 import re
 import gc
 import tempfile
 import subprocess
 import shutil
 import threading
 from pathlib import Path
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -85,6 +93,366 @@ def get_tts():
         )
         print("[tts] Model loaded ✓", flush=True)
     return _tts_instance
 # ──────────────────────────────────────────────────────────────────────────────
@@ -99,6 +467,7 @@ def clean_text(text: str) -> str:
     return text[:500]
 def split_sentences(text: str, max_chars: int = 200) -> list[str]:
     """
     XTTS handles longer segments better than Fish Speech, so we use a
@@ -107,6 +476,12 @@ def split_sentences(text: str, max_chars: int = 200) -> list[str]:
     parts = re.split(r"(?<=[.!?])\s+", text)
     chunks: list[str] = []
     buf = ""
     for p in parts:
         if len(buf) + len(p) < max_chars:
             buf = (buf + " " + p).strip()
@@ -125,6 +500,7 @@ def split_sentences(text: str, max_chars: int = 200) -> list[str]:
 def prepare_ref_audio(ref_path: str) -> str:
     """
     Normalise reference audio to mono 24 000 Hz WAV, capped at 10 seconds.
     XTTS-v2 expects 24 kHz input for its speaker encoder.
@@ -225,10 +601,185 @@ def synthesize(text: str, ref_audio_path: str, secret: str):
                     os.unlink(tmp_out)
                 except OSError:
                     pass
             shutil.rmtree(workdir, ignore_errors=True)
 # ──────────────────────────────────────────────────────────────────────────────
 # Gradio UI  (same contract as the Fish Speech version)
 # ──────────────────────────────────────────────────────────────────────────────
@@ -252,3 +803,27 @@ demo = gr.Interface(
 demo.queue()
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
+<<<<<<< HEAD
 os.environ["PYTHONUTF8"] = "1"
 os.environ["PYTHONIOENCODING"] = "utf-8"
 import re
 import gc
+=======
+import sys
+import re
+import gc
+import base64
+>>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
 import tempfile
 import subprocess
 import shutil
 import threading
+<<<<<<< HEAD
 from pathlib import Path
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
         )
         print("[tts] Model loaded ✓", flush=True)
     return _tts_instance
+=======
+try:
+    import tomllib
+except ModuleNotFoundError:
+    try:
+        import tomli as tomllib
+    except ModuleNotFoundError:
+        tomllib = None
+try:
+    import tomli_w
+except ModuleNotFoundError:
+    tomli_w = None
+from pathlib import Path
+os.environ["GRADIO_SSR_MODE"]        = "0"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["OMP_NUM_THREADS"]        = str(os.cpu_count() or 1)
+os.environ["CUDA_VISIBLE_DEVICES"]   = ""
+import gradio as gr
+from pydub import AudioSegment
+from huggingface_hub import snapshot_download
+SECRET    = os.environ.get("API_SECRET", "")
+REPO_DIR  = Path("/tmp/fish-speech")
+MODEL_DIR = Path("/tmp/fish-speech-weights")
+inference_lock = threading.Lock()
+initialized    = False
+print("=== Chronis Fish Speech Space Booting ===", flush=True)
+# ──────────────────────────────────────────────────────────────────────────────
+# Patch 1 — LogMelSpectrogram
+#
+# History of bugs fixed in this class:
+#
+# Round 1 — AttributeError: 'LogMelSpectrogram' has no attribute 'hop_length'
+#   firefly.py reads self.spec_transform.hop_length (and n_mels, n_fft, etc.)
+#   directly on the object. They were only stored inside self._transform.
+#   Fix: expose every __init__ param as a top-level self.* attribute.
+#
+# Round 2 (current) — RuntimeError: size of tensor a (1292) must match b (160)
+#                      at non-singleton dimension 3
+#
+#   Root cause A — wrong input shape → 4-D output:
+#     vqgan/inference.py loads audio with torchaudio.load() → (C, T),
+#     then passes it as (1, C, T) = (1, 1, T) to model.encode().
+#     firefly.encode() calls self.spec_transform(audios) with a 3-D tensor.
+#     T.MelSpectrogram treats every dim except the last as a batch dim,
+#     so (B=1, C=1, T) → output (B=1, C=1, n_mels, T_frames) [4-D].
+#     Downstream masks are computed as 3-D (B, 1, T_vq).
+#     PyTorch broadcasting aligns from the right:
+#       mels:            (1, 1, 160, 1292)   dim-3 = 1292
+#       mel_masks_conv:  (1,  1,  1,  160)   dim-3 =  160
+#       → "size of tensor a (1292) must match b (160) at non-singleton dim 3"
+#     Fix: squeeze the channel dim inside forward() so output is always 3-D.
+#
+#   Root cause B — wrong default hyperparameters:
+#     The "21hz" in firefly-gan-vq-fsq-8x1024-21hz encodes the token rate:
+#       44100 / (hop_length × 8_conv_strides) ≈ 21  →  hop_length = 256
+#     n_mels is 160 for fish-speech, not 128.
+#     Hydra injects the correct values via __init__ kwargs, but using the
+#     right defaults prevents silent fallback failures.
+# ──────────────────────────────────────────────────────────────────────────────
+SPECTROGRAM_SRC = '''\
+"""
+fish_speech.utils.spectrogram  —  patched by Chronis setup.
+See app.py Patch 1 comment block for the full explanation of fixes.
+"""
+import torch
+import torch.nn as nn
+import torchaudio.transforms as T
+class LogMelSpectrogram(nn.Module):
+    def __init__(
+        self,
+        sample_rate: int   = 44100,
+        n_fft:       int   = 1024,
+        hop_length:  int   = 256,
+        win_length:  int   = 1024,
+        n_mels:      int   = 160,
+        f_min:       float = 0.0,
+        f_max:       float = None,
+        center:      bool  = True,
+        power:       float = 1.0,
+        norm:        str   = None,
+        mel_scale:   str   = "slaney",
+        clamp_min:   float = 1e-5,
+    ):
+        super().__init__()
+        # Every param must be a direct instance attribute.
+        # firefly.py reads them as self.spec_transform.<attr>.
+        self.sample_rate = sample_rate
+        self.n_fft       = n_fft
+        self.hop_length  = hop_length
+        self.win_length  = win_length
+        self.n_mels      = n_mels
+        self.f_min       = f_min
+        self.f_max       = f_max if f_max is not None else float(sample_rate) / 2.0
+        self.clamp_min   = clamp_min
+        self._transform = T.MelSpectrogram(
+            sample_rate = sample_rate,
+            n_fft       = n_fft,
+            hop_length  = hop_length,
+            win_length  = win_length,
+            n_mels      = n_mels,
+            f_min       = f_min,
+            f_max       = self.f_max,
+            center      = center,
+            power       = power,
+            norm        = norm,
+            mel_scale   = mel_scale,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x   : (B, T) | (T,) | (B, 1, T) | (B, C, T)
+        out : (B, n_mels, T_frames)  — always 3-D, never 4-D
+        The channel-squeeze is critical. vqgan/inference.py passes audio as
+        (B=1, C=1, T); without the squeeze T.MelSpectrogram returns a 4-D
+        tensor which mismatches the 3-D conv mask, crashing at dim 3.
+        """
+        if x.ndim == 3:
+            if x.shape[1] == 1:
+                x = x.squeeze(1)   # mono   (B, 1, T) → (B, T)
+            else:
+                x = x.mean(dim=1)  # stereo (B, C, T) → (B, T)
+        mel = self._transform(x)
+        return torch.log(torch.clamp(mel, min=self.clamp_min))
+'''
+def _patch_spectrogram_module():
+    utils_dir = REPO_DIR / "fish_speech" / "utils"
+    utils_dir.mkdir(parents=True, exist_ok=True)
+    init_file = utils_dir / "__init__.py"
+    if not init_file.exists():
+        init_file.write_text("# auto-generated by Chronis setup\n")
+    spec_file = utils_dir / "spectrogram.py"
+    spec_file.write_text(SPECTROGRAM_SRC)
+    # Delete any stale .pyc that could shadow the updated .py
+    pyc_dir = utils_dir / "__pycache__"
+    if pyc_dir.exists():
+        for pyc in pyc_dir.glob("spectrogram*.pyc"):
+            pyc.unlink()
+            print(f"[patch] deleted stale {pyc}", flush=True)
+    print(f"[patch] wrote {spec_file}", flush=True)
+# ──────────────────────────────────────────────────────────────────────────────
+# Patch 2 — strip pyaudio from all dependency manifests
+# ──────────────────────────────────────────────────────────────────────────────
+def _drop_dep(dep_list: list, pattern: str) -> list:
+    return [d for d in dep_list if not d.lower().startswith(pattern)]
+def _patch_pyproject_toml():
+    pyproject = REPO_DIR / "pyproject.toml"
+    if not pyproject.exists():
+        return
+    with open(pyproject, "rb") as f:
+        data = tomllib.load(f)
+    changed = False
+    deps = data.get("project", {}).get("dependencies", [])
+    if deps:
+        new_deps = _drop_dep(deps, "pyaudio")
+        if new_deps != deps:
+            data["project"]["dependencies"] = new_deps
+            changed = True
+    poetry_deps = data.get("tool", {}).get("poetry", {}).get("dependencies", {})
+    if "pyaudio" in poetry_deps or "PyAudio" in poetry_deps:
+        poetry_deps.pop("pyaudio", None)
+        poetry_deps.pop("PyAudio", None)
+        changed = True
+    if changed:
+        with open(pyproject, "wb") as f:
+            tomli_w.dump(data, f)
+        print("[patch] removed pyaudio from pyproject.toml", flush=True)
+def _patch_requirements_txt():
+    for fname in ("requirements.txt", "requirements-base.txt"):
+        req = REPO_DIR / fname
+        if not req.exists():
+            continue
+        lines     = req.read_text().splitlines()
+        new_lines = [l for l in lines if not l.lower().startswith("pyaudio")]
+        if new_lines != lines:
+            req.write_text("\n".join(new_lines) + "\n")
+            print(f"[patch] removed pyaudio from {fname}", flush=True)
+def _patch_setup_cfg():
+    setup_cfg = REPO_DIR / "setup.cfg"
+    if not setup_cfg.exists():
+        return
+    text     = setup_cfg.read_text()
+    new_text = "\n".join(
+        l for l in text.splitlines() if not l.strip().lower().startswith("pyaudio")
+    )
+    if new_text != text:
+        setup_cfg.write_text(new_text)
+        print("[patch] removed pyaudio from setup.cfg", flush=True)
+def _patch_dependencies():
+    global tomllib, tomli_w
+    if tomllib is None or tomli_w is None:
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", "tomli", "tomli_w", "-q"],
+            check=True,
+        )
+        import tomli   as tomllib
+        import tomli_w as tomli_w
+    _patch_pyproject_toml()
+    _patch_requirements_txt()
+    _patch_setup_cfg()
+# ──────────────────────────────────────────────────────────────────────────────
+# Patch 3 — CPU-safe subprocess wrapper
+# ──────────────────────────────────────────────────────────────────────────────
+WRAPPER_PATH = Path("/tmp/_chronis_torch_cpu.py")
+_WRAPPER_SRC = '''\
+"""
+Chronis CPU-safe subprocess wrapper.
+Forces torch.load → CPU, disables weights_only, redirects .to(cuda) → .to(cpu).
+Usage: python _chronis_torch_cpu.py <real_script.py> [args...]
+"""
+import sys
+import torch
+import runpy
+_original_load = torch.load
+def _cpu_safe_load(f, map_location=None, pickle_module=None, **kwargs):
+    kwargs["weights_only"] = False
+    kwargs["map_location"]  = "cpu"
+    if pickle_module is not None:
+        kwargs["pickle_module"] = pickle_module
+    return _original_load(f, **kwargs)
+torch.load = _cpu_safe_load
+_orig_module_to = torch.nn.Module.to
+def _cpu_module_to(self, *args, **kwargs):
+    new_args = []
+    for a in args:
+        if isinstance(a, (str, torch.device)) and "cuda" in str(a):
+            a = torch.device("cpu")
+        new_args.append(a)
+    if "device" in kwargs and "cuda" in str(kwargs["device"]):
+        kwargs["device"] = torch.device("cpu")
+    return _orig_module_to(self, *new_args, **kwargs)
+torch.nn.Module.to = _cpu_module_to
+_orig_tensor_to = torch.Tensor.to
+def _cpu_tensor_to(self, *args, **kwargs):
+    new_args = []
+    for a in args:
+        if isinstance(a, (str, torch.device)) and "cuda" in str(a):
+            a = torch.device("cpu")
+        new_args.append(a)
+    if "device" in kwargs and "cuda" in str(kwargs["device"]):
+        kwargs["device"] = torch.device("cpu")
+    return _orig_tensor_to(self, *new_args, **kwargs)
+torch.Tensor.to = _cpu_tensor_to
+sys.argv = sys.argv[1:]
+runpy.run_path(sys.argv[0], run_name="__main__")
+'''
+def _patch_torch_load():
+    WRAPPER_PATH.write_text(_WRAPPER_SRC)
+    print(f"[patch] wrote subprocess wrapper → {WRAPPER_PATH}", flush=True)
+# ──────────────────────────────────────────────────────────────────────────────
+def _build_env():
+    existing       = os.environ.get("PYTHONPATH", "")
+    new_pythonpath = f"{REPO_DIR}:{existing}" if existing else str(REPO_DIR)
+    return {
+        **os.environ,
+        "PYTHONPATH":           new_pythonpath,
+        "HYDRA_FULL_ERROR":     "1",
+        "CUDA_VISIBLE_DEVICES": "",
+    }
+# ──────────────────────────────────────────────────────────────────────────────
+# Setup
+# ──────────────────────────────────────────────────────────────────────────────
+def setup():
+    global initialized
+    if initialized:
+        return
+    if not REPO_DIR.exists():
+        print("Cloning Fish Speech v1.5.0 ...", flush=True)
+        subprocess.run(
+            [
+                "git", "clone",
+                "--depth", "1",
+                "--branch", "v1.5.0",
+                "https://github.com/fishaudio/fish-speech.git",
+                str(REPO_DIR),
+            ],
+            check=True,
+        )
+    _patch_spectrogram_module()
+    _patch_dependencies()
+    _patch_torch_load()
+    print("Installing Fish Speech (editable) ...", flush=True)
+    subprocess.run(
+        [sys.executable, "-m", "pip", "install", "-e", ".", "--quiet"],
+        cwd=str(REPO_DIR),
+        check=True,
+    )
+    # Re-apply AFTER pip install — editable install can cache stale .pyc files
+    _patch_spectrogram_module()
+    if str(REPO_DIR) not in sys.path:
+        sys.path.insert(0, str(REPO_DIR))
+    if not MODEL_DIR.exists() or not any(MODEL_DIR.iterdir()):
+        print("Downloading Fish Speech 1.5 weights ...", flush=True)
+        snapshot_download(
+            repo_id               = "fishaudio/fish-speech-1.5",
+            local_dir             = str(MODEL_DIR),
+            local_dir_use_symlinks = False,
+        )
+    print("Setup complete.", flush=True)
+    initialized = True
+>>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
 # ──────────────────────────────────────────────────────────────────────────────
     return text[:500]
+<<<<<<< HEAD
 def split_sentences(text: str, max_chars: int = 200) -> list[str]:
     """
     XTTS handles longer segments better than Fish Speech, so we use a
     parts = re.split(r"(?<=[.!?])\s+", text)
     chunks: list[str] = []
     buf = ""
+=======
+def split_sentences(text: str, max_chars: int = 120) -> list:
+    parts  = re.split(r"(?<=[.!?])\s+", text)
+    chunks = []
+    buf    = ""
+>>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
     for p in parts:
         if len(buf) + len(p) < max_chars:
             buf = (buf + " " + p).strip()
 def prepare_ref_audio(ref_path: str) -> str:
     """
+<<<<<<< HEAD
     Normalise reference audio to mono 24 000 Hz WAV, capped at 10 seconds.
     XTTS-v2 expects 24 kHz input for its speaker encoder.
                     os.unlink(tmp_out)
                 except OSError:
                     pass
+=======
+    Normalise to mono 44100 Hz WAV, capped at 8 seconds.
+    Fish Speech docs recommend 3-10 s of reference. We cap at 8 s:
+      - Short enough to keep CPU encode time reasonable
+      - Long enough for good speaker characterisation
+      - Avoids edge-case rounding in the conv-mask stride at 15 s lengths
+    """
+    audio = AudioSegment.from_file(ref_path)
+    audio = audio.set_channels(1).set_frame_rate(44100).normalize()
+    if len(audio) > 8_000:
+        audio = audio[:8_000]
+    elif len(audio) < 1_000:
+        raise ValueError(
+            f"Reference audio too short ({len(audio)}ms). Need at least 1 second."
+        )
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    audio.export(tmp.name, format="wav")
+    return tmp.name
+# ──────────────────────────────────────────────────────────────────────────────
+# Inference pipeline
+# ──────────────────────────────────────────────────────────────────────────────
+def run_step(cmd: list, name: str, cwd: Path, expect_output: Path = None):
+    """
+    Run a Fish Speech subprocess through the CPU wrapper.
+    Raises a detailed RuntimeError on non-zero exit or missing expected output.
+    """
+    print(f"[{name}] starting ...", flush=True)
+    wrapped_cmd = [cmd[0], str(WRAPPER_PATH)] + cmd[1:]
+    result = subprocess.run(
+        wrapped_cmd,
+        cwd            = str(cwd),
+        capture_output = True,
+        text           = True,
+        env            = _build_env(),
+        timeout        = 600,
+    )
+    if result.stdout.strip():
+        print(f"[{name}] stdout:\n{result.stdout[-1200:]}", flush=True)
+    if result.returncode != 0:
+        diag = (
+            f"[{name}] FAILED (exit {result.returncode})\n"
+            f"--- stderr ---\n{result.stderr[-1500:]}\n"
+            f"--- stdout ---\n{result.stdout[-600:]}"
+        )
+        print(diag, flush=True)
+        raise RuntimeError(diag)
+    if expect_output is not None and not expect_output.exists():
+        raise RuntimeError(
+            f"[{name}] exited 0 but expected output missing: {expect_output}\n"
+            f"stdout: {result.stdout[-800:]}\nstderr: {result.stderr[-800:]}"
+        )
+    print(f"[{name}] done ✓", flush=True)
+def run_chunk(text: str, ref_audio: str, workdir: Path, idx: int) -> str:
+    chunk_dir = workdir / f"chunk_{idx}"
+    chunk_dir.mkdir(parents=True, exist_ok=True)
+    ref_copy   = chunk_dir / "ref.wav"
+    shutil.copy(ref_audio, ref_copy)
+    vq_tokens  = chunk_dir / "fake.npy"
+    sem_tokens = chunk_dir / "codes_0.npy"
+    out_wav    = chunk_dir / "fake.wav"
+    # In fish-speech v1.5, tools/vqgan/inference.py handles BOTH encode and
+    # decode. Mode is auto-detected from the input file extension:
+    #   .wav → encode → writes fake.npy
+    #   .npy → decode → writes fake.wav
+    vqgan_script = str(REPO_DIR / "tools" / "vqgan" / "inference.py")
+    t2s_script   = str(REPO_DIR / "fish_speech" / "models" / "text2semantic" / "inference.py")
+    firefly_ckpt = str(MODEL_DIR / "firefly-gan-vq-fsq-8x1024-21hz-generator.pth")
+    # Step 1: Reference audio → VQ tokens
+    run_step(
+        [
+            sys.executable, vqgan_script,
+            "-i",                str(ref_copy),
+            "--checkpoint-path", firefly_ckpt,
+            "--device",          "cpu",
+        ],
+        name          = "Codec Encode",
+        cwd           = chunk_dir,
+        expect_output = vq_tokens,
+    )
+    # Step 2: Text + VQ tokens → semantic codes
+    run_step(
+        [
+            sys.executable, t2s_script,
+            "--text",            text,
+            "--prompt-tokens",   str(vq_tokens),
+            "--checkpoint-path", str(MODEL_DIR),
+            "--num-samples",     "1",
+            "--device",          "cpu",
+        ],
+        name          = "Text2Semantic",
+        cwd           = chunk_dir,
+        expect_output = sem_tokens,
+    )
+    # Step 3: Semantic codes → audio
+    run_step(
+        [
+            sys.executable, vqgan_script,
+            "-i",                str(sem_tokens),
+            "--checkpoint-path", firefly_ckpt,
+            "--device",          "cpu",
+        ],
+        name          = "Codec Decode",
+        cwd           = chunk_dir,
+        expect_output = out_wav,
+    )
+    return str(out_wav)
+# ──────────────────────────────────────────────────────────────────────────────
+# Main synthesis entry point
+# ──────────────────────────────────────────────────────────────────────────────
+def synthesize(text: str, ref_audio_path: str, secret: str):
+    with inference_lock:
+        if SECRET and secret != SECRET:
+            return "", "Unauthorized"
+        if not ref_audio_path or not Path(ref_audio_path).exists():
+            return "", "Reference audio missing or not uploaded"
+        try:
+            setup()
+        except Exception as e:
+            return "", f"Setup failed: {e}"
+        cleaned = clean_text(text)
+        chunks  = split_sentences(cleaned)
+        workdir = Path(tempfile.mkdtemp(prefix="chronis_tts_"))
+        try:
+            clean_ref = prepare_ref_audio(ref_audio_path)
+            combined  = AudioSegment.empty()
+            for i, chunk in enumerate(chunks):
+                print(f"[synth] chunk {i+1}/{len(chunks)}: {chunk[:80]!r}", flush=True)
+                out       = run_chunk(chunk, clean_ref, workdir, i)
+                combined += AudioSegment.from_wav(out)
+                gc.collect()
+            tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            combined.export(tmp.name, format="wav")
+            with open(tmp.name, "rb") as f:
+                audio_b64 = base64.b64encode(f.read()).decode()
+            os.unlink(tmp.name)
+            return audio_b64, "ok"
+        except Exception as e:
+            print(f"[synth] ERROR: {e}", flush=True)
+            return "", str(e)
+        finally:
+>>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
             shutil.rmtree(workdir, ignore_errors=True)
 # ──────────────────────────────────────────────────────────────────────────────
+<<<<<<< HEAD
 # Gradio UI  (same contract as the Fish Speech version)
 # ──────────────────────────────────────────────────────────────────────────────
 demo.queue()
 demo.launch(server_name="0.0.0.0", server_port=7860)
+=======
+# Gradio UI
+# ──────────────────────────────────────────────────────────────────────────────
+demo = gr.Interface(
+    fn      = synthesize,
+    inputs  = [
+        gr.Textbox(label="Text to synthesise"),
+        gr.Audio(type="filepath", label="Reference Voice (3–8 second voice note)"),
+        gr.Textbox(label="Secret", type="password"),
+    ],
+    outputs = [
+        gr.Textbox(label="Audio Base64"),
+        gr.Textbox(label="Status"),
+    ],
+    api_name      = "predict",
+    title         = "Chronis Fish Speech",
+    description   = "Voice cloning TTS — send a voice note, get the cloned voice back.",
+    flagging_mode = "never",
+)
+demo.queue()
+demo.launch(server_name="0.0.0.0", server_port=7860)
+>>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef

requirements.txt CHANGED Viewed

@@ -1,9 +1,12 @@
 TTS>=0.22.0
 torch
 torchaudio
 soundfile
 librosa
 tqdm
 --extra-index-url https://download.pytorch.org/whl/cpu
 gradio==5.23.0
 torch==2.1.0+cpu

+<<<<<<< HEAD
 TTS>=0.22.0
 torch
 torchaudio
 soundfile
 librosa
 tqdm
+=======
+>>>>>>> 6eaf50d4defa4f22a696dde692015ba3a7a450ef
 --extra-index-url https://download.pytorch.org/whl/cpu
 gradio==5.23.0
 torch==2.1.0+cpu