File size: 20,149 Bytes

from __future__ import annotations

"""Vocence TTS miner: HF snapshot (weights + ``fish-speech/``) then Fish inference.

After ``snapshot_download``, code lives at ``<snapshot>/fish-speech/``. Missing
PyPI packages are installed one at a time via ``pip install <module>`` from
``ModuleNotFoundError`` (avoids full ``pyproject.toml`` install pulling ``pyaudio``,
which needs system ``portaudio`` headers). Set ``VOCENCE_SKIP_FISH_SPEECH_PIP=1`` to
disable. Override path with ``fish_speech.repo_root`` or ``FISH_SPEECH_ROOT``.
"""

import io
import logging
import os
import sys
import wave
from pathlib import Path
from typing import Any, Mapping

import numpy as np

REPO = Path(__file__).resolve().parent
_VOCENCE_YAML = "vocence_config.yaml"
_MAX_AUDIO_SEC = 30
_OUT_SR = 24000

_OMEGA_RESOLVER_PATCHED: bool = False
_OrigOmegaRegister: Any = None


def _patch_omegaconf_register_new_resolver() -> None:
    """Retry ``register_new_resolver`` with ``replace=True`` if Hydra/lightning registered ``eval`` first."""
    global _OMEGA_RESOLVER_PATCHED, _OrigOmegaRegister
    if _OMEGA_RESOLVER_PATCHED:
        return
    try:
        from omegaconf import OmegaConf
    except ImportError:
        return
    if _OrigOmegaRegister is None:
        _OrigOmegaRegister = OmegaConf.register_new_resolver

    def _patched(name, resolver, *args, **kwargs):
        kw = dict(kwargs)
        try:
            return _OrigOmegaRegister(name, resolver, *args, **kw)
        except ValueError as exc:
            if "already registered" not in str(exc).lower():
                raise
            kw["replace"] = True
            return _OrigOmegaRegister(name, resolver, *args, **kw)

    OmegaConf.register_new_resolver = _patched  # type: ignore[method-assign]
    _OMEGA_RESOLVER_PATCHED = True


def _read_yaml(repo: Path) -> dict[str, Any]:
    from yaml import safe_load

    p = repo / _VOCENCE_YAML
    if not p.is_file():
        return {}
    with p.open(encoding="utf-8") as f:
        d = safe_load(f)
    return dict(d) if isinstance(d, Mapping) else {}


def _hf_token() -> str | None:
    t = (os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or "").strip()
    return t or None


def _weights_dir(repo: Path, repo_id: str) -> Path:
    safe = repo_id.replace("/", "__").replace(":", "_")
    return (repo / "_vocence_hf_weights" / safe).resolve()


def download_hub(repo: Path, repo_id: str, revision: str | None) -> Path:
    from huggingface_hub import snapshot_download

    dest = _weights_dir(repo, repo_id)
    dest.mkdir(parents=True, exist_ok=True)
    logging.getLogger(__name__).info("Downloading %s → %s", repo_id, dest)
    snapshot_download(repo_id=repo_id, revision=revision, local_dir=str(dest), token=_hf_token())
    if not (dest / "codec.pth").is_file():
        raise FileNotFoundError(f"missing codec.pth under {dest}")
    return dest


def _purge_tools_modules() -> None:
    for m in list(sys.modules):
        if m == "tools" or m.startswith("tools.") or m == "fish_speech" or m.startswith("fish_speech."):
            del sys.modules[m]


# Top-level import name -> PyPI distribution (wrong names break installs, e.g. ``hydra`` vs ``hydra-core``).
_PIP_ALIASES: dict[str, str] = {
    "PIL": "Pillow",
    "yaml": "PyYAML",
    "sklearn": "scikit-learn",
    "hydra": "hydra-core",
    "pytorch_lightning": "lightning",
}


def _pip_install_module(mod: str) -> None:
    """``pip install`` the PyPI package that provides import name ``mod`` (top-level segment)."""
    import subprocess

    head = (mod or "").strip().split(".")[0]
    if not head:
        raise ValueError("empty module name")
    if head in ("fish_speech", "tools"):
        raise RuntimeError(f"refusing to pip install std project name {head!r}")
    std = getattr(sys, "stdlib_module_names", None)
    if std is not None and head in std:
        raise RuntimeError(f"refusing to pip install stdlib name {head!r}")
    pkg = _PIP_ALIASES.get(head, head)
    log = logging.getLogger(__name__)
    cmd = [sys.executable, "-m", "pip", "install", pkg]
    log.info("Running: %s", " ".join(cmd))
    r = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
    if r.returncode != 0:
        msg = (r.stderr or r.stdout or "").strip() or f"exit {r.returncode}"
        raise RuntimeError(f"pip install {pkg!r} failed: {msg}")


def _tools_already_importable() -> bool:
    import importlib

    _patch_omegaconf_register_new_resolver()
    try:
        importlib.import_module("tools.server.model_manager")
        return True
    except Exception:
        _purge_tools_modules()
        return False


def _ensure_fish_speech(miner_repo: Path, model_root: Path, fs: Mapping[str, Any]) -> None:
    """Use ``<model_root>/fish-speech`` on ``sys.path`` (or ``repo_root`` / ``FISH_SPEECH_ROOT``)."""
    global _OMEGA_RESOLVER_PATCHED
    import importlib

    log = logging.getLogger(__name__)
    if _tools_already_importable():
        return

    roots: list[Path] = [(model_root / "fish-speech").resolve()]
    raw = (fs.get("repo_root") or os.environ.get("FISH_SPEECH_ROOT") or "").strip()
    if raw:
        p = Path(raw).expanduser()
        roots.append(p.resolve() if p.is_absolute() else (miner_repo / p).resolve())

    mm = Path("tools") / "server" / "model_manager.py"
    skip_pip = os.environ.get("VOCENCE_SKIP_FISH_SPEECH_PIP", "").strip().lower() in ("1", "true", "yes")
    max_rounds = int(os.environ.get("VOCENCE_FISH_SPEECH_PIP_MAX_ROUNDS", "60"))

    for code_root in roots:
        if not (code_root / mm).is_file():
            continue
        s = str(code_root.resolve())
        if s not in sys.path:
            sys.path.insert(0, s)
        last_err: BaseException | None = None
        for _ in range(max_rounds):
            _patch_omegaconf_register_new_resolver()
            try:
                importlib.import_module("tools.server.model_manager")
                return
            except ModuleNotFoundError as e:
                last_err = e
                _purge_tools_modules()
                mod = e.name
                if skip_pip or mod is None:
                    try:
                        sys.path.remove(s)
                    except ValueError:
                        pass
                    raise ImportError(
                        f"{code_root}: missing {mod!r}. Install deps or unset VOCENCE_SKIP_FISH_SPEECH_PIP."
                    ) from e
                head = mod.split(".")[0]
                pkg = _PIP_ALIASES.get(head, head)
                log.warning("Missing Python module %r — pip install %r …", mod, pkg)
                if head in ("fish_speech", "tools"):
                    try:
                        sys.path.remove(s)
                    except ValueError:
                        pass
                    raise ImportError(
                        f"{code_root}: project import {mod!r} failed (broken tree or path?)."
                    ) from e
                try:
                    _pip_install_module(mod)
                except Exception as pip_e:
                    try:
                        sys.path.remove(s)
                    except ValueError:
                        pass
                    raise ImportError(f"{code_root}: could not install missing {mod!r}: {pip_e}") from pip_e
                if s not in sys.path:
                    sys.path.insert(0, s)
                continue
            except Exception as e:
                msg_l = str(e).lower()
                if "already registered" in msg_l and "resolver" in msg_l:
                    log.warning("OmegaConf resolver clash (%s); clearing ``eval`` and retrying …", e)
                    last_err = e
                    _purge_tools_modules()
                    try:
                        from omegaconf import OmegaConf

                        cr = getattr(OmegaConf, "clear_resolver", None)
                        if callable(cr):
                            cr("eval")
                        if _OrigOmegaRegister is not None:
                            OmegaConf.register_new_resolver = _OrigOmegaRegister  # type: ignore[method-assign]
                    except Exception:
                        pass
                    _OMEGA_RESOLVER_PATCHED = False
                    _patch_omegaconf_register_new_resolver()
                    if s not in sys.path:
                        sys.path.insert(0, s)
                    continue
                last_err = e
                _purge_tools_modules()
                try:
                    sys.path.remove(s)
                except ValueError:
                    pass
                raise ImportError(
                    f"{code_root}: import failed after resolving modules (not a simple missing PyPI dep): {e}"
                ) from e

        try:
            sys.path.remove(s)
        except ValueError:
            pass
        raise ImportError(
            f"{code_root}: exceeded {max_rounds} pip rounds (last error: {last_err}). "
            "Install fish-speech deps manually or raise VOCENCE_FISH_SPEECH_PIP_MAX_ROUNDS."
        ) from last_err

    raise FileNotFoundError(
        f"Missing {roots[0] / mm}. HF repo should include a fish-speech/ tree next to codec.pth, "
        f"or set fish_speech.repo_root in {_VOCENCE_YAML} / FISH_SPEECH_ROOT."
    )


def load_tts_inference_engine(
    *,
    llama_checkpoint_path: str,
    decoder_checkpoint_path: str,
    decoder_config_name: str = "modded_dac_vq",
    device: str = "cuda",
    half: bool = False,
    compile_model: bool = False,
) -> Any:
    from tools.server.model_manager import ModelManager

    m = ModelManager(
        mode="tts",
        device=device,
        half=half,
        compile=compile_model,
        llama_checkpoint_path=llama_checkpoint_path,
        decoder_checkpoint_path=decoder_checkpoint_path,
        decoder_config_name=decoder_config_name,
    )
    return m.tts_inference_engine


def synthesize_wav(
    engine: Any,
    *,
    text: str,
    reference_audio_path: str | None = None,
    reference_text: str | None = None,
    max_new_tokens: int = 1024,
    chunk_length: int = 200,
    top_p: float = 0.8,
    repetition_penalty: float = 1.1,
    temperature: float = 0.8,
    seed: int | None = None,
) -> tuple[int, np.ndarray]:
    from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest

    if bool(reference_audio_path) ^ bool(reference_text):
        raise ValueError("reference_audio_path and reference_text must be both set or both omitted")
    refs: list[ServeReferenceAudio] = []
    if reference_audio_path:
        rp = Path(reference_audio_path)
        if not rp.is_file():
            raise FileNotFoundError(rp)
        refs = [ServeReferenceAudio(audio=rp.read_bytes(), text=reference_text or "")]
    req = ServeTTSRequest(
        text=text,
        references=refs,
        reference_id=None,
        max_new_tokens=max_new_tokens,
        chunk_length=chunk_length,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        format="wav",
        streaming=False,
        seed=seed,
    )
    sr: int | None = None
    audio: np.ndarray | None = None
    for result in engine.inference(req):
        if result.code == "error":
            raise RuntimeError(str(result.error or "inference error"))
        if result.code == "final" and result.audio is not None:
            sr, audio = result.audio
            break
    if sr is None or audio is None:
        raise RuntimeError("no audio")
    arr = np.asarray(audio, dtype=np.float32)
    if arr.ndim > 1:
        arr = np.mean(arr, axis=-1).astype(np.float32)
    return int(sr), arr


def _resample(w: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
    if orig_sr == target_sr:
        return np.asarray(w, dtype=np.float32)
    import librosa

    y = np.asarray(w, dtype=np.float32)
    if y.ndim > 1:
        y = np.mean(y, axis=-1).astype(np.float32)
    return librosa.resample(y, orig_sr=orig_sr, target_sr=target_sr).astype(np.float32)


def _wav_bytes(w: np.ndarray, sample_rate: int) -> bytes:
    w = np.clip(np.asarray(w, dtype=np.float32), -1.0, 1.0)
    s16 = (w * 32767.0).astype(np.int16)
    buf = io.BytesIO()
    with wave.open(buf, "wb") as wv:
        wv.setnchannels(1)
        wv.setsampwidth(2)
        wv.setframerate(sample_rate)
        wv.writeframes(s16.tobytes())
    return buf.getvalue()


def _resolve_ckpt(raw: str | None, *, model_root: Path, miner_repo: Path) -> Path | None:
    if not raw or not str(raw).strip():
        return None
    s = str(raw).strip()
    p = Path(s).expanduser()
    if p.is_absolute():
        return p.resolve()
    for base in (model_root, miner_repo):
        c = (base / s).resolve()
        if c.exists():
            return c
    return (miner_repo / s).resolve()


def _llama_and_decoder(model_root: Path, miner_repo: Path, fs: Mapping[str, Any]) -> tuple[str, str]:
    lr = (fs.get("llama_checkpoint_path") or os.environ.get("FISH_SPEECH_LLAMA_PATH") or "").strip()
    dr = (fs.get("decoder_checkpoint_path") or os.environ.get("FISH_SPEECH_DECODER_PATH") or "").strip()
    lp, dp = _resolve_ckpt(lr or None, model_root=model_root, miner_repo=miner_repo), _resolve_ckpt(
        dr or None, model_root=model_root, miner_repo=miner_repo
    )
    if lp is not None and dp is not None:
        return str(lp), str(dp)
    if lp is not None and dp is None:
        cand = sorted(Path(lp).rglob("codec.pth"), key=lambda x: len(x.parts))
        if not cand:
            raise FileNotFoundError(f"no codec.pth under {lp}")
        return str(lp), str(cand[0])
    if dp is not None and lp is None:
        return str(Path(dp).parent), str(dp)
    c = model_root / "codec.pth"
    if c.is_file():
        p = c.parent
        return str(p), str(c)
    m = sorted(model_root.rglob("codec.pth"), key=lambda x: len(x.parts))
    if not m:
        raise FileNotFoundError(f"no codec.pth under {model_root}")
    x = m[0]
    return str(x.parent), str(x)


def _prompt(instruction: str, text: str) -> str:
    s = instruction.strip()
    tags = "".join(f"[{p.strip()}]" for p in s.split("|") if p.strip()) if s else ""
    body = text.strip()
    if not tags:
        return body
    return f"{tags} {body}" if body else tags


class Miner:
    def __init__(self, miner_repo: Path) -> None:
        if not logging.root.handlers:
            logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
        self._repo = Path(miner_repo).resolve()
        cfg = _read_yaml(self._repo)
        lim = cfg.get("limits") or {}
        self._cap_t = int(lim.get("max_text_chars", 2000))
        self._cap_i = int(lim.get("max_instruction_chars", 600))
        gen = cfg.get("generation") or {}
        self._out_sr = int(gen.get("sample_rate", _OUT_SR))
        if self._out_sr != _OUT_SR:
            raise ValueError(f"generation.sample_rate must be {_OUT_SR} in {_VOCENCE_YAML}")
        fs = cfg.get("fish_speech") or {}
        rt = cfg.get("runtime") or {}
        log = logging.getLogger(__name__)

        hub = (rt.get("hub_model_id") or rt.get("model_id") or "").strip()
        rev = str(rt.get("model_revision") or rt.get("hub_revision") or os.environ.get("VOCENCE_MODEL_REVISION") or "").strip() or None
        model_root = download_hub(self._repo, hub, rev) if hub else self._repo

        _ensure_fish_speech(self._repo, model_root, fs)
        llama_p, dec_p = _llama_and_decoder(model_root, self._repo, fs)
        if not Path(dec_p).is_file():
            raise FileNotFoundError(f"decoder not a file: {dec_p}")
        if not Path(llama_p).exists():
            raise FileNotFoundError(f"llama path missing: {llama_p}")

        dev = str(fs.get("device") or rt.get("device_preference") or os.environ.get("FISH_SPEECH_DEVICE") or "cuda")
        self._engine = load_tts_inference_engine(
            llama_checkpoint_path=llama_p,
            decoder_checkpoint_path=dec_p,
            decoder_config_name=str(fs.get("decoder_config_name", "modded_dac_vq")),
            device=dev,
            half=bool(fs.get("half", False)),
            compile_model=bool(fs.get("compile", False)),
        )
        self._tok = int(fs.get("max_new_tokens", 1024))
        self._chunk = int(fs.get("chunk_length", 200))
        self._top_p = float(fs.get("top_p", 0.8))
        self._rep = float(fs.get("repetition_penalty", 1.1))
        self._temp = float(fs.get("temperature", 0.8))
        se = fs.get("seed")
        self._seed = int(se) if se is not None else None
        self._adapter = str(rt.get("adapter", "finetuned-tts"))
        log.info("Miner ready (hub=%s, llama=%s)", hub or "local", llama_p)

    def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
        t = text[: self._cap_t] if self._cap_t else text
        ins = instruction[: self._cap_i] if self._cap_i else instruction
        sr, wav = synthesize_wav(
            self._engine,
            text=_prompt(ins, t),
            max_new_tokens=self._tok,
            chunk_length=self._chunk,
            top_p=self._top_p,
            repetition_penalty=self._rep,
            temperature=self._temp,
            seed=self._seed,
        )
        return _resample(wav, int(sr), self._out_sr), self._out_sr


_engine: Miner | None = None
_err: str | None = None
_sr: int = _OUT_SR
_adapter: str = "finetuned-tts"


def _run_dev_server() -> None:
    from contextlib import asynccontextmanager

    import uvicorn
    from fastapi import Body, FastAPI, HTTPException, status
    from fastapi.responses import Response
    from pydantic import BaseModel

    if not logging.root.handlers:
        logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")

    @asynccontextmanager
    async def lifespan(_: Any):
        global _engine, _err, _sr, _adapter
        cfg = _read_yaml(REPO)
        _sr = int((cfg.get("generation") or {}).get("sample_rate", _OUT_SR))
        _adapter = str((cfg.get("runtime") or {}).get("adapter", "finetuned-tts"))
        try:
            _engine = Miner(REPO)
            _err = None
        except Exception as e:
            _engine = None
            _err = f"{type(e).__name__}: {e}"
            logging.getLogger(__name__).exception("Miner startup failed")
        yield
        _engine = None

    class Health(BaseModel):
        status: str
        model_loaded: bool
        sample_rate: int
        adapter: str
        error: str | None = None

    app = FastAPI(title="Vocence TTS", lifespan=lifespan)

    @app.get("/health", response_model=Health)
    async def health() -> Health:
        ok = _engine is not None
        return Health(
            status="healthy" if ok else "unhealthy",
            model_loaded=ok,
            sample_rate=_sr,
            adapter=_adapter,
            error=None if ok else _err,
        )

    lim = _read_yaml(REPO).get("limits") or {}
    mx_t, mx_i = int(lim.get("max_text_chars", 2000)), int(lim.get("max_instruction_chars", 600))

    @app.post("/speak", response_class=Response, response_model=None)
    async def speak(
        text: str = Body(..., min_length=1, max_length=mx_t, embed=True),
        instruction: str = Body(..., min_length=1, max_length=mx_i, embed=True),
    ) -> Response:
        if _engine is None:
            raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=_err or "engine not loaded")
        w, sr = _engine.generate_wav(instruction, text)
        w = np.asarray(w)
        if w.ndim != 1 or w.size == 0:
            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="invalid waveform")
        d = float(w.shape[0]) / float(sr)
        if d <= 0 or d > _MAX_AUDIO_SEC:
            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="invalid duration")
        return Response(content=_wav_bytes(w, int(sr)), media_type="audio/wav")

    uvicorn.run(app, host=os.environ.get("HOST", "0.0.0.0"), port=int(os.environ.get("PORT", "8765")))


if __name__ == "__main__":
    _run_dev_server()