| from __future__ import annotations |
|
|
| """Vocence TTS miner: HF snapshot (weights + ``fish-speech/``) then Fish inference. |
| |
| After ``snapshot_download``, code lives at ``<snapshot>/fish-speech/``. Missing |
| PyPI packages are installed one at a time via ``pip install <module>`` from |
| ``ModuleNotFoundError`` (avoids full ``pyproject.toml`` install pulling ``pyaudio``, |
| which needs system ``portaudio`` headers). Set ``VOCENCE_SKIP_FISH_SPEECH_PIP=1`` to |
| disable. Override path with ``fish_speech.repo_root`` or ``FISH_SPEECH_ROOT``. |
| """ |
|
|
| import io |
| import logging |
| import os |
| import sys |
| import wave |
| from pathlib import Path |
| from typing import Any, Mapping |
|
|
| import numpy as np |
|
|
| REPO = Path(__file__).resolve().parent |
| _VOCENCE_YAML = "vocence_config.yaml" |
| _MAX_AUDIO_SEC = 30 |
| _OUT_SR = 24000 |
|
|
| _OMEGA_RESOLVER_PATCHED: bool = False |
| _OrigOmegaRegister: Any = None |
|
|
|
|
| def _patch_omegaconf_register_new_resolver() -> None: |
| """Retry ``register_new_resolver`` with ``replace=True`` if Hydra/lightning registered ``eval`` first.""" |
| global _OMEGA_RESOLVER_PATCHED, _OrigOmegaRegister |
| if _OMEGA_RESOLVER_PATCHED: |
| return |
| try: |
| from omegaconf import OmegaConf |
| except ImportError: |
| return |
| if _OrigOmegaRegister is None: |
| _OrigOmegaRegister = OmegaConf.register_new_resolver |
|
|
| def _patched(name, resolver, *args, **kwargs): |
| kw = dict(kwargs) |
| try: |
| return _OrigOmegaRegister(name, resolver, *args, **kw) |
| except ValueError as exc: |
| if "already registered" not in str(exc).lower(): |
| raise |
| kw["replace"] = True |
| return _OrigOmegaRegister(name, resolver, *args, **kw) |
|
|
| OmegaConf.register_new_resolver = _patched |
| _OMEGA_RESOLVER_PATCHED = True |
|
|
|
|
| def _read_yaml(repo: Path) -> dict[str, Any]: |
| from yaml import safe_load |
|
|
| p = repo / _VOCENCE_YAML |
| if not p.is_file(): |
| return {} |
| with p.open(encoding="utf-8") as f: |
| d = safe_load(f) |
| return dict(d) if isinstance(d, Mapping) else {} |
|
|
|
|
| def _hf_token() -> str | None: |
| t = (os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or "").strip() |
| return t or None |
|
|
|
|
| def _weights_dir(repo: Path, repo_id: str) -> Path: |
| safe = repo_id.replace("/", "__").replace(":", "_") |
| return (repo / "_vocence_hf_weights" / safe).resolve() |
|
|
|
|
| def download_hub(repo: Path, repo_id: str, revision: str | None) -> Path: |
| from huggingface_hub import snapshot_download |
|
|
| dest = _weights_dir(repo, repo_id) |
| dest.mkdir(parents=True, exist_ok=True) |
| logging.getLogger(__name__).info("Downloading %s → %s", repo_id, dest) |
| snapshot_download(repo_id=repo_id, revision=revision, local_dir=str(dest), token=_hf_token()) |
| if not (dest / "codec.pth").is_file(): |
| raise FileNotFoundError(f"missing codec.pth under {dest}") |
| return dest |
|
|
|
|
| def _purge_tools_modules() -> None: |
| for m in list(sys.modules): |
| if m == "tools" or m.startswith("tools.") or m == "fish_speech" or m.startswith("fish_speech."): |
| del sys.modules[m] |
|
|
|
|
| |
| _PIP_ALIASES: dict[str, str] = { |
| "PIL": "Pillow", |
| "yaml": "PyYAML", |
| "sklearn": "scikit-learn", |
| "hydra": "hydra-core", |
| "pytorch_lightning": "lightning", |
| } |
|
|
|
|
| def _pip_install_module(mod: str) -> None: |
| """``pip install`` the PyPI package that provides import name ``mod`` (top-level segment).""" |
| import subprocess |
|
|
| head = (mod or "").strip().split(".")[0] |
| if not head: |
| raise ValueError("empty module name") |
| if head in ("fish_speech", "tools"): |
| raise RuntimeError(f"refusing to pip install std project name {head!r}") |
| std = getattr(sys, "stdlib_module_names", None) |
| if std is not None and head in std: |
| raise RuntimeError(f"refusing to pip install stdlib name {head!r}") |
| pkg = _PIP_ALIASES.get(head, head) |
| log = logging.getLogger(__name__) |
| cmd = [sys.executable, "-m", "pip", "install", pkg] |
| log.info("Running: %s", " ".join(cmd)) |
| r = subprocess.run(cmd, capture_output=True, text=True, timeout=600) |
| if r.returncode != 0: |
| msg = (r.stderr or r.stdout or "").strip() or f"exit {r.returncode}" |
| raise RuntimeError(f"pip install {pkg!r} failed: {msg}") |
|
|
|
|
| def _tools_already_importable() -> bool: |
| import importlib |
|
|
| _patch_omegaconf_register_new_resolver() |
| try: |
| importlib.import_module("tools.server.model_manager") |
| return True |
| except Exception: |
| _purge_tools_modules() |
| return False |
|
|
|
|
| def _ensure_fish_speech(miner_repo: Path, model_root: Path, fs: Mapping[str, Any]) -> None: |
| """Use ``<model_root>/fish-speech`` on ``sys.path`` (or ``repo_root`` / ``FISH_SPEECH_ROOT``).""" |
| global _OMEGA_RESOLVER_PATCHED |
| import importlib |
|
|
| log = logging.getLogger(__name__) |
| if _tools_already_importable(): |
| return |
|
|
| roots: list[Path] = [(model_root / "fish-speech").resolve()] |
| raw = (fs.get("repo_root") or os.environ.get("FISH_SPEECH_ROOT") or "").strip() |
| if raw: |
| p = Path(raw).expanduser() |
| roots.append(p.resolve() if p.is_absolute() else (miner_repo / p).resolve()) |
|
|
| mm = Path("tools") / "server" / "model_manager.py" |
| skip_pip = os.environ.get("VOCENCE_SKIP_FISH_SPEECH_PIP", "").strip().lower() in ("1", "true", "yes") |
| max_rounds = int(os.environ.get("VOCENCE_FISH_SPEECH_PIP_MAX_ROUNDS", "60")) |
|
|
| for code_root in roots: |
| if not (code_root / mm).is_file(): |
| continue |
| s = str(code_root.resolve()) |
| if s not in sys.path: |
| sys.path.insert(0, s) |
| last_err: BaseException | None = None |
| for _ in range(max_rounds): |
| _patch_omegaconf_register_new_resolver() |
| try: |
| importlib.import_module("tools.server.model_manager") |
| return |
| except ModuleNotFoundError as e: |
| last_err = e |
| _purge_tools_modules() |
| mod = e.name |
| if skip_pip or mod is None: |
| try: |
| sys.path.remove(s) |
| except ValueError: |
| pass |
| raise ImportError( |
| f"{code_root}: missing {mod!r}. Install deps or unset VOCENCE_SKIP_FISH_SPEECH_PIP." |
| ) from e |
| head = mod.split(".")[0] |
| pkg = _PIP_ALIASES.get(head, head) |
| log.warning("Missing Python module %r — pip install %r …", mod, pkg) |
| if head in ("fish_speech", "tools"): |
| try: |
| sys.path.remove(s) |
| except ValueError: |
| pass |
| raise ImportError( |
| f"{code_root}: project import {mod!r} failed (broken tree or path?)." |
| ) from e |
| try: |
| _pip_install_module(mod) |
| except Exception as pip_e: |
| try: |
| sys.path.remove(s) |
| except ValueError: |
| pass |
| raise ImportError(f"{code_root}: could not install missing {mod!r}: {pip_e}") from pip_e |
| if s not in sys.path: |
| sys.path.insert(0, s) |
| continue |
| except Exception as e: |
| msg_l = str(e).lower() |
| if "already registered" in msg_l and "resolver" in msg_l: |
| log.warning("OmegaConf resolver clash (%s); clearing ``eval`` and retrying …", e) |
| last_err = e |
| _purge_tools_modules() |
| try: |
| from omegaconf import OmegaConf |
|
|
| cr = getattr(OmegaConf, "clear_resolver", None) |
| if callable(cr): |
| cr("eval") |
| if _OrigOmegaRegister is not None: |
| OmegaConf.register_new_resolver = _OrigOmegaRegister |
| except Exception: |
| pass |
| _OMEGA_RESOLVER_PATCHED = False |
| _patch_omegaconf_register_new_resolver() |
| if s not in sys.path: |
| sys.path.insert(0, s) |
| continue |
| last_err = e |
| _purge_tools_modules() |
| try: |
| sys.path.remove(s) |
| except ValueError: |
| pass |
| raise ImportError( |
| f"{code_root}: import failed after resolving modules (not a simple missing PyPI dep): {e}" |
| ) from e |
|
|
| try: |
| sys.path.remove(s) |
| except ValueError: |
| pass |
| raise ImportError( |
| f"{code_root}: exceeded {max_rounds} pip rounds (last error: {last_err}). " |
| "Install fish-speech deps manually or raise VOCENCE_FISH_SPEECH_PIP_MAX_ROUNDS." |
| ) from last_err |
|
|
| raise FileNotFoundError( |
| f"Missing {roots[0] / mm}. HF repo should include a fish-speech/ tree next to codec.pth, " |
| f"or set fish_speech.repo_root in {_VOCENCE_YAML} / FISH_SPEECH_ROOT." |
| ) |
|
|
|
|
| def load_tts_inference_engine( |
| *, |
| llama_checkpoint_path: str, |
| decoder_checkpoint_path: str, |
| decoder_config_name: str = "modded_dac_vq", |
| device: str = "cuda", |
| half: bool = False, |
| compile_model: bool = False, |
| ) -> Any: |
| from tools.server.model_manager import ModelManager |
|
|
| m = ModelManager( |
| mode="tts", |
| device=device, |
| half=half, |
| compile=compile_model, |
| llama_checkpoint_path=llama_checkpoint_path, |
| decoder_checkpoint_path=decoder_checkpoint_path, |
| decoder_config_name=decoder_config_name, |
| ) |
| return m.tts_inference_engine |
|
|
|
|
| def synthesize_wav( |
| engine: Any, |
| *, |
| text: str, |
| reference_audio_path: str | None = None, |
| reference_text: str | None = None, |
| max_new_tokens: int = 1024, |
| chunk_length: int = 200, |
| top_p: float = 0.8, |
| repetition_penalty: float = 1.1, |
| temperature: float = 0.8, |
| seed: int | None = None, |
| ) -> tuple[int, np.ndarray]: |
| from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest |
|
|
| if bool(reference_audio_path) ^ bool(reference_text): |
| raise ValueError("reference_audio_path and reference_text must be both set or both omitted") |
| refs: list[ServeReferenceAudio] = [] |
| if reference_audio_path: |
| rp = Path(reference_audio_path) |
| if not rp.is_file(): |
| raise FileNotFoundError(rp) |
| refs = [ServeReferenceAudio(audio=rp.read_bytes(), text=reference_text or "")] |
| req = ServeTTSRequest( |
| text=text, |
| references=refs, |
| reference_id=None, |
| max_new_tokens=max_new_tokens, |
| chunk_length=chunk_length, |
| top_p=top_p, |
| repetition_penalty=repetition_penalty, |
| temperature=temperature, |
| format="wav", |
| streaming=False, |
| seed=seed, |
| ) |
| sr: int | None = None |
| audio: np.ndarray | None = None |
| for result in engine.inference(req): |
| if result.code == "error": |
| raise RuntimeError(str(result.error or "inference error")) |
| if result.code == "final" and result.audio is not None: |
| sr, audio = result.audio |
| break |
| if sr is None or audio is None: |
| raise RuntimeError("no audio") |
| arr = np.asarray(audio, dtype=np.float32) |
| if arr.ndim > 1: |
| arr = np.mean(arr, axis=-1).astype(np.float32) |
| return int(sr), arr |
|
|
|
|
| def _resample(w: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: |
| if orig_sr == target_sr: |
| return np.asarray(w, dtype=np.float32) |
| import librosa |
|
|
| y = np.asarray(w, dtype=np.float32) |
| if y.ndim > 1: |
| y = np.mean(y, axis=-1).astype(np.float32) |
| return librosa.resample(y, orig_sr=orig_sr, target_sr=target_sr).astype(np.float32) |
|
|
|
|
| def _wav_bytes(w: np.ndarray, sample_rate: int) -> bytes: |
| w = np.clip(np.asarray(w, dtype=np.float32), -1.0, 1.0) |
| s16 = (w * 32767.0).astype(np.int16) |
| buf = io.BytesIO() |
| with wave.open(buf, "wb") as wv: |
| wv.setnchannels(1) |
| wv.setsampwidth(2) |
| wv.setframerate(sample_rate) |
| wv.writeframes(s16.tobytes()) |
| return buf.getvalue() |
|
|
|
|
| def _resolve_ckpt(raw: str | None, *, model_root: Path, miner_repo: Path) -> Path | None: |
| if not raw or not str(raw).strip(): |
| return None |
| s = str(raw).strip() |
| p = Path(s).expanduser() |
| if p.is_absolute(): |
| return p.resolve() |
| for base in (model_root, miner_repo): |
| c = (base / s).resolve() |
| if c.exists(): |
| return c |
| return (miner_repo / s).resolve() |
|
|
|
|
| def _llama_and_decoder(model_root: Path, miner_repo: Path, fs: Mapping[str, Any]) -> tuple[str, str]: |
| lr = (fs.get("llama_checkpoint_path") or os.environ.get("FISH_SPEECH_LLAMA_PATH") or "").strip() |
| dr = (fs.get("decoder_checkpoint_path") or os.environ.get("FISH_SPEECH_DECODER_PATH") or "").strip() |
| lp, dp = _resolve_ckpt(lr or None, model_root=model_root, miner_repo=miner_repo), _resolve_ckpt( |
| dr or None, model_root=model_root, miner_repo=miner_repo |
| ) |
| if lp is not None and dp is not None: |
| return str(lp), str(dp) |
| if lp is not None and dp is None: |
| cand = sorted(Path(lp).rglob("codec.pth"), key=lambda x: len(x.parts)) |
| if not cand: |
| raise FileNotFoundError(f"no codec.pth under {lp}") |
| return str(lp), str(cand[0]) |
| if dp is not None and lp is None: |
| return str(Path(dp).parent), str(dp) |
| c = model_root / "codec.pth" |
| if c.is_file(): |
| p = c.parent |
| return str(p), str(c) |
| m = sorted(model_root.rglob("codec.pth"), key=lambda x: len(x.parts)) |
| if not m: |
| raise FileNotFoundError(f"no codec.pth under {model_root}") |
| x = m[0] |
| return str(x.parent), str(x) |
|
|
|
|
| def _prompt(instruction: str, text: str) -> str: |
| s = instruction.strip() |
| tags = "".join(f"[{p.strip()}]" for p in s.split("|") if p.strip()) if s else "" |
| body = text.strip() |
| if not tags: |
| return body |
| return f"{tags} {body}" if body else tags |
|
|
|
|
| class Miner: |
| def __init__(self, miner_repo: Path) -> None: |
| if not logging.root.handlers: |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s") |
| self._repo = Path(miner_repo).resolve() |
| cfg = _read_yaml(self._repo) |
| lim = cfg.get("limits") or {} |
| self._cap_t = int(lim.get("max_text_chars", 2000)) |
| self._cap_i = int(lim.get("max_instruction_chars", 600)) |
| gen = cfg.get("generation") or {} |
| self._out_sr = int(gen.get("sample_rate", _OUT_SR)) |
| if self._out_sr != _OUT_SR: |
| raise ValueError(f"generation.sample_rate must be {_OUT_SR} in {_VOCENCE_YAML}") |
| fs = cfg.get("fish_speech") or {} |
| rt = cfg.get("runtime") or {} |
| log = logging.getLogger(__name__) |
|
|
| hub = (rt.get("hub_model_id") or rt.get("model_id") or "").strip() |
| rev = str(rt.get("model_revision") or rt.get("hub_revision") or os.environ.get("VOCENCE_MODEL_REVISION") or "").strip() or None |
| model_root = download_hub(self._repo, hub, rev) if hub else self._repo |
|
|
| _ensure_fish_speech(self._repo, model_root, fs) |
| llama_p, dec_p = _llama_and_decoder(model_root, self._repo, fs) |
| if not Path(dec_p).is_file(): |
| raise FileNotFoundError(f"decoder not a file: {dec_p}") |
| if not Path(llama_p).exists(): |
| raise FileNotFoundError(f"llama path missing: {llama_p}") |
|
|
| dev = str(fs.get("device") or rt.get("device_preference") or os.environ.get("FISH_SPEECH_DEVICE") or "cuda") |
| self._engine = load_tts_inference_engine( |
| llama_checkpoint_path=llama_p, |
| decoder_checkpoint_path=dec_p, |
| decoder_config_name=str(fs.get("decoder_config_name", "modded_dac_vq")), |
| device=dev, |
| half=bool(fs.get("half", False)), |
| compile_model=bool(fs.get("compile", False)), |
| ) |
| self._tok = int(fs.get("max_new_tokens", 1024)) |
| self._chunk = int(fs.get("chunk_length", 200)) |
| self._top_p = float(fs.get("top_p", 0.8)) |
| self._rep = float(fs.get("repetition_penalty", 1.1)) |
| self._temp = float(fs.get("temperature", 0.8)) |
| se = fs.get("seed") |
| self._seed = int(se) if se is not None else None |
| self._adapter = str(rt.get("adapter", "finetuned-tts")) |
| log.info("Miner ready (hub=%s, llama=%s)", hub or "local", llama_p) |
|
|
| def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]: |
| t = text[: self._cap_t] if self._cap_t else text |
| ins = instruction[: self._cap_i] if self._cap_i else instruction |
| sr, wav = synthesize_wav( |
| self._engine, |
| text=_prompt(ins, t), |
| max_new_tokens=self._tok, |
| chunk_length=self._chunk, |
| top_p=self._top_p, |
| repetition_penalty=self._rep, |
| temperature=self._temp, |
| seed=self._seed, |
| ) |
| return _resample(wav, int(sr), self._out_sr), self._out_sr |
|
|
|
|
| _engine: Miner | None = None |
| _err: str | None = None |
| _sr: int = _OUT_SR |
| _adapter: str = "finetuned-tts" |
|
|
|
|
| def _run_dev_server() -> None: |
| from contextlib import asynccontextmanager |
|
|
| import uvicorn |
| from fastapi import Body, FastAPI, HTTPException, status |
| from fastapi.responses import Response |
| from pydantic import BaseModel |
|
|
| if not logging.root.handlers: |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s") |
|
|
| @asynccontextmanager |
| async def lifespan(_: Any): |
| global _engine, _err, _sr, _adapter |
| cfg = _read_yaml(REPO) |
| _sr = int((cfg.get("generation") or {}).get("sample_rate", _OUT_SR)) |
| _adapter = str((cfg.get("runtime") or {}).get("adapter", "finetuned-tts")) |
| try: |
| _engine = Miner(REPO) |
| _err = None |
| except Exception as e: |
| _engine = None |
| _err = f"{type(e).__name__}: {e}" |
| logging.getLogger(__name__).exception("Miner startup failed") |
| yield |
| _engine = None |
|
|
| class Health(BaseModel): |
| status: str |
| model_loaded: bool |
| sample_rate: int |
| adapter: str |
| error: str | None = None |
|
|
| app = FastAPI(title="Vocence TTS", lifespan=lifespan) |
|
|
| @app.get("/health", response_model=Health) |
| async def health() -> Health: |
| ok = _engine is not None |
| return Health( |
| status="healthy" if ok else "unhealthy", |
| model_loaded=ok, |
| sample_rate=_sr, |
| adapter=_adapter, |
| error=None if ok else _err, |
| ) |
|
|
| lim = _read_yaml(REPO).get("limits") or {} |
| mx_t, mx_i = int(lim.get("max_text_chars", 2000)), int(lim.get("max_instruction_chars", 600)) |
|
|
| @app.post("/speak", response_class=Response, response_model=None) |
| async def speak( |
| text: str = Body(..., min_length=1, max_length=mx_t, embed=True), |
| instruction: str = Body(..., min_length=1, max_length=mx_i, embed=True), |
| ) -> Response: |
| if _engine is None: |
| raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=_err or "engine not loaded") |
| w, sr = _engine.generate_wav(instruction, text) |
| w = np.asarray(w) |
| if w.ndim != 1 or w.size == 0: |
| raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="invalid waveform") |
| d = float(w.shape[0]) / float(sr) |
| if d <= 0 or d > _MAX_AUDIO_SEC: |
| raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="invalid duration") |
| return Response(content=_wav_bytes(w, int(sr)), media_type="audio/wav") |
|
|
| uvicorn.run(app, host=os.environ.get("HOST", "0.0.0.0"), port=int(os.environ.get("PORT", "8765"))) |
|
|
|
|
| if __name__ == "__main__": |
| _run_dev_server() |
|
|