Upload miner.py with huggingface_hub

8960e0b verified 9 days ago

20.1 kB

	from __future__ import annotations

	"""Vocence TTS miner: HF snapshot (weights + ``fish-speech/``) then Fish inference.

	After ``snapshot_download``, code lives at ``<snapshot>/fish-speech/``. Missing
	PyPI packages are installed one at a time via ``pip install <module>`` from
	``ModuleNotFoundError`` (avoids full ``pyproject.toml`` install pulling ``pyaudio``,
	which needs system ``portaudio`` headers). Set ``VOCENCE_SKIP_FISH_SPEECH_PIP=1`` to
	disable. Override path with ``fish_speech.repo_root`` or ``FISH_SPEECH_ROOT``.
	"""

	import io
	import logging
	import os
	import sys
	import wave
	from pathlib import Path
	from typing import Any, Mapping

	import numpy as np

	REPO = Path(__file__).resolve().parent
	_VOCENCE_YAML = "vocence_config.yaml"
	_MAX_AUDIO_SEC = 30
	_OUT_SR = 24000

	_OMEGA_RESOLVER_PATCHED: bool = False
	_OrigOmegaRegister: Any = None


	def _patch_omegaconf_register_new_resolver() -> None:
	"""Retry ``register_new_resolver`` with ``replace=True`` if Hydra/lightning registered ``eval`` first."""
	global _OMEGA_RESOLVER_PATCHED, _OrigOmegaRegister
	if _OMEGA_RESOLVER_PATCHED:
	return
	try:
	from omegaconf import OmegaConf
	except ImportError:
	return
	if _OrigOmegaRegister is None:
	_OrigOmegaRegister = OmegaConf.register_new_resolver

	def _patched(name, resolver, args, *kwargs):
	kw = dict(kwargs)
	try:
	return _OrigOmegaRegister(name, resolver, args, *kw)
	except ValueError as exc:
	if "already registered" not in str(exc).lower():
	raise
	kw["replace"] = True
	return _OrigOmegaRegister(name, resolver, args, *kw)

	OmegaConf.register_new_resolver = _patched # type: ignore[method-assign]
	_OMEGA_RESOLVER_PATCHED = True


	def _read_yaml(repo: Path) -> dict[str, Any]:
	from yaml import safe_load

	p = repo / _VOCENCE_YAML
	if not p.is_file():
	return {}
	with p.open(encoding="utf-8") as f:
	d = safe_load(f)
	return dict(d) if isinstance(d, Mapping) else {}


	def _hf_token() -> str \| None:
	t = (os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or "").strip()
	return t or None


	def _weights_dir(repo: Path, repo_id: str) -> Path:
	safe = repo_id.replace("/", "__").replace(":", "_")
	return (repo / "_vocence_hf_weights" / safe).resolve()


	def download_hub(repo: Path, repo_id: str, revision: str \| None) -> Path:
	from huggingface_hub import snapshot_download

	dest = _weights_dir(repo, repo_id)
	dest.mkdir(parents=True, exist_ok=True)
	logging.getLogger(__name__).info("Downloading %s → %s", repo_id, dest)
	snapshot_download(repo_id=repo_id, revision=revision, local_dir=str(dest), token=_hf_token())
	if not (dest / "codec.pth").is_file():
	raise FileNotFoundError(f"missing codec.pth under {dest}")
	return dest


	def _purge_tools_modules() -> None:
	for m in list(sys.modules):
	if m == "tools" or m.startswith("tools.") or m == "fish_speech" or m.startswith("fish_speech."):
	del sys.modules[m]


	# Top-level import name -> PyPI distribution (wrong names break installs, e.g. ``hydra`` vs ``hydra-core``).
	_PIP_ALIASES: dict[str, str] = {
	"PIL": "Pillow",
	"yaml": "PyYAML",
	"sklearn": "scikit-learn",
	"hydra": "hydra-core",
	"pytorch_lightning": "lightning",
	}


	def _pip_install_module(mod: str) -> None:
	"""``pip install`` the PyPI package that provides import name ``mod`` (top-level segment)."""
	import subprocess

	head = (mod or "").strip().split(".")[0]
	if not head:
	raise ValueError("empty module name")
	if head in ("fish_speech", "tools"):
	raise RuntimeError(f"refusing to pip install std project name {head!r}")
	std = getattr(sys, "stdlib_module_names", None)
	if std is not None and head in std:
	raise RuntimeError(f"refusing to pip install stdlib name {head!r}")
	pkg = _PIP_ALIASES.get(head, head)
	log = logging.getLogger(__name__)
	cmd = [sys.executable, "-m", "pip", "install", pkg]
	log.info("Running: %s", " ".join(cmd))
	r = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
	if r.returncode != 0:
	msg = (r.stderr or r.stdout or "").strip() or f"exit {r.returncode}"
	raise RuntimeError(f"pip install {pkg!r} failed: {msg}")


	def _tools_already_importable() -> bool:
	import importlib

	_patch_omegaconf_register_new_resolver()
	try:
	importlib.import_module("tools.server.model_manager")
	return True
	except Exception:
	_purge_tools_modules()
	return False


	def _ensure_fish_speech(miner_repo: Path, model_root: Path, fs: Mapping[str, Any]) -> None:
	"""Use ``<model_root>/fish-speech`` on ``sys.path`` (or ``repo_root`` / ``FISH_SPEECH_ROOT``)."""
	global _OMEGA_RESOLVER_PATCHED
	import importlib

	log = logging.getLogger(__name__)
	if _tools_already_importable():
	return

	roots: list[Path] = [(model_root / "fish-speech").resolve()]
	raw = (fs.get("repo_root") or os.environ.get("FISH_SPEECH_ROOT") or "").strip()
	if raw:
	p = Path(raw).expanduser()
	roots.append(p.resolve() if p.is_absolute() else (miner_repo / p).resolve())

	mm = Path("tools") / "server" / "model_manager.py"
	skip_pip = os.environ.get("VOCENCE_SKIP_FISH_SPEECH_PIP", "").strip().lower() in ("1", "true", "yes")
	max_rounds = int(os.environ.get("VOCENCE_FISH_SPEECH_PIP_MAX_ROUNDS", "60"))

	for code_root in roots:
	if not (code_root / mm).is_file():
	continue
	s = str(code_root.resolve())
	if s not in sys.path:
	sys.path.insert(0, s)
	last_err: BaseException \| None = None
	for _ in range(max_rounds):
	_patch_omegaconf_register_new_resolver()
	try:
	importlib.import_module("tools.server.model_manager")
	return
	except ModuleNotFoundError as e:
	last_err = e
	_purge_tools_modules()
	mod = e.name
	if skip_pip or mod is None:
	try:
	sys.path.remove(s)
	except ValueError:
	pass
	raise ImportError(
	f"{code_root}: missing {mod!r}. Install deps or unset VOCENCE_SKIP_FISH_SPEECH_PIP."
	) from e
	head = mod.split(".")[0]
	pkg = _PIP_ALIASES.get(head, head)
	log.warning("Missing Python module %r — pip install %r …", mod, pkg)
	if head in ("fish_speech", "tools"):
	try:
	sys.path.remove(s)
	except ValueError:
	pass
	raise ImportError(
	f"{code_root}: project import {mod!r} failed (broken tree or path?)."
	) from e
	try:
	_pip_install_module(mod)
	except Exception as pip_e:
	try:
	sys.path.remove(s)
	except ValueError:
	pass
	raise ImportError(f"{code_root}: could not install missing {mod!r}: {pip_e}") from pip_e
	if s not in sys.path:
	sys.path.insert(0, s)
	continue
	except Exception as e:
	msg_l = str(e).lower()
	if "already registered" in msg_l and "resolver" in msg_l:
	log.warning("OmegaConf resolver clash (%s); clearing ``eval`` and retrying …", e)
	last_err = e
	_purge_tools_modules()
	try:
	from omegaconf import OmegaConf

	cr = getattr(OmegaConf, "clear_resolver", None)
	if callable(cr):
	cr("eval")
	if _OrigOmegaRegister is not None:
	OmegaConf.register_new_resolver = _OrigOmegaRegister # type: ignore[method-assign]
	except Exception:
	pass
	_OMEGA_RESOLVER_PATCHED = False
	_patch_omegaconf_register_new_resolver()
	if s not in sys.path:
	sys.path.insert(0, s)
	continue
	last_err = e
	_purge_tools_modules()
	try:
	sys.path.remove(s)
	except ValueError:
	pass
	raise ImportError(
	f"{code_root}: import failed after resolving modules (not a simple missing PyPI dep): {e}"
	) from e

	try:
	sys.path.remove(s)
	except ValueError:
	pass
	raise ImportError(
	f"{code_root}: exceeded {max_rounds} pip rounds (last error: {last_err}). "
	"Install fish-speech deps manually or raise VOCENCE_FISH_SPEECH_PIP_MAX_ROUNDS."
	) from last_err

	raise FileNotFoundError(
	f"Missing {roots[0] / mm}. HF repo should include a fish-speech/ tree next to codec.pth, "
	f"or set fish_speech.repo_root in {_VOCENCE_YAML} / FISH_SPEECH_ROOT."
	)


	def load_tts_inference_engine(
	*,
	llama_checkpoint_path: str,
	decoder_checkpoint_path: str,
	decoder_config_name: str = "modded_dac_vq",
	device: str = "cuda",
	half: bool = False,
	compile_model: bool = False,
	) -> Any:
	from tools.server.model_manager import ModelManager

	m = ModelManager(
	mode="tts",
	device=device,
	half=half,
	compile=compile_model,
	llama_checkpoint_path=llama_checkpoint_path,
	decoder_checkpoint_path=decoder_checkpoint_path,
	decoder_config_name=decoder_config_name,
	)
	return m.tts_inference_engine


	def synthesize_wav(
	engine: Any,
	*,
	text: str,
	reference_audio_path: str \| None = None,
	reference_text: str \| None = None,
	max_new_tokens: int = 1024,
	chunk_length: int = 200,
	top_p: float = 0.8,
	repetition_penalty: float = 1.1,
	temperature: float = 0.8,
	seed: int \| None = None,
	) -> tuple[int, np.ndarray]:
	from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest

	if bool(reference_audio_path) ^ bool(reference_text):
	raise ValueError("reference_audio_path and reference_text must be both set or both omitted")
	refs: list[ServeReferenceAudio] = []
	if reference_audio_path:
	rp = Path(reference_audio_path)
	if not rp.is_file():
	raise FileNotFoundError(rp)
	refs = [ServeReferenceAudio(audio=rp.read_bytes(), text=reference_text or "")]
	req = ServeTTSRequest(
	text=text,
	references=refs,
	reference_id=None,
	max_new_tokens=max_new_tokens,
	chunk_length=chunk_length,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	temperature=temperature,
	format="wav",
	streaming=False,
	seed=seed,
	)
	sr: int \| None = None
	audio: np.ndarray \| None = None
	for result in engine.inference(req):
	if result.code == "error":
	raise RuntimeError(str(result.error or "inference error"))
	if result.code == "final" and result.audio is not None:
	sr, audio = result.audio
	break
	if sr is None or audio is None:
	raise RuntimeError("no audio")
	arr = np.asarray(audio, dtype=np.float32)
	if arr.ndim > 1:
	arr = np.mean(arr, axis=-1).astype(np.float32)
	return int(sr), arr


	def _resample(w: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
	if orig_sr == target_sr:
	return np.asarray(w, dtype=np.float32)
	import librosa

	y = np.asarray(w, dtype=np.float32)
	if y.ndim > 1:
	y = np.mean(y, axis=-1).astype(np.float32)
	return librosa.resample(y, orig_sr=orig_sr, target_sr=target_sr).astype(np.float32)


	def _wav_bytes(w: np.ndarray, sample_rate: int) -> bytes:
	w = np.clip(np.asarray(w, dtype=np.float32), -1.0, 1.0)
	s16 = (w * 32767.0).astype(np.int16)
	buf = io.BytesIO()
	with wave.open(buf, "wb") as wv:
	wv.setnchannels(1)
	wv.setsampwidth(2)
	wv.setframerate(sample_rate)
	wv.writeframes(s16.tobytes())
	return buf.getvalue()


	def _resolve_ckpt(raw: str \| None, *, model_root: Path, miner_repo: Path) -> Path \| None:
	if not raw or not str(raw).strip():
	return None
	s = str(raw).strip()
	p = Path(s).expanduser()
	if p.is_absolute():
	return p.resolve()
	for base in (model_root, miner_repo):
	c = (base / s).resolve()
	if c.exists():
	return c
	return (miner_repo / s).resolve()


	def _llama_and_decoder(model_root: Path, miner_repo: Path, fs: Mapping[str, Any]) -> tuple[str, str]:
	lr = (fs.get("llama_checkpoint_path") or os.environ.get("FISH_SPEECH_LLAMA_PATH") or "").strip()
	dr = (fs.get("decoder_checkpoint_path") or os.environ.get("FISH_SPEECH_DECODER_PATH") or "").strip()
	lp, dp = _resolve_ckpt(lr or None, model_root=model_root, miner_repo=miner_repo), _resolve_ckpt(
	dr or None, model_root=model_root, miner_repo=miner_repo
	)
	if lp is not None and dp is not None:
	return str(lp), str(dp)
	if lp is not None and dp is None:
	cand = sorted(Path(lp).rglob("codec.pth"), key=lambda x: len(x.parts))
	if not cand:
	raise FileNotFoundError(f"no codec.pth under {lp}")
	return str(lp), str(cand[0])
	if dp is not None and lp is None:
	return str(Path(dp).parent), str(dp)
	c = model_root / "codec.pth"
	if c.is_file():
	p = c.parent
	return str(p), str(c)
	m = sorted(model_root.rglob("codec.pth"), key=lambda x: len(x.parts))
	if not m:
	raise FileNotFoundError(f"no codec.pth under {model_root}")
	x = m[0]
	return str(x.parent), str(x)


	def _prompt(instruction: str, text: str) -> str:
	s = instruction.strip()
	tags = "".join(f"[{p.strip()}]" for p in s.split("\|") if p.strip()) if s else ""
	body = text.strip()
	if not tags:
	return body
	return f"{tags} {body}" if body else tags


	class Miner:
	def __init__(self, miner_repo: Path) -> None:
	if not logging.root.handlers:
	logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
	self._repo = Path(miner_repo).resolve()
	cfg = _read_yaml(self._repo)
	lim = cfg.get("limits") or {}
	self._cap_t = int(lim.get("max_text_chars", 2000))
	self._cap_i = int(lim.get("max_instruction_chars", 600))
	gen = cfg.get("generation") or {}
	self._out_sr = int(gen.get("sample_rate", _OUT_SR))
	if self._out_sr != _OUT_SR:
	raise ValueError(f"generation.sample_rate must be {_OUT_SR} in {_VOCENCE_YAML}")
	fs = cfg.get("fish_speech") or {}
	rt = cfg.get("runtime") or {}
	log = logging.getLogger(__name__)

	hub = (rt.get("hub_model_id") or rt.get("model_id") or "").strip()
	rev = str(rt.get("model_revision") or rt.get("hub_revision") or os.environ.get("VOCENCE_MODEL_REVISION") or "").strip() or None
	model_root = download_hub(self._repo, hub, rev) if hub else self._repo

	_ensure_fish_speech(self._repo, model_root, fs)
	llama_p, dec_p = _llama_and_decoder(model_root, self._repo, fs)
	if not Path(dec_p).is_file():
	raise FileNotFoundError(f"decoder not a file: {dec_p}")
	if not Path(llama_p).exists():
	raise FileNotFoundError(f"llama path missing: {llama_p}")

	dev = str(fs.get("device") or rt.get("device_preference") or os.environ.get("FISH_SPEECH_DEVICE") or "cuda")
	self._engine = load_tts_inference_engine(
	llama_checkpoint_path=llama_p,
	decoder_checkpoint_path=dec_p,
	decoder_config_name=str(fs.get("decoder_config_name", "modded_dac_vq")),
	device=dev,
	half=bool(fs.get("half", False)),
	compile_model=bool(fs.get("compile", False)),
	)
	self._tok = int(fs.get("max_new_tokens", 1024))
	self._chunk = int(fs.get("chunk_length", 200))
	self._top_p = float(fs.get("top_p", 0.8))
	self._rep = float(fs.get("repetition_penalty", 1.1))
	self._temp = float(fs.get("temperature", 0.8))
	se = fs.get("seed")
	self._seed = int(se) if se is not None else None
	self._adapter = str(rt.get("adapter", "finetuned-tts"))
	log.info("Miner ready (hub=%s, llama=%s)", hub or "local", llama_p)

	def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
	t = text[: self._cap_t] if self._cap_t else text
	ins = instruction[: self._cap_i] if self._cap_i else instruction
	sr, wav = synthesize_wav(
	self._engine,
	text=_prompt(ins, t),
	max_new_tokens=self._tok,
	chunk_length=self._chunk,
	top_p=self._top_p,
	repetition_penalty=self._rep,
	temperature=self._temp,
	seed=self._seed,
	)
	return _resample(wav, int(sr), self._out_sr), self._out_sr


	_engine: Miner \| None = None
	_err: str \| None = None
	_sr: int = _OUT_SR
	_adapter: str = "finetuned-tts"


	def _run_dev_server() -> None:
	from contextlib import asynccontextmanager

	import uvicorn
	from fastapi import Body, FastAPI, HTTPException, status
	from fastapi.responses import Response
	from pydantic import BaseModel

	if not logging.root.handlers:
	logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")

	@asynccontextmanager
	async def lifespan(_: Any):
	global _engine, _err, _sr, _adapter
	cfg = _read_yaml(REPO)
	_sr = int((cfg.get("generation") or {}).get("sample_rate", _OUT_SR))
	_adapter = str((cfg.get("runtime") or {}).get("adapter", "finetuned-tts"))
	try:
	_engine = Miner(REPO)
	_err = None
	except Exception as e:
	_engine = None
	_err = f"{type(e).__name__}: {e}"
	logging.getLogger(__name__).exception("Miner startup failed")
	yield
	_engine = None

	class Health(BaseModel):
	status: str
	model_loaded: bool
	sample_rate: int
	adapter: str
	error: str \| None = None

	app = FastAPI(title="Vocence TTS", lifespan=lifespan)

	@app.get("/health", response_model=Health)
	async def health() -> Health:
	ok = _engine is not None
	return Health(
	status="healthy" if ok else "unhealthy",
	model_loaded=ok,
	sample_rate=_sr,
	adapter=_adapter,
	error=None if ok else _err,
	)

	lim = _read_yaml(REPO).get("limits") or {}
	mx_t, mx_i = int(lim.get("max_text_chars", 2000)), int(lim.get("max_instruction_chars", 600))

	@app.post("/speak", response_class=Response, response_model=None)
	async def speak(
	text: str = Body(..., min_length=1, max_length=mx_t, embed=True),
	instruction: str = Body(..., min_length=1, max_length=mx_i, embed=True),
	) -> Response:
	if _engine is None:
	raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=_err or "engine not loaded")
	w, sr = _engine.generate_wav(instruction, text)
	w = np.asarray(w)
	if w.ndim != 1 or w.size == 0:
	raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="invalid waveform")
	d = float(w.shape[0]) / float(sr)
	if d <= 0 or d > _MAX_AUDIO_SEC:
	raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="invalid duration")
	return Response(content=_wav_bytes(w, int(sr)), media_type="audio/wav")

	uvicorn.run(app, host=os.environ.get("HOST", "0.0.0.0"), port=int(os.environ.get("PORT", "8765")))


	if __name__ == "__main__":
	_run_dev_server()