arwin0727
/

tts_engine_v2

Model card Files Files and versions

xet

Community

arwin0727 commited on 20 days ago

Commit

9110de1

verified ·

1 Parent(s): e2d018c

Upload miner.py with huggingface_hub

Browse files

Files changed (1) hide show

miner.py +426 -0

miner.py ADDED Viewed

	@@ -0,0 +1,426 @@

+from __future__ import annotations
+import io
+import json
+import os
+import sys
+import wave
+from pathlib import Path
+from typing import Any, Mapping
+import numpy as np
+REPO = Path(__file__).resolve().parent
+_VOCENCE_YAML = "vocence_config.yaml"
+_MAX_AUDIO_SEC = 30
+_VOCENCE_OUTPUT_HZ = 24000
+def _resample_to_hz_mono_f32(waveform: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+    """Linear / polyphase resample mono float32 ``[-1, 1]`` to ``target_sr`` (uses librosa)."""
+    if orig_sr == target_sr:
+        return np.asarray(waveform, dtype=np.float32)
+    import librosa
+    y = np.asarray(waveform, dtype=np.float32)
+    if y.ndim > 1:
+        y = np.mean(y, axis=-1).astype(np.float32)
+    return librosa.resample(y, orig_sr=int(orig_sr), target_sr=int(target_sr)).astype(np.float32)
+def load_tts_inference_engine(
+    *,
+    llama_checkpoint_path: str,
+    decoder_checkpoint_path: str,
+    decoder_config_name: str = "modded_dac_vq",
+    device: str = "cuda",
+    half: bool = False,
+    compile_model: bool = False,
+) -> Any:
+    from tools.server.model_manager import ModelManager
+    manager = ModelManager(
+        mode="tts",
+        device=device,
+        half=half,
+        compile=compile_model,
+        llama_checkpoint_path=llama_checkpoint_path,
+        decoder_checkpoint_path=decoder_checkpoint_path,
+        decoder_config_name=decoder_config_name,
+    )
+    return manager.tts_inference_engine
+def synthesize_wav(
+    engine: Any,
+    *,
+    text: str,
+    reference_audio_path: str | None = None,
+    reference_text: str | None = None,
+    max_new_tokens: int = 1024,
+    chunk_length: int = 200,
+    top_p: float = 0.8,
+    repetition_penalty: float = 1.1,
+    temperature: float = 0.8,
+    seed: int | None = None,
+) -> tuple[int, np.ndarray]:
+    """One non-streaming TTS request; returns ``(sample_rate_hz, mono float32)``."""
+    from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest
+    if bool(reference_audio_path) ^ bool(reference_text):
+        raise ValueError("provide both reference_audio_path and reference_text, or neither")
+    references: list[ServeReferenceAudio] = []
+    if reference_audio_path:
+        ref_path = Path(reference_audio_path)
+        if not ref_path.is_file():
+            raise FileNotFoundError(f"reference audio not found: {ref_path}")
+        references = [
+            ServeReferenceAudio(
+                audio=ref_path.read_bytes(),
+                text=reference_text or "",
+            )
+        ]
+    req = ServeTTSRequest(
+        text=text,
+        references=references,
+        reference_id=None,
+        max_new_tokens=max_new_tokens,
+        chunk_length=chunk_length,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        temperature=temperature,
+        format="wav",
+        streaming=False,
+        seed=seed,
+    )
+    sample_rate: int | None = None
+    audio: np.ndarray | None = None
+    for result in engine.inference(req):
+        if result.code == "error":
+            err = result.error or "unknown inference error"
+            raise RuntimeError(str(err))
+        if result.code == "final" and result.audio is not None:
+            sample_rate, audio = result.audio
+            break
+    if sample_rate is None or audio is None:
+        raise RuntimeError("no audio produced")
+    arr = np.asarray(audio, dtype=np.float32)
+    if arr.ndim > 1:
+        arr = np.mean(arr, axis=-1).astype(np.float32)
+    return int(sample_rate), arr
+def _read_vocence_yaml(repo: Path) -> dict[str, Any]:
+    path = repo / _VOCENCE_YAML
+    if not path.is_file():
+        return {}
+    from yaml import safe_load
+    with path.open("r", encoding="utf-8") as fh:
+        data = safe_load(fh)
+    return data if isinstance(data, Mapping) else {}
+def _f32_to_wav_bytes(waveform: np.ndarray, sample_rate: int) -> bytes:
+    w = np.clip(np.asarray(waveform, dtype=np.float32), -1.0, 1.0)
+    s16 = (w * 32767.0).astype(np.int16)
+    buf = io.BytesIO()
+    with wave.open(buf, "wb") as wv:
+        wv.setnchannels(1)
+        wv.setsampwidth(2)
+        wv.setframerate(sample_rate)
+        wv.writeframes(s16.tobytes())
+    return buf.getvalue()
+def _resolve_path(repo: Path, raw: str) -> Path:
+    p = Path(raw).expanduser()
+    return p.resolve() if p.is_absolute() else (repo / p).resolve()
+def _hf_token() -> str | None:
+    return (os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or "").strip() or None
+def _weights_dir_for_repo_id(hf_repo: Path, repo_id: str) -> Path:
+    safe = repo_id.replace("/", "__").replace(":", "_")
+    return (hf_repo / "_vocence_hf_weights" / safe).resolve()
+def download_runtime_hub_model(
+    hf_repo: Path,
+    repo_id: str,
+    *,
+    revision: str | None = None,
+) -> Path:
+    """Download ``repo_id`` into ``hf_repo/_vocence_hf_weights/<sanitized>/`` and return that directory."""
+    from huggingface_hub import snapshot_download
+    dest = _weights_dir_for_repo_id(hf_repo, repo_id)
+    dest.mkdir(parents=True, exist_ok=True)
+    snapshot_download(
+        repo_id=repo_id,
+        revision=revision,
+        local_dir=str(dest),
+        local_dir_use_symlinks=False,
+        token=_hf_token(),
+    )
+    return dest
+def _resolve_checkpoint_path(
+    raw: str | None,
+    *,
+    model_root: Path,
+    hf_repo: Path,
+) -> Path | None:
+    if raw is None or not str(raw).strip():
+        return None
+    s = str(raw).strip()
+    p = Path(s).expanduser()
+    if p.is_absolute():
+        return p.resolve()
+    for base in (model_root, hf_repo):
+        cand = (base / s).resolve()
+        if cand.exists():
+            return cand
+    return (hf_repo / s).resolve()
+def _infer_fish_codec_paths(model_root: Path) -> tuple[str, str]:
+    matches = sorted(model_root.rglob("codec.pth"), key=lambda x: len(x.parts))
+    if not matches:
+        raise FileNotFoundError(
+            f"No codec.pth under {model_root}; set fish_speech.llama_checkpoint_path and "
+            f"fish_speech.decoder_checkpoint_path in {_VOCENCE_YAML}."
+        )
+    codec = matches[0]
+    parent = codec.parent
+    return str(parent), str(codec)
+def _instruction_pipes_to_brackets(instruction: str) -> str:
+    s = instruction.strip()
+    if not s:
+        return ""
+    parts = [p.strip() for p in s.split("|") if p.strip()]
+    return "".join(f"[{p}]" for p in parts)
+def _tts_prompt_from_instruction_and_text(instruction: str, text: str) -> str:
+    tags = _instruction_pipes_to_brackets(instruction)
+    body = text.strip()
+    if not tags:
+        return body
+    if not body:
+        return tags
+    return f"{tags} {body}"
+class Miner:
+    def __init__(self, path_hf_repo: Path) -> None:
+        self._repo = Path(path_hf_repo).resolve()
+        cfg = _read_vocence_yaml(self._repo)
+        limits = cfg.get("limits") or {}
+        self._cap_text = int(limits.get("max_text_chars", 2000))
+        self._cap_instruction = int(limits.get("max_instruction_chars", 600))
+        gen = cfg.get("generation") or {}
+        out_sr = int(gen.get("sample_rate", _VOCENCE_OUTPUT_HZ))
+        if out_sr != _VOCENCE_OUTPUT_HZ:
+            raise ValueError(
+                f"generation.sample_rate must be {_VOCENCE_OUTPUT_HZ} (got {out_sr}); "
+                f"edit {self._repo / _VOCENCE_YAML}."
+            )
+        self._output_sr = out_sr
+        fs = cfg.get("fish_speech") or {}
+        rt = cfg.get("runtime") or {}
+        hub_id = (rt.get("hub_model_id") or rt.get("model_id") or "").strip()
+        rev_raw = (
+            rt.get("model_revision")
+            or rt.get("hub_revision")
+            or os.environ.get("VOCENCE_MODEL_REVISION")
+            or ""
+        )
+        revision = str(rev_raw).strip() or None
+        model_root = self._repo
+        if hub_id:
+            model_root = download_runtime_hub_model(self._repo, hub_id, revision=revision)
+        repo_root = (fs.get("repo_root") or os.environ.get("FISH_SPEECH_ROOT") or "").strip()
+        if repo_root:
+            rr = _resolve_path(self._repo, repo_root)
+            if rr.is_dir() and str(rr) not in sys.path:
+                sys.path.insert(0, str(rr))
+        llama_raw = (fs.get("llama_checkpoint_path") or os.environ.get("FISH_SPEECH_LLAMA_PATH") or "").strip()
+        dec_raw = (fs.get("decoder_checkpoint_path") or os.environ.get("FISH_SPEECH_DECODER_PATH") or "").strip()
+        llama_path = _resolve_checkpoint_path(llama_raw or None, model_root=model_root, hf_repo=self._repo)
+        dec_path = _resolve_checkpoint_path(dec_raw or None, model_root=model_root, hf_repo=self._repo)
+        if llama_path is not None and dec_path is not None:
+            llama_p, decoder_p = str(llama_path), str(dec_path)
+        elif llama_path is not None and dec_path is None:
+            llama_p = str(llama_path)
+            cand = sorted(Path(llama_p).rglob("codec.pth"), key=lambda x: len(x.parts))
+            if not cand:
+                raise FileNotFoundError(f"No codec.pth under {llama_p}; set fish_speech.decoder_checkpoint_path.")
+            decoder_p = str(cand[0])
+        elif dec_path is not None and llama_path is None:
+            decoder_p = str(dec_path)
+            llama_p = str(dec_path.parent)
+        else:
+            llama_p, decoder_p = _infer_fish_codec_paths(model_root)
+        device = str(fs.get("device") or rt.get("device_preference") or os.environ.get("FISH_SPEECH_DEVICE") or "cuda")
+        half = bool(fs.get("half", False))
+        compile_model = bool(fs.get("compile", False))
+        decoder_config = str(fs.get("decoder_config_name", "modded_dac_vq"))
+        self._engine = load_tts_inference_engine(
+            llama_checkpoint_path=llama_p,
+            decoder_checkpoint_path=decoder_p,
+            decoder_config_name=decoder_config,
+            device=device,
+            half=half,
+            compile_model=compile_model,
+        )
+        self._max_new_tokens = int(fs.get("max_new_tokens", 1024))
+        self._chunk_length = int(fs.get("chunk_length", 200))
+        self._top_p = float(fs.get("top_p", 0.8))
+        self._repetition_penalty = float(fs.get("repetition_penalty", 1.1))
+        self._temperature = float(fs.get("temperature", 0.8))
+        self._seed = fs.get("seed")
+        self._seed_i: int | None = int(self._seed) if self._seed is not None else None
+        self._meta = {
+            "adapter": str(rt.get("adapter", "finetuned-tts")),
+            "hub_model_id": hub_id or None,
+            "model_revision": revision,
+            "weights_local_dir": str(model_root) if hub_id else None,
+            "llama_checkpoint_path": llama_p,
+            "decoder_checkpoint_path": decoder_p,
+            "device": device,
+            "output_sample_rate": self._output_sr,
+        }
+    def get_status(self) -> dict[str, Any]:
+        return {"tts_engine": "finetuned-tts", **self._meta}
+    def warmup(self) -> None:
+        self.generate_wav(
+            "gender: neutral | pitch: mid | speed: normal | age_group: adult | "
+            "emotion: neutral | tone: neutral | accent: generic",
+            "Warmup complete.",
+        )
+    def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
+        t = text[: self._cap_text] if self._cap_text else text
+        ins = instruction[: self._cap_instruction] if self._cap_instruction else instruction
+        prompt = _tts_prompt_from_instruction_and_text(ins, t)
+        sr, wav = synthesize_wav(
+            self._engine,
+            text=prompt,
+            max_new_tokens=self._max_new_tokens,
+            chunk_length=self._chunk_length,
+            top_p=self._top_p,
+            repetition_penalty=self._repetition_penalty,
+            temperature=self._temperature,
+            seed=self._seed_i,
+        )
+        wav_out = _resample_to_hz_mono_f32(wav, int(sr), self._output_sr)
+        return wav_out, self._output_sr
+_engine: Miner | None = None
+_health: dict[str, Any] = {}
+def _run_dev_server() -> None:
+    from contextlib import asynccontextmanager
+    import uvicorn
+    from fastapi import Body, FastAPI, HTTPException, status
+    from fastapi.responses import Response
+    from pydantic import BaseModel
+    @asynccontextmanager
+    async def _lifespan(_app: Any):
+        global _engine, _health
+        cfg = _read_vocence_yaml(REPO)
+        gen = cfg.get("generation") or {}
+        _health = {"sample_rate": int(gen.get("sample_rate", _VOCENCE_OUTPUT_HZ))}
+        try:
+            _engine = Miner(REPO)
+            _health["adapter"] = json.dumps(_engine.get_status())
+        except Exception as e:
+            _engine = None
+            _health["adapter"] = json.dumps({"tts_engine": "not loaded"})
+            _health["error"] = f"{type(e).__name__}: {e}"
+        yield
+        _engine = None
+    class HealthResponse(BaseModel):
+        status: str
+        model_loaded: bool
+        sample_rate: int | None = None
+        adapter: str | None = None
+    app = FastAPI(title="Vocence finetuned-tts TTS (dev)", lifespan=_lifespan)
+    @app.get("/health", response_model=HealthResponse)
+    async def health() -> HealthResponse:
+        ok = _engine is not None
+        err = _health.get("error")
+        return HealthResponse(
+            status="healthy" if ok else (f"unhealthy: {err}" if err else "unhealthy"),
+            model_loaded=ok,
+            sample_rate=_health.get("sample_rate"),
+            adapter=_health.get("adapter", "finetuned-tts"),
+        )
+    max_text = int((_read_vocence_yaml(REPO).get("limits") or {}).get("max_text_chars", 2000))
+    max_inst = int((_read_vocence_yaml(REPO).get("limits") or {}).get("max_instruction_chars", 600))
+    @app.post("/speak", response_class=Response, response_model=None)
+    async def speak(
+        text: str = Body(..., min_length=1, max_length=max_text, embed=True),
+        instruction: str = Body(..., min_length=1, max_length=max_inst, embed=True),
+    ) -> Response:
+        if _engine is None:
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail=f"TTS engine not loaded: {_health.get('error', 'unknown')}",
+            )
+        waveform, sample_rate = _engine.generate_wav(instruction=instruction, text=text)
+        w = np.asarray(waveform)
+        if w.ndim != 1 or w.size == 0:
+            raise HTTPException(status_code=400, detail="invalid waveform")
+        duration = float(w.shape[0]) / float(sample_rate)
+        if duration <= 0 or duration > _MAX_AUDIO_SEC:
+            raise HTTPException(status_code=400, detail="invalid duration")
+        return Response(content=_f32_to_wav_bytes(w, int(sample_rate)), media_type="audio/wav")
+    import logging
+    logging.basicConfig(level=logging.INFO)
+    host = os.environ.get("HOST", "0.0.0.0")
+    port = int(os.environ.get("PORT", "8765"))
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    _run_dev_server()