tts_engine_model / miner.py
arwin0727's picture
Upload folder using huggingface_hub
79a876c verified
from __future__ import annotations
import threading
from functools import cached_property
from pathlib import Path
from types import SimpleNamespace
from typing import Any
import numpy as np
class Miner:
REPO_SENTINEL = "config.json"
SETTINGS_FILE = "vocence_config.yaml"
WARMUP_TIMEOUT = 180.0
def __init__(self, path_hf_repo: Path) -> None:
self.root = Path(path_hf_repo).resolve()
if not (self.root / self.REPO_SENTINEL).is_file():
raise FileNotFoundError(f"{self.REPO_SENTINEL} not present in {self.root}")
_ = self.settings
_ = self.model
def __repr__(self) -> str:
return f"<Miner root={self.root.name} language={self.settings.language!r}>"
@cached_property
def settings(self) -> SimpleNamespace:
raw = self._load_yaml(self.root / self.SETTINGS_FILE)
rt = raw.get("runtime") or {}
gen = raw.get("generation") or {}
lim = raw.get("limits") or {}
return SimpleNamespace(
language=str(lim.get("default_language") or rt.get("default_language") or "English"),
sample_rate=int(gen.get("sample_rate", 24000)),
max_instruction_chars=int(lim.get("max_instruction_chars", 600)),
max_text_chars=int(lim.get("max_text_chars", 2000)),
prefer_cuda=str(rt.get("device_preference", "cuda")).lower() == "cuda",
prefer_bf16=str(rt.get("dtype", "bfloat16")).lower() == "bfloat16",
prefer_flash=bool(rt.get("use_flash_attention_2", False)),
)
@cached_property
def model(self) -> Any:
return self._instantiate_engine()
def warmup(self) -> None:
outcome: dict[str, Any] = {"done": False, "err": None}
def _trial() -> None:
try:
self.generate_wav(instruction="Neutral voice.", text="Warming up.")
outcome["done"] = True
except Exception as exc:
outcome["err"] = repr(exc)
worker = threading.Thread(target=_trial, daemon=True)
worker.start()
worker.join(timeout=self.WARMUP_TIMEOUT)
if not outcome["done"]:
raise RuntimeError(
f"warmup did not complete within {self.WARMUP_TIMEOUT}s: {outcome['err'] or 'no completion signal'}"
)
def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
s = self.settings
prompt = instruction[: s.max_instruction_chars] if s.max_instruction_chars > 0 else instruction
body = text[: s.max_text_chars] if s.max_text_chars > 0 else text
wavs, sample_rate = self.model.generate_voice_design(
text=body,
instruct=prompt,
language=s.language,
)
if not wavs or wavs[0] is None:
raise ValueError("qwen3-tts produced no audio")
wave = np.asarray(wavs[0], dtype=np.float32)
if wave.ndim > 1:
wave = wave.mean(axis=1)
return wave, int(sample_rate)
def _instantiate_engine(self) -> Any:
import torch
from qwen_tts import Qwen3TTSModel
s = self.settings
cuda_ready = bool(torch.cuda.is_available())
device_map = "cuda:0" if (s.prefer_cuda and cuda_ready) else "cpu"
torch_dtype = torch.bfloat16 if (s.prefer_bf16 and cuda_ready) else torch.float32
attempts = ("flash_attention_2", "sdpa") if s.prefer_flash else ("sdpa",)
last_failure: BaseException | None = None
for attn in attempts:
try:
engine = Qwen3TTSModel.from_pretrained(
pretrained_model_name_or_path=str(self.root),
device_map=device_map,
dtype=torch_dtype,
attn_implementation=attn,
)
dtype_tag = "bf16" if torch_dtype is torch.bfloat16 else "fp32"
print(f"[Miner] qwen3-tts ready :: device={device_map} dtype={dtype_tag} attn={attn}")
return engine
except Exception as exc:
last_failure = exc
raise RuntimeError(f"qwen3-tts failed to load :: {last_failure!r}")
@staticmethod
def _load_yaml(path: Path) -> dict[str, Any]:
if not path.is_file():
return {}
from yaml import safe_load
with path.open("r", encoding="utf-8") as fh:
return safe_load(fh) or {}