minerTTS / miner.py

Upload miner.py with huggingface_hub

485e837 verified 5 days ago

4.77 kB

	from __future__ import annotations

	from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeout
	from pathlib import Path
	from typing import Any

	import numpy as np


	VOCENCE_CONFIG = "vocence_config.yaml"
	QWEN_ANCHOR = "config.json"
	WARMUP_SECONDS = 180.0


	def _load_yaml(path: Path) -> dict[str, Any]:
	if not path.is_file():
	return {}
	from yaml import safe_load
	with path.open("r", encoding="utf-8") as fh:
	return safe_load(fh) or {}


	def _select_device(prefer_cuda: bool):
	import torch
	has_cuda = torch.cuda.is_available()
	device = "cuda:0" if (prefer_cuda and has_cuda) else "cpu"
	return device, torch, has_cuda


	def _select_dtype(torch_mod, want_bf16: bool, has_cuda: bool):
	return torch_mod.bfloat16 if (want_bf16 and has_cuda) else torch_mod.float32


	def _build_qwen(model_name: str, device: str, dtype: Any, attn: str):
	from qwen_tts import Qwen3TTSModel
	return Qwen3TTSModel.from_pretrained(
	pretrained_model_name_or_path=model_name,
	device_map=device,
	dtype=dtype,
	attn_implementation=attn,
	)


	def _attn_order(prefer_flash: bool) -> tuple[str, ...]:
	return ("flash_attention_2", "sdpa") if prefer_flash else ("sdpa",)


	def _mono_pcm(arr: Any) -> np.ndarray:
	wave = np.asarray(arr, dtype=np.float32)
	return wave.mean(axis=1) if wave.ndim > 1 else wave


	def _settings(snapshot: Path) -> dict[str, Any]:
	raw = _load_yaml(snapshot / VOCENCE_CONFIG)
	rt = raw.get("runtime") or {}
	gen = raw.get("generation") or {}
	lim = raw.get("limits") or {}
	return {
	"model_name": str(raw["model_name"]),
	"language": str(lim.get("default_language") or rt.get("default_language") or "English"),
	"sample_rate": int(gen.get("sample_rate", 24000)),
	"cap_instruct": int(lim.get("max_instruction_chars", 600)),
	"cap_text": int(lim.get("max_text_chars", 2000)),
	"prefer_cuda": str(rt.get("device_preference", "cuda")).lower() == "cuda",
	"prefer_bf16": str(rt.get("dtype", "bfloat16")).lower() == "bfloat16",
	"prefer_flash": bool(rt.get("use_flash_attention_2", False)),
	}


	class Miner:

	def __init__(self, path_hf_repo: Path) -> None:
	snapshot = Path(path_hf_repo).resolve()
	if not (snapshot / QWEN_ANCHOR).is_file():
	raise FileNotFoundError(f"snapshot missing {QWEN_ANCHOR}: {snapshot}")
	self.snapshot = snapshot
	self.cfg = _settings(snapshot)
	model_name = self.cfg["model_name"]

	device, torch_mod, has_cuda = _select_device(self.cfg["prefer_cuda"])
	dtype = _select_dtype(torch_mod, self.cfg["prefer_bf16"], has_cuda)

	last_err: BaseException \| None = None
	engine = None
	for attn in _attn_order(self.cfg["prefer_flash"]):
	try:
	engine = _build_qwen(model_name, device, dtype, attn)
	tag = "bf16" if self.cfg["prefer_bf16"] and has_cuda else "fp32"
	print(f"[Miner] qwen3-tts ready: model={model_name} device={device} dtype={tag} attn={attn}")
	break
	except Exception as exc:
	last_err = exc
	if engine is None:
	raise RuntimeError(f"qwen3-tts load failed: {last_err!r}")
	self.engine = engine

	def __repr__(self) -> str:
	return f"<Miner model={self.cfg['model_name']!r} lang={self.cfg['language']!r}>"

	def warmup(self) -> None:
	instruct = (
	"An adult female with an American accent, speaking at a normal pace "
	"in a mid-range pitch with a neutral emotional tone."
	)
	with ThreadPoolExecutor(max_workers=1) as pool:
	future = pool.submit(self.generate_wav, instruct, "Warmup phrase for inference.")
	try:
	future.result(timeout=WARMUP_SECONDS)
	except FutureTimeout:
	raise RuntimeError(f"Miner warmup exceeded {WARMUP_SECONDS}s")

	def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
	"""Synthesize mono float32 PCM.

	Vocence requires `instruction` and `text` to be passed verbatim to the model.
	Do not rewrite, enrich, or reformat either string.
	"""
	cap_i = self.cfg["cap_instruct"]
	cap_t = self.cfg["cap_text"]
	instruct = instruction[:cap_i] if cap_i > 0 else instruction
	body = text[:cap_t] if cap_t > 0 else text

	wavs, sr = self.engine.generate_voice_design(
	text=body,
	instruct=instruct,
	language=self.cfg["language"],
	)
	if not wavs or wavs[0] is None:
	raise ValueError("qwen3-tts returned no audio")
	return _mono_pcm(wavs[0]), int(sr)