Instructions to use Gem1832/monkey_06 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Gem1832/monkey_06 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-to-speech", model="Gem1832/monkey_06")# Load model directly from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained("Gem1832/monkey_06", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """ | |
| Vocence TTS engine: Qwen3 12Hz checkpoint in the HF repo snapshot. | |
| The chute snapshot is the only weight source: nothing is pulled from an external | |
| model id at inference time. Optional vocence_config.yaml tweaks device, dtype, | |
| attention, and language defaults. | |
| Model load: Miner.__init__ -> _instantiate_qwen() -> Qwen3TTSModel.from_pretrained(repo_path). | |
| Contract (Vocence): | |
| Miner(path_hf_repo: Path) | |
| warmup() -> None | |
| generate_wav(instruction: str, text: str) -> tuple[np.ndarray, int] | |
| """ | |
| from __future__ import annotations | |
| import threading | |
| from pathlib import Path | |
| from typing import Any, Mapping | |
| import numpy as np | |
| _CONFIG_NAME = "config.json" | |
| _VOCENCE_YAML = "vocence_config.yaml" | |
| def _merge_vocence_yaml(repo: Path) -> dict[str, Any]: | |
| path = repo / _VOCENCE_YAML | |
| if not path.is_file(): | |
| return {} | |
| from yaml import safe_load | |
| with path.open("r", encoding="utf-8") as fh: | |
| data = safe_load(fh) | |
| return data if isinstance(data, Mapping) else {} | |
| def _ensure_repo_checkpoint(repo: Path) -> Path: | |
| repo = repo.resolve() | |
| marker = repo / _CONFIG_NAME | |
| if not marker.is_file(): | |
| raise FileNotFoundError( | |
| f"Model snapshot incomplete: {marker} missing. " | |
| "Host the full Qwen3-TTS weights (checkpoint + tokenizers) in this repository." | |
| ) | |
| return repo | |
| def _resolve_compute_device(prefer_cuda: bool) -> str: | |
| import torch | |
| if prefer_cuda and torch.cuda.is_available(): | |
| return "cuda:0" | |
| return "cpu" | |
| def _resolve_torch_dtype(torch, prefer_bf16: bool): | |
| if prefer_bf16 and torch.cuda.is_available(): | |
| return torch.bfloat16 | |
| return torch.float32 | |
| def _instantiate_qwen(checkpoint_dir: str, device_map: str, torch_dtype, use_flash2: bool): | |
| """Load Qwen3TTSModel weights from the local repo directory (HF snapshot path).""" | |
| from qwen_tts import Qwen3TTSModel | |
| attn = "flash_attention_2" if use_flash2 else "sdpa" | |
| common = dict( | |
| pretrained_model_name_or_path=checkpoint_dir, | |
| device_map=device_map, | |
| dtype=torch_dtype, | |
| attn_implementation=attn, | |
| ) | |
| try: | |
| return Qwen3TTSModel.from_pretrained(**common) | |
| except Exception: | |
| common["attn_implementation"] = "sdpa" | |
| return Qwen3TTSModel.from_pretrained(**common) | |
| def _to_mono_f32(segment: np.ndarray) -> np.ndarray: | |
| x = np.asarray(segment, dtype=np.float32) | |
| if x.ndim > 1: | |
| x = x.mean(axis=1) | |
| return x | |
| class Miner: | |
| """ | |
| Loads the checkpoint from the Hugging Face repo directory Chutes downloaded. | |
| Synthesis uses natural-language instruction + text (qwen-tts API). | |
| """ | |
| def __init__(self, path_hf_repo: Path) -> None: | |
| self._root = _ensure_repo_checkpoint(Path(path_hf_repo)) | |
| self._cfg = _merge_vocence_yaml(self._root) | |
| rt = self._cfg.get("runtime") or {} | |
| gen = self._cfg.get("generation") or {} | |
| lim = self._cfg.get("limits") or {} | |
| self._language = str(lim.get("default_language") or rt.get("default_language", "English")) | |
| self._output_sr = int(gen.get("sample_rate", 24000)) | |
| self._cap_instruction = int(lim.get("max_instruction_chars", 600)) | |
| self._cap_text = int(lim.get("max_text_chars", 2000)) | |
| prefer_cuda = str(rt.get("device_preference", "cuda")).lower() == "cuda" | |
| want_bf16 = str(rt.get("dtype", "bfloat16")).lower() == "bfloat16" | |
| flash = bool(rt.get("use_flash_attention_2", False)) | |
| import torch | |
| device_map = _resolve_compute_device(prefer_cuda) | |
| torch_dtype = _resolve_torch_dtype(torch, want_bf16) | |
| ckpt = str(self._root) | |
| self._tts = _instantiate_qwen(ckpt, device_map, torch_dtype, flash) | |
| # Qwen3TTSModel is a thin wrapper, not nn.Module — no .eval() | |
| print("Qwen3-TTS checkpoint ready (loaded from repo snapshot).") | |
| def __repr__(self) -> str: | |
| return "Miner(qwen3-tts-local, local_snapshot=True)" | |
| def warmup(self) -> None: | |
| """Force one cheap synthesis on a background thread (startup SLAs).""" | |
| status: dict[str, object] = {"done": False, "error": None} | |
| def _once() -> None: | |
| try: | |
| self.generate_wav( | |
| instruction="Clear, neutral delivery.", | |
| text="Warmup.", | |
| ) | |
| status["done"] = True | |
| except Exception as exc: # noqa: BLE001 — surface to host | |
| status["error"] = str(exc) | |
| worker = threading.Thread(target=_once, daemon=True) | |
| worker.start() | |
| worker.join(timeout=180.0) | |
| if not status["done"]: | |
| raise RuntimeError(status["error"] or "warmup exceeded 180s") | |
| def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]: | |
| if self._cap_instruction > 0: | |
| instruction = instruction[: self._cap_instruction] | |
| if self._cap_text > 0: | |
| text = text[: self._cap_text] | |
| # Upstream qwen-tts method name (instruct + text -> waveform). | |
| waves, sr = self._tts.generate_voice_design( | |
| text=text, | |
| language=self._language, | |
| instruct=instruction, | |
| ) | |
| if not waves: | |
| raise ValueError("TTS generation returned no audio") | |
| first = waves[0] | |
| if first is None: | |
| raise ValueError("TTS generation returned empty channel") | |
| return _to_mono_f32(first), int(sr) | |