arwin0727
/

tts_engine_v1

Safetensors

Qwen3-TTS

Model card Files Files and versions

xet

Community

arwin0727 commited on 21 days ago

Commit

019ac4f

verified ·

1 Parent(s): 4aa7268

Upload miner.py with huggingface_hub

Browse files

Files changed (1) hide show

miner.py +222 -430

miner.py CHANGED Viewed

@@ -123,18 +123,6 @@ def _vocence_install_sox_stub() -> None:
 _vocence_install_sox_stub()
-# ---------------------------------------------------------------------------
-# In-process `onnxruntime` stub.
-#
-# qwen_tts/core/tokenizer_25hz/vq/speech_vq.py also does a top-level
-# `import onnxruntime`. The XVectorExtractor it imports is instantiated
-# during Qwen3TTSModel.from_pretrained (it loads `campplus.onnx` for speaker
-# x-vector extraction), but the InferenceSession is only *run* when encoding
-# a reference voice clip for voice-cloning. Our /speak API is instruction+
-# text only, so the session is created but never run. We provide a stub that
-# accepts construction and exposes the minimal SessionOptions / InferenceSession
-# surface used in __init__, but raises if run() is ever called.
-# ---------------------------------------------------------------------------
 def _vocence_install_onnxruntime_stub() -> None:
     if "onnxruntime" in sys.modules:
         return
@@ -200,7 +188,6 @@ _VALIDATOR_WEIGHTS: dict[str, float] = {
 }
 DEFAULT_HUB_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
-OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
 _MIN_DURATION_SEC = 0.15
 _MAX_DURATION_SEC = 29.5
@@ -268,17 +255,6 @@ def _read_vocence_yaml(repo: Path) -> dict[str, Any]:
     return data if isinstance(data, Mapping) else {}
-def _merge_default_instruction(default: str, user: str) -> str:
-    """Prepend default style; duplicate keys in ` | ` parse order: user segment wins (see _parse_instruction)."""
-    d = (default or "").strip()
-    u = (user or "").strip()
-    if not d:
-        return u
-    if not u:
-        return d
-    return f"{d} | {u}"
 def _is_hub_model_id(s: str) -> bool:
     t = (s or "").strip()
     if not t or t[0] in ("/", ".", "~") or "\\" in t or "\n" in t or ".." in t:
@@ -423,125 +399,6 @@ def _parse_instruction(instruction: str) -> dict[str, str]:
     return result
-_GENDER_PHRASE = {
-    "male": "male", "female": "female", "neutral": "gender-neutral",
-}
-_PITCH_PHRASE = {
-    "low": "deep low-pitched voice", "mid": "medium natural pitch", "high": "high-pitched voice",
-}
-_SPEED_PHRASE = {
-    "slow": "slow deliberate pace", "normal": "natural conversational pace", "fast": "brisk fast pace",
-}
-_AGE_PHRASE = {
-    "child": "child", "young_adult": "young adult", "adult": "adult", "senior": "elderly senior",
-}
-_EMOTION_PHRASE = {
-    "neutral": "neutral composed delivery",
-    "happy": "cheerful happy upbeat warm",
-    "sad": "sorrowful sad subdued downcast",
-    "angry": "firm angry forceful assertive tense",
-    "calm": "calm relaxed measured peaceful unhurried",
-    "excited": "excited enthusiastic energetic lively",
-    "serious": "serious grave deliberate weighty",
-    "fearful": "nervous fearful hesitant trembling",
-}
-_TONE_PHRASE = {
-    "warm": "warm", "cold": "cold detached", "friendly": "friendly",
-    "formal": "formal", "casual": "casual", "authoritative": "authoritative commanding",
-}
-_ACCENT_PHRASE = {
-    "us": "standard American English accent with rhotic r sounds",
-    "uk": "standard British English accent with non-rhotic received pronunciation",
-    "au": "Australian English accent",
-    "in": "Indian English accent",
-    "neutral": "neutral international English accent",
-    "other": "non-native English accent",
-}
-def _build_natural_preamble(parsed: dict[str, str]) -> str:
-    gender = _GENDER_PHRASE.get(parsed.get("gender", ""), "")
-    age = _AGE_PHRASE.get(parsed.get("age_group", ""), "")
-    pitch = _PITCH_PHRASE.get(parsed.get("pitch", ""), "")
-    speed = _SPEED_PHRASE.get(parsed.get("speed", ""), "")
-    emotion = _EMOTION_PHRASE.get(parsed.get("emotion", ""), "")
-    tone = _TONE_PHRASE.get(parsed.get("tone", ""), "")
-    accent = _ACCENT_PHRASE.get(parsed.get("accent", ""), "")
-    parts: list[str] = []
-    # Gender-first to avoid timbre drift on emotion-heavy prompts
-    identity = " ".join(p for p in [gender, age] if p)
-    if identity:
-        parts.append(f"a {identity} voice")
-    if emotion:
-        parts.append(emotion)
-    if accent:
-        parts.append(f"speaking with a {accent}")
-    if pitch:
-        parts.append(pitch)
-    if speed:
-        parts.append(speed)
-    if tone:
-        parts.append(f"{tone} tone")
-    if not parts:
-        return ""
-    preamble = "Speak as " + ", ".join(parts) + "."
-    return preamble + " Use natural human prosody with realistic breath placement and varied intonation."
-def _enhance_instruction(instruction: str) -> str:
-    parsed = _parse_instruction(instruction)
-    if not parsed:
-        return instruction
-    preamble = _build_natural_preamble(parsed)
-    if not preamble:
-        return instruction
-    return f"{preamble} {instruction}"
-_NUM_WORDS = {
-    "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four",
-    "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine",
-    "10": "ten", "11": "eleven", "12": "twelve", "13": "thirteen",
-    "14": "fourteen", "15": "fifteen", "16": "sixteen", "17": "seventeen",
-    "18": "eighteen", "19": "nineteen", "20": "twenty", "30": "thirty",
-    "40": "forty", "50": "fifty", "60": "sixty", "70": "seventy",
-    "80": "eighty", "90": "ninety", "100": "one hundred",
-}
-_ABBREV = {
-    "Mr.": "Mister", "Mrs.": "Missus", "Dr.": "Doctor", "St.": "Saint",
-    "etc.": "et cetera", "vs.": "versus", "approx.": "approximately",
-    "dept.": "department", "govt.": "government", "mgr.": "manager",
-}
-def _normalize_text_for_tts(text: str) -> str:
-    import re
-    # Expand known abbreviations
-    for abbr, expansion in _ABBREV.items():
-        text = text.replace(abbr, expansion)
-    # Expand $N / £N / €N → "N dollars/pounds/euros"
-    text = re.sub(r'\$(\d+)', lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} dollars", text)
-    text = re.sub(r'£(\d+)', lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} pounds", text)
-    text = re.sub(r'€(\d+)', lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} euros", text)
-    # Expand standalone small integers (not part of larger numbers)
-    text = re.sub(
-        r'\b(\d{1,2})\b',
-        lambda m: _NUM_WORDS.get(m.group(1), m.group(1)),
-        text,
-    )
-    # Add comma pause before coordinating conjunctions in long sentences
-    text = re.sub(r'(?<!\,)\s+(but|however|although|though|yet)\s+', r', \1 ', text, flags=re.IGNORECASE)
-    return text.strip()
 def _score_wer(wav: np.ndarray, sr: int, target_text: str, whisper_model: Any) -> float:
     if whisper_model is None:
         return 0.5
@@ -570,54 +427,6 @@ def _score_wer(wav: np.ndarray, sr: int, target_text: str, whisper_model: Any) -
         return 0.5
-def _try_load_punct_model() -> Any:
-    """
-    Load deepmultilingualpunctuation PunctuationModel for restoring commas/periods
-    to unpunctuated input text before TTS synthesis. Improves prosody and naturalness
-    dramatically for run-on text (e.g. raw literary/OCR input).
-    Install: pip install deepmultilingualpunctuation
-    Returns the model on success, None if unavailable.
-    """
-    try:
-        from deepmultilingualpunctuation import PunctuationModel  # type: ignore[import]
-        model = PunctuationModel()
-        print("[miner] PunctuationModel loaded for text pre-processing", flush=True)
-        return model
-    except Exception as e:
-        print(f"[miner] PunctuationModel unavailable ({e}); punctuation restoration skipped", flush=True)
-        return None
-def _restore_punctuation(text: str, punct_model: Any) -> str:
-    """
-    Restore punctuation to text that lacks commas/periods.
-    Only applies the model when the text appears to lack punctuation
-    (fewer than 1 punctuation mark per 80 characters), so already
-    well-punctuated inputs are passed through unchanged.
-    Falls back to original text on any error.
-    """
-    if punct_model is None:
-        return text
-    stripped = text.strip()
-    if not stripped:
-        return text
-    punct_chars = sum(1 for c in stripped if c in ".,:;!?")
-    density = punct_chars / max(len(stripped), 1)
-    if density >= 1 / 80:
-        return text
-    try:
-        result: str = punct_model.restore_punctuation(stripped)
-        print(
-            f"[miner] punctuation restored: {len(stripped)}→{len(result)} chars",
-            flush=True,
-        )
-        return result
-    except Exception as e:
-        print(f"[miner] punctuation restoration failed ({e}); using original", flush=True)
-        return text
 _VOICE_TRAIT_ENUMS: dict[str, list[str]] = {
     "gender":    ["male", "female", "neutral"],
     "pitch":     ["low", "mid", "high"],
@@ -629,181 +438,200 @@ _VOICE_TRAIT_ENUMS: dict[str, list[str]] = {
 }
 _ORDINAL_TRAITS = {"pitch", "speed", "age_group"}
-_AI_COMPARE_SYSTEM = """You are an expert TTS evaluator. Analyze each provided audio candidate and rate it against the target instruction and target text.
-For EACH candidate output these fields:
-- transcription: exact words spoken, lowercased (string)
-- gender: one of [male, female, neutral]
-- pitch: one of [low, mid, high]
-- speed: one of [slow, normal, fast]
-- age_group: one of [child, young_adult, adult, senior]
-- emotion: one of [neutral, happy, sad, angry, calm, excited, serious, fearful]
-- tone: one of [warm, cold, friendly, formal, casual, authoritative]
-- accent: one of [us, uk, au, in, neutral, other]
-- naturalness_score: integer 1-5 (1=robotic, 5=indistinguishable from human)
-Then set "best" to the 0-based index of the candidate that best matches the instruction and sounds most natural.
-Return ONLY valid JSON in this exact shape (no markdown, no commentary):
-{"candidates": [{"transcription":"...","gender":"...","pitch":"...","speed":"...","age_group":"...","emotion":"...","tone":"...","accent":"...","naturalness_score":4}], "best": 0}"""
-class OmniAudioJudge:
-    """
-    Local audio judge using Qwen2.5-Omni-7B. No external API.
-    Mimics GPT-4o-audio-preview trait extraction + comparative naturalness ranking.
-    """
-    def __init__(self, model_id: str = OMNI_MODEL_ID) -> None:
-        self._model_id = model_id
-        self._model = None
-        self._processor = None
-        self._device = "cpu"
-        self._dtype = None
-        self._api_ok = False
-        self._api_error: str = ""
-        self._load()
-    def _load(self) -> None:
         try:
             import torch
-            from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
-            self._device = "cuda:0" if torch.cuda.is_available() else "cpu"
-            self._dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-            print(f"[miner] scorer-1: loading {self._model_id} on {self._device} ({self._dtype})...", flush=True)
-            self._processor = Qwen2_5OmniProcessor.from_pretrained(self._model_id)
-            try:
-                self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
-                    self._model_id,
-                    dtype=self._dtype,
-                    device_map=self._device,
-                    attn_implementation="flash_attention_2",
-                )
-            except Exception:
-                self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
-                    self._model_id,
-                    dtype=self._dtype,
-                    device_map=self._device,
-                    attn_implementation="sdpa",
-                )
             self._model.eval()
-            self._api_ok = True
-            print(f"[miner] scorer-1: Omni judge ready ({self._model_id})", flush=True)
         except Exception as e:
-            self._api_error = f"load_failed: {e}"
-            print(f"[miner] scorer-1: Omni judge load FAILED ({e})", flush=True)
-    @staticmethod
-    def _to_16k_mono(wav: np.ndarray, sr: int) -> np.ndarray:
         import librosa
-        x = wav.astype(np.float32)
-        if x.ndim > 1:
-            x = x.mean(axis=1)
-        if sr != 16000:
-            x = librosa.resample(x, orig_sr=sr, target_sr=16000)
-        return x.astype(np.float32)
-    def judge_candidates(
-        self,
-        candidates: list[tuple[np.ndarray, int]],
-        target_text: str,
-        instruction: str,
-    ) -> tuple[int, list[dict[str, Any]]]:
-        """
-        Send all candidates in one Omni call. Returns (best_index, trait_list).
-        """
-        import json as _json
-        import torch
-        audios_16k = [self._to_16k_mono(w, sr) for w, sr in candidates]
-        content: list[dict] = []
-        for i in range(len(candidates)):
-            content.append({"type": "text", "text": f"Candidate {i}:"})
-            content.append({"type": "audio", "audio": audios_16k[i]})
-        content.append({"type": "text", "text": f"Target instruction: {instruction}"})
-        content.append({"type": "text", "text": f"Target text: {target_text}"})
-        conversation = [
-            {"role": "system", "content": [{"type": "text", "text": _AI_COMPARE_SYSTEM}]},
-            {"role": "user", "content": content},
-        ]
-        text = self._processor.apply_chat_template(
-            conversation, add_generation_prompt=True, tokenize=False
-        )
-        inputs = self._processor(
-            text=text,
-            audio=audios_16k,
-            sampling_rate=16000,
-            return_tensors="pt",
-            padding=True,
-        )
-        inputs = {k: (v.to(self._device) if hasattr(v, "to") else v) for k, v in inputs.items()}
-        with torch.inference_mode():
-            outputs = self._model.generate(
-                **inputs,
-                max_new_tokens=600,
-                do_sample=False,
-                return_audio=False,
-            )
-        in_len = inputs["input_ids"].shape[1] if "input_ids" in inputs else 0
-        gen = outputs[:, in_len:] if in_len else outputs
-        raw = self._processor.batch_decode(
-            gen, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0].strip()
-        if "```" in raw:
-            for part in raw.split("```"):
-                p = part.strip()
-                if p.startswith("json"):
-                    p = p[4:].strip()
-                if p.startswith("{"):
-                    raw = p
-                    break
-        start = raw.find("{")
-        end = raw.rfind("}")
-        if start != -1 and end != -1 and end > start:
-            raw = raw[start:end + 1]
-        try:
-            data = _json.loads(raw)
-        except _json.JSONDecodeError:
-            print(f"[miner] scorer-1: JSON parse failed; raw={raw[:300]}", flush=True)
-            data = {}
-        fallbacks = {
-            "gender": "neutral", "pitch": "mid", "speed": "normal", "age_group": "adult",
-            "emotion": "neutral", "tone": "casual", "accent": "neutral",
-        }
-        trait_list: list[dict[str, Any]] = []
-        for raw_c in (data.get("candidates") or []):
-            out: dict[str, Any] = {"transcription": str(raw_c.get("transcription") or "").strip()}
-            for k, enum in _VOICE_TRAIT_ENUMS.items():
-                v = str(raw_c.get(k) or "").strip().lower().replace(" ", "_").replace("-", "_")
-                out[k] = v if v in enum else fallbacks[k]
-            try:
-                out["naturalness_score"] = float(max(1, min(5, int(raw_c.get("naturalness_score", 3)))))
-            except (TypeError, ValueError):
-                out["naturalness_score"] = 3.0
-            trait_list.append(out)
-        while len(trait_list) < len(candidates):
-            trait_list.append({**fallbacks, "transcription": "", "naturalness_score": 3.0})
-        try:
-            ai_best = int(data.get("best", 0))
-            if not (0 <= ai_best < len(candidates)):
-                ai_best = 0
-        except (TypeError, ValueError):
-            ai_best = 0
-        return ai_best, trait_list
 # ---------------------------------------------------------------------------
-# Qwen2-Audio-7B-Instruct scorer (free local fallback)
 # ---------------------------------------------------------------------------
@@ -975,16 +803,11 @@ class Miner:
         self._root = Path(path_hf_repo).resolve()
         cfg = _read_vocence_yaml(self._root)
         runtime = cfg.get("runtime") or {}
-        generation = cfg.get("generation") or {}
         limits = cfg.get("limits") or {}
         self._language = str(runtime.get("default_language", "English"))
         self._cap_instruction = int(limits.get("max_instruction_chars", 600))
         self._cap_text = int(limits.get("max_text_chars", 2000))
-        _di = generation.get("default_instruction")
-        self._default_instruction = (
-            str(_di).strip() if _di is not None and str(_di).strip() else ""
-        )
         _local_root = _local_dir_for_downloads(self._root, runtime)
         _hub = str(runtime.get("hub_model_id", DEFAULT_HUB_MODEL_ID))
@@ -1046,18 +869,13 @@ class Miner:
             print(f"[miner] whisper unavailable ({e}); selection falls back", flush=True)
             self._whisper = None
-        self._punct_model = _try_load_punct_model()
-        # Scorer 1: local Omni judge (Qwen2.5-Omni-7B)
-        try:
-            self._ai: Any = OmniAudioJudge()
-        except Exception as e:
-            print(f"[miner] scorer-1: Omni judge init failed ({e})", flush=True)
-            self._ai = None
-        # Scorer 2: Whisper WER (fallback — already loaded above)
-        active = "Omni-Judge" if (self._ai and self._ai._api_ok) else "Whisper-WER"
-        print(f"[miner] ready: best-of-3 (qwen×3); active scorer: {active}", flush=True)
         wts = _VALIDATOR_WEIGHTS
         print(
             f"[miner] validator weights: script={wts['script']:.2f} nat={wts['naturalness']:.2f} "
@@ -1066,22 +884,15 @@ class Miner:
             f"tone={wts['tone']:.2f}",
             flush=True,
         )
-        if self._default_instruction:
-            print(
-                "[miner] default_instruction: prepended to each request (per-key override: user wins)",
-                flush=True,
-            )
     def __repr__(self) -> str:
-        return "Miner(best-of-3/qwen×3, in_process=True)"
     def get_status(self) -> dict:
-        ai_ok = self._ai is not None and self._ai._api_ok
         whisper_ok = self._whisper is not None
-        active = "omni-judge" if ai_ok else ("whisper-wer" if whisper_ok else "none")
-        ai_status = "ready" if ai_ok else (
-            f"api_error: {self._ai._api_error}" if self._ai is not None else "not configured"
-        )
         cuda_info = "unknown"
         try:
             import torch
@@ -1098,7 +909,7 @@ class Miner:
         tts_device = getattr(self, "_tts_device", "unknown")
         model_on_cuda = isinstance(tts_device, str) and tts_device.startswith("cuda")
         return {
-            "scorer_ai":      ai_status,
             "scorer_whisper": "ready" if whisper_ok else "not available",
             "active_scorer":  active,
             "cuda":           cuda_info,
@@ -1134,32 +945,26 @@ class Miner:
     def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
         text = text[: self._cap_text] if self._cap_text else text
-        inst = _merge_default_instruction(self._default_instruction, instruction)
-        if self._cap_instruction:
-            inst = inst[: self._cap_instruction]
-        # Enrich text: punctuation restoration + number/abbreviation expansion.
-        punct_text = _restore_punctuation(text, self._punct_model)
-        rich_text = _normalize_text_for_tts(punct_text)
-        parsed = _parse_instruction(inst)
-        t0 = time.time()
-        # All candidates use identical inputs and the model's own
-        # generation_config defaults — no temperature/top_p/top_k/rep_penalty/
-        # max_new_tokens overrides. Diversity comes purely from sampling RNG
-        # (Qwen3-TTS-VoiceDesign's default config has do_sample=True, so each
-        # call draws a fresh sample). Mirrors magma90909/vocence_miner_v8,
-        # which only passes text/instruct/language to generate_voice_design.
-        # Phase 1: generate 3 Qwen candidates
         raw_candidates: list[tuple[np.ndarray, int, str]] = []
         first_rejected: tuple[np.ndarray, int] | None = None
         def _qwen(tag: str) -> None:
             nonlocal first_rejected
             kwargs: dict[str, Any] = dict(
-                text=rich_text,
-                instruct=inst,
                 language=self._language,
             )
             gen_t0 = time.time()
@@ -1194,46 +999,33 @@ class Miner:
                 f"all synthesis attempts failed validity in {time.time()-t0:.1f}s{hint}"
             )
-        # Phase 2: AI judge → Whisper fallback
         scores: list[float] = []
-        ai_best = 0
-        best = 0
-        scorer_used = "none"
-        if self._ai is not None and self._ai._api_ok:
-            try:
-                print(f"[scorer-1/omni] judging {len(raw_candidates)} candidates...", flush=True)
-                audio_list = [(w, s) for w, s, _ in raw_candidates]
-                ai_best, trait_list = self._ai.judge_candidates(audio_list, text, inst)
-                for i, (wav, sr_i, tag) in enumerate(raw_candidates):
-                    total, detail = _score_from_traits(trait_list[i], text, parsed)
-                    scores.append(total)
-                    print(
-                        f"[scorer-1/omni][{tag}] score={total:.3f} wer={detail['wer']:.3f} "
-                        f"gp={detail['gp']:.3f} spd={detail['speed']:.3f} "
-                        f"nat={detail['nat']:.3f} age={detail['age']:.2f} "
-                        f"emo={detail['emo']:.2f} tone={detail['tone']:.2f} "
-                        f"accent={detail['accent']:.2f} elapsed={time.time()-t0:.1f}s",
-                        flush=True,
-                    )
-                best = ai_best
-                scorer_used = "omni-judge"
-            except Exception as e:
-                print(f"[scorer-1/omni] failed ({e}); falling back to Whisper...", flush=True)
-        if not scores:
-            print(f"[scorer-2/whisper] judging {len(raw_candidates)} candidates...", flush=True)
-            for wav, sr_i, tag in raw_candidates:
-                total, detail = _score_fallback(wav, sr_i, text, self._whisper)
-                scores.append(total)
-                print(f"[scorer-2/whisper][{tag}] score={total:.3f} wer={detail['wer']:.3f} elapsed={time.time()-t0:.1f}s", flush=True)
-            best = int(np.argmax(scores))
-            scorer_used = "whisper-wer"
-        active_scores = scores
         print(
-            f"[miner] best={raw_candidates[best][2]} score={active_scores[best]:.3f} "
-            f"scorer={scorer_used} total={len(raw_candidates)} elapsed={time.time()-t0:.1f}s",
             flush=True,
         )
         return raw_candidates[best][0], raw_candidates[best][1]

 _vocence_install_sox_stub()
 def _vocence_install_onnxruntime_stub() -> None:
     if "onnxruntime" in sys.modules:
         return
 }
 DEFAULT_HUB_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
 _MIN_DURATION_SEC = 0.15
 _MAX_DURATION_SEC = 29.5
     return data if isinstance(data, Mapping) else {}
 def _is_hub_model_id(s: str) -> bool:
     t = (s or "").strip()
     if not t or t[0] in ("/", ".", "~") or "\\" in t or "\n" in t or ".." in t:
     return result
 def _score_wer(wav: np.ndarray, sr: int, target_text: str, whisper_model: Any) -> float:
     if whisper_model is None:
         return 0.5
         return 0.5
 _VOICE_TRAIT_ENUMS: dict[str, list[str]] = {
     "gender":    ["male", "female", "neutral"],
     "pitch":     ["low", "mid", "high"],
 }
 _ORDINAL_TRAITS = {"pitch", "speed", "age_group"}
+# JIT UTMOS (balacoon/utmos) — torch + hub only; no fairseq stack.
+_UTMOS_JIT_REPO = "balacoon/utmos"
+_UTMOS_JIT_FILENAME = "utmos.jit"
+class UtmosJitPredictor:
+    """Mean-opinion-score style naturalness (≈1–5) via traced UTMOS."""
+    def __init__(self) -> None:
+        self._model: Any = None
+        self._device: Any = None
+        self._ok = False
+        self._failed = False
+        self._error: str = ""
+    def is_ok(self) -> bool:
+        return self._ok
+    def error(self) -> str:
+        return self._error
+    def ensure(self) -> bool:
+        if self._ok:
+            return True
+        if self._failed:
+            return False
         try:
             import torch
+            from huggingface_hub import hf_hub_download
+            token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+            path = hf_hub_download(
+                repo_id=_UTMOS_JIT_REPO,
+                filename=_UTMOS_JIT_FILENAME,
+                repo_type="model",
+                token=token,
+            )
+            self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            self._model = torch.jit.load(path, map_location=self._device)
             self._model.eval()
+            self._ok = True
+            print(
+                f"[miner] UTMOS JIT loaded ({_UTMOS_JIT_REPO}) on {self._device}",
+                flush=True,
+            )
         except Exception as e:
+            self._failed = True
+            self._error = repr(e)
+            self._model = None
+            print(f"[miner] UTMOS JIT load FAILED: {e}", flush=True)
+        return self._ok
+    def predict_mos(self, wav: np.ndarray, sr: int) -> float:
+        """Return MOS-like score in ~[1, 5]; fallback 3.0 if model unavailable."""
+        if not self.ensure() or self._model is None:
+            return 3.0
+        try:
+            import librosa
+            import torch
+            x = wav.astype(np.float32)
+            if x.ndim > 1:
+                x = x.mean(axis=1)
+            if sr != 16000:
+                x = librosa.resample(x, orig_sr=sr, target_sr=16000)
+            x = np.clip(x, -1.0, 1.0)
+            xi = (x * 32767.0).astype(np.int16)
+            t = torch.as_tensor(xi, device=self._device, dtype=torch.int16).unsqueeze(0)
+            with torch.inference_mode():
+                out = self._model(t)
+            val = float(out.reshape(-1)[0].item())
+            return max(1.0, min(5.0, val))
+        except Exception as e:
+            print(f"[miner] UTMOS predict failed: {e}", flush=True)
+            return 3.0
+def _transcribe_whisper(wav: np.ndarray, sr: int, whisper_model: Any) -> str:
+    if whisper_model is None:
+        return ""
+    try:
         import librosa
+        wav16 = librosa.resample(wav.astype(np.float32), orig_sr=sr, target_sr=16000)
+        result = whisper_model.transcribe(wav16, language="en", fp16=False)
+        return str(result.get("text") or "").strip().lower()
+    except Exception:
+        return ""
+def _mean_fundamental_hz(wav: np.ndarray, sr: int) -> float:
+    import librosa
+    y = wav.astype(np.float32)
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+    if y.size < 256:
+        return 0.0
+    f0 = librosa.yin(y, fmin=50.0, fmax=500.0, sr=sr)
+    v = f0[np.isfinite(f0) & (f0 > 0)]
+    if v.size == 0:
+        return 0.0
+    return float(np.nanmean(v))
+def _estimate_pitch_trait(wav: np.ndarray, sr: int) -> str:
+    """Map mean F0 to low/mid/high (coarse heuristic for synthetic speech)."""
+    hz = _mean_fundamental_hz(wav, sr)
+    if hz <= 0.0:
+        return "mid"
+    if hz < 130.0:
+        return "low"
+    if hz < 210.0:
+        return "mid"
+    return "high"
+def _estimate_speed_trait(wav: np.ndarray, sr: int, reference_text: str) -> str:
+    """Speaking rate vs reference word count (coarse slow/normal/fast)."""
+    import re
+    dur = float(wav.shape[0]) / float(sr) if sr else 0.0
+    if dur < 0.05:
+        return "normal"
+    words = re.findall(r"\w+", (reference_text or "").lower())
+    nw = max(len(words), 1)
+    wps = nw / dur
+    if wps < 2.2:
+        return "slow"
+    if wps > 4.0:
+        return "fast"
+    return "normal"
+def _trait_score_without_audio_classifier(expected: str) -> float:
+    """No audio-side classifier for this trait; soft constant if instruction pins it."""
+    e = (expected or "").strip().lower()
+    if not e:
+        return 1.0
+    return 0.85
+def _build_traits_non_llm(
+    wav: np.ndarray,
+    sr: int,
+    *,
+    validator_text: str,
+    parsed: dict[str, str],
+    whisper_model: Any,
+    utmos: UtmosJitPredictor,
+) -> dict[str, Any]:
+    return {
+        "transcription": _transcribe_whisper(wav, sr, whisper_model),
+        "naturalness_score": float(utmos.predict_mos(wav, sr)),
+        "pitch": _estimate_pitch_trait(wav, sr),
+        "speed": _estimate_speed_trait(wav, sr, validator_text),
+        "gender": "neutral",
+        "age_group": "adult",
+        "emotion": "neutral",
+        "tone": "casual",
+        "accent": "neutral",
+    }
+def _score_from_traits_non_llm(
+    traits: dict[str, Any],
+    target_text: str,
+    parsed: dict[str, str],
+) -> tuple[float, dict[str, float]]:
+    """Validator-aligned score: script + UTMOS naturalness + pitch/speed heuristics; soft prior on other traits."""
+    script_s = max(0.0, 1.0 - _wer_simple(target_text, traits.get("transcription", "")))
+    nat_s = (float(traits.get("naturalness_score", 3.0)) - 1.0) / 4.0
+    elem_scores: dict[str, float] = {"script": script_s, "naturalness": nat_s}
+    for key in ("gender", "age_group", "emotion", "tone", "accent"):
+        elem_scores[key] = _trait_score_without_audio_classifier(parsed.get(key, ""))
+    for key in ("pitch", "speed"):
+        elem_scores[key] = _ai_score_element(key, parsed.get(key, ""), traits.get(key, ""))
+    wsum = sum(_VALIDATOR_WEIGHTS.values())
+    total = sum(_VALIDATOR_WEIGHTS[k] * elem_scores[k] for k in _VALIDATOR_WEIGHTS) / wsum
+    detail = {
+        "wer": script_s,
+        "gp": (elem_scores["gender"] + elem_scores["pitch"]) / 2.0,
+        "speed": elem_scores["speed"],
+        "nat": nat_s,
+        "age": elem_scores["age_group"],
+        "emo": elem_scores["emotion"],
+        "tone": elem_scores["tone"],
+        "accent": elem_scores["accent"],
+    }
+    return total, detail
 # ---------------------------------------------------------------------------
+# Trait scoring helpers (validator-aligned)
 # ---------------------------------------------------------------------------
         self._root = Path(path_hf_repo).resolve()
         cfg = _read_vocence_yaml(self._root)
         runtime = cfg.get("runtime") or {}
         limits = cfg.get("limits") or {}
         self._language = str(runtime.get("default_language", "English"))
         self._cap_instruction = int(limits.get("max_instruction_chars", 600))
         self._cap_text = int(limits.get("max_text_chars", 2000))
         _local_root = _local_dir_for_downloads(self._root, runtime)
         _hub = str(runtime.get("hub_model_id", DEFAULT_HUB_MODEL_ID))
             print(f"[miner] whisper unavailable ({e}); selection falls back", flush=True)
             self._whisper = None
+        self._utmos = UtmosJitPredictor()
+        self._utmos.ensure()
+        ut_ok = self._utmos.is_ok()
+        whisper_ok = self._whisper is not None
+        active = "utmos+whisper" if ut_ok else ("whisper-only" if whisper_ok else "degraded")
+        print(f"[miner] ready: best-of-N (qwen candidates); active scorer: {active}", flush=True)
         wts = _VALIDATOR_WEIGHTS
         print(
             f"[miner] validator weights: script={wts['script']:.2f} nat={wts['naturalness']:.2f} "
             f"tone={wts['tone']:.2f}",
             flush=True,
         )
     def __repr__(self) -> str:
+        return "Miner(best-of-N/qwen, utmos+whisper scorer, in_process=True)"
     def get_status(self) -> dict:
+        ut_ok = self._utmos.is_ok()
         whisper_ok = self._whisper is not None
+        active = "utmos+whisper" if ut_ok else ("whisper-only" if whisper_ok else "degraded")
+        ut_status = "ready" if ut_ok else f"not loaded: {self._utmos.error()}"
         cuda_info = "unknown"
         try:
             import torch
         tts_device = getattr(self, "_tts_device", "unknown")
         model_on_cuda = isinstance(tts_device, str) and tts_device.startswith("cuda")
         return {
+            "scorer_utmos":   ut_status,
             "scorer_whisper": "ready" if whisper_ok else "not available",
             "active_scorer":  active,
             "cuda":           cuda_info,
     def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
         text = text[: self._cap_text] if self._cap_text else text
+        validator_text = text
+        validator_inst = (
+            instruction[: self._cap_instruction]
+            if self._cap_instruction
+            else instruction
+        )
+        parsed_eval = _parse_instruction(validator_inst)
+        # TTS: same capped strings the validator sent (no default merge, preamble, or text enrichment).
+        t0 = time.time()
+        # Phase 1: generate Qwen candidates
         raw_candidates: list[tuple[np.ndarray, int, str]] = []
         first_rejected: tuple[np.ndarray, int] | None = None
         def _qwen(tag: str) -> None:
             nonlocal first_rejected
             kwargs: dict[str, Any] = dict(
+                text=validator_text,
+                instruct=validator_inst,
                 language=self._language,
             )
             gen_t0 = time.time()
                 f"all synthesis attempts failed validity in {time.time()-t0:.1f}s{hint}"
             )
+        # Phase 2: UTMOS + Whisper script + audio heuristics (validator instruction/text only).
         scores: list[float] = []
+        print(f"[scorer/utmos] judging {len(raw_candidates)} candidates...", flush=True)
+        for wav, sr_i, tag in raw_candidates:
+            traits = _build_traits_non_llm(
+                wav,
+                sr_i,
+                validator_text=validator_text,
+                parsed=parsed_eval,
+                whisper_model=self._whisper,
+                utmos=self._utmos,
+            )
+            total, detail = _score_from_traits_non_llm(traits, validator_text, parsed_eval)
+            scores.append(total)
+            print(
+                f"[scorer/utmos][{tag}] score={total:.3f} wer={detail['wer']:.3f} "
+                f"gp={detail['gp']:.3f} spd={detail['speed']:.3f} "
+                f"nat={detail['nat']:.3f} age={detail['age']:.2f} "
+                f"emo={detail['emo']:.2f} tone={detail['tone']:.2f} "
+                f"accent={detail['accent']:.2f} elapsed={time.time()-t0:.1f}s",
+                flush=True,
+            )
+        best = int(np.argmax(scores))
         print(
+            f"[miner] best={raw_candidates[best][2]} score={scores[best]:.3f} "
+            f"scorer=utmos+traits total={len(raw_candidates)} elapsed={time.time()-t0:.1f}s",
             flush=True,
         )
         return raw_candidates[best][0], raw_candidates[best][1]