arwin0727 commited on
Commit
019ac4f
·
verified ·
1 Parent(s): 4aa7268

Upload miner.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. miner.py +222 -430
miner.py CHANGED
@@ -123,18 +123,6 @@ def _vocence_install_sox_stub() -> None:
123
  _vocence_install_sox_stub()
124
 
125
 
126
- # ---------------------------------------------------------------------------
127
- # In-process `onnxruntime` stub.
128
- #
129
- # qwen_tts/core/tokenizer_25hz/vq/speech_vq.py also does a top-level
130
- # `import onnxruntime`. The XVectorExtractor it imports is instantiated
131
- # during Qwen3TTSModel.from_pretrained (it loads `campplus.onnx` for speaker
132
- # x-vector extraction), but the InferenceSession is only *run* when encoding
133
- # a reference voice clip for voice-cloning. Our /speak API is instruction+
134
- # text only, so the session is created but never run. We provide a stub that
135
- # accepts construction and exposes the minimal SessionOptions / InferenceSession
136
- # surface used in __init__, but raises if run() is ever called.
137
- # ---------------------------------------------------------------------------
138
  def _vocence_install_onnxruntime_stub() -> None:
139
  if "onnxruntime" in sys.modules:
140
  return
@@ -200,7 +188,6 @@ _VALIDATOR_WEIGHTS: dict[str, float] = {
200
  }
201
 
202
  DEFAULT_HUB_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
203
- OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
204
 
205
  _MIN_DURATION_SEC = 0.15
206
  _MAX_DURATION_SEC = 29.5
@@ -268,17 +255,6 @@ def _read_vocence_yaml(repo: Path) -> dict[str, Any]:
268
  return data if isinstance(data, Mapping) else {}
269
 
270
 
271
- def _merge_default_instruction(default: str, user: str) -> str:
272
- """Prepend default style; duplicate keys in ` | ` parse order: user segment wins (see _parse_instruction)."""
273
- d = (default or "").strip()
274
- u = (user or "").strip()
275
- if not d:
276
- return u
277
- if not u:
278
- return d
279
- return f"{d} | {u}"
280
-
281
-
282
  def _is_hub_model_id(s: str) -> bool:
283
  t = (s or "").strip()
284
  if not t or t[0] in ("/", ".", "~") or "\\" in t or "\n" in t or ".." in t:
@@ -423,125 +399,6 @@ def _parse_instruction(instruction: str) -> dict[str, str]:
423
  return result
424
 
425
 
426
- _GENDER_PHRASE = {
427
- "male": "male", "female": "female", "neutral": "gender-neutral",
428
- }
429
- _PITCH_PHRASE = {
430
- "low": "deep low-pitched voice", "mid": "medium natural pitch", "high": "high-pitched voice",
431
- }
432
- _SPEED_PHRASE = {
433
- "slow": "slow deliberate pace", "normal": "natural conversational pace", "fast": "brisk fast pace",
434
- }
435
- _AGE_PHRASE = {
436
- "child": "child", "young_adult": "young adult", "adult": "adult", "senior": "elderly senior",
437
- }
438
- _EMOTION_PHRASE = {
439
- "neutral": "neutral composed delivery",
440
- "happy": "cheerful happy upbeat warm",
441
- "sad": "sorrowful sad subdued downcast",
442
- "angry": "firm angry forceful assertive tense",
443
- "calm": "calm relaxed measured peaceful unhurried",
444
- "excited": "excited enthusiastic energetic lively",
445
- "serious": "serious grave deliberate weighty",
446
- "fearful": "nervous fearful hesitant trembling",
447
- }
448
- _TONE_PHRASE = {
449
- "warm": "warm", "cold": "cold detached", "friendly": "friendly",
450
- "formal": "formal", "casual": "casual", "authoritative": "authoritative commanding",
451
- }
452
- _ACCENT_PHRASE = {
453
- "us": "standard American English accent with rhotic r sounds",
454
- "uk": "standard British English accent with non-rhotic received pronunciation",
455
- "au": "Australian English accent",
456
- "in": "Indian English accent",
457
- "neutral": "neutral international English accent",
458
- "other": "non-native English accent",
459
- }
460
-
461
-
462
- def _build_natural_preamble(parsed: dict[str, str]) -> str:
463
- gender = _GENDER_PHRASE.get(parsed.get("gender", ""), "")
464
- age = _AGE_PHRASE.get(parsed.get("age_group", ""), "")
465
- pitch = _PITCH_PHRASE.get(parsed.get("pitch", ""), "")
466
- speed = _SPEED_PHRASE.get(parsed.get("speed", ""), "")
467
- emotion = _EMOTION_PHRASE.get(parsed.get("emotion", ""), "")
468
- tone = _TONE_PHRASE.get(parsed.get("tone", ""), "")
469
- accent = _ACCENT_PHRASE.get(parsed.get("accent", ""), "")
470
-
471
- parts: list[str] = []
472
-
473
- # Gender-first to avoid timbre drift on emotion-heavy prompts
474
- identity = " ".join(p for p in [gender, age] if p)
475
- if identity:
476
- parts.append(f"a {identity} voice")
477
- if emotion:
478
- parts.append(emotion)
479
- if accent:
480
- parts.append(f"speaking with a {accent}")
481
- if pitch:
482
- parts.append(pitch)
483
- if speed:
484
- parts.append(speed)
485
- if tone:
486
- parts.append(f"{tone} tone")
487
-
488
- if not parts:
489
- return ""
490
- preamble = "Speak as " + ", ".join(parts) + "."
491
- return preamble + " Use natural human prosody with realistic breath placement and varied intonation."
492
-
493
-
494
- def _enhance_instruction(instruction: str) -> str:
495
- parsed = _parse_instruction(instruction)
496
- if not parsed:
497
- return instruction
498
- preamble = _build_natural_preamble(parsed)
499
- if not preamble:
500
- return instruction
501
- return f"{preamble} {instruction}"
502
-
503
-
504
- _NUM_WORDS = {
505
- "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four",
506
- "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine",
507
- "10": "ten", "11": "eleven", "12": "twelve", "13": "thirteen",
508
- "14": "fourteen", "15": "fifteen", "16": "sixteen", "17": "seventeen",
509
- "18": "eighteen", "19": "nineteen", "20": "twenty", "30": "thirty",
510
- "40": "forty", "50": "fifty", "60": "sixty", "70": "seventy",
511
- "80": "eighty", "90": "ninety", "100": "one hundred",
512
- }
513
- _ABBREV = {
514
- "Mr.": "Mister", "Mrs.": "Missus", "Dr.": "Doctor", "St.": "Saint",
515
- "etc.": "et cetera", "vs.": "versus", "approx.": "approximately",
516
- "dept.": "department", "govt.": "government", "mgr.": "manager",
517
- }
518
-
519
-
520
- def _normalize_text_for_tts(text: str) -> str:
521
- import re
522
-
523
- # Expand known abbreviations
524
- for abbr, expansion in _ABBREV.items():
525
- text = text.replace(abbr, expansion)
526
-
527
- # Expand $N / £N / €N → "N dollars/pounds/euros"
528
- text = re.sub(r'\$(\d+)', lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} dollars", text)
529
- text = re.sub(r'£(\d+)', lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} pounds", text)
530
- text = re.sub(r'€(\d+)', lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} euros", text)
531
-
532
- # Expand standalone small integers (not part of larger numbers)
533
- text = re.sub(
534
- r'\b(\d{1,2})\b',
535
- lambda m: _NUM_WORDS.get(m.group(1), m.group(1)),
536
- text,
537
- )
538
-
539
- # Add comma pause before coordinating conjunctions in long sentences
540
- text = re.sub(r'(?<!\,)\s+(but|however|although|though|yet)\s+', r', \1 ', text, flags=re.IGNORECASE)
541
-
542
- return text.strip()
543
-
544
-
545
  def _score_wer(wav: np.ndarray, sr: int, target_text: str, whisper_model: Any) -> float:
546
  if whisper_model is None:
547
  return 0.5
@@ -570,54 +427,6 @@ def _score_wer(wav: np.ndarray, sr: int, target_text: str, whisper_model: Any) -
570
  return 0.5
571
 
572
 
573
-
574
- def _try_load_punct_model() -> Any:
575
- """
576
- Load deepmultilingualpunctuation PunctuationModel for restoring commas/periods
577
- to unpunctuated input text before TTS synthesis. Improves prosody and naturalness
578
- dramatically for run-on text (e.g. raw literary/OCR input).
579
- Install: pip install deepmultilingualpunctuation
580
- Returns the model on success, None if unavailable.
581
- """
582
- try:
583
- from deepmultilingualpunctuation import PunctuationModel # type: ignore[import]
584
- model = PunctuationModel()
585
- print("[miner] PunctuationModel loaded for text pre-processing", flush=True)
586
- return model
587
- except Exception as e:
588
- print(f"[miner] PunctuationModel unavailable ({e}); punctuation restoration skipped", flush=True)
589
- return None
590
-
591
-
592
- def _restore_punctuation(text: str, punct_model: Any) -> str:
593
- """
594
- Restore punctuation to text that lacks commas/periods.
595
- Only applies the model when the text appears to lack punctuation
596
- (fewer than 1 punctuation mark per 80 characters), so already
597
- well-punctuated inputs are passed through unchanged.
598
- Falls back to original text on any error.
599
- """
600
- if punct_model is None:
601
- return text
602
- stripped = text.strip()
603
- if not stripped:
604
- return text
605
- punct_chars = sum(1 for c in stripped if c in ".,:;!?")
606
- density = punct_chars / max(len(stripped), 1)
607
- if density >= 1 / 80:
608
- return text
609
- try:
610
- result: str = punct_model.restore_punctuation(stripped)
611
- print(
612
- f"[miner] punctuation restored: {len(stripped)}→{len(result)} chars",
613
- flush=True,
614
- )
615
- return result
616
- except Exception as e:
617
- print(f"[miner] punctuation restoration failed ({e}); using original", flush=True)
618
- return text
619
-
620
-
621
  _VOICE_TRAIT_ENUMS: dict[str, list[str]] = {
622
  "gender": ["male", "female", "neutral"],
623
  "pitch": ["low", "mid", "high"],
@@ -629,181 +438,200 @@ _VOICE_TRAIT_ENUMS: dict[str, list[str]] = {
629
  }
630
  _ORDINAL_TRAITS = {"pitch", "speed", "age_group"}
631
 
632
- _AI_COMPARE_SYSTEM = """You are an expert TTS evaluator. Analyze each provided audio candidate and rate it against the target instruction and target text.
 
 
633
 
634
- For EACH candidate output these fields:
635
- - transcription: exact words spoken, lowercased (string)
636
- - gender: one of [male, female, neutral]
637
- - pitch: one of [low, mid, high]
638
- - speed: one of [slow, normal, fast]
639
- - age_group: one of [child, young_adult, adult, senior]
640
- - emotion: one of [neutral, happy, sad, angry, calm, excited, serious, fearful]
641
- - tone: one of [warm, cold, friendly, formal, casual, authoritative]
642
- - accent: one of [us, uk, au, in, neutral, other]
643
- - naturalness_score: integer 1-5 (1=robotic, 5=indistinguishable from human)
644
 
645
- Then set "best" to the 0-based index of the candidate that best matches the instruction and sounds most natural.
 
646
 
647
- Return ONLY valid JSON in this exact shape (no markdown, no commentary):
648
- {"candidates": [{"transcription":"...","gender":"...","pitch":"...","speed":"...","age_group":"...","emotion":"...","tone":"...","accent":"...","naturalness_score":4}], "best": 0}"""
 
 
 
 
649
 
 
 
650
 
651
- class OmniAudioJudge:
652
- """
653
- Local audio judge using Qwen2.5-Omni-7B. No external API.
654
- Mimics GPT-4o-audio-preview trait extraction + comparative naturalness ranking.
655
- """
656
 
657
- def __init__(self, model_id: str = OMNI_MODEL_ID) -> None:
658
- self._model_id = model_id
659
- self._model = None
660
- self._processor = None
661
- self._device = "cpu"
662
- self._dtype = None
663
- self._api_ok = False
664
- self._api_error: str = ""
665
- self._load()
666
-
667
- def _load(self) -> None:
668
  try:
669
  import torch
670
- from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
671
- self._device = "cuda:0" if torch.cuda.is_available() else "cpu"
672
- self._dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
673
- print(f"[miner] scorer-1: loading {self._model_id} on {self._device} ({self._dtype})...", flush=True)
674
- self._processor = Qwen2_5OmniProcessor.from_pretrained(self._model_id)
675
- try:
676
- self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
677
- self._model_id,
678
- dtype=self._dtype,
679
- device_map=self._device,
680
- attn_implementation="flash_attention_2",
681
- )
682
- except Exception:
683
- self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
684
- self._model_id,
685
- dtype=self._dtype,
686
- device_map=self._device,
687
- attn_implementation="sdpa",
688
- )
689
  self._model.eval()
690
- self._api_ok = True
691
- print(f"[miner] scorer-1: Omni judge ready ({self._model_id})", flush=True)
 
 
 
692
  except Exception as e:
693
- self._api_error = f"load_failed: {e}"
694
- print(f"[miner] scorer-1: Omni judge load FAILED ({e})", flush=True)
 
 
 
 
 
 
 
 
 
 
 
695
 
696
- @staticmethod
697
- def _to_16k_mono(wav: np.ndarray, sr: int) -> np.ndarray:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  import librosa
699
- x = wav.astype(np.float32)
700
- if x.ndim > 1:
701
- x = x.mean(axis=1)
702
- if sr != 16000:
703
- x = librosa.resample(x, orig_sr=sr, target_sr=16000)
704
- return x.astype(np.float32)
705
-
706
- def judge_candidates(
707
- self,
708
- candidates: list[tuple[np.ndarray, int]],
709
- target_text: str,
710
- instruction: str,
711
- ) -> tuple[int, list[dict[str, Any]]]:
712
- """
713
- Send all candidates in one Omni call. Returns (best_index, trait_list).
714
- """
715
- import json as _json
716
- import torch
717
-
718
- audios_16k = [self._to_16k_mono(w, sr) for w, sr in candidates]
719
-
720
- content: list[dict] = []
721
- for i in range(len(candidates)):
722
- content.append({"type": "text", "text": f"Candidate {i}:"})
723
- content.append({"type": "audio", "audio": audios_16k[i]})
724
- content.append({"type": "text", "text": f"Target instruction: {instruction}"})
725
- content.append({"type": "text", "text": f"Target text: {target_text}"})
726
-
727
- conversation = [
728
- {"role": "system", "content": [{"type": "text", "text": _AI_COMPARE_SYSTEM}]},
729
- {"role": "user", "content": content},
730
- ]
731
-
732
- text = self._processor.apply_chat_template(
733
- conversation, add_generation_prompt=True, tokenize=False
734
- )
735
- inputs = self._processor(
736
- text=text,
737
- audio=audios_16k,
738
- sampling_rate=16000,
739
- return_tensors="pt",
740
- padding=True,
741
- )
742
- inputs = {k: (v.to(self._device) if hasattr(v, "to") else v) for k, v in inputs.items()}
743
-
744
- with torch.inference_mode():
745
- outputs = self._model.generate(
746
- **inputs,
747
- max_new_tokens=600,
748
- do_sample=False,
749
- return_audio=False,
750
- )
751
 
752
- in_len = inputs["input_ids"].shape[1] if "input_ids" in inputs else 0
753
- gen = outputs[:, in_len:] if in_len else outputs
754
- raw = self._processor.batch_decode(
755
- gen, skip_special_tokens=True, clean_up_tokenization_spaces=False
756
- )[0].strip()
757
-
758
- if "```" in raw:
759
- for part in raw.split("```"):
760
- p = part.strip()
761
- if p.startswith("json"):
762
- p = p[4:].strip()
763
- if p.startswith("{"):
764
- raw = p
765
- break
766
- start = raw.find("{")
767
- end = raw.rfind("}")
768
- if start != -1 and end != -1 and end > start:
769
- raw = raw[start:end + 1]
770
- try:
771
- data = _json.loads(raw)
772
- except _json.JSONDecodeError:
773
- print(f"[miner] scorer-1: JSON parse failed; raw={raw[:300]}", flush=True)
774
- data = {}
775
-
776
- fallbacks = {
777
- "gender": "neutral", "pitch": "mid", "speed": "normal", "age_group": "adult",
778
- "emotion": "neutral", "tone": "casual", "accent": "neutral",
779
- }
780
- trait_list: list[dict[str, Any]] = []
781
- for raw_c in (data.get("candidates") or []):
782
- out: dict[str, Any] = {"transcription": str(raw_c.get("transcription") or "").strip()}
783
- for k, enum in _VOICE_TRAIT_ENUMS.items():
784
- v = str(raw_c.get(k) or "").strip().lower().replace(" ", "_").replace("-", "_")
785
- out[k] = v if v in enum else fallbacks[k]
786
- try:
787
- out["naturalness_score"] = float(max(1, min(5, int(raw_c.get("naturalness_score", 3)))))
788
- except (TypeError, ValueError):
789
- out["naturalness_score"] = 3.0
790
- trait_list.append(out)
791
 
792
- while len(trait_list) < len(candidates):
793
- trait_list.append({**fallbacks, "transcription": "", "naturalness_score": 3.0})
794
 
795
- try:
796
- ai_best = int(data.get("best", 0))
797
- if not (0 <= ai_best < len(candidates)):
798
- ai_best = 0
799
- except (TypeError, ValueError):
800
- ai_best = 0
 
 
 
 
 
 
 
801
 
802
- return ai_best, trait_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
 
804
 
805
  # ---------------------------------------------------------------------------
806
- # Qwen2-Audio-7B-Instruct scorer (free local fallback)
807
  # ---------------------------------------------------------------------------
808
 
809
 
@@ -975,16 +803,11 @@ class Miner:
975
  self._root = Path(path_hf_repo).resolve()
976
  cfg = _read_vocence_yaml(self._root)
977
  runtime = cfg.get("runtime") or {}
978
- generation = cfg.get("generation") or {}
979
  limits = cfg.get("limits") or {}
980
 
981
  self._language = str(runtime.get("default_language", "English"))
982
  self._cap_instruction = int(limits.get("max_instruction_chars", 600))
983
  self._cap_text = int(limits.get("max_text_chars", 2000))
984
- _di = generation.get("default_instruction")
985
- self._default_instruction = (
986
- str(_di).strip() if _di is not None and str(_di).strip() else ""
987
- )
988
 
989
  _local_root = _local_dir_for_downloads(self._root, runtime)
990
  _hub = str(runtime.get("hub_model_id", DEFAULT_HUB_MODEL_ID))
@@ -1046,18 +869,13 @@ class Miner:
1046
  print(f"[miner] whisper unavailable ({e}); selection falls back", flush=True)
1047
  self._whisper = None
1048
 
1049
- self._punct_model = _try_load_punct_model()
1050
-
1051
- # Scorer 1: local Omni judge (Qwen2.5-Omni-7B)
1052
- try:
1053
- self._ai: Any = OmniAudioJudge()
1054
- except Exception as e:
1055
- print(f"[miner] scorer-1: Omni judge init failed ({e})", flush=True)
1056
- self._ai = None
1057
 
1058
- # Scorer 2: Whisper WER (fallback — already loaded above)
1059
- active = "Omni-Judge" if (self._ai and self._ai._api_ok) else "Whisper-WER"
1060
- print(f"[miner] ready: best-of-3 (qwen×3); active scorer: {active}", flush=True)
 
1061
  wts = _VALIDATOR_WEIGHTS
1062
  print(
1063
  f"[miner] validator weights: script={wts['script']:.2f} nat={wts['naturalness']:.2f} "
@@ -1066,22 +884,15 @@ class Miner:
1066
  f"tone={wts['tone']:.2f}",
1067
  flush=True,
1068
  )
1069
- if self._default_instruction:
1070
- print(
1071
- "[miner] default_instruction: prepended to each request (per-key override: user wins)",
1072
- flush=True,
1073
- )
1074
 
1075
  def __repr__(self) -> str:
1076
- return "Miner(best-of-3/qwen×3, in_process=True)"
1077
 
1078
  def get_status(self) -> dict:
1079
- ai_ok = self._ai is not None and self._ai._api_ok
1080
  whisper_ok = self._whisper is not None
1081
- active = "omni-judge" if ai_ok else ("whisper-wer" if whisper_ok else "none")
1082
- ai_status = "ready" if ai_ok else (
1083
- f"api_error: {self._ai._api_error}" if self._ai is not None else "not configured"
1084
- )
1085
  cuda_info = "unknown"
1086
  try:
1087
  import torch
@@ -1098,7 +909,7 @@ class Miner:
1098
  tts_device = getattr(self, "_tts_device", "unknown")
1099
  model_on_cuda = isinstance(tts_device, str) and tts_device.startswith("cuda")
1100
  return {
1101
- "scorer_ai": ai_status,
1102
  "scorer_whisper": "ready" if whisper_ok else "not available",
1103
  "active_scorer": active,
1104
  "cuda": cuda_info,
@@ -1134,32 +945,26 @@ class Miner:
1134
 
1135
  def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
1136
  text = text[: self._cap_text] if self._cap_text else text
1137
- inst = _merge_default_instruction(self._default_instruction, instruction)
1138
- if self._cap_instruction:
1139
- inst = inst[: self._cap_instruction]
1140
-
1141
- # Enrich text: punctuation restoration + number/abbreviation expansion.
1142
- punct_text = _restore_punctuation(text, self._punct_model)
1143
- rich_text = _normalize_text_for_tts(punct_text)
1144
- parsed = _parse_instruction(inst)
1145
- t0 = time.time()
1146
 
1147
- # All candidates use identical inputs and the model's own
1148
- # generation_config defaults — no temperature/top_p/top_k/rep_penalty/
1149
- # max_new_tokens overrides. Diversity comes purely from sampling RNG
1150
- # (Qwen3-TTS-VoiceDesign's default config has do_sample=True, so each
1151
- # call draws a fresh sample). Mirrors magma90909/vocence_miner_v8,
1152
- # which only passes text/instruct/language to generate_voice_design.
1153
 
1154
- # Phase 1: generate 3 Qwen candidates
1155
  raw_candidates: list[tuple[np.ndarray, int, str]] = []
1156
  first_rejected: tuple[np.ndarray, int] | None = None
1157
 
1158
  def _qwen(tag: str) -> None:
1159
  nonlocal first_rejected
1160
  kwargs: dict[str, Any] = dict(
1161
- text=rich_text,
1162
- instruct=inst,
1163
  language=self._language,
1164
  )
1165
  gen_t0 = time.time()
@@ -1194,46 +999,33 @@ class Miner:
1194
  f"all synthesis attempts failed validity in {time.time()-t0:.1f}s{hint}"
1195
  )
1196
 
1197
- # Phase 2: AI judge Whisper fallback
1198
  scores: list[float] = []
1199
- ai_best = 0
1200
- best = 0
1201
- scorer_used = "none"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1202
 
1203
- if self._ai is not None and self._ai._api_ok:
1204
- try:
1205
- print(f"[scorer-1/omni] judging {len(raw_candidates)} candidates...", flush=True)
1206
- audio_list = [(w, s) for w, s, _ in raw_candidates]
1207
- ai_best, trait_list = self._ai.judge_candidates(audio_list, text, inst)
1208
- for i, (wav, sr_i, tag) in enumerate(raw_candidates):
1209
- total, detail = _score_from_traits(trait_list[i], text, parsed)
1210
- scores.append(total)
1211
- print(
1212
- f"[scorer-1/omni][{tag}] score={total:.3f} wer={detail['wer']:.3f} "
1213
- f"gp={detail['gp']:.3f} spd={detail['speed']:.3f} "
1214
- f"nat={detail['nat']:.3f} age={detail['age']:.2f} "
1215
- f"emo={detail['emo']:.2f} tone={detail['tone']:.2f} "
1216
- f"accent={detail['accent']:.2f} elapsed={time.time()-t0:.1f}s",
1217
- flush=True,
1218
- )
1219
- best = ai_best
1220
- scorer_used = "omni-judge"
1221
- except Exception as e:
1222
- print(f"[scorer-1/omni] failed ({e}); falling back to Whisper...", flush=True)
1223
-
1224
- if not scores:
1225
- print(f"[scorer-2/whisper] judging {len(raw_candidates)} candidates...", flush=True)
1226
- for wav, sr_i, tag in raw_candidates:
1227
- total, detail = _score_fallback(wav, sr_i, text, self._whisper)
1228
- scores.append(total)
1229
- print(f"[scorer-2/whisper][{tag}] score={total:.3f} wer={detail['wer']:.3f} elapsed={time.time()-t0:.1f}s", flush=True)
1230
- best = int(np.argmax(scores))
1231
- scorer_used = "whisper-wer"
1232
-
1233
- active_scores = scores
1234
  print(
1235
- f"[miner] best={raw_candidates[best][2]} score={active_scores[best]:.3f} "
1236
- f"scorer={scorer_used} total={len(raw_candidates)} elapsed={time.time()-t0:.1f}s",
1237
  flush=True,
1238
  )
1239
  return raw_candidates[best][0], raw_candidates[best][1]
 
123
  _vocence_install_sox_stub()
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def _vocence_install_onnxruntime_stub() -> None:
127
  if "onnxruntime" in sys.modules:
128
  return
 
188
  }
189
 
190
  DEFAULT_HUB_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
 
191
 
192
  _MIN_DURATION_SEC = 0.15
193
  _MAX_DURATION_SEC = 29.5
 
255
  return data if isinstance(data, Mapping) else {}
256
 
257
 
 
 
 
 
 
 
 
 
 
 
 
258
  def _is_hub_model_id(s: str) -> bool:
259
  t = (s or "").strip()
260
  if not t or t[0] in ("/", ".", "~") or "\\" in t or "\n" in t or ".." in t:
 
399
  return result
400
 
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  def _score_wer(wav: np.ndarray, sr: int, target_text: str, whisper_model: Any) -> float:
403
  if whisper_model is None:
404
  return 0.5
 
427
  return 0.5
428
 
429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  _VOICE_TRAIT_ENUMS: dict[str, list[str]] = {
431
  "gender": ["male", "female", "neutral"],
432
  "pitch": ["low", "mid", "high"],
 
438
  }
439
  _ORDINAL_TRAITS = {"pitch", "speed", "age_group"}
440
 
441
+ # JIT UTMOS (balacoon/utmos) torch + hub only; no fairseq stack.
442
+ _UTMOS_JIT_REPO = "balacoon/utmos"
443
+ _UTMOS_JIT_FILENAME = "utmos.jit"
444
 
 
 
 
 
 
 
 
 
 
 
445
 
446
+ class UtmosJitPredictor:
447
+ """Mean-opinion-score style naturalness (≈1–5) via traced UTMOS."""
448
 
449
+ def __init__(self) -> None:
450
+ self._model: Any = None
451
+ self._device: Any = None
452
+ self._ok = False
453
+ self._failed = False
454
+ self._error: str = ""
455
 
456
+ def is_ok(self) -> bool:
457
+ return self._ok
458
 
459
+ def error(self) -> str:
460
+ return self._error
 
 
 
461
 
462
+ def ensure(self) -> bool:
463
+ if self._ok:
464
+ return True
465
+ if self._failed:
466
+ return False
 
 
 
 
 
 
467
  try:
468
  import torch
469
+ from huggingface_hub import hf_hub_download
470
+
471
+ token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
472
+ path = hf_hub_download(
473
+ repo_id=_UTMOS_JIT_REPO,
474
+ filename=_UTMOS_JIT_FILENAME,
475
+ repo_type="model",
476
+ token=token,
477
+ )
478
+ self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
479
+ self._model = torch.jit.load(path, map_location=self._device)
 
 
 
 
 
 
 
 
480
  self._model.eval()
481
+ self._ok = True
482
+ print(
483
+ f"[miner] UTMOS JIT loaded ({_UTMOS_JIT_REPO}) on {self._device}",
484
+ flush=True,
485
+ )
486
  except Exception as e:
487
+ self._failed = True
488
+ self._error = repr(e)
489
+ self._model = None
490
+ print(f"[miner] UTMOS JIT load FAILED: {e}", flush=True)
491
+ return self._ok
492
+
493
+ def predict_mos(self, wav: np.ndarray, sr: int) -> float:
494
+ """Return MOS-like score in ~[1, 5]; fallback 3.0 if model unavailable."""
495
+ if not self.ensure() or self._model is None:
496
+ return 3.0
497
+ try:
498
+ import librosa
499
+ import torch
500
 
501
+ x = wav.astype(np.float32)
502
+ if x.ndim > 1:
503
+ x = x.mean(axis=1)
504
+ if sr != 16000:
505
+ x = librosa.resample(x, orig_sr=sr, target_sr=16000)
506
+ x = np.clip(x, -1.0, 1.0)
507
+ xi = (x * 32767.0).astype(np.int16)
508
+ t = torch.as_tensor(xi, device=self._device, dtype=torch.int16).unsqueeze(0)
509
+ with torch.inference_mode():
510
+ out = self._model(t)
511
+ val = float(out.reshape(-1)[0].item())
512
+ return max(1.0, min(5.0, val))
513
+ except Exception as e:
514
+ print(f"[miner] UTMOS predict failed: {e}", flush=True)
515
+ return 3.0
516
+
517
+
518
+ def _transcribe_whisper(wav: np.ndarray, sr: int, whisper_model: Any) -> str:
519
+ if whisper_model is None:
520
+ return ""
521
+ try:
522
  import librosa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
 
524
+ wav16 = librosa.resample(wav.astype(np.float32), orig_sr=sr, target_sr=16000)
525
+ result = whisper_model.transcribe(wav16, language="en", fp16=False)
526
+ return str(result.get("text") or "").strip().lower()
527
+ except Exception:
528
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
 
 
530
 
531
+ def _mean_fundamental_hz(wav: np.ndarray, sr: int) -> float:
532
+ import librosa
533
+
534
+ y = wav.astype(np.float32)
535
+ if y.ndim > 1:
536
+ y = y.mean(axis=1)
537
+ if y.size < 256:
538
+ return 0.0
539
+ f0 = librosa.yin(y, fmin=50.0, fmax=500.0, sr=sr)
540
+ v = f0[np.isfinite(f0) & (f0 > 0)]
541
+ if v.size == 0:
542
+ return 0.0
543
+ return float(np.nanmean(v))
544
 
545
+
546
+ def _estimate_pitch_trait(wav: np.ndarray, sr: int) -> str:
547
+ """Map mean F0 to low/mid/high (coarse heuristic for synthetic speech)."""
548
+ hz = _mean_fundamental_hz(wav, sr)
549
+ if hz <= 0.0:
550
+ return "mid"
551
+ if hz < 130.0:
552
+ return "low"
553
+ if hz < 210.0:
554
+ return "mid"
555
+ return "high"
556
+
557
+
558
+ def _estimate_speed_trait(wav: np.ndarray, sr: int, reference_text: str) -> str:
559
+ """Speaking rate vs reference word count (coarse slow/normal/fast)."""
560
+ import re
561
+
562
+ dur = float(wav.shape[0]) / float(sr) if sr else 0.0
563
+ if dur < 0.05:
564
+ return "normal"
565
+ words = re.findall(r"\w+", (reference_text or "").lower())
566
+ nw = max(len(words), 1)
567
+ wps = nw / dur
568
+ if wps < 2.2:
569
+ return "slow"
570
+ if wps > 4.0:
571
+ return "fast"
572
+ return "normal"
573
+
574
+
575
+ def _trait_score_without_audio_classifier(expected: str) -> float:
576
+ """No audio-side classifier for this trait; soft constant if instruction pins it."""
577
+ e = (expected or "").strip().lower()
578
+ if not e:
579
+ return 1.0
580
+ return 0.85
581
+
582
+
583
+ def _build_traits_non_llm(
584
+ wav: np.ndarray,
585
+ sr: int,
586
+ *,
587
+ validator_text: str,
588
+ parsed: dict[str, str],
589
+ whisper_model: Any,
590
+ utmos: UtmosJitPredictor,
591
+ ) -> dict[str, Any]:
592
+ return {
593
+ "transcription": _transcribe_whisper(wav, sr, whisper_model),
594
+ "naturalness_score": float(utmos.predict_mos(wav, sr)),
595
+ "pitch": _estimate_pitch_trait(wav, sr),
596
+ "speed": _estimate_speed_trait(wav, sr, validator_text),
597
+ "gender": "neutral",
598
+ "age_group": "adult",
599
+ "emotion": "neutral",
600
+ "tone": "casual",
601
+ "accent": "neutral",
602
+ }
603
+
604
+
605
+ def _score_from_traits_non_llm(
606
+ traits: dict[str, Any],
607
+ target_text: str,
608
+ parsed: dict[str, str],
609
+ ) -> tuple[float, dict[str, float]]:
610
+ """Validator-aligned score: script + UTMOS naturalness + pitch/speed heuristics; soft prior on other traits."""
611
+ script_s = max(0.0, 1.0 - _wer_simple(target_text, traits.get("transcription", "")))
612
+ nat_s = (float(traits.get("naturalness_score", 3.0)) - 1.0) / 4.0
613
+ elem_scores: dict[str, float] = {"script": script_s, "naturalness": nat_s}
614
+ for key in ("gender", "age_group", "emotion", "tone", "accent"):
615
+ elem_scores[key] = _trait_score_without_audio_classifier(parsed.get(key, ""))
616
+ for key in ("pitch", "speed"):
617
+ elem_scores[key] = _ai_score_element(key, parsed.get(key, ""), traits.get(key, ""))
618
+ wsum = sum(_VALIDATOR_WEIGHTS.values())
619
+ total = sum(_VALIDATOR_WEIGHTS[k] * elem_scores[k] for k in _VALIDATOR_WEIGHTS) / wsum
620
+ detail = {
621
+ "wer": script_s,
622
+ "gp": (elem_scores["gender"] + elem_scores["pitch"]) / 2.0,
623
+ "speed": elem_scores["speed"],
624
+ "nat": nat_s,
625
+ "age": elem_scores["age_group"],
626
+ "emo": elem_scores["emotion"],
627
+ "tone": elem_scores["tone"],
628
+ "accent": elem_scores["accent"],
629
+ }
630
+ return total, detail
631
 
632
 
633
  # ---------------------------------------------------------------------------
634
+ # Trait scoring helpers (validator-aligned)
635
  # ---------------------------------------------------------------------------
636
 
637
 
 
803
  self._root = Path(path_hf_repo).resolve()
804
  cfg = _read_vocence_yaml(self._root)
805
  runtime = cfg.get("runtime") or {}
 
806
  limits = cfg.get("limits") or {}
807
 
808
  self._language = str(runtime.get("default_language", "English"))
809
  self._cap_instruction = int(limits.get("max_instruction_chars", 600))
810
  self._cap_text = int(limits.get("max_text_chars", 2000))
 
 
 
 
811
 
812
  _local_root = _local_dir_for_downloads(self._root, runtime)
813
  _hub = str(runtime.get("hub_model_id", DEFAULT_HUB_MODEL_ID))
 
869
  print(f"[miner] whisper unavailable ({e}); selection falls back", flush=True)
870
  self._whisper = None
871
 
872
+ self._utmos = UtmosJitPredictor()
873
+ self._utmos.ensure()
 
 
 
 
 
 
874
 
875
+ ut_ok = self._utmos.is_ok()
876
+ whisper_ok = self._whisper is not None
877
+ active = "utmos+whisper" if ut_ok else ("whisper-only" if whisper_ok else "degraded")
878
+ print(f"[miner] ready: best-of-N (qwen candidates); active scorer: {active}", flush=True)
879
  wts = _VALIDATOR_WEIGHTS
880
  print(
881
  f"[miner] validator weights: script={wts['script']:.2f} nat={wts['naturalness']:.2f} "
 
884
  f"tone={wts['tone']:.2f}",
885
  flush=True,
886
  )
 
 
 
 
 
887
 
888
  def __repr__(self) -> str:
889
+ return "Miner(best-of-N/qwen, utmos+whisper scorer, in_process=True)"
890
 
891
  def get_status(self) -> dict:
892
+ ut_ok = self._utmos.is_ok()
893
  whisper_ok = self._whisper is not None
894
+ active = "utmos+whisper" if ut_ok else ("whisper-only" if whisper_ok else "degraded")
895
+ ut_status = "ready" if ut_ok else f"not loaded: {self._utmos.error()}"
 
 
896
  cuda_info = "unknown"
897
  try:
898
  import torch
 
909
  tts_device = getattr(self, "_tts_device", "unknown")
910
  model_on_cuda = isinstance(tts_device, str) and tts_device.startswith("cuda")
911
  return {
912
+ "scorer_utmos": ut_status,
913
  "scorer_whisper": "ready" if whisper_ok else "not available",
914
  "active_scorer": active,
915
  "cuda": cuda_info,
 
945
 
946
  def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
947
  text = text[: self._cap_text] if self._cap_text else text
948
+ validator_text = text
949
+ validator_inst = (
950
+ instruction[: self._cap_instruction]
951
+ if self._cap_instruction
952
+ else instruction
953
+ )
954
+ parsed_eval = _parse_instruction(validator_inst)
 
 
955
 
956
+ # TTS: same capped strings the validator sent (no default merge, preamble, or text enrichment).
957
+ t0 = time.time()
 
 
 
 
958
 
959
+ # Phase 1: generate Qwen candidates
960
  raw_candidates: list[tuple[np.ndarray, int, str]] = []
961
  first_rejected: tuple[np.ndarray, int] | None = None
962
 
963
  def _qwen(tag: str) -> None:
964
  nonlocal first_rejected
965
  kwargs: dict[str, Any] = dict(
966
+ text=validator_text,
967
+ instruct=validator_inst,
968
  language=self._language,
969
  )
970
  gen_t0 = time.time()
 
999
  f"all synthesis attempts failed validity in {time.time()-t0:.1f}s{hint}"
1000
  )
1001
 
1002
+ # Phase 2: UTMOS + Whisper script + audio heuristics (validator instruction/text only).
1003
  scores: list[float] = []
1004
+ print(f"[scorer/utmos] judging {len(raw_candidates)} candidates...", flush=True)
1005
+ for wav, sr_i, tag in raw_candidates:
1006
+ traits = _build_traits_non_llm(
1007
+ wav,
1008
+ sr_i,
1009
+ validator_text=validator_text,
1010
+ parsed=parsed_eval,
1011
+ whisper_model=self._whisper,
1012
+ utmos=self._utmos,
1013
+ )
1014
+ total, detail = _score_from_traits_non_llm(traits, validator_text, parsed_eval)
1015
+ scores.append(total)
1016
+ print(
1017
+ f"[scorer/utmos][{tag}] score={total:.3f} wer={detail['wer']:.3f} "
1018
+ f"gp={detail['gp']:.3f} spd={detail['speed']:.3f} "
1019
+ f"nat={detail['nat']:.3f} age={detail['age']:.2f} "
1020
+ f"emo={detail['emo']:.2f} tone={detail['tone']:.2f} "
1021
+ f"accent={detail['accent']:.2f} elapsed={time.time()-t0:.1f}s",
1022
+ flush=True,
1023
+ )
1024
+ best = int(np.argmax(scores))
1025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1026
  print(
1027
+ f"[miner] best={raw_candidates[best][2]} score={scores[best]:.3f} "
1028
+ f"scorer=utmos+traits total={len(raw_candidates)} elapsed={time.time()-t0:.1f}s",
1029
  flush=True,
1030
  )
1031
  return raw_candidates[best][0], raw_candidates[best][1]