#!/usr/bin/env python3 """Enriched eval gate — intelligibility + TW-accent + signal quality, in one run (moss-nano-venv). Usage: python assess_quality.py --synth-dir --tag [--std-model medium] Three axes: 1. INTELLIGIBILITY zh-CER via Breeze-ASR-25 (zh-TW), en-WER via a generic whisper. 2. TW-ACCENT dual-ASR gap = CER(generic whisper) - CER(Breeze zh-TW), per zh clip. >0 => Breeze understands it better than a generic recognizer => Taiwan accent present (a *feature* for zh). Heuristic: a generic model is also just weaker, so read trends, not absolutes — calibrate against a known-mainland clip if you need a zero point. 3. QUALITY torchaudio SQUIM: PESQ + STOI (reference-free) + MOS (subjective, teacher NMR). Reads ref text + lang (zh/mix/en) from eval_big.jsonl, wavs from /synth.jsonl. """ import argparse, json, glob, re, statistics ZT = "/home/luigi/jetson-tts/mossnano/zhtw8k" import opencc _t2s = opencc.OpenCC("t2s") def _han(s): return re.sub(r"[^一-鿿]", "", s) def _norm_zh(s): return _han(_t2s.convert(s or "")) def _norm_en(s): return re.sub(r"[^a-z' ]", " ", (s or "").lower()).split() def _lev(a, b): m, n = len(a), len(b) if m == 0 or n == 0: return max(m, n) prev = list(range(n + 1)) for i in range(1, m + 1): cur = [i] + [0] * n for j in range(1, n + 1): cur[j] = min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (a[i - 1] != b[j - 1])) prev = cur return prev[n] def _cer(ref, hyp): r = _norm_zh(ref); return _lev(list(r), list(_norm_zh(hyp))) / max(1, len(r)) def _wer(ref, hyp): r = _norm_en(ref); return _lev(r, _norm_en(hyp)) / max(1, len(r)) def main(): ap = argparse.ArgumentParser() ap.add_argument("--synth-dir", required=True) ap.add_argument("--tag", default="") ap.add_argument("--std-model", default="medium", help="generic whisper for standard-zh + en pole") a = ap.parse_args() meta = {r["id"]: (r["lang"], r["text"]) for r in (json.loads(l) for l in open(f"{ZT}/eval_big.jsonl"))} rows = [json.loads(l) for l in open(f"{a.synth_dir}/synth.jsonl") if l.strip()] from faster_whisper import WhisperModel breeze = WhisperModel("SoybeanMilk/faster-whisper-Breeze-ASR-25", device="cpu", compute_type="int8") std = WhisperModel(a.std_model, device="cpu", compute_type="int8") def asr(model, wav, lang): segs, _ = model.transcribe(wav, language=lang, beam_size=1) return "".join(s.text for s in segs) import torch, soundfile as sf, torchaudio.functional as AF from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE Mo = SQUIM_OBJECTIVE.get_model().eval(); Ms = SQUIM_SUBJECTIVE.get_model().eval() def load16k(p): w, sr = sf.read(p) if getattr(w, "ndim", 1) > 1: w = w.mean(1) t = torch.tensor(w, dtype=torch.float32) return AF.resample(t, sr, 16000) if sr != 16000 else t nmr = load16k(sorted(glob.glob(f"{ZT}/teacher_corpus_en_expand/*.wav"))[0])[None] def quality(wav): t = load16k(wav)[None] with torch.no_grad(): st, pe, si = Mo(t) try: mos = float(Ms(t, nmr)[0]) except Exception: mos = float("nan") return float(pe[0]), float(st[0]), mos cats = {"zh": [], "mix": [], "en": []}; gap = []; pq = []; sq = []; mq = [] for r in rows: lang, ref = meta[r["id"]]; wav = r["wav"] try: pe, st_, mo = quality(wav); pq.append(pe); sq.append(st_) if mo == mo: mq.append(mo) if lang in ("zh", "mix"): cb = _cer(ref, asr(breeze, wav, "zh")); cs = _cer(ref, asr(std, wav, "zh")) cats[lang].append(cb); gap.append(cs - cb) else: cats["en"].append(_wer(ref, asr(std, wav, "en"))) except Exception as e: print(" skip", r["id"], str(e)[:60]) def avg(x): return statistics.mean(x) if x else float("nan") print(f"[{a.tag}] INTELLIGIBILITY zh-CER(Breeze)={avg(cats['zh']+cats['mix']):.3f} " f"(zh={avg(cats['zh']):.3f} mix={avg(cats['mix']):.3f}) en-WER={avg(cats['en']):.3f}") print(f"[{a.tag}] TW-ACCENT gap(generic-Breeze CER, >0=more TW)={avg(gap):+.3f}") print(f"[{a.tag}] QUALITY SQUIM-PESQ={avg(pq):.2f} STOI={avg(sq):.3f} MOS={avg(mq):.2f} (N={len(rows)})") if __name__ == "__main__": main()