#!/usr/bin/env python3
"""Enriched eval gate — intelligibility + TW-accent + signal quality, in one run (moss-nano-venv).
Usage: python assess_quality.py --synth-dir
--tag [--std-model medium]
Three axes:
1. INTELLIGIBILITY zh-CER via Breeze-ASR-25 (zh-TW), en-WER via a generic whisper.
2. TW-ACCENT dual-ASR gap = CER(generic whisper) - CER(Breeze zh-TW), per zh clip.
>0 => Breeze understands it better than a generic recognizer => Taiwan accent present
(a *feature* for zh). Heuristic: a generic model is also just weaker, so read trends,
not absolutes — calibrate against a known-mainland clip if you need a zero point.
3. QUALITY torchaudio SQUIM: PESQ + STOI (reference-free) + MOS (subjective, teacher NMR).
Reads ref text + lang (zh/mix/en) from eval_big.jsonl, wavs from /synth.jsonl.
"""
import argparse, json, glob, re, statistics
ZT = "/home/luigi/jetson-tts/mossnano/zhtw8k"
import opencc
_t2s = opencc.OpenCC("t2s")
def _han(s): return re.sub(r"[^一-鿿]", "", s)
def _norm_zh(s): return _han(_t2s.convert(s or ""))
def _norm_en(s): return re.sub(r"[^a-z' ]", " ", (s or "").lower()).split()
def _lev(a, b):
m, n = len(a), len(b)
if m == 0 or n == 0: return max(m, n)
prev = list(range(n + 1))
for i in range(1, m + 1):
cur = [i] + [0] * n
for j in range(1, n + 1):
cur[j] = min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (a[i - 1] != b[j - 1]))
prev = cur
return prev[n]
def _cer(ref, hyp): r = _norm_zh(ref); return _lev(list(r), list(_norm_zh(hyp))) / max(1, len(r))
def _wer(ref, hyp): r = _norm_en(ref); return _lev(r, _norm_en(hyp)) / max(1, len(r))
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--synth-dir", required=True)
ap.add_argument("--tag", default="")
ap.add_argument("--std-model", default="medium", help="generic whisper for standard-zh + en pole")
a = ap.parse_args()
meta = {r["id"]: (r["lang"], r["text"]) for r in (json.loads(l) for l in open(f"{ZT}/eval_big.jsonl"))}
rows = [json.loads(l) for l in open(f"{a.synth_dir}/synth.jsonl") if l.strip()]
from faster_whisper import WhisperModel
breeze = WhisperModel("SoybeanMilk/faster-whisper-Breeze-ASR-25", device="cpu", compute_type="int8")
std = WhisperModel(a.std_model, device="cpu", compute_type="int8")
def asr(model, wav, lang):
segs, _ = model.transcribe(wav, language=lang, beam_size=1)
return "".join(s.text for s in segs)
import torch, soundfile as sf, torchaudio.functional as AF
from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE
Mo = SQUIM_OBJECTIVE.get_model().eval(); Ms = SQUIM_SUBJECTIVE.get_model().eval()
def load16k(p):
w, sr = sf.read(p)
if getattr(w, "ndim", 1) > 1: w = w.mean(1)
t = torch.tensor(w, dtype=torch.float32)
return AF.resample(t, sr, 16000) if sr != 16000 else t
nmr = load16k(sorted(glob.glob(f"{ZT}/teacher_corpus_en_expand/*.wav"))[0])[None]
def quality(wav):
t = load16k(wav)[None]
with torch.no_grad():
st, pe, si = Mo(t)
try: mos = float(Ms(t, nmr)[0])
except Exception: mos = float("nan")
return float(pe[0]), float(st[0]), mos
cats = {"zh": [], "mix": [], "en": []}; gap = []; pq = []; sq = []; mq = []
for r in rows:
lang, ref = meta[r["id"]]; wav = r["wav"]
try:
pe, st_, mo = quality(wav); pq.append(pe); sq.append(st_)
if mo == mo: mq.append(mo)
if lang in ("zh", "mix"):
cb = _cer(ref, asr(breeze, wav, "zh")); cs = _cer(ref, asr(std, wav, "zh"))
cats[lang].append(cb); gap.append(cs - cb)
else:
cats["en"].append(_wer(ref, asr(std, wav, "en")))
except Exception as e:
print(" skip", r["id"], str(e)[:60])
def avg(x): return statistics.mean(x) if x else float("nan")
print(f"[{a.tag}] INTELLIGIBILITY zh-CER(Breeze)={avg(cats['zh']+cats['mix']):.3f} "
f"(zh={avg(cats['zh']):.3f} mix={avg(cats['mix']):.3f}) en-WER={avg(cats['en']):.3f}")
print(f"[{a.tag}] TW-ACCENT gap(generic-Breeze CER, >0=more TW)={avg(gap):+.3f}")
print(f"[{a.tag}] QUALITY SQUIM-PESQ={avg(pq):.2f} STOI={avg(sq):.3f} MOS={avg(mq):.2f} (N={len(rows)})")
if __name__ == "__main__":
main()