| |
| """Enriched eval gate — intelligibility + TW-accent + signal quality, in one run (moss-nano-venv). |
| |
| Usage: python assess_quality.py --synth-dir <dir> --tag <name> [--std-model medium] |
| |
| Three axes: |
| 1. INTELLIGIBILITY zh-CER via Breeze-ASR-25 (zh-TW), en-WER via a generic whisper. |
| 2. TW-ACCENT dual-ASR gap = CER(generic whisper) - CER(Breeze zh-TW), per zh clip. |
| >0 => Breeze understands it better than a generic recognizer => Taiwan accent present |
| (a *feature* for zh). Heuristic: a generic model is also just weaker, so read trends, |
| not absolutes — calibrate against a known-mainland clip if you need a zero point. |
| 3. QUALITY torchaudio SQUIM: PESQ + STOI (reference-free) + MOS (subjective, teacher NMR). |
| Reads ref text + lang (zh/mix/en) from eval_big.jsonl, wavs from <synth-dir>/synth.jsonl. |
| """ |
| import argparse, json, glob, re, statistics |
|
|
| ZT = "/home/luigi/jetson-tts/mossnano/zhtw8k" |
|
|
| import opencc |
| _t2s = opencc.OpenCC("t2s") |
| def _han(s): return re.sub(r"[^一-鿿]", "", s) |
| def _norm_zh(s): return _han(_t2s.convert(s or "")) |
| def _norm_en(s): return re.sub(r"[^a-z' ]", " ", (s or "").lower()).split() |
| def _lev(a, b): |
| m, n = len(a), len(b) |
| if m == 0 or n == 0: return max(m, n) |
| prev = list(range(n + 1)) |
| for i in range(1, m + 1): |
| cur = [i] + [0] * n |
| for j in range(1, n + 1): |
| cur[j] = min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (a[i - 1] != b[j - 1])) |
| prev = cur |
| return prev[n] |
| def _cer(ref, hyp): r = _norm_zh(ref); return _lev(list(r), list(_norm_zh(hyp))) / max(1, len(r)) |
| def _wer(ref, hyp): r = _norm_en(ref); return _lev(r, _norm_en(hyp)) / max(1, len(r)) |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--synth-dir", required=True) |
| ap.add_argument("--tag", default="") |
| ap.add_argument("--std-model", default="medium", help="generic whisper for standard-zh + en pole") |
| a = ap.parse_args() |
| meta = {r["id"]: (r["lang"], r["text"]) for r in (json.loads(l) for l in open(f"{ZT}/eval_big.jsonl"))} |
| rows = [json.loads(l) for l in open(f"{a.synth_dir}/synth.jsonl") if l.strip()] |
|
|
| from faster_whisper import WhisperModel |
| breeze = WhisperModel("SoybeanMilk/faster-whisper-Breeze-ASR-25", device="cpu", compute_type="int8") |
| std = WhisperModel(a.std_model, device="cpu", compute_type="int8") |
| def asr(model, wav, lang): |
| segs, _ = model.transcribe(wav, language=lang, beam_size=1) |
| return "".join(s.text for s in segs) |
|
|
| import torch, soundfile as sf, torchaudio.functional as AF |
| from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE |
| Mo = SQUIM_OBJECTIVE.get_model().eval(); Ms = SQUIM_SUBJECTIVE.get_model().eval() |
| def load16k(p): |
| w, sr = sf.read(p) |
| if getattr(w, "ndim", 1) > 1: w = w.mean(1) |
| t = torch.tensor(w, dtype=torch.float32) |
| return AF.resample(t, sr, 16000) if sr != 16000 else t |
| nmr = load16k(sorted(glob.glob(f"{ZT}/teacher_corpus_en_expand/*.wav"))[0])[None] |
| def quality(wav): |
| t = load16k(wav)[None] |
| with torch.no_grad(): |
| st, pe, si = Mo(t) |
| try: mos = float(Ms(t, nmr)[0]) |
| except Exception: mos = float("nan") |
| return float(pe[0]), float(st[0]), mos |
|
|
| cats = {"zh": [], "mix": [], "en": []}; gap = []; pq = []; sq = []; mq = [] |
| for r in rows: |
| lang, ref = meta[r["id"]]; wav = r["wav"] |
| try: |
| pe, st_, mo = quality(wav); pq.append(pe); sq.append(st_) |
| if mo == mo: mq.append(mo) |
| if lang in ("zh", "mix"): |
| cb = _cer(ref, asr(breeze, wav, "zh")); cs = _cer(ref, asr(std, wav, "zh")) |
| cats[lang].append(cb); gap.append(cs - cb) |
| else: |
| cats["en"].append(_wer(ref, asr(std, wav, "en"))) |
| except Exception as e: |
| print(" skip", r["id"], str(e)[:60]) |
| def avg(x): return statistics.mean(x) if x else float("nan") |
| print(f"[{a.tag}] INTELLIGIBILITY zh-CER(Breeze)={avg(cats['zh']+cats['mix']):.3f} " |
| f"(zh={avg(cats['zh']):.3f} mix={avg(cats['mix']):.3f}) en-WER={avg(cats['en']):.3f}") |
| print(f"[{a.tag}] TW-ACCENT gap(generic-Breeze CER, >0=more TW)={avg(gap):+.3f}") |
| print(f"[{a.tag}] QUALITY SQUIM-PESQ={avg(pq):.2f} STOI={avg(sq):.3f} MOS={avg(mq):.2f} (N={len(rows)})") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|