#!/usr/bin/env python3 """Assemble the v3 training manifest: existing clean corpus + new diverse mix + entity/name clips, with EVERY text passed through text_norm.normalize so the manifest's phonemization matches the audio (the teacher read the same spoken form). Idempotent on already-normalized entity rows. Usage: python build_corpus_v3.py --out corpus_v3.norm.jsonl ... """ import argparse, json, os import text_norm as T def main(): ap = argparse.ArgumentParser() ap.add_argument("--out", required=True) ap.add_argument("manifests", nargs="+") a = ap.parse_args() seen, n_in, n_out = set(), 0, 0 with open(a.out, "w", encoding="utf-8") as o: for mf in a.manifests: if not os.path.exists(mf): print("SKIP missing", mf); continue for l in open(mf): if not l.strip(): continue r = json.loads(l); n_in += 1 wav = r.get("target_audio") if not wav or not os.path.exists(wav): continue if wav in seen: continue seen.add(wav) r["text"] = T.normalize(r.get("text", "")) if len(r["text"]) < 2: continue o.write(json.dumps(r, ensure_ascii=False) + "\n"); n_out += 1 print(f"BUILD_CORPUS_V3 in={n_in} out={n_out} -> {a.out}") if __name__ == "__main__": main()