#!/usr/bin/env python3
"""Assemble the v3 training manifest: existing clean corpus + new diverse mix + entity/name clips,
with EVERY text passed through text_norm.normalize so the manifest's phonemization matches the audio
(the teacher read the same spoken form). Idempotent on already-normalized entity rows. Usage:
  python build_corpus_v3.py --out corpus_v3.norm.jsonl <manifest1.jsonl> <manifest2.jsonl> ...
"""
import argparse, json, os
import text_norm as T

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", required=True)
    ap.add_argument("manifests", nargs="+")
    a = ap.parse_args()
    seen, n_in, n_out = set(), 0, 0
    with open(a.out, "w", encoding="utf-8") as o:
        for mf in a.manifests:
            if not os.path.exists(mf):
                print("SKIP missing", mf); continue
            for l in open(mf):
                if not l.strip(): continue
                r = json.loads(l); n_in += 1
                wav = r.get("target_audio")
                if not wav or not os.path.exists(wav): continue
                if wav in seen: continue
                seen.add(wav)
                r["text"] = T.normalize(r.get("text", ""))
                if len(r["text"]) < 2: continue
                o.write(json.dumps(r, ensure_ascii=False) + "\n"); n_out += 1
    print(f"BUILD_CORPUS_V3 in={n_in} out={n_out} -> {a.out}")

if __name__ == "__main__":
    main()