| |
| """Assemble the v3 training manifest: existing clean corpus + new diverse mix + entity/name clips, |
| with EVERY text passed through text_norm.normalize so the manifest's phonemization matches the audio |
| (the teacher read the same spoken form). Idempotent on already-normalized entity rows. Usage: |
| python build_corpus_v3.py --out corpus_v3.norm.jsonl <manifest1.jsonl> <manifest2.jsonl> ... |
| """ |
| import argparse, json, os |
| import text_norm as T |
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--out", required=True) |
| ap.add_argument("manifests", nargs="+") |
| a = ap.parse_args() |
| seen, n_in, n_out = set(), 0, 0 |
| with open(a.out, "w", encoding="utf-8") as o: |
| for mf in a.manifests: |
| if not os.path.exists(mf): |
| print("SKIP missing", mf); continue |
| for l in open(mf): |
| if not l.strip(): continue |
| r = json.loads(l); n_in += 1 |
| wav = r.get("target_audio") |
| if not wav or not os.path.exists(wav): continue |
| if wav in seen: continue |
| seen.add(wav) |
| r["text"] = T.normalize(r.get("text", "")) |
| if len(r["text"]) < 2: continue |
| o.write(json.dumps(r, ensure_ascii=False) + "\n"); n_out += 1 |
| print(f"BUILD_CORPUS_V3 in={n_in} out={n_out} -> {a.out}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|