PrimeTTS / scripts /build_corpus_v3.py
Luigi's picture
one-click rebuild_voice.sh + generators + text pools
c950da9 verified
Raw
History Blame Contribute Delete
1.42 kB
#!/usr/bin/env python3
"""Assemble the v3 training manifest: existing clean corpus + new diverse mix + entity/name clips,
with EVERY text passed through text_norm.normalize so the manifest's phonemization matches the audio
(the teacher read the same spoken form). Idempotent on already-normalized entity rows. Usage:
python build_corpus_v3.py --out corpus_v3.norm.jsonl <manifest1.jsonl> <manifest2.jsonl> ...
"""
import argparse, json, os
import text_norm as T
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True)
ap.add_argument("manifests", nargs="+")
a = ap.parse_args()
seen, n_in, n_out = set(), 0, 0
with open(a.out, "w", encoding="utf-8") as o:
for mf in a.manifests:
if not os.path.exists(mf):
print("SKIP missing", mf); continue
for l in open(mf):
if not l.strip(): continue
r = json.loads(l); n_in += 1
wav = r.get("target_audio")
if not wav or not os.path.exists(wav): continue
if wav in seen: continue
seen.add(wav)
r["text"] = T.normalize(r.get("text", ""))
if len(r["text"]) < 2: continue
o.write(json.dumps(r, ensure_ascii=False) + "\n"); n_out += 1
print(f"BUILD_CORPUS_V3 in={n_in} out={n_out} -> {a.out}")
if __name__ == "__main__":
main()