scripts/build_corpus_v3.py · Luigi/PrimeTTS at main

PrimeTTS / scripts /build_corpus_v3.py

one-click rebuild_voice.sh + generators + text pools

c950da9 verified 11 days ago

1.42 kB

	#!/usr/bin/env python3
	"""Assemble the v3 training manifest: existing clean corpus + new diverse mix + entity/name clips,
	with EVERY text passed through text_norm.normalize so the manifest's phonemization matches the audio
	(the teacher read the same spoken form). Idempotent on already-normalized entity rows. Usage:
	python build_corpus_v3.py --out corpus_v3.norm.jsonl <manifest1.jsonl> <manifest2.jsonl> ...
	"""
	import argparse, json, os
	import text_norm as T

	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--out", required=True)
	ap.add_argument("manifests", nargs="+")
	a = ap.parse_args()
	seen, n_in, n_out = set(), 0, 0
	with open(a.out, "w", encoding="utf-8") as o:
	for mf in a.manifests:
	if not os.path.exists(mf):
	print("SKIP missing", mf); continue
	for l in open(mf):
	if not l.strip(): continue
	r = json.loads(l); n_in += 1
	wav = r.get("target_audio")
	if not wav or not os.path.exists(wav): continue
	if wav in seen: continue
	seen.add(wav)
	r["text"] = T.normalize(r.get("text", ""))
	if len(r["text"]) < 2: continue
	o.write(json.dumps(r, ensure_ascii=False) + "\n"); n_out += 1
	print(f"BUILD_CORPUS_V3 in={n_in} out={n_out} -> {a.out}")

	if __name__ == "__main__":
	main()