PrimeTTS / scripts /gen_entity_texts.py

one-click rebuild_voice.sh + generators + text pools

7876105 verified 3 days ago

5.67 kB

	#!/usr/bin/env python3
	"""Entity- and name-rich training text for phone-attendant correctness: phone/ext, email, address,
	price, serial, temperature, weather, person-count, date — in zh / en / mix — plus EXHAUSTIVE English
	first-name coverage (nltk names, 7.5k). Output is PRE-NORMALIZED via text_norm so the VoxCPM2 teacher
	reads exactly what the frontend will phonemize (train/infer consistency). Usage:
	python gen_entity_texts.py --n 2600 --out entity_texts.jsonl
	"""
	import argparse, random, json, itertools
	import text_norm as T
	from nltk.corpus import names as NLTK_NAMES

	NAMES = sorted(set(NLTK_NAMES.words()))
	SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴"]
	TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監"]
	CITY=["台北市","新北市","台中市","高雄市","台南市","桃園市","新竹市"]
	DIST=["信義區","大安區","中山區","板橋區","三民區","西屯區","北區","東區"]
	ROAD=["松高路","忠孝東路","中山北路","文化路二段","民生東路","建國南路","公益路"]
	DOM=["gmail.com","company.com","example.com.tw","outlook.com","yahoo.com.tw","hotmail.com"]
	WX_ZH=["晴天","多雲","陰天","短暫陣雨","雷陣雨","晴時多雲","局部降雨"]
	WX_EN=["sunny","cloudy","partly cloudy","light rain","thunderstorms","overcast"]
	MON=["January","February","March","April","May","June","July","August","September","October","November","December"]
	def ext(): return f"{random.randint(1000,9999)}"
	def mobile(): return f"09{random.randint(10,99)}-{random.randint(100,999)}-{random.randint(100,999)}"
	def usphone(): return f"{random.randint(200,999)}-{random.randint(100,999)}-{random.randint(1000,9999)}"
	def order(): return f"{random.randint(100000,9999999)}"
	def serial():
	L="".join(random.choice("ABCDEFGHJKLMNPRSTUVWXYZ") for _ in range(2))
	return f"{L}{random.randint(1000,9999)}{random.choice('ABCDEFGH')}"
	def price(): return f"{random.choice([99,199,299,500,1299,2680,3990,12800])}"
	def email(n): return f"{n.lower()}.{random.choice(['lin','wang','chen','lee'])}@{random.choice(DOM)}"

	def zh_frames(n,n2):
	return [
	f"您好,幫您轉接給 {n} {random.choice(SUR)}{random.choice(TITLE)},他的分機是 {ext()}。",
	f"{n} 的手機號碼是 {mobile()},麻煩您記一下。",
	f"請把資料寄到 {email(n)},謝謝。",
	f"地址是{random.choice(CITY)}{random.choice(DIST)}{random.choice(ROAD)}{random.randint(1,199)}號{random.randint(1,20)}樓。",
	f"這台要 NT${price()},現在下訂再折 {random.choice([100,200,500])} 元。",
	f"您的序號是 {serial()},訂單編號是 {order()}。",
	f"今天氣溫 {random.randint(15,36)}°C,{random.choice(WX_ZH)},降雨機率 {random.choice([10,20,30,50,70,90])}%。",
	f"會議改到 {random.randint(2024,2026)}年{random.randint(1,12)}月{random.randint(1,28)}日下午{random.choice(['兩','三','四'])}點。",
	f"今天總共有 {random.randint(2,12)} 位客人預約,{n} 跟 {n2} 負責接待。",
	f"{n} {random.choice(SUR)}{random.choice(TITLE)}說 {n2} 會在明天上午到,分機 {ext()}。",
	]
	def en_frames(n,n2):
	return [
	f"Please call {n} at {usphone()} or extension {ext()}.",
	f"You can email {n} at {email(n)} anytime.",
	f"The total is ${price()}.{random.randint(0,99):02d}, and we offer a {random.choice([10,15,20,30])}% discount.",
	f"Your serial number is {serial()} and the order id is {order()}.",
	f"Tomorrow will be {random.choice(WX_EN)}, around {random.randint(40,95)} degrees, with a {random.choice([10,30,60,80])}% chance of rain.",
	f"The meeting with {n} is on {random.choice(MON)} {random.randint(1,28)}, {random.randint(2024,2026)}.",
	f"We have {random.randint(2,12)} people booked today; {n} and {n2} will host.",
	f"{n} said the temperature will drop to -{random.randint(1,9)} degrees Celsius tonight.",
	]
	def mix_frames(n,n2):
	return [
	f"{n} 的 email 是 {email(n)},分機 {ext()}。",
	f"幫 {n} 預約 {random.choice(MON)} {random.randint(1,28)} 號的 meeting,地點在{random.choice(DIST)}。",
	f"這個 order {order()} 總共 NT${price()},{n} 會 follow up。",
	f"{n} 說今天 {random.choice(WX_EN)},氣溫大概 {random.randint(18,33)}°C。",
	f"請 call {n} 的手機 {mobile()},或寄到 {email(n)}。",
	]

	def main():
	ap=argparse.ArgumentParser(); ap.add_argument("--n",type=int,default=2600)
	ap.add_argument("--out",default="entity_texts.jsonl"); ap.add_argument("--seed",type=int,default=11)
	a=ap.parse_args(); random.seed(a.seed)
	name_cycle=itertools.cycle(random.sample(NAMES,len(NAMES))) # every name appears -> coverage
	rows=[]; seen=set()
	while len(rows)<a.n:
	n=next(name_cycle); n2=next(name_cycle)
	bucket=random.choices(["zh","en","mix"],weights=[0.45,0.30,0.25])[0]
	frames={"zh":zh_frames,"en":en_frames,"mix":mix_frames}[bucket](n,n2)
	raw=random.choice(frames)
	norm=T.normalize(raw)
	if norm in seen or len(norm)<6: continue
	seen.add(norm)
	rows.append({"id":f"et{len(rows):05d}","text":norm,"lang":bucket})
	with open(a.out,"w",encoding="utf-8") as f:
	for r in rows: f.write(json.dumps(r,ensure_ascii=False)+"\n")
	import collections; c=collections.Counter(r["lang"] for r in rows)
	print(f"wrote {len(rows)} entity/name texts -> {a.out} {dict(c)} \| names pool {len(NAMES)}")
	for r in random.sample(rows,6): print(" ",r["lang"],r["text"])

	if __name__=="__main__": main()