PrimeTTS / scripts /gen_entity_texts.py
Luigi's picture
one-click rebuild_voice.sh + generators + text pools
7876105 verified
Raw
History Blame Contribute Delete
5.67 kB
#!/usr/bin/env python3
"""Entity- and name-rich training text for phone-attendant correctness: phone/ext, email, address,
price, serial, temperature, weather, person-count, date — in zh / en / mix — plus EXHAUSTIVE English
first-name coverage (nltk names, 7.5k). Output is PRE-NORMALIZED via text_norm so the VoxCPM2 teacher
reads exactly what the frontend will phonemize (train/infer consistency). Usage:
python gen_entity_texts.py --n 2600 --out entity_texts.jsonl
"""
import argparse, random, json, itertools
import text_norm as T
from nltk.corpus import names as NLTK_NAMES
NAMES = sorted(set(NLTK_NAMES.words()))
SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴"]
TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監"]
CITY=["台北市","新北市","台中市","高雄市","台南市","桃園市","新竹市"]
DIST=["信義區","大安區","中山區","板橋區","三民區","西屯區","北區","東區"]
ROAD=["松高路","忠孝東路","中山北路","文化路二段","民生東路","建國南路","公益路"]
DOM=["gmail.com","company.com","example.com.tw","outlook.com","yahoo.com.tw","hotmail.com"]
WX_ZH=["晴天","多雲","陰天","短暫陣雨","雷陣雨","晴時多雲","局部降雨"]
WX_EN=["sunny","cloudy","partly cloudy","light rain","thunderstorms","overcast"]
MON=["January","February","March","April","May","June","July","August","September","October","November","December"]
def ext(): return f"{random.randint(1000,9999)}"
def mobile(): return f"09{random.randint(10,99)}-{random.randint(100,999)}-{random.randint(100,999)}"
def usphone(): return f"{random.randint(200,999)}-{random.randint(100,999)}-{random.randint(1000,9999)}"
def order(): return f"{random.randint(100000,9999999)}"
def serial():
L="".join(random.choice("ABCDEFGHJKLMNPRSTUVWXYZ") for _ in range(2))
return f"{L}{random.randint(1000,9999)}{random.choice('ABCDEFGH')}"
def price(): return f"{random.choice([99,199,299,500,1299,2680,3990,12800])}"
def email(n): return f"{n.lower()}.{random.choice(['lin','wang','chen','lee'])}@{random.choice(DOM)}"
def zh_frames(n,n2):
return [
f"您好,幫您轉接給 {n} {random.choice(SUR)}{random.choice(TITLE)},他的分機是 {ext()}。",
f"{n} 的手機號碼是 {mobile()},麻煩您記一下。",
f"請把資料寄到 {email(n)},謝謝。",
f"地址是{random.choice(CITY)}{random.choice(DIST)}{random.choice(ROAD)}{random.randint(1,199)}{random.randint(1,20)}樓。",
f"這台要 NT${price()},現在下訂再折 {random.choice([100,200,500])} 元。",
f"您的序號是 {serial()},訂單編號是 {order()}。",
f"今天氣溫 {random.randint(15,36)}°C,{random.choice(WX_ZH)},降雨機率 {random.choice([10,20,30,50,70,90])}%。",
f"會議改到 {random.randint(2024,2026)}{random.randint(1,12)}{random.randint(1,28)}日下午{random.choice(['兩','三','四'])}點。",
f"今天總共有 {random.randint(2,12)} 位客人預約,{n}{n2} 負責接待。",
f"{n} {random.choice(SUR)}{random.choice(TITLE)}{n2} 會在明天上午到,分機 {ext()}。",
]
def en_frames(n,n2):
return [
f"Please call {n} at {usphone()} or extension {ext()}.",
f"You can email {n} at {email(n)} anytime.",
f"The total is ${price()}.{random.randint(0,99):02d}, and we offer a {random.choice([10,15,20,30])}% discount.",
f"Your serial number is {serial()} and the order id is {order()}.",
f"Tomorrow will be {random.choice(WX_EN)}, around {random.randint(40,95)} degrees, with a {random.choice([10,30,60,80])}% chance of rain.",
f"The meeting with {n} is on {random.choice(MON)} {random.randint(1,28)}, {random.randint(2024,2026)}.",
f"We have {random.randint(2,12)} people booked today; {n} and {n2} will host.",
f"{n} said the temperature will drop to -{random.randint(1,9)} degrees Celsius tonight.",
]
def mix_frames(n,n2):
return [
f"{n} 的 email 是 {email(n)},分機 {ext()}。",
f"幫 {n} 預約 {random.choice(MON)} {random.randint(1,28)} 號的 meeting,地點在{random.choice(DIST)}。",
f"這個 order {order()} 總共 NT${price()},{n} 會 follow up。",
f"{n} 說今天 {random.choice(WX_EN)},氣溫大概 {random.randint(18,33)}°C。",
f"請 call {n} 的手機 {mobile()},或寄到 {email(n)}。",
]
def main():
ap=argparse.ArgumentParser(); ap.add_argument("--n",type=int,default=2600)
ap.add_argument("--out",default="entity_texts.jsonl"); ap.add_argument("--seed",type=int,default=11)
a=ap.parse_args(); random.seed(a.seed)
name_cycle=itertools.cycle(random.sample(NAMES,len(NAMES))) # every name appears -> coverage
rows=[]; seen=set()
while len(rows)<a.n:
n=next(name_cycle); n2=next(name_cycle)
bucket=random.choices(["zh","en","mix"],weights=[0.45,0.30,0.25])[0]
frames={"zh":zh_frames,"en":en_frames,"mix":mix_frames}[bucket](n,n2)
raw=random.choice(frames)
norm=T.normalize(raw)
if norm in seen or len(norm)<6: continue
seen.add(norm)
rows.append({"id":f"et{len(rows):05d}","text":norm,"lang":bucket})
with open(a.out,"w",encoding="utf-8") as f:
for r in rows: f.write(json.dumps(r,ensure_ascii=False)+"\n")
import collections; c=collections.Counter(r["lang"] for r in rows)
print(f"wrote {len(rows)} entity/name texts -> {a.out} {dict(c)} | names pool {len(NAMES)}")
for r in random.sample(rows,6): print(" ",r["lang"],r["text"])
if __name__=="__main__": main()