#!/usr/bin/env python3 """Entity- and name-rich training text for phone-attendant correctness: phone/ext, email, address, price, serial, temperature, weather, person-count, date — in zh / en / mix — plus EXHAUSTIVE English first-name coverage (nltk names, 7.5k). Output is PRE-NORMALIZED via text_norm so the VoxCPM2 teacher reads exactly what the frontend will phonemize (train/infer consistency). Usage: python gen_entity_texts.py --n 2600 --out entity_texts.jsonl """ import argparse, random, json, itertools import text_norm as T from nltk.corpus import names as NLTK_NAMES NAMES = sorted(set(NLTK_NAMES.words())) SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴"] TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監"] CITY=["台北市","新北市","台中市","高雄市","台南市","桃園市","新竹市"] DIST=["信義區","大安區","中山區","板橋區","三民區","西屯區","北區","東區"] ROAD=["松高路","忠孝東路","中山北路","文化路二段","民生東路","建國南路","公益路"] DOM=["gmail.com","company.com","example.com.tw","outlook.com","yahoo.com.tw","hotmail.com"] WX_ZH=["晴天","多雲","陰天","短暫陣雨","雷陣雨","晴時多雲","局部降雨"] WX_EN=["sunny","cloudy","partly cloudy","light rain","thunderstorms","overcast"] MON=["January","February","March","April","May","June","July","August","September","October","November","December"] def ext(): return f"{random.randint(1000,9999)}" def mobile(): return f"09{random.randint(10,99)}-{random.randint(100,999)}-{random.randint(100,999)}" def usphone(): return f"{random.randint(200,999)}-{random.randint(100,999)}-{random.randint(1000,9999)}" def order(): return f"{random.randint(100000,9999999)}" def serial(): L="".join(random.choice("ABCDEFGHJKLMNPRSTUVWXYZ") for _ in range(2)) return f"{L}{random.randint(1000,9999)}{random.choice('ABCDEFGH')}" def price(): return f"{random.choice([99,199,299,500,1299,2680,3990,12800])}" def email(n): return f"{n.lower()}.{random.choice(['lin','wang','chen','lee'])}@{random.choice(DOM)}" def zh_frames(n,n2): return [ f"您好,幫您轉接給 {n} {random.choice(SUR)}{random.choice(TITLE)},他的分機是 {ext()}。", f"{n} 的手機號碼是 {mobile()},麻煩您記一下。", f"請把資料寄到 {email(n)},謝謝。", f"地址是{random.choice(CITY)}{random.choice(DIST)}{random.choice(ROAD)}{random.randint(1,199)}號{random.randint(1,20)}樓。", f"這台要 NT${price()},現在下訂再折 {random.choice([100,200,500])} 元。", f"您的序號是 {serial()},訂單編號是 {order()}。", f"今天氣溫 {random.randint(15,36)}°C,{random.choice(WX_ZH)},降雨機率 {random.choice([10,20,30,50,70,90])}%。", f"會議改到 {random.randint(2024,2026)}年{random.randint(1,12)}月{random.randint(1,28)}日下午{random.choice(['兩','三','四'])}點。", f"今天總共有 {random.randint(2,12)} 位客人預約,{n} 跟 {n2} 負責接待。", f"{n} {random.choice(SUR)}{random.choice(TITLE)}說 {n2} 會在明天上午到,分機 {ext()}。", ] def en_frames(n,n2): return [ f"Please call {n} at {usphone()} or extension {ext()}.", f"You can email {n} at {email(n)} anytime.", f"The total is ${price()}.{random.randint(0,99):02d}, and we offer a {random.choice([10,15,20,30])}% discount.", f"Your serial number is {serial()} and the order id is {order()}.", f"Tomorrow will be {random.choice(WX_EN)}, around {random.randint(40,95)} degrees, with a {random.choice([10,30,60,80])}% chance of rain.", f"The meeting with {n} is on {random.choice(MON)} {random.randint(1,28)}, {random.randint(2024,2026)}.", f"We have {random.randint(2,12)} people booked today; {n} and {n2} will host.", f"{n} said the temperature will drop to -{random.randint(1,9)} degrees Celsius tonight.", ] def mix_frames(n,n2): return [ f"{n} 的 email 是 {email(n)},分機 {ext()}。", f"幫 {n} 預約 {random.choice(MON)} {random.randint(1,28)} 號的 meeting,地點在{random.choice(DIST)}。", f"這個 order {order()} 總共 NT${price()},{n} 會 follow up。", f"{n} 說今天 {random.choice(WX_EN)},氣溫大概 {random.randint(18,33)}°C。", f"請 call {n} 的手機 {mobile()},或寄到 {email(n)}。", ] def main(): ap=argparse.ArgumentParser(); ap.add_argument("--n",type=int,default=2600) ap.add_argument("--out",default="entity_texts.jsonl"); ap.add_argument("--seed",type=int,default=11) a=ap.parse_args(); random.seed(a.seed) name_cycle=itertools.cycle(random.sample(NAMES,len(NAMES))) # every name appears -> coverage rows=[]; seen=set() while len(rows) {a.out} {dict(c)} | names pool {len(NAMES)}") for r in random.sample(rows,6): print(" ",r["lang"],r["text"]) if __name__=="__main__": main()