| |
| """Entity- and name-rich training text for phone-attendant correctness: phone/ext, email, address, |
| price, serial, temperature, weather, person-count, date — in zh / en / mix — plus EXHAUSTIVE English |
| first-name coverage (nltk names, 7.5k). Output is PRE-NORMALIZED via text_norm so the VoxCPM2 teacher |
| reads exactly what the frontend will phonemize (train/infer consistency). Usage: |
| python gen_entity_texts.py --n 2600 --out entity_texts.jsonl |
| """ |
| import argparse, random, json, itertools |
| import text_norm as T |
| from nltk.corpus import names as NLTK_NAMES |
|
|
| NAMES = sorted(set(NLTK_NAMES.words())) |
| SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴"] |
| TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監"] |
| CITY=["台北市","新北市","台中市","高雄市","台南市","桃園市","新竹市"] |
| DIST=["信義區","大安區","中山區","板橋區","三民區","西屯區","北區","東區"] |
| ROAD=["松高路","忠孝東路","中山北路","文化路二段","民生東路","建國南路","公益路"] |
| DOM=["gmail.com","company.com","example.com.tw","outlook.com","yahoo.com.tw","hotmail.com"] |
| WX_ZH=["晴天","多雲","陰天","短暫陣雨","雷陣雨","晴時多雲","局部降雨"] |
| WX_EN=["sunny","cloudy","partly cloudy","light rain","thunderstorms","overcast"] |
| MON=["January","February","March","April","May","June","July","August","September","October","November","December"] |
| def ext(): return f"{random.randint(1000,9999)}" |
| def mobile(): return f"09{random.randint(10,99)}-{random.randint(100,999)}-{random.randint(100,999)}" |
| def usphone(): return f"{random.randint(200,999)}-{random.randint(100,999)}-{random.randint(1000,9999)}" |
| def order(): return f"{random.randint(100000,9999999)}" |
| def serial(): |
| L="".join(random.choice("ABCDEFGHJKLMNPRSTUVWXYZ") for _ in range(2)) |
| return f"{L}{random.randint(1000,9999)}{random.choice('ABCDEFGH')}" |
| def price(): return f"{random.choice([99,199,299,500,1299,2680,3990,12800])}" |
| def email(n): return f"{n.lower()}.{random.choice(['lin','wang','chen','lee'])}@{random.choice(DOM)}" |
|
|
| def zh_frames(n,n2): |
| return [ |
| f"您好,幫您轉接給 {n} {random.choice(SUR)}{random.choice(TITLE)},他的分機是 {ext()}。", |
| f"{n} 的手機號碼是 {mobile()},麻煩您記一下。", |
| f"請把資料寄到 {email(n)},謝謝。", |
| f"地址是{random.choice(CITY)}{random.choice(DIST)}{random.choice(ROAD)}{random.randint(1,199)}號{random.randint(1,20)}樓。", |
| f"這台要 NT${price()},現在下訂再折 {random.choice([100,200,500])} 元。", |
| f"您的序號是 {serial()},訂單編號是 {order()}。", |
| f"今天氣溫 {random.randint(15,36)}°C,{random.choice(WX_ZH)},降雨機率 {random.choice([10,20,30,50,70,90])}%。", |
| f"會議改到 {random.randint(2024,2026)}年{random.randint(1,12)}月{random.randint(1,28)}日下午{random.choice(['兩','三','四'])}點。", |
| f"今天總共有 {random.randint(2,12)} 位客人預約,{n} 跟 {n2} 負責接待。", |
| f"{n} {random.choice(SUR)}{random.choice(TITLE)}說 {n2} 會在明天上午到,分機 {ext()}。", |
| ] |
| def en_frames(n,n2): |
| return [ |
| f"Please call {n} at {usphone()} or extension {ext()}.", |
| f"You can email {n} at {email(n)} anytime.", |
| f"The total is ${price()}.{random.randint(0,99):02d}, and we offer a {random.choice([10,15,20,30])}% discount.", |
| f"Your serial number is {serial()} and the order id is {order()}.", |
| f"Tomorrow will be {random.choice(WX_EN)}, around {random.randint(40,95)} degrees, with a {random.choice([10,30,60,80])}% chance of rain.", |
| f"The meeting with {n} is on {random.choice(MON)} {random.randint(1,28)}, {random.randint(2024,2026)}.", |
| f"We have {random.randint(2,12)} people booked today; {n} and {n2} will host.", |
| f"{n} said the temperature will drop to -{random.randint(1,9)} degrees Celsius tonight.", |
| ] |
| def mix_frames(n,n2): |
| return [ |
| f"{n} 的 email 是 {email(n)},分機 {ext()}。", |
| f"幫 {n} 預約 {random.choice(MON)} {random.randint(1,28)} 號的 meeting,地點在{random.choice(DIST)}。", |
| f"這個 order {order()} 總共 NT${price()},{n} 會 follow up。", |
| f"{n} 說今天 {random.choice(WX_EN)},氣溫大概 {random.randint(18,33)}°C。", |
| f"請 call {n} 的手機 {mobile()},或寄到 {email(n)}。", |
| ] |
|
|
| def main(): |
| ap=argparse.ArgumentParser(); ap.add_argument("--n",type=int,default=2600) |
| ap.add_argument("--out",default="entity_texts.jsonl"); ap.add_argument("--seed",type=int,default=11) |
| a=ap.parse_args(); random.seed(a.seed) |
| name_cycle=itertools.cycle(random.sample(NAMES,len(NAMES))) |
| rows=[]; seen=set() |
| while len(rows)<a.n: |
| n=next(name_cycle); n2=next(name_cycle) |
| bucket=random.choices(["zh","en","mix"],weights=[0.45,0.30,0.25])[0] |
| frames={"zh":zh_frames,"en":en_frames,"mix":mix_frames}[bucket](n,n2) |
| raw=random.choice(frames) |
| norm=T.normalize(raw) |
| if norm in seen or len(norm)<6: continue |
| seen.add(norm) |
| rows.append({"id":f"et{len(rows):05d}","text":norm,"lang":bucket}) |
| with open(a.out,"w",encoding="utf-8") as f: |
| for r in rows: f.write(json.dumps(r,ensure_ascii=False)+"\n") |
| import collections; c=collections.Counter(r["lang"] for r in rows) |
| print(f"wrote {len(rows)} entity/name texts -> {a.out} {dict(c)} | names pool {len(NAMES)}") |
| for r in random.sample(rows,6): print(" ",r["lang"],r["text"]) |
|
|
| if __name__=="__main__": main() |
|
|