one-click rebuild_voice.sh + generators + text pools
Browse files- scripts/gen_entity_texts.py +86 -0
scripts/gen_entity_texts.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Entity- and name-rich training text for phone-attendant correctness: phone/ext, email, address,
|
| 3 |
+
price, serial, temperature, weather, person-count, date — in zh / en / mix — plus EXHAUSTIVE English
|
| 4 |
+
first-name coverage (nltk names, 7.5k). Output is PRE-NORMALIZED via text_norm so the VoxCPM2 teacher
|
| 5 |
+
reads exactly what the frontend will phonemize (train/infer consistency). Usage:
|
| 6 |
+
python gen_entity_texts.py --n 2600 --out entity_texts.jsonl
|
| 7 |
+
"""
|
| 8 |
+
import argparse, random, json, itertools
|
| 9 |
+
import text_norm as T
|
| 10 |
+
from nltk.corpus import names as NLTK_NAMES
|
| 11 |
+
|
| 12 |
+
NAMES = sorted(set(NLTK_NAMES.words()))
|
| 13 |
+
SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴"]
|
| 14 |
+
TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監"]
|
| 15 |
+
CITY=["台北市","新北市","台中市","高雄市","台南市","桃園市","新竹市"]
|
| 16 |
+
DIST=["信義區","大安區","中山區","板橋區","三民區","西屯區","北區","東區"]
|
| 17 |
+
ROAD=["松高路","忠孝東路","中山北路","文化路二段","民生東路","建國南路","公益路"]
|
| 18 |
+
DOM=["gmail.com","company.com","example.com.tw","outlook.com","yahoo.com.tw","hotmail.com"]
|
| 19 |
+
WX_ZH=["晴天","多雲","陰天","短暫陣雨","雷陣雨","晴時多雲","局部降雨"]
|
| 20 |
+
WX_EN=["sunny","cloudy","partly cloudy","light rain","thunderstorms","overcast"]
|
| 21 |
+
MON=["January","February","March","April","May","June","July","August","September","October","November","December"]
|
| 22 |
+
def ext(): return f"{random.randint(1000,9999)}"
|
| 23 |
+
def mobile(): return f"09{random.randint(10,99)}-{random.randint(100,999)}-{random.randint(100,999)}"
|
| 24 |
+
def usphone(): return f"{random.randint(200,999)}-{random.randint(100,999)}-{random.randint(1000,9999)}"
|
| 25 |
+
def order(): return f"{random.randint(100000,9999999)}"
|
| 26 |
+
def serial():
|
| 27 |
+
L="".join(random.choice("ABCDEFGHJKLMNPRSTUVWXYZ") for _ in range(2))
|
| 28 |
+
return f"{L}{random.randint(1000,9999)}{random.choice('ABCDEFGH')}"
|
| 29 |
+
def price(): return f"{random.choice([99,199,299,500,1299,2680,3990,12800])}"
|
| 30 |
+
def email(n): return f"{n.lower()}.{random.choice(['lin','wang','chen','lee'])}@{random.choice(DOM)}"
|
| 31 |
+
|
| 32 |
+
def zh_frames(n,n2):
|
| 33 |
+
return [
|
| 34 |
+
f"您好,幫您轉接給 {n} {random.choice(SUR)}{random.choice(TITLE)},他的分機是 {ext()}。",
|
| 35 |
+
f"{n} 的手機號碼是 {mobile()},麻煩您記一下。",
|
| 36 |
+
f"請把資料寄到 {email(n)},謝謝。",
|
| 37 |
+
f"地址是{random.choice(CITY)}{random.choice(DIST)}{random.choice(ROAD)}{random.randint(1,199)}號{random.randint(1,20)}樓。",
|
| 38 |
+
f"這台要 NT${price()},現在下訂再折 {random.choice([100,200,500])} 元。",
|
| 39 |
+
f"您的序號是 {serial()},訂單編號是 {order()}。",
|
| 40 |
+
f"今天氣溫 {random.randint(15,36)}°C,{random.choice(WX_ZH)},降雨機率 {random.choice([10,20,30,50,70,90])}%。",
|
| 41 |
+
f"會議改到 {random.randint(2024,2026)}年{random.randint(1,12)}月{random.randint(1,28)}日下午{random.choice(['兩','三','四'])}點。",
|
| 42 |
+
f"今天總共有 {random.randint(2,12)} 位客人預約,{n} 跟 {n2} 負責接待。",
|
| 43 |
+
f"{n} {random.choice(SUR)}{random.choice(TITLE)}說 {n2} 會在明天上午到,分機 {ext()}。",
|
| 44 |
+
]
|
| 45 |
+
def en_frames(n,n2):
|
| 46 |
+
return [
|
| 47 |
+
f"Please call {n} at {usphone()} or extension {ext()}.",
|
| 48 |
+
f"You can email {n} at {email(n)} anytime.",
|
| 49 |
+
f"The total is ${price()}.{random.randint(0,99):02d}, and we offer a {random.choice([10,15,20,30])}% discount.",
|
| 50 |
+
f"Your serial number is {serial()} and the order id is {order()}.",
|
| 51 |
+
f"Tomorrow will be {random.choice(WX_EN)}, around {random.randint(40,95)} degrees, with a {random.choice([10,30,60,80])}% chance of rain.",
|
| 52 |
+
f"The meeting with {n} is on {random.choice(MON)} {random.randint(1,28)}, {random.randint(2024,2026)}.",
|
| 53 |
+
f"We have {random.randint(2,12)} people booked today; {n} and {n2} will host.",
|
| 54 |
+
f"{n} said the temperature will drop to -{random.randint(1,9)} degrees Celsius tonight.",
|
| 55 |
+
]
|
| 56 |
+
def mix_frames(n,n2):
|
| 57 |
+
return [
|
| 58 |
+
f"{n} 的 email 是 {email(n)},分機 {ext()}。",
|
| 59 |
+
f"幫 {n} 預約 {random.choice(MON)} {random.randint(1,28)} 號的 meeting,地點在{random.choice(DIST)}。",
|
| 60 |
+
f"這個 order {order()} 總共 NT${price()},{n} 會 follow up。",
|
| 61 |
+
f"{n} 說今天 {random.choice(WX_EN)},氣溫大概 {random.randint(18,33)}°C。",
|
| 62 |
+
f"請 call {n} 的手機 {mobile()},或寄到 {email(n)}。",
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
def main():
|
| 66 |
+
ap=argparse.ArgumentParser(); ap.add_argument("--n",type=int,default=2600)
|
| 67 |
+
ap.add_argument("--out",default="entity_texts.jsonl"); ap.add_argument("--seed",type=int,default=11)
|
| 68 |
+
a=ap.parse_args(); random.seed(a.seed)
|
| 69 |
+
name_cycle=itertools.cycle(random.sample(NAMES,len(NAMES))) # every name appears -> coverage
|
| 70 |
+
rows=[]; seen=set()
|
| 71 |
+
while len(rows)<a.n:
|
| 72 |
+
n=next(name_cycle); n2=next(name_cycle)
|
| 73 |
+
bucket=random.choices(["zh","en","mix"],weights=[0.45,0.30,0.25])[0]
|
| 74 |
+
frames={"zh":zh_frames,"en":en_frames,"mix":mix_frames}[bucket](n,n2)
|
| 75 |
+
raw=random.choice(frames)
|
| 76 |
+
norm=T.normalize(raw)
|
| 77 |
+
if norm in seen or len(norm)<6: continue
|
| 78 |
+
seen.add(norm)
|
| 79 |
+
rows.append({"id":f"et{len(rows):05d}","text":norm,"lang":bucket})
|
| 80 |
+
with open(a.out,"w",encoding="utf-8") as f:
|
| 81 |
+
for r in rows: f.write(json.dumps(r,ensure_ascii=False)+"\n")
|
| 82 |
+
import collections; c=collections.Counter(r["lang"] for r in rows)
|
| 83 |
+
print(f"wrote {len(rows)} entity/name texts -> {a.out} {dict(c)} | names pool {len(NAMES)}")
|
| 84 |
+
for r in random.sample(rows,6): print(" ",r["lang"],r["text"])
|
| 85 |
+
|
| 86 |
+
if __name__=="__main__": main()
|