PrimeTTS / scripts /gen_codemix_v2.py
Luigi's picture
one-click rebuild_voice.sh + generators + text pools
dfa700d verified
Raw
History Blame Contribute Delete
6.84 kB
#!/usr/bin/env python3
"""v2 code-mix generator — EXPANDED frame bank (more syntactic variety + English in more positions)
to push mix-CER below the 0.178 the v1 (40-frame) generator reached. Adds questions, longer multi-clause
frames, leading-English frames, and number/date-heavy frames. Dedups against an existing corpus.
Usage: python gen_codemix_v2.py --n 3000 --exclude codemix_corpus.txt --out codemix_v2.txt
"""
import argparse, random, re
NAMES = ["Jason","Kelly","Daniel","Rita","Amy","Kevin","Linda","Peter","Vivian","Frank","Tom","Cindy",
"Eric","Grace","Sam","Joyce","Leo","Nina","Oscar","Sandy","Ryan","Emma","Jack","Mia","Henry",
"Chloe","Ivan","Wendy","Alan","Tina","Bella","George","Karen","Lucas","Sophie","Victor"]
SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴","周","葉"]
TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監","組長"]
DEPT=["技術部","客服部","業務部","品保部","財務部","人資部","採購部","研發部","行銷部","資訊部","法務部","物流部"]
APP=["App","Line","Email","Portal","Outlook","Teams","Slack","ERP 系統","CRM 系統","官網","Dropbox","Notion"]
ITEM=["Excel 報表","PDF 檔","QR Code","VIP 等級","Zoom 連結","email","發票","合約","報告書","專案","預算表",
"行事曆","購物車","訂單","帳號","密碼","會員卡","序號","授權碼","點數","報價單","出貨單","履歷","簡報"]
ACT=["login","logout","update","reset","upload","download","check","confirm","submit","cancel","review",
"approve","sync","backup","scan","forward","schedule","reschedule","verify","activate"]
STATUS=["ready","done","updated","confirmed","cancelled","pending","online","offline","expired","approved","rejected"]
EVENT=["meeting","Zoom 會議","conference call","interview","presentation","demo","workshop","training","kick-off"]
ADJ=["busy","urgent","important","ready","OK","fine","available","tied up"]
def ext(): return str(random.randint(1000,9999))
def num(): return str(random.randint(100000,999999))
def time_(): return random.choice(["上午九點","上午十點半","中午十二點","下午兩點","下午三點半","下午四點",
"明天上午","後天下午","這個禮拜五","下週一早上","月底前","這個月十五號","禮拜三中午"])
def disc(): return random.choice(["九折","八五折","七九折","買一送一","免運","現折五百"])
# v1's 40 frames are kept implicitly by overlap; here are NEW + varied ones (leading-English, questions,
# multi-clause, numeric). The fill() randomization over big slot lists yields tens of thousands distinct.
FRAMES = [
# leading / English-first
"Hi {name},您的{item}我已經 {act} 好了,要不要 double check 一下?",
"OK,那我先把這個 {event} {act} 到{time},confirm 後再通知您。",
"Sorry,{app} 剛剛 {status},您的{item}可能要重新 {act} 一次。",
"No problem,我請 {name} {sur}{title}在{time}前 {act} 完這份{item}。",
"Actually 這個 {item} 的 deadline 是{time},麻煩您 prioritize 一下。",
# questions
"請問您的 {app} 帳號是用 email 還是 phone number 註冊的?",
"您要的{item}是 standard 還是 premium 版,方便 confirm 一下嗎?",
"請問這個 {event} 的 link 您有收到嗎?還是要我 re-send?",
"{name} 的分機是 {ext} 對嗎?我幫您 transfer 過去好嗎?",
"您的{item}編號是不是 {num}?我這邊 check 一下 status。",
"請問{time}的 {event} 改到三樓還是線上 Zoom?",
# multi-clause
"您的訂單 {num} 已經 {status},預計{time}送達,tracking number 我用 Line 傳給您。",
"我先幫您把{item} {act},等 {dept} approve 之後,系統會自動 send 一封 email。",
"{name} {sur}{title}今天比較 {adj},不過他說{time}可以跟您 quick call 一下。",
"這個 case 我已經 {act} 到 CRM 系統,reference number 是 {num},有問題再撥分機 {ext}。",
"麻煩您先 {act} 一下{item},然後把 screenshot upload 到 {app},我這邊同步處理。",
"您的會員 {status},如果現在 upgrade 到 VIP,這個月可以多享{disc}的優惠。",
# numeric / scheduling heavy
"您預約的 {event} 是{time},地點在三樓,記得帶 ID card 上來 check in。",
"這批貨的 PO number 是 {num},預計{time}到,到了我會 update 給{dept}。",
"您的 password 已經 expired,請在{time}前用 {app} reset,不然帳號會被 lock。",
"退款 {num} 元已經 {status},三到五個 working day 會退回原本的 credit card。",
"{name} 的 {event} 我 reschedule 到{time}了,calendar invite 已經 send 給所有人。",
# service register
"您好,這裡是 {dept} customer service,請問需要什麼 help?",
"不好意思讓您 hold 這麼久,您的{item}我現在馬上幫您 {act}。",
"我幫您 note 在系統了,{name} 一上線就會 call back,大概{time}。",
"您的 {item} 目前 {status},如需 support 請撥分機 {ext} 找 {name}。",
"這份{item}我 convert 成 PDF 用 email 寄給您,subject 會註明{time}。",
"麻煩您 confirm 一下{time}方不方便,我再幫您把 {event} lock 起來。",
"您的 {app} 點數還有 {num} 點,結帳時可以 redeem 折抵{disc}。",
"好的 {name},我這邊 follow up 您的{item},有 update 第一時間 ping 您。",
]
def fill(frame):
return frame.format(name=random.choice(NAMES),sur=random.choice(SUR),title=random.choice(TITLE),
dept=random.choice(DEPT),app=random.choice(APP),item=random.choice(ITEM),act=random.choice(ACT),
status=random.choice(STATUS),event=random.choice(EVENT),adj=random.choice(ADJ),
ext=ext(),num=num(),time=time_(),disc=disc())
def main():
ap=argparse.ArgumentParser()
ap.add_argument("--n",type=int,default=3000)
ap.add_argument("--exclude",default="codemix_corpus.txt")
ap.add_argument("--out",default="codemix_v2.txt")
ap.add_argument("--seed",type=int,default=7)
a=ap.parse_args()
random.seed(a.seed)
excl=set()
try: excl={l.strip() for l in open(a.exclude) if l.strip()}
except FileNotFoundError: pass
out,tries=set(),0
while len(out)<a.n and tries<a.n*60:
tries+=1
s=fill(random.choice(FRAMES))
if (re.search(r"[一-鿿]",s) and re.search(r"[A-Za-z]",s) and 10<=len(s)<=64
and s not in excl and s not in out):
out.add(s)
out=sorted(out)
open(a.out,"w",encoding="utf-8").write("\n".join(out)+"\n")
print(f"wrote {len(out)} NEW diverse code-mix sentences -> {a.out} ({len(FRAMES)} new frames, excl {len(excl)})")
import random as _r; _r.seed(1)
for s in _r.sample(out,8): print(" ",s)
if __name__=="__main__":
main()