| |
| """v2 code-mix generator — EXPANDED frame bank (more syntactic variety + English in more positions) |
| to push mix-CER below the 0.178 the v1 (40-frame) generator reached. Adds questions, longer multi-clause |
| frames, leading-English frames, and number/date-heavy frames. Dedups against an existing corpus. |
| |
| Usage: python gen_codemix_v2.py --n 3000 --exclude codemix_corpus.txt --out codemix_v2.txt |
| """ |
| import argparse, random, re |
|
|
| NAMES = ["Jason","Kelly","Daniel","Rita","Amy","Kevin","Linda","Peter","Vivian","Frank","Tom","Cindy", |
| "Eric","Grace","Sam","Joyce","Leo","Nina","Oscar","Sandy","Ryan","Emma","Jack","Mia","Henry", |
| "Chloe","Ivan","Wendy","Alan","Tina","Bella","George","Karen","Lucas","Sophie","Victor"] |
| SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴","周","葉"] |
| TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監","組長"] |
| DEPT=["技術部","客服部","業務部","品保部","財務部","人資部","採購部","研發部","行銷部","資訊部","法務部","物流部"] |
| APP=["App","Line","Email","Portal","Outlook","Teams","Slack","ERP 系統","CRM 系統","官網","Dropbox","Notion"] |
| ITEM=["Excel 報表","PDF 檔","QR Code","VIP 等級","Zoom 連結","email","發票","合約","報告書","專案","預算表", |
| "行事曆","購物車","訂單","帳號","密碼","會員卡","序號","授權碼","點數","報價單","出貨單","履歷","簡報"] |
| ACT=["login","logout","update","reset","upload","download","check","confirm","submit","cancel","review", |
| "approve","sync","backup","scan","forward","schedule","reschedule","verify","activate"] |
| STATUS=["ready","done","updated","confirmed","cancelled","pending","online","offline","expired","approved","rejected"] |
| EVENT=["meeting","Zoom 會議","conference call","interview","presentation","demo","workshop","training","kick-off"] |
| ADJ=["busy","urgent","important","ready","OK","fine","available","tied up"] |
| def ext(): return str(random.randint(1000,9999)) |
| def num(): return str(random.randint(100000,999999)) |
| def time_(): return random.choice(["上午九點","上午十點半","中午十二點","下午兩點","下午三點半","下午四點", |
| "明天上午","後天下午","這個禮拜五","下週一早上","月底前","這個月十五號","禮拜三中午"]) |
| def disc(): return random.choice(["九折","八五折","七九折","買一送一","免運","現折五百"]) |
|
|
| |
| |
| FRAMES = [ |
| |
| "Hi {name},您的{item}我已經 {act} 好了,要不要 double check 一下?", |
| "OK,那我先把這個 {event} {act} 到{time},confirm 後再通知您。", |
| "Sorry,{app} 剛剛 {status},您的{item}可能要重新 {act} 一次。", |
| "No problem,我請 {name} {sur}{title}在{time}前 {act} 完這份{item}。", |
| "Actually 這個 {item} 的 deadline 是{time},麻煩您 prioritize 一下。", |
| |
| "請問您的 {app} 帳號是用 email 還是 phone number 註冊的?", |
| "您要的{item}是 standard 還是 premium 版,方便 confirm 一下嗎?", |
| "請問這個 {event} 的 link 您有收到嗎?還是要我 re-send?", |
| "{name} 的分機是 {ext} 對嗎?我幫您 transfer 過去好嗎?", |
| "您的{item}編號是不是 {num}?我這邊 check 一下 status。", |
| "請問{time}的 {event} 改到三樓還是線上 Zoom?", |
| |
| "您的訂單 {num} 已經 {status},預計{time}送達,tracking number 我用 Line 傳給您。", |
| "我先幫您把{item} {act},等 {dept} approve 之後,系統會自動 send 一封 email。", |
| "{name} {sur}{title}今天比較 {adj},不過他說{time}可以跟您 quick call 一下。", |
| "這個 case 我已經 {act} 到 CRM 系統,reference number 是 {num},有問題再撥分機 {ext}。", |
| "麻煩您先 {act} 一下{item},然後把 screenshot upload 到 {app},我這邊同步處理。", |
| "您的會員 {status},如果現在 upgrade 到 VIP,這個月可以多享{disc}的優惠。", |
| |
| "您預約的 {event} 是{time},地點在三樓,記得帶 ID card 上來 check in。", |
| "這批貨的 PO number 是 {num},預計{time}到,到了我會 update 給{dept}。", |
| "您的 password 已經 expired,請在{time}前用 {app} reset,不然帳號會被 lock。", |
| "退款 {num} 元已經 {status},三到五個 working day 會退回原本的 credit card。", |
| "{name} 的 {event} 我 reschedule 到{time}了,calendar invite 已經 send 給所有人。", |
| |
| "您好,這裡是 {dept} customer service,請問需要什麼 help?", |
| "不好意思讓您 hold 這麼久,您的{item}我現在馬上幫您 {act}。", |
| "我幫您 note 在系統了,{name} 一上線就會 call back,大概{time}。", |
| "您的 {item} 目前 {status},如需 support 請撥分機 {ext} 找 {name}。", |
| "這份{item}我 convert 成 PDF 用 email 寄給您,subject 會註明{time}。", |
| "麻煩您 confirm 一下{time}方不方便,我再幫您把 {event} lock 起來。", |
| "您的 {app} 點數還有 {num} 點,結帳時可以 redeem 折抵{disc}。", |
| "好的 {name},我這邊 follow up 您的{item},有 update 第一時間 ping 您。", |
| ] |
|
|
| def fill(frame): |
| return frame.format(name=random.choice(NAMES),sur=random.choice(SUR),title=random.choice(TITLE), |
| dept=random.choice(DEPT),app=random.choice(APP),item=random.choice(ITEM),act=random.choice(ACT), |
| status=random.choice(STATUS),event=random.choice(EVENT),adj=random.choice(ADJ), |
| ext=ext(),num=num(),time=time_(),disc=disc()) |
|
|
| def main(): |
| ap=argparse.ArgumentParser() |
| ap.add_argument("--n",type=int,default=3000) |
| ap.add_argument("--exclude",default="codemix_corpus.txt") |
| ap.add_argument("--out",default="codemix_v2.txt") |
| ap.add_argument("--seed",type=int,default=7) |
| a=ap.parse_args() |
| random.seed(a.seed) |
| excl=set() |
| try: excl={l.strip() for l in open(a.exclude) if l.strip()} |
| except FileNotFoundError: pass |
| out,tries=set(),0 |
| while len(out)<a.n and tries<a.n*60: |
| tries+=1 |
| s=fill(random.choice(FRAMES)) |
| if (re.search(r"[一-鿿]",s) and re.search(r"[A-Za-z]",s) and 10<=len(s)<=64 |
| and s not in excl and s not in out): |
| out.add(s) |
| out=sorted(out) |
| open(a.out,"w",encoding="utf-8").write("\n".join(out)+"\n") |
| print(f"wrote {len(out)} NEW diverse code-mix sentences -> {a.out} ({len(FRAMES)} new frames, excl {len(excl)})") |
| import random as _r; _r.seed(1) |
| for s in _r.sample(out,8): print(" ",s) |
|
|
| if __name__=="__main__": |
| main() |
|
|