#!/usr/bin/env python3 """v2 code-mix generator — EXPANDED frame bank (more syntactic variety + English in more positions) to push mix-CER below the 0.178 the v1 (40-frame) generator reached. Adds questions, longer multi-clause frames, leading-English frames, and number/date-heavy frames. Dedups against an existing corpus. Usage: python gen_codemix_v2.py --n 3000 --exclude codemix_corpus.txt --out codemix_v2.txt """ import argparse, random, re NAMES = ["Jason","Kelly","Daniel","Rita","Amy","Kevin","Linda","Peter","Vivian","Frank","Tom","Cindy", "Eric","Grace","Sam","Joyce","Leo","Nina","Oscar","Sandy","Ryan","Emma","Jack","Mia","Henry", "Chloe","Ivan","Wendy","Alan","Tina","Bella","George","Karen","Lucas","Sophie","Victor"] SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴","周","葉"] TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監","組長"] DEPT=["技術部","客服部","業務部","品保部","財務部","人資部","採購部","研發部","行銷部","資訊部","法務部","物流部"] APP=["App","Line","Email","Portal","Outlook","Teams","Slack","ERP 系統","CRM 系統","官網","Dropbox","Notion"] ITEM=["Excel 報表","PDF 檔","QR Code","VIP 等級","Zoom 連結","email","發票","合約","報告書","專案","預算表", "行事曆","購物車","訂單","帳號","密碼","會員卡","序號","授權碼","點數","報價單","出貨單","履歷","簡報"] ACT=["login","logout","update","reset","upload","download","check","confirm","submit","cancel","review", "approve","sync","backup","scan","forward","schedule","reschedule","verify","activate"] STATUS=["ready","done","updated","confirmed","cancelled","pending","online","offline","expired","approved","rejected"] EVENT=["meeting","Zoom 會議","conference call","interview","presentation","demo","workshop","training","kick-off"] ADJ=["busy","urgent","important","ready","OK","fine","available","tied up"] def ext(): return str(random.randint(1000,9999)) def num(): return str(random.randint(100000,999999)) def time_(): return random.choice(["上午九點","上午十點半","中午十二點","下午兩點","下午三點半","下午四點", "明天上午","後天下午","這個禮拜五","下週一早上","月底前","這個月十五號","禮拜三中午"]) def disc(): return random.choice(["九折","八五折","七九折","買一送一","免運","現折五百"]) # v1's 40 frames are kept implicitly by overlap; here are NEW + varied ones (leading-English, questions, # multi-clause, numeric). The fill() randomization over big slot lists yields tens of thousands distinct. FRAMES = [ # leading / English-first "Hi {name},您的{item}我已經 {act} 好了,要不要 double check 一下?", "OK,那我先把這個 {event} {act} 到{time},confirm 後再通知您。", "Sorry,{app} 剛剛 {status},您的{item}可能要重新 {act} 一次。", "No problem,我請 {name} {sur}{title}在{time}前 {act} 完這份{item}。", "Actually 這個 {item} 的 deadline 是{time},麻煩您 prioritize 一下。", # questions "請問您的 {app} 帳號是用 email 還是 phone number 註冊的?", "您要的{item}是 standard 還是 premium 版,方便 confirm 一下嗎?", "請問這個 {event} 的 link 您有收到嗎?還是要我 re-send?", "{name} 的分機是 {ext} 對嗎?我幫您 transfer 過去好嗎?", "您的{item}編號是不是 {num}?我這邊 check 一下 status。", "請問{time}的 {event} 改到三樓還是線上 Zoom?", # multi-clause "您的訂單 {num} 已經 {status},預計{time}送達,tracking number 我用 Line 傳給您。", "我先幫您把{item} {act},等 {dept} approve 之後,系統會自動 send 一封 email。", "{name} {sur}{title}今天比較 {adj},不過他說{time}可以跟您 quick call 一下。", "這個 case 我已經 {act} 到 CRM 系統,reference number 是 {num},有問題再撥分機 {ext}。", "麻煩您先 {act} 一下{item},然後把 screenshot upload 到 {app},我這邊同步處理。", "您的會員 {status},如果現在 upgrade 到 VIP,這個月可以多享{disc}的優惠。", # numeric / scheduling heavy "您預約的 {event} 是{time},地點在三樓,記得帶 ID card 上來 check in。", "這批貨的 PO number 是 {num},預計{time}到,到了我會 update 給{dept}。", "您的 password 已經 expired,請在{time}前用 {app} reset,不然帳號會被 lock。", "退款 {num} 元已經 {status},三到五個 working day 會退回原本的 credit card。", "{name} 的 {event} 我 reschedule 到{time}了,calendar invite 已經 send 給所有人。", # service register "您好,這裡是 {dept} customer service,請問需要什麼 help?", "不好意思讓您 hold 這麼久,您的{item}我現在馬上幫您 {act}。", "我幫您 note 在系統了,{name} 一上線就會 call back,大概{time}。", "您的 {item} 目前 {status},如需 support 請撥分機 {ext} 找 {name}。", "這份{item}我 convert 成 PDF 用 email 寄給您,subject 會註明{time}。", "麻煩您 confirm 一下{time}方不方便,我再幫您把 {event} lock 起來。", "您的 {app} 點數還有 {num} 點,結帳時可以 redeem 折抵{disc}。", "好的 {name},我這邊 follow up 您的{item},有 update 第一時間 ping 您。", ] def fill(frame): return frame.format(name=random.choice(NAMES),sur=random.choice(SUR),title=random.choice(TITLE), dept=random.choice(DEPT),app=random.choice(APP),item=random.choice(ITEM),act=random.choice(ACT), status=random.choice(STATUS),event=random.choice(EVENT),adj=random.choice(ADJ), ext=ext(),num=num(),time=time_(),disc=disc()) def main(): ap=argparse.ArgumentParser() ap.add_argument("--n",type=int,default=3000) ap.add_argument("--exclude",default="codemix_corpus.txt") ap.add_argument("--out",default="codemix_v2.txt") ap.add_argument("--seed",type=int,default=7) a=ap.parse_args() random.seed(a.seed) excl=set() try: excl={l.strip() for l in open(a.exclude) if l.strip()} except FileNotFoundError: pass out,tries=set(),0 while len(out) {a.out} ({len(FRAMES)} new frames, excl {len(excl)})") import random as _r; _r.seed(1) for s in _r.sample(out,8): print(" ",s) if __name__=="__main__": main()