#!/usr/bin/env python3 """Generate DIVERSE zh-TW + English code-mix sentences to fix the mix-CER weak spot. The existing corpus code-mix (~4.4k rows) is templated — a few frames repeated with swapped English nouns — so the model overfits frames and fails on diverse eval code-mix (mix CER 0.318). This builds a broad frame bank (varied syntax + English in varied positions) x varied insertions -> many distinct sentences, matching the Taiwan office / phone-attendant register the eval set uses. Usage: python gen_codemix.py --n 2800 --out codemix_corpus.txt """ import argparse, random, re NAMES = ["Jason", "Kelly", "Daniel", "Rita", "Amy", "Kevin", "Linda", "Peter", "Vivian", "Frank", "Tom", "Cindy", "Eric", "Grace", "Sam", "Joyce", "Leo", "Nina", "Oscar", "Sandy", "Ryan", "Emma", "Jack", "Mia", "Henry", "Chloe", "Ivan", "Wendy", "Alan", "Tina"] SUR = ["王", "陳", "林", "李", "張", "黃", "吳", "劉", "蔡", "楊", "許", "鄭", "謝", "郭", "洪"] TITLE = ["經理", "助理", "工程師", "專員", "主任", "課長", "副理", "顧問", "店長"] DEPT = ["技術部", "客服部", "業務部", "品保部", "財務部", "人資部", "採購部", "研發部", "行銷部", "資訊部", "法務部"] APP = ["App", "Line", "Email", "Portal", "Outlook", "Teams", "Slack", "ERP 系統", "CRM 系統", "官網"] ITEM = ["Excel 報表", "PDF 檔", "QR Code", "VIP 等級", "Zoom 連結", "email", "發票", "合約", "報告書", "專案", "預算表", "行事曆", "購物車", "訂單", "帳號", "密碼", "會員卡", "序號", "授權碼", "點數"] ACT_EN = ["login", "logout", "update", "reset", "upload", "download", "check", "confirm", "submit", "cancel", "review", "approve", "sync", "backup", "scan"] STATUS = ["ready", "done", "updated", "confirmed", "cancelled", "pending", "online", "offline", "expired"] EVENT = ["meeting", "Zoom 會議", "conference call", "interview", "presentation", "demo", "workshop", "training"] ADJ = ["busy", "urgent", "important", "ready", "OK", "fine"] def ext(): return str(random.randint(1000, 9999)) def num(): return str(random.randint(100000, 999999)) def time_(): return random.choice(["上午九點", "上午十點半", "中午十二點", "下午兩點", "下午三點半", "下午四點", "明天上午", "後天下午", "這個禮拜五", "下週一早上"]) def disc(): return random.choice(["九折", "八五折", "七九折", "買一送一", "免運"]) FRAMES = [ "您好,{dept}的 {name} {sur}{title}為您服務。", "幫您轉接給 {name} {sur}{title},他的分機是 {ext}。", "請問您要查詢的{item}編號是多少?", "您的{item}已經 {status} 了,請至 {app} 查看。", "{name} 的 {event} 改到{time}。", "請至 {app} 點選 {act} 重新登入。", "我這邊先幫您 {act} 這筆{item}。", "這個{item}需要重新 {act},麻煩您稍等。", "您的會員 {status},現在升級 VIP 可享{disc}優惠。", "麻煩您把 {item} email 到我的信箱,謝謝。", "系統顯示您的{item}需要 {act},請聯絡{dept}。", "請問 {name} 在嗎?我這邊有一份 {item} 要給他。", "您的訂單編號是 {num},預計{time}送達。", "我幫您預約{time}的 {event},地點在三樓會議室。", "不好意思,{app} 現在 {status},請您稍後再試。", "請先 {act} 一下您的{item},我這邊同步處理。", "{name} 說他今天比較 {adj},{event}可能要延到{time}。", "您的{item}我已經 {status},等一下會 send 給您。", "麻煩您提供一下 {item} 的序號,我幫您 {act}。", "這個 case 我先 update 到系統,{dept}會再回覆您。", "我的 {item} 今天有點問題,可以幫我 {act} 嗎?", "請問這個 {event} 的 link 是哪一個?", "您好,這裡是 {dept},請問需要什麼 service?", "{name} 的分機 {ext} 現在忙線中,要幫您留言嗎?", "您的 password 已經過期,請用 {app} 重設一個新的。", "我這邊收到您的 {item} 了,正在 {act} 中,請稍候。", "下午的 {event} 我會把 agenda 先寄給大家。", "麻煩 {name} 在{time}前把{item} {act} 完成。", "這份{item}的 deadline 是{time},請務必準時。", "您的 VIP 點數還有 {num} 點,可以折抵{disc}。", "請問您的 {app} 帳號是用 email 還是手機註冊的?", "我幫您把{item} upload 到雲端了,連結在 Line 裡面。", "{name} {sur}{title}稍後會 call 您,大概{time}。", "您的退款已經 {status},三到五個 working day 會入帳。", "這台機器的 firmware 要 {act},我請 {name} 過去處理。", "麻煩您先 confirm 一下{time}的 {event} 方不方便。", "您的 {item} 目前 {status},如需協助請撥分機 {ext}。", "我把今天的 meeting note 整理成 PDF 寄給您。", "請問您要的是 standard 版還是 premium 版的{item}?", "您好,{name} 的 schedule 我看一下,他{time}有空。", ] def fill(frame): s = frame.format( name=random.choice(NAMES), sur=random.choice(SUR), title=random.choice(TITLE), dept=random.choice(DEPT), app=random.choice(APP), item=random.choice(ITEM), act=random.choice(ACT_EN), status=random.choice(STATUS), event=random.choice(EVENT), adj=random.choice(ADJ), ext=ext(), num=num(), time=time_(), disc=disc()) return s def main(): ap = argparse.ArgumentParser() ap.add_argument("--n", type=int, default=2800) ap.add_argument("--out", default="codemix_corpus.txt") ap.add_argument("--seed", type=int, default=42) a = ap.parse_args() random.seed(a.seed) out, tries = set(), 0 while len(out) < a.n and tries < a.n * 40: tries += 1 s = fill(random.choice(FRAMES)) # keep only genuine code-mix (has both Han + ASCII letters) and a sane length if re.search(r"[一-鿿]", s) and re.search(r"[A-Za-z]", s) and 8 <= len(s) <= 60: out.add(s) out = sorted(out) with open(a.out, "w", encoding="utf-8") as f: f.write("\n".join(out) + "\n") print(f"wrote {len(out)} diverse code-mix sentences -> {a.out} (from {len(FRAMES)} frames)") import random as _r; _r.seed(1) for s in _r.sample(out, 8): print(" ", s) if __name__ == "__main__": main()