| |
| """Generate DIVERSE zh-TW + English code-mix sentences to fix the mix-CER weak spot. |
| |
| The existing corpus code-mix (~4.4k rows) is templated — a few frames repeated with swapped English |
| nouns — so the model overfits frames and fails on diverse eval code-mix (mix CER 0.318). This builds a |
| broad frame bank (varied syntax + English in varied positions) x varied insertions -> many distinct |
| sentences, matching the Taiwan office / phone-attendant register the eval set uses. |
| |
| Usage: python gen_codemix.py --n 2800 --out codemix_corpus.txt |
| """ |
| import argparse, random, re |
|
|
| NAMES = ["Jason", "Kelly", "Daniel", "Rita", "Amy", "Kevin", "Linda", "Peter", "Vivian", "Frank", |
| "Tom", "Cindy", "Eric", "Grace", "Sam", "Joyce", "Leo", "Nina", "Oscar", "Sandy", "Ryan", |
| "Emma", "Jack", "Mia", "Henry", "Chloe", "Ivan", "Wendy", "Alan", "Tina"] |
| SUR = ["王", "陳", "林", "李", "張", "黃", "吳", "劉", "蔡", "楊", "許", "鄭", "謝", "郭", "洪"] |
| TITLE = ["經理", "助理", "工程師", "專員", "主任", "課長", "副理", "顧問", "店長"] |
| DEPT = ["技術部", "客服部", "業務部", "品保部", "財務部", "人資部", "採購部", "研發部", "行銷部", "資訊部", "法務部"] |
| APP = ["App", "Line", "Email", "Portal", "Outlook", "Teams", "Slack", "ERP 系統", "CRM 系統", "官網"] |
| ITEM = ["Excel 報表", "PDF 檔", "QR Code", "VIP 等級", "Zoom 連結", "email", "發票", "合約", "報告書", |
| "專案", "預算表", "行事曆", "購物車", "訂單", "帳號", "密碼", "會員卡", "序號", "授權碼", "點數"] |
| ACT_EN = ["login", "logout", "update", "reset", "upload", "download", "check", "confirm", "submit", |
| "cancel", "review", "approve", "sync", "backup", "scan"] |
| STATUS = ["ready", "done", "updated", "confirmed", "cancelled", "pending", "online", "offline", "expired"] |
| EVENT = ["meeting", "Zoom 會議", "conference call", "interview", "presentation", "demo", "workshop", "training"] |
| ADJ = ["busy", "urgent", "important", "ready", "OK", "fine"] |
| def ext(): return str(random.randint(1000, 9999)) |
| def num(): return str(random.randint(100000, 999999)) |
| def time_(): return random.choice(["上午九點", "上午十點半", "中午十二點", "下午兩點", "下午三點半", |
| "下午四點", "明天上午", "後天下午", "這個禮拜五", "下週一早上"]) |
| def disc(): return random.choice(["九折", "八五折", "七九折", "買一送一", "免運"]) |
|
|
| FRAMES = [ |
| "您好,{dept}的 {name} {sur}{title}為您服務。", |
| "幫您轉接給 {name} {sur}{title},他的分機是 {ext}。", |
| "請問您要查詢的{item}編號是多少?", |
| "您的{item}已經 {status} 了,請至 {app} 查看。", |
| "{name} 的 {event} 改到{time}。", |
| "請至 {app} 點選 {act} 重新登入。", |
| "我這邊先幫您 {act} 這筆{item}。", |
| "這個{item}需要重新 {act},麻煩您稍等。", |
| "您的會員 {status},現在升級 VIP 可享{disc}優惠。", |
| "麻煩您把 {item} email 到我的信箱,謝謝。", |
| "系統顯示您的{item}需要 {act},請聯絡{dept}。", |
| "請問 {name} 在嗎?我這邊有一份 {item} 要給他。", |
| "您的訂單編號是 {num},預計{time}送達。", |
| "我幫您預約{time}的 {event},地點在三樓會議室。", |
| "不好意思,{app} 現在 {status},請您稍後再試。", |
| "請先 {act} 一下您的{item},我這邊同步處理。", |
| "{name} 說他今天比較 {adj},{event}可能要延到{time}。", |
| "您的{item}我已經 {status},等一下會 send 給您。", |
| "麻煩您提供一下 {item} 的序號,我幫您 {act}。", |
| "這個 case 我先 update 到系統,{dept}會再回覆您。", |
| "我的 {item} 今天有點問題,可以幫我 {act} 嗎?", |
| "請問這個 {event} 的 link 是哪一個?", |
| "您好,這裡是 {dept},請問需要什麼 service?", |
| "{name} 的分機 {ext} 現在忙線中,要幫您留言嗎?", |
| "您的 password 已經過期,請用 {app} 重設一個新的。", |
| "我這邊收到您的 {item} 了,正在 {act} 中,請稍候。", |
| "下午的 {event} 我會把 agenda 先寄給大家。", |
| "麻煩 {name} 在{time}前把{item} {act} 完成。", |
| "這份{item}的 deadline 是{time},請務必準時。", |
| "您的 VIP 點數還有 {num} 點,可以折抵{disc}。", |
| "請問您的 {app} 帳號是用 email 還是手機註冊的?", |
| "我幫您把{item} upload 到雲端了,連結在 Line 裡面。", |
| "{name} {sur}{title}稍後會 call 您,大概{time}。", |
| "您的退款已經 {status},三到五個 working day 會入帳。", |
| "這台機器的 firmware 要 {act},我請 {name} 過去處理。", |
| "麻煩您先 confirm 一下{time}的 {event} 方不方便。", |
| "您的 {item} 目前 {status},如需協助請撥分機 {ext}。", |
| "我把今天的 meeting note 整理成 PDF 寄給您。", |
| "請問您要的是 standard 版還是 premium 版的{item}?", |
| "您好,{name} 的 schedule 我看一下,他{time}有空。", |
| ] |
|
|
|
|
| def fill(frame): |
| s = frame.format( |
| name=random.choice(NAMES), sur=random.choice(SUR), title=random.choice(TITLE), |
| dept=random.choice(DEPT), app=random.choice(APP), item=random.choice(ITEM), |
| act=random.choice(ACT_EN), status=random.choice(STATUS), event=random.choice(EVENT), |
| adj=random.choice(ADJ), ext=ext(), num=num(), time=time_(), disc=disc()) |
| return s |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--n", type=int, default=2800) |
| ap.add_argument("--out", default="codemix_corpus.txt") |
| ap.add_argument("--seed", type=int, default=42) |
| a = ap.parse_args() |
| random.seed(a.seed) |
| out, tries = set(), 0 |
| while len(out) < a.n and tries < a.n * 40: |
| tries += 1 |
| s = fill(random.choice(FRAMES)) |
| |
| if re.search(r"[一-鿿]", s) and re.search(r"[A-Za-z]", s) and 8 <= len(s) <= 60: |
| out.add(s) |
| out = sorted(out) |
| with open(a.out, "w", encoding="utf-8") as f: |
| f.write("\n".join(out) + "\n") |
| print(f"wrote {len(out)} diverse code-mix sentences -> {a.out} (from {len(FRAMES)} frames)") |
| import random as _r; _r.seed(1) |
| for s in _r.sample(out, 8): print(" ", s) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|