File size: 6,354 Bytes
383648f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
"""Generate DIVERSE zh-TW + English code-mix sentences to fix the mix-CER weak spot.

The existing corpus code-mix (~4.4k rows) is templated — a few frames repeated with swapped English
nouns — so the model overfits frames and fails on diverse eval code-mix (mix CER 0.318). This builds a
broad frame bank (varied syntax + English in varied positions) x varied insertions -> many distinct
sentences, matching the Taiwan office / phone-attendant register the eval set uses.

Usage: python gen_codemix.py --n 2800 --out codemix_corpus.txt
"""
import argparse, random, re

NAMES = ["Jason", "Kelly", "Daniel", "Rita", "Amy", "Kevin", "Linda", "Peter", "Vivian", "Frank",
         "Tom", "Cindy", "Eric", "Grace", "Sam", "Joyce", "Leo", "Nina", "Oscar", "Sandy", "Ryan",
         "Emma", "Jack", "Mia", "Henry", "Chloe", "Ivan", "Wendy", "Alan", "Tina"]
SUR = ["王", "陳", "林", "李", "張", "黃", "吳", "劉", "蔡", "楊", "許", "鄭", "謝", "郭", "洪"]
TITLE = ["經理", "助理", "工程師", "專員", "主任", "課長", "副理", "顧問", "店長"]
DEPT = ["技術部", "客服部", "業務部", "品保部", "財務部", "人資部", "採購部", "研發部", "行銷部", "資訊部", "法務部"]
APP = ["App", "Line", "Email", "Portal", "Outlook", "Teams", "Slack", "ERP 系統", "CRM 系統", "官網"]
ITEM = ["Excel 報表", "PDF 檔", "QR Code", "VIP 等級", "Zoom 連結", "email", "發票", "合約", "報告書",
        "專案", "預算表", "行事曆", "購物車", "訂單", "帳號", "密碼", "會員卡", "序號", "授權碼", "點數"]
ACT_EN = ["login", "logout", "update", "reset", "upload", "download", "check", "confirm", "submit",
          "cancel", "review", "approve", "sync", "backup", "scan"]
STATUS = ["ready", "done", "updated", "confirmed", "cancelled", "pending", "online", "offline", "expired"]
EVENT = ["meeting", "Zoom 會議", "conference call", "interview", "presentation", "demo", "workshop", "training"]
ADJ = ["busy", "urgent", "important", "ready", "OK", "fine"]
def ext(): return str(random.randint(1000, 9999))
def num(): return str(random.randint(100000, 999999))
def time_(): return random.choice(["上午九點", "上午十點半", "中午十二點", "下午兩點", "下午三點半",
                                    "下午四點", "明天上午", "後天下午", "這個禮拜五", "下週一早上"])
def disc(): return random.choice(["九折", "八五折", "七九折", "買一送一", "免運"])

FRAMES = [
    "您好,{dept}的 {name} {sur}{title}為您服務。",
    "幫您轉接給 {name} {sur}{title},他的分機是 {ext}。",
    "請問您要查詢的{item}編號是多少?",
    "您的{item}已經 {status} 了,請至 {app} 查看。",
    "{name} 的 {event} 改到{time}。",
    "請至 {app} 點選 {act} 重新登入。",
    "我這邊先幫您 {act} 這筆{item}。",
    "這個{item}需要重新 {act},麻煩您稍等。",
    "您的會員 {status},現在升級 VIP 可享{disc}優惠。",
    "麻煩您把 {item} email 到我的信箱,謝謝。",
    "系統顯示您的{item}需要 {act},請聯絡{dept}。",
    "請問 {name} 在嗎?我這邊有一份 {item} 要給他。",
    "您的訂單編號是 {num},預計{time}送達。",
    "我幫您預約{time}的 {event},地點在三樓會議室。",
    "不好意思,{app} 現在 {status},請您稍後再試。",
    "請先 {act} 一下您的{item},我這邊同步處理。",
    "{name} 說他今天比較 {adj},{event}可能要延到{time}。",
    "您的{item}我已經 {status},等一下會 send 給您。",
    "麻煩您提供一下 {item} 的序號,我幫您 {act}。",
    "這個 case 我先 update 到系統,{dept}會再回覆您。",
    "我的 {item} 今天有點問題,可以幫我 {act} 嗎?",
    "請問這個 {event} 的 link 是哪一個?",
    "您好,這裡是 {dept},請問需要什麼 service?",
    "{name} 的分機 {ext} 現在忙線中,要幫您留言嗎?",
    "您的 password 已經過期,請用 {app} 重設一個新的。",
    "我這邊收到您的 {item} 了,正在 {act} 中,請稍候。",
    "下午的 {event} 我會把 agenda 先寄給大家。",
    "麻煩 {name} 在{time}前把{item} {act} 完成。",
    "這份{item}的 deadline 是{time},請務必準時。",
    "您的 VIP 點數還有 {num} 點,可以折抵{disc}。",
    "請問您的 {app} 帳號是用 email 還是手機註冊的?",
    "我幫您把{item} upload 到雲端了,連結在 Line 裡面。",
    "{name} {sur}{title}稍後會 call 您,大概{time}。",
    "您的退款已經 {status},三到五個 working day 會入帳。",
    "這台機器的 firmware 要 {act},我請 {name} 過去處理。",
    "麻煩您先 confirm 一下{time}的 {event} 方不方便。",
    "您的 {item} 目前 {status},如需協助請撥分機 {ext}。",
    "我把今天的 meeting note 整理成 PDF 寄給您。",
    "請問您要的是 standard 版還是 premium 版的{item}?",
    "您好,{name} 的 schedule 我看一下,他{time}有空。",
]


def fill(frame):
    s = frame.format(
        name=random.choice(NAMES), sur=random.choice(SUR), title=random.choice(TITLE),
        dept=random.choice(DEPT), app=random.choice(APP), item=random.choice(ITEM),
        act=random.choice(ACT_EN), status=random.choice(STATUS), event=random.choice(EVENT),
        adj=random.choice(ADJ), ext=ext(), num=num(), time=time_(), disc=disc())
    return s


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--n", type=int, default=2800)
    ap.add_argument("--out", default="codemix_corpus.txt")
    ap.add_argument("--seed", type=int, default=42)
    a = ap.parse_args()
    random.seed(a.seed)
    out, tries = set(), 0
    while len(out) < a.n and tries < a.n * 40:
        tries += 1
        s = fill(random.choice(FRAMES))
        # keep only genuine code-mix (has both Han + ASCII letters) and a sane length
        if re.search(r"[一-鿿]", s) and re.search(r"[A-Za-z]", s) and 8 <= len(s) <= 60:
            out.add(s)
    out = sorted(out)
    with open(a.out, "w", encoding="utf-8") as f:
        f.write("\n".join(out) + "\n")
    print(f"wrote {len(out)} diverse code-mix sentences -> {a.out} (from {len(FRAMES)} frames)")
    import random as _r; _r.seed(1)
    for s in _r.sample(out, 8): print("  ", s)


if __name__ == "__main__":
    main()