File size: 6,837 Bytes
dfa700d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
"""v2 code-mix generator — EXPANDED frame bank (more syntactic variety + English in more positions)
to push mix-CER below the 0.178 the v1 (40-frame) generator reached. Adds questions, longer multi-clause
frames, leading-English frames, and number/date-heavy frames. Dedups against an existing corpus.

Usage: python gen_codemix_v2.py --n 3000 --exclude codemix_corpus.txt --out codemix_v2.txt
"""
import argparse, random, re

NAMES = ["Jason","Kelly","Daniel","Rita","Amy","Kevin","Linda","Peter","Vivian","Frank","Tom","Cindy",
         "Eric","Grace","Sam","Joyce","Leo","Nina","Oscar","Sandy","Ryan","Emma","Jack","Mia","Henry",
         "Chloe","Ivan","Wendy","Alan","Tina","Bella","George","Karen","Lucas","Sophie","Victor"]
SUR=["王","陳","林","李","張","黃","吳","劉","蔡","楊","許","鄭","謝","郭","洪","曾","廖","賴","周","葉"]
TITLE=["經理","助理","工程師","專員","主任","課長","副理","顧問","店長","總監","組長"]
DEPT=["技術部","客服部","業務部","品保部","財務部","人資部","採購部","研發部","行銷部","資訊部","法務部","物流部"]
APP=["App","Line","Email","Portal","Outlook","Teams","Slack","ERP 系統","CRM 系統","官網","Dropbox","Notion"]
ITEM=["Excel 報表","PDF 檔","QR Code","VIP 等級","Zoom 連結","email","發票","合約","報告書","專案","預算表",
      "行事曆","購物車","訂單","帳號","密碼","會員卡","序號","授權碼","點數","報價單","出貨單","履歷","簡報"]
ACT=["login","logout","update","reset","upload","download","check","confirm","submit","cancel","review",
     "approve","sync","backup","scan","forward","schedule","reschedule","verify","activate"]
STATUS=["ready","done","updated","confirmed","cancelled","pending","online","offline","expired","approved","rejected"]
EVENT=["meeting","Zoom 會議","conference call","interview","presentation","demo","workshop","training","kick-off"]
ADJ=["busy","urgent","important","ready","OK","fine","available","tied up"]
def ext(): return str(random.randint(1000,9999))
def num(): return str(random.randint(100000,999999))
def time_(): return random.choice(["上午九點","上午十點半","中午十二點","下午兩點","下午三點半","下午四點",
    "明天上午","後天下午","這個禮拜五","下週一早上","月底前","這個月十五號","禮拜三中午"])
def disc(): return random.choice(["九折","八五折","七九折","買一送一","免運","現折五百"])

# v1's 40 frames are kept implicitly by overlap; here are NEW + varied ones (leading-English, questions,
# multi-clause, numeric). The fill() randomization over big slot lists yields tens of thousands distinct.
FRAMES = [
    # leading / English-first
    "Hi {name},您的{item}我已經 {act} 好了,要不要 double check 一下?",
    "OK,那我先把這個 {event} {act} 到{time},confirm 後再通知您。",
    "Sorry,{app} 剛剛 {status},您的{item}可能要重新 {act} 一次。",
    "No problem,我請 {name} {sur}{title}在{time}前 {act} 完這份{item}。",
    "Actually 這個 {item} 的 deadline 是{time},麻煩您 prioritize 一下。",
    # questions
    "請問您的 {app} 帳號是用 email 還是 phone number 註冊的?",
    "您要的{item}是 standard 還是 premium 版,方便 confirm 一下嗎?",
    "請問這個 {event} 的 link 您有收到嗎?還是要我 re-send?",
    "{name} 的分機是 {ext} 對嗎?我幫您 transfer 過去好嗎?",
    "您的{item}編號是不是 {num}?我這邊 check 一下 status。",
    "請問{time}的 {event} 改到三樓還是線上 Zoom?",
    # multi-clause
    "您的訂單 {num} 已經 {status},預計{time}送達,tracking number 我用 Line 傳給您。",
    "我先幫您把{item} {act},等 {dept} approve 之後,系統會自動 send 一封 email。",
    "{name} {sur}{title}今天比較 {adj},不過他說{time}可以跟您 quick call 一下。",
    "這個 case 我已經 {act} 到 CRM 系統,reference number 是 {num},有問題再撥分機 {ext}。",
    "麻煩您先 {act} 一下{item},然後把 screenshot upload 到 {app},我這邊同步處理。",
    "您的會員 {status},如果現在 upgrade 到 VIP,這個月可以多享{disc}的優惠。",
    # numeric / scheduling heavy
    "您預約的 {event} 是{time},地點在三樓,記得帶 ID card 上來 check in。",
    "這批貨的 PO number 是 {num},預計{time}到,到了我會 update 給{dept}。",
    "您的 password 已經 expired,請在{time}前用 {app} reset,不然帳號會被 lock。",
    "退款 {num} 元已經 {status},三到五個 working day 會退回原本的 credit card。",
    "{name} 的 {event} 我 reschedule 到{time}了,calendar invite 已經 send 給所有人。",
    # service register
    "您好,這裡是 {dept} customer service,請問需要什麼 help?",
    "不好意思讓您 hold 這麼久,您的{item}我現在馬上幫您 {act}。",
    "我幫您 note 在系統了,{name} 一上線就會 call back,大概{time}。",
    "您的 {item} 目前 {status},如需 support 請撥分機 {ext} 找 {name}。",
    "這份{item}我 convert 成 PDF 用 email 寄給您,subject 會註明{time}。",
    "麻煩您 confirm 一下{time}方不方便,我再幫您把 {event} lock 起來。",
    "您的 {app} 點數還有 {num} 點,結帳時可以 redeem 折抵{disc}。",
    "好的 {name},我這邊 follow up 您的{item},有 update 第一時間 ping 您。",
]

def fill(frame):
    return frame.format(name=random.choice(NAMES),sur=random.choice(SUR),title=random.choice(TITLE),
        dept=random.choice(DEPT),app=random.choice(APP),item=random.choice(ITEM),act=random.choice(ACT),
        status=random.choice(STATUS),event=random.choice(EVENT),adj=random.choice(ADJ),
        ext=ext(),num=num(),time=time_(),disc=disc())

def main():
    ap=argparse.ArgumentParser()
    ap.add_argument("--n",type=int,default=3000)
    ap.add_argument("--exclude",default="codemix_corpus.txt")
    ap.add_argument("--out",default="codemix_v2.txt")
    ap.add_argument("--seed",type=int,default=7)
    a=ap.parse_args()
    random.seed(a.seed)
    excl=set()
    try: excl={l.strip() for l in open(a.exclude) if l.strip()}
    except FileNotFoundError: pass
    out,tries=set(),0
    while len(out)<a.n and tries<a.n*60:
        tries+=1
        s=fill(random.choice(FRAMES))
        if (re.search(r"[一-鿿]",s) and re.search(r"[A-Za-z]",s) and 10<=len(s)<=64
                and s not in excl and s not in out):
            out.add(s)
    out=sorted(out)
    open(a.out,"w",encoding="utf-8").write("\n".join(out)+"\n")
    print(f"wrote {len(out)} NEW diverse code-mix sentences -> {a.out} ({len(FRAMES)} new frames, excl {len(excl)})")
    import random as _r; _r.seed(1)
    for s in _r.sample(out,8): print("  ",s)

if __name__=="__main__":
    main()