PrimeTTS / scripts /gen_codemix.py
Luigi's picture
reproduction: actual VoxCPM2-TW pipeline scripts + master run + eval set
383648f verified
Raw
History Blame Contribute Delete
6.35 kB
#!/usr/bin/env python3
"""Generate DIVERSE zh-TW + English code-mix sentences to fix the mix-CER weak spot.
The existing corpus code-mix (~4.4k rows) is templated — a few frames repeated with swapped English
nouns — so the model overfits frames and fails on diverse eval code-mix (mix CER 0.318). This builds a
broad frame bank (varied syntax + English in varied positions) x varied insertions -> many distinct
sentences, matching the Taiwan office / phone-attendant register the eval set uses.
Usage: python gen_codemix.py --n 2800 --out codemix_corpus.txt
"""
import argparse, random, re
NAMES = ["Jason", "Kelly", "Daniel", "Rita", "Amy", "Kevin", "Linda", "Peter", "Vivian", "Frank",
"Tom", "Cindy", "Eric", "Grace", "Sam", "Joyce", "Leo", "Nina", "Oscar", "Sandy", "Ryan",
"Emma", "Jack", "Mia", "Henry", "Chloe", "Ivan", "Wendy", "Alan", "Tina"]
SUR = ["王", "陳", "林", "李", "張", "黃", "吳", "劉", "蔡", "楊", "許", "鄭", "謝", "郭", "洪"]
TITLE = ["經理", "助理", "工程師", "專員", "主任", "課長", "副理", "顧問", "店長"]
DEPT = ["技術部", "客服部", "業務部", "品保部", "財務部", "人資部", "採購部", "研發部", "行銷部", "資訊部", "法務部"]
APP = ["App", "Line", "Email", "Portal", "Outlook", "Teams", "Slack", "ERP 系統", "CRM 系統", "官網"]
ITEM = ["Excel 報表", "PDF 檔", "QR Code", "VIP 等級", "Zoom 連結", "email", "發票", "合約", "報告書",
"專案", "預算表", "行事曆", "購物車", "訂單", "帳號", "密碼", "會員卡", "序號", "授權碼", "點數"]
ACT_EN = ["login", "logout", "update", "reset", "upload", "download", "check", "confirm", "submit",
"cancel", "review", "approve", "sync", "backup", "scan"]
STATUS = ["ready", "done", "updated", "confirmed", "cancelled", "pending", "online", "offline", "expired"]
EVENT = ["meeting", "Zoom 會議", "conference call", "interview", "presentation", "demo", "workshop", "training"]
ADJ = ["busy", "urgent", "important", "ready", "OK", "fine"]
def ext(): return str(random.randint(1000, 9999))
def num(): return str(random.randint(100000, 999999))
def time_(): return random.choice(["上午九點", "上午十點半", "中午十二點", "下午兩點", "下午三點半",
"下午四點", "明天上午", "後天下午", "這個禮拜五", "下週一早上"])
def disc(): return random.choice(["九折", "八五折", "七九折", "買一送一", "免運"])
FRAMES = [
"您好,{dept}的 {name} {sur}{title}為您服務。",
"幫您轉接給 {name} {sur}{title},他的分機是 {ext}。",
"請問您要查詢的{item}編號是多少?",
"您的{item}已經 {status} 了,請至 {app} 查看。",
"{name} 的 {event} 改到{time}。",
"請至 {app} 點選 {act} 重新登入。",
"我這邊先幫您 {act} 這筆{item}。",
"這個{item}需要重新 {act},麻煩您稍等。",
"您的會員 {status},現在升級 VIP 可享{disc}優惠。",
"麻煩您把 {item} email 到我的信箱,謝謝。",
"系統顯示您的{item}需要 {act},請聯絡{dept}。",
"請問 {name} 在嗎?我這邊有一份 {item} 要給他。",
"您的訂單編號是 {num},預計{time}送達。",
"我幫您預約{time}的 {event},地點在三樓會議室。",
"不好意思,{app} 現在 {status},請您稍後再試。",
"請先 {act} 一下您的{item},我這邊同步處理。",
"{name} 說他今天比較 {adj},{event}可能要延到{time}。",
"您的{item}我已經 {status},等一下會 send 給您。",
"麻煩您提供一下 {item} 的序號,我幫您 {act}。",
"這個 case 我先 update 到系統,{dept}會再回覆您。",
"我的 {item} 今天有點問題,可以幫我 {act} 嗎?",
"請問這個 {event} 的 link 是哪一個?",
"您好,這裡是 {dept},請問需要什麼 service?",
"{name} 的分機 {ext} 現在忙線中,要幫您留言嗎?",
"您的 password 已經過期,請用 {app} 重設一個新的。",
"我這邊收到您的 {item} 了,正在 {act} 中,請稍候。",
"下午的 {event} 我會把 agenda 先寄給大家。",
"麻煩 {name} 在{time}前把{item} {act} 完成。",
"這份{item}的 deadline 是{time},請務必準時。",
"您的 VIP 點數還有 {num} 點,可以折抵{disc}。",
"請問您的 {app} 帳號是用 email 還是手機註冊的?",
"我幫您把{item} upload 到雲端了,連結在 Line 裡面。",
"{name} {sur}{title}稍後會 call 您,大概{time}。",
"您的退款已經 {status},三到五個 working day 會入帳。",
"這台機器的 firmware 要 {act},我請 {name} 過去處理。",
"麻煩您先 confirm 一下{time}的 {event} 方不方便。",
"您的 {item} 目前 {status},如需協助請撥分機 {ext}。",
"我把今天的 meeting note 整理成 PDF 寄給您。",
"請問您要的是 standard 版還是 premium 版的{item}?",
"您好,{name} 的 schedule 我看一下,他{time}有空。",
]
def fill(frame):
s = frame.format(
name=random.choice(NAMES), sur=random.choice(SUR), title=random.choice(TITLE),
dept=random.choice(DEPT), app=random.choice(APP), item=random.choice(ITEM),
act=random.choice(ACT_EN), status=random.choice(STATUS), event=random.choice(EVENT),
adj=random.choice(ADJ), ext=ext(), num=num(), time=time_(), disc=disc())
return s
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--n", type=int, default=2800)
ap.add_argument("--out", default="codemix_corpus.txt")
ap.add_argument("--seed", type=int, default=42)
a = ap.parse_args()
random.seed(a.seed)
out, tries = set(), 0
while len(out) < a.n and tries < a.n * 40:
tries += 1
s = fill(random.choice(FRAMES))
# keep only genuine code-mix (has both Han + ASCII letters) and a sane length
if re.search(r"[一-鿿]", s) and re.search(r"[A-Za-z]", s) and 8 <= len(s) <= 60:
out.add(s)
out = sorted(out)
with open(a.out, "w", encoding="utf-8") as f:
f.write("\n".join(out) + "\n")
print(f"wrote {len(out)} diverse code-mix sentences -> {a.out} (from {len(FRAMES)} frames)")
import random as _r; _r.seed(1)
for s in _r.sample(out, 8): print(" ", s)
if __name__ == "__main__":
main()