txya900619's picture
UI調整 (#1)
c708048 verified
import gradio as gr
import spaces
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
FORMOSAN_LANGUAGES_MAP = {
"阿美_海岸": "ami_Coas",
"阿美_恆春": "ami_Heng",
"阿美_馬蘭": "ami_Mala",
"阿美_南勢": "ami_Sout",
"阿美_秀姑巒": "ami_Xiug",
"泰雅_四季": "tay_Four",
"泰雅_賽考利克": "tay_Seko",
"泰雅_萬大": "tay_Wand",
"泰雅_汶水": "tay_Wens",
"泰雅_宜蘭澤敖利": "tay_Yzea",
"泰雅_澤敖利": "tay_Zeao",
"布農_郡群": "bnn_Junq",
"布農_卡群": "bnn_Kaqu",
"布農_巒群": "bnn_Luan",
"布農_丹群": "bnn_Tanq",
"布農_卓群": "bnn_Zhuo",
"卡那卡那富": "xnb_Kana",
"噶瑪蘭": "ckv_Kava",
"排灣_中": "pwn_Cent",
"排灣_東": "pwn_East",
"排灣_北": "pwn_Nrth",
"排灣_南": "pwn_Sout",
"卑南_建和": "pyu_Jian",
"卑南_南王": "pyu_Nanw",
"卑南_西群": "pyu_Xiqu",
"卑南_知本": "pyu_Zhib",
"魯凱_大武": "dru_Dawu",
"魯凱_多納": "dru_Dona",
"魯凱_東": "dru_East",
"魯凱_茂林": "dru_Maol",
"魯凱_萬山": "dru_Wans",
"魯凱_霧台": "dru_Wuta",
"拉阿魯哇": "sxr_Saar",
"賽夏": "xsy_Sais",
"撒奇萊雅": "szy_Saki",
"賽德克_德鹿谷": "trv_Delu",
"賽德克_都達": "trv_Duda",
"賽德克_德固達雅": "trv_Tegu",
"邵": "ssf_Thao",
"太魯閣": "trv_Truk",
"鄒": "tsu_Tsou",
"雅美": "tao_Yami",
}
ETHNICITIES = sorted(set([k.split("_")[0] for k in FORMOSAN_LANGUAGES_MAP.keys()]))
MODEL_NAME = "ithuan/nllb-600m-formosan-all-finetune-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def get_languages_by_ethnicity(ethnicity: str):
return [
(k, v)
for k, v in FORMOSAN_LANGUAGES_MAP.items()
if k.split("_")[0] == ethnicity
]
@spaces.GPU
def translate(text: str, src_lang: str, tgt_lang: str):
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang
input_tokens = (
tokenizer(text, return_tensors="pt").input_ids[0].cpu().numpy().tolist()
)
translated = model.generate(
input_ids=torch.tensor([input_tokens]).to(device),
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
max_length=5000,
num_return_sequences=1,
num_beams=5,
no_repeat_ngram_size=4, # repetition blocking works better if this number is below num_beams
renormalize_logits=True, # recompute token probabilities after banning the repetitions
)
translated = tokenizer.decode(translated[0], skip_special_tokens=True)
return translated
demo = gr.Blocks(
title="族語基礎翻譯系統beta",
css="@import url(https://tauhu.tw/tauhu-oo.css);",
theme=gr.themes.Default(
font=(
"tauhu-oo",
gr.themes.GoogleFont("Source Sans Pro"),
"ui-sans-serif",
"system-ui",
"sans-serif",
)
),
)
with demo:
gr.Markdown(
"""
# 族語基礎翻譯系統beta
這是「族語華語對譯系統」,請按照下方步驟操作,或查看操作手冊及操作影片。本系統為初步開發測試版,翻譯結果可能出現錯誤,目前仍在持續優化中。試用時請務必謹慎檢視翻譯結果,切勿直接作為正式或關鍵資訊使用,感謝您的理解與支持,並請不吝留下系統回報與建議。
"""
)
with gr.Tab("族語 ⮕ 華語"):
to_zh_ethnicity = gr.Dropdown(
label="族別",
choices=ETHNICITIES,
value="阿美",
filterable=False,
)
to_zh_src_lang = gr.Dropdown(
label="語別",
choices=get_languages_by_ethnicity(to_zh_ethnicity.value),
value=get_languages_by_ethnicity(to_zh_ethnicity.value)[0][1],
filterable=False,
interactive=len(get_languages_by_ethnicity(to_zh_ethnicity.value)) > 1,
)
to_zh_tgt_lang = gr.Text(value="zho_Hant", visible=False, interactive=False)
to_zh_input_text = gr.Textbox(label="原文", lines=6)
to_zh_btn = gr.Button("翻譯", variant="primary")
to_zh_output = gr.Textbox(label="翻譯結果", lines=6)
to_zh_ethnicity.change(
lambda ethnicity: gr.Dropdown(
choices=get_languages_by_ethnicity(ethnicity),
value=get_languages_by_ethnicity(ethnicity)[0][1],
interactive=len(get_languages_by_ethnicity(ethnicity)) > 1,
),
inputs=to_zh_ethnicity,
outputs=to_zh_src_lang,
)
to_zh_btn.click(
translate,
inputs=[to_zh_input_text, to_zh_src_lang, to_zh_tgt_lang],
outputs=to_zh_output,
)
with gr.Tab("華語 ⮕ 族語"):
to_formosan_src_lang = gr.Text(
value="zho_Hant", visible=False, interactive=False
)
to_formosan_ethnicity = gr.Dropdown(
label="族別",
choices=ETHNICITIES,
value="阿美",
filterable=False,
)
to_formosan_tgt_lang = gr.Dropdown(
label="語別",
choices=get_languages_by_ethnicity(to_formosan_ethnicity.value),
value=get_languages_by_ethnicity(to_formosan_ethnicity.value)[0][1],
filterable=False,
interactive=len(get_languages_by_ethnicity(to_formosan_ethnicity.value))
> 1,
)
to_formosan_input_text = gr.Textbox(label="原文", lines=6)
to_formosan_btn = gr.Button("翻譯", variant="primary")
to_formosan_output = gr.Textbox(label="翻譯結果", lines=6)
to_formosan_ethnicity.change(
lambda ethnicity: gr.Dropdown(
choices=get_languages_by_ethnicity(ethnicity),
value=get_languages_by_ethnicity(ethnicity)[0][1],
interactive=len(get_languages_by_ethnicity(ethnicity)) > 1,
),
inputs=to_formosan_ethnicity,
outputs=to_formosan_tgt_lang,
)
to_formosan_btn.click(
translate,
inputs=[to_formosan_input_text, to_formosan_src_lang, to_formosan_tgt_lang],
outputs=to_formosan_output,
)
demo.launch()