Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import spaces | |
| import torch | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| FORMOSAN_LANGUAGES_MAP = { | |
| "阿美_海岸": "ami_Coas", | |
| "阿美_恆春": "ami_Heng", | |
| "阿美_馬蘭": "ami_Mala", | |
| "阿美_南勢": "ami_Sout", | |
| "阿美_秀姑巒": "ami_Xiug", | |
| "泰雅_四季": "tay_Four", | |
| "泰雅_賽考利克": "tay_Seko", | |
| "泰雅_萬大": "tay_Wand", | |
| "泰雅_汶水": "tay_Wens", | |
| "泰雅_宜蘭澤敖利": "tay_Yzea", | |
| "泰雅_澤敖利": "tay_Zeao", | |
| "布農_郡群": "bnn_Junq", | |
| "布農_卡群": "bnn_Kaqu", | |
| "布農_巒群": "bnn_Luan", | |
| "布農_丹群": "bnn_Tanq", | |
| "布農_卓群": "bnn_Zhuo", | |
| "卡那卡那富": "xnb_Kana", | |
| "噶瑪蘭": "ckv_Kava", | |
| "排灣_中": "pwn_Cent", | |
| "排灣_東": "pwn_East", | |
| "排灣_北": "pwn_Nrth", | |
| "排灣_南": "pwn_Sout", | |
| "卑南_建和": "pyu_Jian", | |
| "卑南_南王": "pyu_Nanw", | |
| "卑南_西群": "pyu_Xiqu", | |
| "卑南_知本": "pyu_Zhib", | |
| "魯凱_大武": "dru_Dawu", | |
| "魯凱_多納": "dru_Dona", | |
| "魯凱_東": "dru_East", | |
| "魯凱_茂林": "dru_Maol", | |
| "魯凱_萬山": "dru_Wans", | |
| "魯凱_霧台": "dru_Wuta", | |
| "拉阿魯哇": "sxr_Saar", | |
| "賽夏": "xsy_Sais", | |
| "撒奇萊雅": "szy_Saki", | |
| "賽德克_德鹿谷": "trv_Delu", | |
| "賽德克_都達": "trv_Duda", | |
| "賽德克_德固達雅": "trv_Tegu", | |
| "邵": "ssf_Thao", | |
| "太魯閣": "trv_Truk", | |
| "鄒": "tsu_Tsou", | |
| "雅美": "tao_Yami", | |
| } | |
| ETHNICITIES = sorted(set([k.split("_")[0] for k in FORMOSAN_LANGUAGES_MAP.keys()])) | |
| MODEL_NAME = "ithuan/nllb-600m-formosan-all-finetune-v2" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| def get_languages_by_ethnicity(ethnicity: str): | |
| return [ | |
| (k, v) | |
| for k, v in FORMOSAN_LANGUAGES_MAP.items() | |
| if k.split("_")[0] == ethnicity | |
| ] | |
| def translate(text: str, src_lang: str, tgt_lang: str): | |
| tokenizer.src_lang = src_lang | |
| tokenizer.tgt_lang = tgt_lang | |
| input_tokens = ( | |
| tokenizer(text, return_tensors="pt").input_ids[0].cpu().numpy().tolist() | |
| ) | |
| translated = model.generate( | |
| input_ids=torch.tensor([input_tokens]).to(device), | |
| forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang), | |
| max_length=5000, | |
| num_return_sequences=1, | |
| num_beams=5, | |
| no_repeat_ngram_size=4, # repetition blocking works better if this number is below num_beams | |
| renormalize_logits=True, # recompute token probabilities after banning the repetitions | |
| ) | |
| translated = tokenizer.decode(translated[0], skip_special_tokens=True) | |
| return translated | |
| demo = gr.Blocks( | |
| title="族語基礎翻譯系統beta", | |
| css="@import url(https://tauhu.tw/tauhu-oo.css);", | |
| theme=gr.themes.Default( | |
| font=( | |
| "tauhu-oo", | |
| gr.themes.GoogleFont("Source Sans Pro"), | |
| "ui-sans-serif", | |
| "system-ui", | |
| "sans-serif", | |
| ) | |
| ), | |
| ) | |
| with demo: | |
| gr.Markdown( | |
| """ | |
| # 族語基礎翻譯系統beta | |
| 這是「族語華語對譯系統」,請按照下方步驟操作,或查看操作手冊及操作影片。本系統為初步開發測試版,翻譯結果可能出現錯誤,目前仍在持續優化中。試用時請務必謹慎檢視翻譯結果,切勿直接作為正式或關鍵資訊使用,感謝您的理解與支持,並請不吝留下系統回報與建議。 | |
| """ | |
| ) | |
| with gr.Tab("族語 ⮕ 華語"): | |
| to_zh_ethnicity = gr.Dropdown( | |
| label="族別", | |
| choices=ETHNICITIES, | |
| value="阿美", | |
| filterable=False, | |
| ) | |
| to_zh_src_lang = gr.Dropdown( | |
| label="語別", | |
| choices=get_languages_by_ethnicity(to_zh_ethnicity.value), | |
| value=get_languages_by_ethnicity(to_zh_ethnicity.value)[0][1], | |
| filterable=False, | |
| interactive=len(get_languages_by_ethnicity(to_zh_ethnicity.value)) > 1, | |
| ) | |
| to_zh_tgt_lang = gr.Text(value="zho_Hant", visible=False, interactive=False) | |
| to_zh_input_text = gr.Textbox(label="原文", lines=6) | |
| to_zh_btn = gr.Button("翻譯", variant="primary") | |
| to_zh_output = gr.Textbox(label="翻譯結果", lines=6) | |
| to_zh_ethnicity.change( | |
| lambda ethnicity: gr.Dropdown( | |
| choices=get_languages_by_ethnicity(ethnicity), | |
| value=get_languages_by_ethnicity(ethnicity)[0][1], | |
| interactive=len(get_languages_by_ethnicity(ethnicity)) > 1, | |
| ), | |
| inputs=to_zh_ethnicity, | |
| outputs=to_zh_src_lang, | |
| ) | |
| to_zh_btn.click( | |
| translate, | |
| inputs=[to_zh_input_text, to_zh_src_lang, to_zh_tgt_lang], | |
| outputs=to_zh_output, | |
| ) | |
| with gr.Tab("華語 ⮕ 族語"): | |
| to_formosan_src_lang = gr.Text( | |
| value="zho_Hant", visible=False, interactive=False | |
| ) | |
| to_formosan_ethnicity = gr.Dropdown( | |
| label="族別", | |
| choices=ETHNICITIES, | |
| value="阿美", | |
| filterable=False, | |
| ) | |
| to_formosan_tgt_lang = gr.Dropdown( | |
| label="語別", | |
| choices=get_languages_by_ethnicity(to_formosan_ethnicity.value), | |
| value=get_languages_by_ethnicity(to_formosan_ethnicity.value)[0][1], | |
| filterable=False, | |
| interactive=len(get_languages_by_ethnicity(to_formosan_ethnicity.value)) | |
| > 1, | |
| ) | |
| to_formosan_input_text = gr.Textbox(label="原文", lines=6) | |
| to_formosan_btn = gr.Button("翻譯", variant="primary") | |
| to_formosan_output = gr.Textbox(label="翻譯結果", lines=6) | |
| to_formosan_ethnicity.change( | |
| lambda ethnicity: gr.Dropdown( | |
| choices=get_languages_by_ethnicity(ethnicity), | |
| value=get_languages_by_ethnicity(ethnicity)[0][1], | |
| interactive=len(get_languages_by_ethnicity(ethnicity)) > 1, | |
| ), | |
| inputs=to_formosan_ethnicity, | |
| outputs=to_formosan_tgt_lang, | |
| ) | |
| to_formosan_btn.click( | |
| translate, | |
| inputs=[to_formosan_input_text, to_formosan_src_lang, to_formosan_tgt_lang], | |
| outputs=to_formosan_output, | |
| ) | |
| demo.launch() | |