Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import re | |
| import sys | |
| import threading | |
| import unicodedata | |
| from dataclasses import dataclass | |
| from typing import Dict, Tuple | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForSeq2SeqLM, NllbTokenizer | |
| try: | |
| from sacremoses import MosesPunctNormalizer | |
| except Exception: | |
| MosesPunctNormalizer = None | |
| try: | |
| import spaces | |
| gpu = spaces.GPU(duration=60) | |
| except Exception: | |
| def gpu(fn): | |
| return fn | |
| F2EN_MODEL_ID = "FormosanBank/nllb200-formosan-en-spm8k" | |
| EN2F_MODEL_ID = "FormosanBank/nllb200-en-formosan-spm8k" | |
| F2ZH_MODEL_ID = "FormosanBank/nllb200-formosan-zh-spm8k" | |
| ZH2F_MODEL_ID = "FormosanBank/nllb200-zh-formosan-spm8k" | |
| ENGLISH_LID = "eng_Latn" | |
| CHINESE_LID = "zho_Hant" | |
| MAX_INPUT_LENGTH = 384 | |
| FORMOSAN_LANGS: Dict[str, Tuple[str, str]] = { | |
| "Amis": ("ami", "ami_Latn"), | |
| "Bunun": ("bnn", "bnn_Latn"), | |
| "Kavalan": ("ckv", "ckv_Latn"), | |
| "Rukai": ("dru", "dru_Latn"), | |
| "Paiwan": ("pwn", "pwn_Latn"), | |
| "Puyuma": ("pyu", "pyu_Latn"), | |
| "Thao": ("ssf", "ssf_Latn"), | |
| "Saaroa": ("sxr", "sxr_Latn"), | |
| "Sakizaya": ("szy", "szy_Latn"), | |
| "Tao / Yami": ("tao", "tao_Latn"), | |
| "Atayal": ("tay", "tay_Latn"), | |
| "Seediq": ("trv", "trv_Latn"), | |
| "Tsou": ("tsu", "tsu_Latn"), | |
| "Kanakanavu": ("xnb", "xnb_Latn"), | |
| "Saisiyat": ("xsy", "xsy_Latn"), | |
| } | |
| DIRECTION_LABELS = { | |
| "Formosan → English": "f2en", | |
| "English → Formosan": "en2f", | |
| "Formosan → Chinese": "f2zh", | |
| "Chinese → Formosan": "zh2f", | |
| } | |
| DOMAIN_CHOICES = { | |
| "Unknown / general": "unknown", | |
| "Dictionary": "dictionary", | |
| "Learning vocabulary": "learning_vocab", | |
| "Classroom context": "classroom_context", | |
| "Picture story": "picture_story", | |
| "Picture book": "picture_book", | |
| "Essays": "essays", | |
| "Reading / writing": "reading_writing", | |
| "Culture": "culture", | |
| "Nine-level materials": "nine_level", | |
| "YouTube": "youtube", | |
| "NTU": "ntu", | |
| "Presidential apology": "presidential_apology", | |
| "Formosan ePark": "formosan_epark", | |
| "Formosan 100 Paiwan Texts": "formosan_100_paiwan_texts", | |
| "Formosan Amis Myths and Customs": "formosan_amis_myths_and_customs", | |
| "Formosan Old Texts": "formosan_old_texts", | |
| "Formosan Paiwan Stories": "formosan_paiwanstories", | |
| "Formosan Rik Bunun": "formosan_rik_bunun", | |
| "Formosan SEALS": "formosan_seals", | |
| "Formosan Wilang Yutas Videos": "formosan_wilang_yutas_videos", | |
| "Formosan Yeddas Blog": "formosan_yeddas_blog", | |
| "Formosan Zheng Data": "formosan_zheng_data", | |
| "Formosan GitBook translations": "formosan_gitbook_translations", | |
| } | |
| DIALECT_CHOICES = { | |
| "Default / unknown": "default", | |
| "Unknown": "unknown", | |
| "Central": "central", | |
| "Coastal": "coastal", | |
| "Dawu": "dawu", | |
| "Delu Valley": "deluvalley", | |
| "Dona": "dona", | |
| "Duda": "duda", | |
| "Eastern": "eastern", | |
| "Four Seasons": "fourseasons", | |
| "Hengchun": "hengchun", | |
| "Jianhe": "jianhe", | |
| "Junqun": "junqun", | |
| "Kaqun": "kaqun", | |
| "Luanqun": "luanqun", | |
| "Malan": "malan", | |
| "Maolin": "maolin", | |
| "Nanwang": "nanwang", | |
| "Northern": "northern", | |
| "Sekolik": "sekolik", | |
| "Southern": "southern", | |
| "Tanqun": "tanqun", | |
| "Tegudaya": "tegudaya", | |
| "Truku": "truku", | |
| "Wanda": "wanda", | |
| "Wanshan": "wanshan", | |
| "Wenshui": "wenshui", | |
| "Wutai": "wutai", | |
| "Xiqun": "xiqun", | |
| "Xiuguluan": "xiuguluan", | |
| } | |
| EXAMPLE_PRESETS = { | |
| "English → Amis: He revealed what he was doing.": ( | |
| "He revealed what he was doing.", | |
| "English → Formosan", | |
| "Amis", | |
| "Unknown / general", | |
| "Default / unknown", | |
| 96, | |
| 4, | |
| 1.15, | |
| ), | |
| "English → Seediq: beetles in the forest": ( | |
| "There are many beetles in the forest.", | |
| "English → Formosan", | |
| "Seediq", | |
| "Unknown / general", | |
| "Default / unknown", | |
| 96, | |
| 4, | |
| 1.15, | |
| ), | |
| "Amis → English: Pa'araw cingra...": ( | |
| "Pa'araw cingra to demak nira.", | |
| "Formosan → English", | |
| "Amis", | |
| "Unknown / general", | |
| "Default / unknown", | |
| 96, | |
| 4, | |
| 1.15, | |
| ), | |
| "Paiwan → English: abonai aravac...": ( | |
| "abonai aravac a sapoi.", | |
| "Formosan → English", | |
| "Paiwan", | |
| "Unknown / general", | |
| "Default / unknown", | |
| 96, | |
| 4, | |
| 1.15, | |
| ), | |
| "Chinese → Amis: 他回家了。": ( | |
| "他回家了。", | |
| "Chinese → Formosan", | |
| "Amis", | |
| "Unknown / general", | |
| "Default / unknown", | |
| 96, | |
| 4, | |
| 1.15, | |
| ), | |
| "Amis → Chinese: Pa'araw cingra...": ( | |
| "Pa'araw cingra to demak nira.", | |
| "Formosan → Chinese", | |
| "Amis", | |
| "Unknown / general", | |
| "Default / unknown", | |
| 96, | |
| 4, | |
| 1.15, | |
| ), | |
| } | |
| if MosesPunctNormalizer is not None: | |
| mpn_english = MosesPunctNormalizer(lang="en") | |
| mpn_english.substitutions = [(re.compile(pattern), sub) for pattern, sub in mpn_english.substitutions] | |
| else: | |
| mpn_english = None | |
| def get_non_printing_char_replacer(replace_by: str = " "): | |
| non_printable_map = { | |
| ord(c): replace_by | |
| for c in (chr(i) for i in range(sys.maxunicode + 1)) | |
| if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"} | |
| } | |
| return lambda line: line.translate(non_printable_map) | |
| replace_nonprint = get_non_printing_char_replacer(" ") | |
| def preproc_english(text: str) -> str: | |
| clean = text | |
| if mpn_english is not None: | |
| for pattern, sub in mpn_english.substitutions: | |
| clean = pattern.sub(sub, clean) | |
| clean = replace_nonprint(clean) | |
| return unicodedata.normalize("NFKC", clean).strip() | |
| def preproc_formosan(text: str) -> str: | |
| return unicodedata.normalize("NFKC", replace_nonprint(text)).strip() | |
| def preproc_chinese(text: str) -> str: | |
| return unicodedata.normalize("NFKC", replace_nonprint(text)).strip() | |
| class ModelBundle: | |
| tokenizer: NllbTokenizer | |
| model: AutoModelForSeq2SeqLM | |
| repo_id: str | |
| MODEL_CACHE: Dict[str, ModelBundle] = {} | |
| MODEL_LOCK = threading.RLock() | |
| def active_device() -> torch.device: | |
| return torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| def model_id_for(direction_key: str) -> str: | |
| return { | |
| "f2en": F2EN_MODEL_ID, | |
| "en2f": EN2F_MODEL_ID, | |
| "f2zh": F2ZH_MODEL_ID, | |
| "zh2f": ZH2F_MODEL_ID, | |
| }[direction_key] | |
| def load_bundle(direction_key: str) -> ModelBundle: | |
| repo_id = model_id_for(direction_key) | |
| device = active_device() | |
| with MODEL_LOCK: | |
| if direction_key not in MODEL_CACHE: | |
| if device.type == "cuda": | |
| for bundle in MODEL_CACHE.values(): | |
| if next(bundle.model.parameters()).device.type == "cuda": | |
| bundle.model.to("cpu") | |
| torch.cuda.empty_cache() | |
| tokenizer = NllbTokenizer.from_pretrained(repo_id) | |
| dtype = torch.float16 if device.type == "cuda" else torch.float32 | |
| model = AutoModelForSeq2SeqLM.from_pretrained(repo_id, torch_dtype=dtype) | |
| model.config.decoder_start_token_id = tokenizer.eos_token_id | |
| model.generation_config.decoder_start_token_id = tokenizer.eos_token_id | |
| model.to(device) | |
| model.eval() | |
| MODEL_CACHE[direction_key] = ModelBundle(tokenizer=tokenizer, model=model, repo_id=repo_id) | |
| else: | |
| bundle = MODEL_CACHE[direction_key] | |
| model_device = next(bundle.model.parameters()).device | |
| if model_device != device: | |
| bundle.model.to(device) | |
| bundle.model.eval() | |
| if device.type == "cuda": | |
| for key, bundle in MODEL_CACHE.items(): | |
| if key != direction_key and next(bundle.model.parameters()).device.type == "cuda": | |
| bundle.model.to("cpu") | |
| torch.cuda.empty_cache() | |
| return MODEL_CACHE[direction_key] | |
| def known_tag(tokenizer: NllbTokenizer, tag: str, fallback: str) -> str: | |
| token_id = tokenizer.convert_tokens_to_ids(tag) | |
| if token_id is None or token_id == tokenizer.unk_token_id: | |
| return fallback | |
| return tag | |
| def format_prompt( | |
| tokenizer: NllbTokenizer, | |
| text: str, | |
| direction_key: str, | |
| lang_code: str, | |
| domain_value: str, | |
| dialect_value: str, | |
| ) -> str: | |
| domain_tag = known_tag(tokenizer, f"<dom_{domain_value}>", "<dom_unknown>") | |
| dialect_tag = known_tag(tokenizer, f"<dialect_{dialect_value}>", "<dialect_default>") | |
| if direction_key == "f2en": | |
| return f"<to_eng> <src_{lang_code}> {domain_tag} {dialect_tag} {text}" | |
| if direction_key == "en2f": | |
| return f"<to_{lang_code}> <src_eng> {domain_tag} {dialect_tag} {text}" | |
| if direction_key == "f2zh": | |
| return f"<to_zh> <src_{lang_code}> {domain_tag} {dialect_tag} {text}" | |
| return f"<to_{lang_code}> <src_zh> {domain_tag} {dialect_tag} {text}" | |
| def translate( | |
| text: str, | |
| direction_label: str, | |
| formosan_language: str, | |
| source_domain: str, | |
| dialect: str, | |
| max_new_tokens: int, | |
| num_beams: int, | |
| repetition_penalty: float, | |
| ) -> Tuple[str, str]: | |
| raw_text = text.strip() | |
| if not raw_text: | |
| return "", "Enter text to translate." | |
| direction_key = DIRECTION_LABELS[direction_label] | |
| lang_code, lang_lid = FORMOSAN_LANGS[formosan_language] | |
| domain_value = DOMAIN_CHOICES[source_domain] | |
| dialect_value = DIALECT_CHOICES[dialect] | |
| bundle = load_bundle(direction_key) | |
| tokenizer = bundle.tokenizer | |
| model = bundle.model | |
| if direction_key == "f2en": | |
| tokenizer.src_lang = lang_lid | |
| clean_text = preproc_formosan(raw_text) | |
| target_lid = ENGLISH_LID | |
| elif direction_key == "en2f": | |
| tokenizer.src_lang = ENGLISH_LID | |
| clean_text = preproc_english(raw_text) | |
| target_lid = lang_lid | |
| elif direction_key == "f2zh": | |
| tokenizer.src_lang = lang_lid | |
| clean_text = preproc_formosan(raw_text) | |
| target_lid = CHINESE_LID | |
| else: | |
| tokenizer.src_lang = CHINESE_LID | |
| clean_text = preproc_chinese(raw_text) | |
| target_lid = lang_lid | |
| prompt = format_prompt(tokenizer, clean_text, direction_key, lang_code, domain_value, dialect_value) | |
| forced_bos = tokenizer.convert_tokens_to_ids(target_lid) | |
| if forced_bos is None or forced_bos == tokenizer.unk_token_id: | |
| raise gr.Error(f"Unknown target language token: {target_lid}") | |
| inputs = tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=MAX_INPUT_LENGTH, | |
| ).to(model.device) | |
| with torch.inference_mode(): | |
| outputs = model.generate( | |
| **inputs, | |
| forced_bos_token_id=forced_bos, | |
| decoder_start_token_id=tokenizer.eos_token_id, | |
| max_new_tokens=int(max_new_tokens), | |
| num_beams=int(num_beams), | |
| no_repeat_ngram_size=3, | |
| repetition_penalty=float(repetition_penalty), | |
| length_penalty=1.0, | |
| early_stopping=True, | |
| ) | |
| decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
| translation = decoded[0].strip() if decoded else "" | |
| meta = ( | |
| f"Model: `{bundle.repo_id}` \n" | |
| f"Source: `{tokenizer.src_lang}` → Target: `{target_lid}` \n" | |
| f"Hidden prefix: `{prompt[:220]}{'...' if len(prompt) > 220 else ''}`" | |
| ) | |
| return translation, meta | |
| def swap_placeholder(direction_label: str, formosan_language: str) -> gr.Textbox: | |
| direction_key = DIRECTION_LABELS[direction_label] | |
| if direction_key in {"f2en", "f2zh"}: | |
| target = "English" if direction_key == "f2en" else "Traditional Chinese" | |
| return gr.Textbox( | |
| placeholder=f"Enter text in {formosan_language}. The app will translate it into {target}.", | |
| label=f"{formosan_language} input", | |
| ) | |
| source = "English" if direction_key == "en2f" else "Traditional Chinese" | |
| return gr.Textbox( | |
| placeholder=f"Enter {source} text to translate into {formosan_language}.", | |
| label=f"{source} input", | |
| ) | |
| def load_example(example_name: str): | |
| values = EXAMPLE_PRESETS.get(example_name) or next(iter(EXAMPLE_PRESETS.values())) | |
| return (*values, "", "Model metadata will appear after translation.") | |
| with gr.Blocks(title="FormosanBank MT") as demo: | |
| gr.Markdown( | |
| """ | |
| # Formosan ↔ English / Chinese MT | |
| Translate between 15 Formosan languages and English or Traditional Chinese using directional NLLB-200 checkpoints. | |
| The app adds the training control tags internally; users only choose direction and language. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| input_text = gr.Textbox( | |
| label="English input", | |
| placeholder="Enter English text to translate into a Formosan language.", | |
| lines=5, | |
| max_lines=10, | |
| ) | |
| translate_btn = gr.Button("Translate", variant="primary", size="lg") | |
| output_text = gr.Textbox( | |
| label="Translation", | |
| lines=5, | |
| max_lines=10, | |
| show_copy_button=True, | |
| interactive=False, | |
| ) | |
| metadata = gr.Markdown("Model metadata will appear after translation.") | |
| with gr.Column(scale=1): | |
| direction = gr.Radio( | |
| label="Direction", | |
| choices=list(DIRECTION_LABELS), | |
| value="English → Formosan", | |
| ) | |
| formosan_language = gr.Dropdown( | |
| label="Formosan language", | |
| choices=list(FORMOSAN_LANGS), | |
| value="Amis", | |
| ) | |
| with gr.Accordion("Advanced metadata tags", open=False): | |
| source_domain = gr.Dropdown( | |
| label="Source/domain bucket", | |
| choices=list(DOMAIN_CHOICES), | |
| value="Unknown / general", | |
| info="Most users should leave this as Unknown / general.", | |
| ) | |
| dialect = gr.Dropdown( | |
| label="Dialect tag", | |
| choices=list(DIALECT_CHOICES), | |
| value="Default / unknown", | |
| info="Use a specific dialect only if you know it.", | |
| ) | |
| with gr.Accordion("Generation controls", open=False): | |
| max_new_tokens = gr.Slider( | |
| label="Max new tokens", | |
| minimum=24, | |
| maximum=256, | |
| value=128, | |
| step=8, | |
| ) | |
| num_beams = gr.Slider( | |
| label="Beam size", | |
| minimum=1, | |
| maximum=8, | |
| value=4, | |
| step=1, | |
| ) | |
| repetition_penalty = gr.Slider( | |
| label="Repetition penalty", | |
| minimum=1.0, | |
| maximum=1.5, | |
| value=1.15, | |
| step=0.05, | |
| ) | |
| with gr.Group(): | |
| example_select = gr.Dropdown( | |
| label="Example preset", | |
| choices=list(EXAMPLE_PRESETS), | |
| value=next(iter(EXAMPLE_PRESETS)), | |
| ) | |
| load_example_btn = gr.Button("Load example", variant="secondary", size="sm") | |
| gr.Markdown( | |
| """ | |
| **Current hard-split scores** | |
| Formosan→English: BLEU 8.23 / chrF2 27.35 | |
| English→Formosan: BLEU 5.77 / chrF2 30.24 | |
| Formosan→Chinese: BLEU 9.79 / chrF2 11.77 | |
| Chinese→Formosan: BLEU 7.65 / chrF2 32.97 | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| ## Notes | |
| This is a research demo, not an authoritative translation service. Outputs can be wrong, incomplete, | |
| or culturally inappropriate, especially when translating from English into a Formosan language. | |
| Use fluent-speaker review for community-facing, ceremonial, legal, medical, or other high-stakes use. | |
| Model cards and evaluation details are available at: | |
| - [`FormosanBank/nllb200-formosan-en-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-en-spm8k) | |
| - [`FormosanBank/nllb200-en-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-en-formosan-spm8k) | |
| - [`FormosanBank/nllb200-formosan-zh-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-zh-spm8k) | |
| - [`FormosanBank/nllb200-zh-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-zh-formosan-spm8k) | |
| """ | |
| ) | |
| direction.change(swap_placeholder, inputs=[direction, formosan_language], outputs=input_text) | |
| formosan_language.change(swap_placeholder, inputs=[direction, formosan_language], outputs=input_text) | |
| load_example_btn.click( | |
| load_example, | |
| inputs=[example_select], | |
| outputs=[ | |
| input_text, | |
| direction, | |
| formosan_language, | |
| source_domain, | |
| dialect, | |
| max_new_tokens, | |
| num_beams, | |
| repetition_penalty, | |
| output_text, | |
| metadata, | |
| ], | |
| ) | |
| translate_btn.click( | |
| translate, | |
| inputs=[ | |
| input_text, | |
| direction, | |
| formosan_language, | |
| source_domain, | |
| dialect, | |
| max_new_tokens, | |
| num_beams, | |
| repetition_penalty, | |
| ], | |
| outputs=[output_text, metadata], | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=16).launch(ssr_mode=False) | |