from __future__ import annotations import re import sys import threading import unicodedata from dataclasses import dataclass from typing import Dict, Tuple import gradio as gr import torch from transformers import AutoModelForSeq2SeqLM, NllbTokenizer try: from sacremoses import MosesPunctNormalizer except Exception: MosesPunctNormalizer = None try: import spaces gpu = spaces.GPU(duration=60) except Exception: def gpu(fn): return fn F2EN_MODEL_ID = "FormosanBank/nllb200-formosan-en-spm8k" EN2F_MODEL_ID = "FormosanBank/nllb200-en-formosan-spm8k" F2ZH_MODEL_ID = "FormosanBank/nllb200-formosan-zh-spm8k" ZH2F_MODEL_ID = "FormosanBank/nllb200-zh-formosan-spm8k" ENGLISH_LID = "eng_Latn" CHINESE_LID = "zho_Hant" MAX_INPUT_LENGTH = 384 FORMOSAN_LANGS: Dict[str, Tuple[str, str]] = { "Amis": ("ami", "ami_Latn"), "Bunun": ("bnn", "bnn_Latn"), "Kavalan": ("ckv", "ckv_Latn"), "Rukai": ("dru", "dru_Latn"), "Paiwan": ("pwn", "pwn_Latn"), "Puyuma": ("pyu", "pyu_Latn"), "Thao": ("ssf", "ssf_Latn"), "Saaroa": ("sxr", "sxr_Latn"), "Sakizaya": ("szy", "szy_Latn"), "Tao / Yami": ("tao", "tao_Latn"), "Atayal": ("tay", "tay_Latn"), "Seediq": ("trv", "trv_Latn"), "Tsou": ("tsu", "tsu_Latn"), "Kanakanavu": ("xnb", "xnb_Latn"), "Saisiyat": ("xsy", "xsy_Latn"), } DIRECTION_LABELS = { "Formosan → English": "f2en", "English → Formosan": "en2f", "Formosan → Chinese": "f2zh", "Chinese → Formosan": "zh2f", } DOMAIN_CHOICES = { "Unknown / general": "unknown", "Dictionary": "dictionary", "Learning vocabulary": "learning_vocab", "Classroom context": "classroom_context", "Picture story": "picture_story", "Picture book": "picture_book", "Essays": "essays", "Reading / writing": "reading_writing", "Culture": "culture", "Nine-level materials": "nine_level", "YouTube": "youtube", "NTU": "ntu", "Presidential apology": "presidential_apology", "Formosan ePark": "formosan_epark", "Formosan 100 Paiwan Texts": "formosan_100_paiwan_texts", "Formosan Amis Myths and Customs": "formosan_amis_myths_and_customs", "Formosan Old Texts": "formosan_old_texts", "Formosan Paiwan Stories": "formosan_paiwanstories", "Formosan Rik Bunun": "formosan_rik_bunun", "Formosan SEALS": "formosan_seals", "Formosan Wilang Yutas Videos": "formosan_wilang_yutas_videos", "Formosan Yeddas Blog": "formosan_yeddas_blog", "Formosan Zheng Data": "formosan_zheng_data", "Formosan GitBook translations": "formosan_gitbook_translations", } DIALECT_CHOICES = { "Default / unknown": "default", "Unknown": "unknown", "Central": "central", "Coastal": "coastal", "Dawu": "dawu", "Delu Valley": "deluvalley", "Dona": "dona", "Duda": "duda", "Eastern": "eastern", "Four Seasons": "fourseasons", "Hengchun": "hengchun", "Jianhe": "jianhe", "Junqun": "junqun", "Kaqun": "kaqun", "Luanqun": "luanqun", "Malan": "malan", "Maolin": "maolin", "Nanwang": "nanwang", "Northern": "northern", "Sekolik": "sekolik", "Southern": "southern", "Tanqun": "tanqun", "Tegudaya": "tegudaya", "Truku": "truku", "Wanda": "wanda", "Wanshan": "wanshan", "Wenshui": "wenshui", "Wutai": "wutai", "Xiqun": "xiqun", "Xiuguluan": "xiuguluan", } EXAMPLE_PRESETS = { "English → Amis: He revealed what he was doing.": ( "He revealed what he was doing.", "English → Formosan", "Amis", "Unknown / general", "Default / unknown", 96, 4, 1.15, ), "English → Seediq: beetles in the forest": ( "There are many beetles in the forest.", "English → Formosan", "Seediq", "Unknown / general", "Default / unknown", 96, 4, 1.15, ), "Amis → English: Pa'araw cingra...": ( "Pa'araw cingra to demak nira.", "Formosan → English", "Amis", "Unknown / general", "Default / unknown", 96, 4, 1.15, ), "Paiwan → English: abonai aravac...": ( "abonai aravac a sapoi.", "Formosan → English", "Paiwan", "Unknown / general", "Default / unknown", 96, 4, 1.15, ), "Chinese → Amis: 他回家了。": ( "他回家了。", "Chinese → Formosan", "Amis", "Unknown / general", "Default / unknown", 96, 4, 1.15, ), "Amis → Chinese: Pa'araw cingra...": ( "Pa'araw cingra to demak nira.", "Formosan → Chinese", "Amis", "Unknown / general", "Default / unknown", 96, 4, 1.15, ), } if MosesPunctNormalizer is not None: mpn_english = MosesPunctNormalizer(lang="en") mpn_english.substitutions = [(re.compile(pattern), sub) for pattern, sub in mpn_english.substitutions] else: mpn_english = None def get_non_printing_char_replacer(replace_by: str = " "): non_printable_map = { ord(c): replace_by for c in (chr(i) for i in range(sys.maxunicode + 1)) if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"} } return lambda line: line.translate(non_printable_map) replace_nonprint = get_non_printing_char_replacer(" ") def preproc_english(text: str) -> str: clean = text if mpn_english is not None: for pattern, sub in mpn_english.substitutions: clean = pattern.sub(sub, clean) clean = replace_nonprint(clean) return unicodedata.normalize("NFKC", clean).strip() def preproc_formosan(text: str) -> str: return unicodedata.normalize("NFKC", replace_nonprint(text)).strip() def preproc_chinese(text: str) -> str: return unicodedata.normalize("NFKC", replace_nonprint(text)).strip() @dataclass class ModelBundle: tokenizer: NllbTokenizer model: AutoModelForSeq2SeqLM repo_id: str MODEL_CACHE: Dict[str, ModelBundle] = {} MODEL_LOCK = threading.RLock() def active_device() -> torch.device: return torch.device("cuda" if torch.cuda.is_available() else "cpu") def model_id_for(direction_key: str) -> str: return { "f2en": F2EN_MODEL_ID, "en2f": EN2F_MODEL_ID, "f2zh": F2ZH_MODEL_ID, "zh2f": ZH2F_MODEL_ID, }[direction_key] def load_bundle(direction_key: str) -> ModelBundle: repo_id = model_id_for(direction_key) device = active_device() with MODEL_LOCK: if direction_key not in MODEL_CACHE: if device.type == "cuda": for bundle in MODEL_CACHE.values(): if next(bundle.model.parameters()).device.type == "cuda": bundle.model.to("cpu") torch.cuda.empty_cache() tokenizer = NllbTokenizer.from_pretrained(repo_id) dtype = torch.float16 if device.type == "cuda" else torch.float32 model = AutoModelForSeq2SeqLM.from_pretrained(repo_id, torch_dtype=dtype) model.config.decoder_start_token_id = tokenizer.eos_token_id model.generation_config.decoder_start_token_id = tokenizer.eos_token_id model.to(device) model.eval() MODEL_CACHE[direction_key] = ModelBundle(tokenizer=tokenizer, model=model, repo_id=repo_id) else: bundle = MODEL_CACHE[direction_key] model_device = next(bundle.model.parameters()).device if model_device != device: bundle.model.to(device) bundle.model.eval() if device.type == "cuda": for key, bundle in MODEL_CACHE.items(): if key != direction_key and next(bundle.model.parameters()).device.type == "cuda": bundle.model.to("cpu") torch.cuda.empty_cache() return MODEL_CACHE[direction_key] def known_tag(tokenizer: NllbTokenizer, tag: str, fallback: str) -> str: token_id = tokenizer.convert_tokens_to_ids(tag) if token_id is None or token_id == tokenizer.unk_token_id: return fallback return tag def format_prompt( tokenizer: NllbTokenizer, text: str, direction_key: str, lang_code: str, domain_value: str, dialect_value: str, ) -> str: domain_tag = known_tag(tokenizer, f"", "") dialect_tag = known_tag(tokenizer, f"", "") if direction_key == "f2en": return f" {domain_tag} {dialect_tag} {text}" if direction_key == "en2f": return f" {domain_tag} {dialect_tag} {text}" if direction_key == "f2zh": return f" {domain_tag} {dialect_tag} {text}" return f" {domain_tag} {dialect_tag} {text}" @gpu def translate( text: str, direction_label: str, formosan_language: str, source_domain: str, dialect: str, max_new_tokens: int, num_beams: int, repetition_penalty: float, ) -> Tuple[str, str]: raw_text = text.strip() if not raw_text: return "", "Enter text to translate." direction_key = DIRECTION_LABELS[direction_label] lang_code, lang_lid = FORMOSAN_LANGS[formosan_language] domain_value = DOMAIN_CHOICES[source_domain] dialect_value = DIALECT_CHOICES[dialect] bundle = load_bundle(direction_key) tokenizer = bundle.tokenizer model = bundle.model if direction_key == "f2en": tokenizer.src_lang = lang_lid clean_text = preproc_formosan(raw_text) target_lid = ENGLISH_LID elif direction_key == "en2f": tokenizer.src_lang = ENGLISH_LID clean_text = preproc_english(raw_text) target_lid = lang_lid elif direction_key == "f2zh": tokenizer.src_lang = lang_lid clean_text = preproc_formosan(raw_text) target_lid = CHINESE_LID else: tokenizer.src_lang = CHINESE_LID clean_text = preproc_chinese(raw_text) target_lid = lang_lid prompt = format_prompt(tokenizer, clean_text, direction_key, lang_code, domain_value, dialect_value) forced_bos = tokenizer.convert_tokens_to_ids(target_lid) if forced_bos is None or forced_bos == tokenizer.unk_token_id: raise gr.Error(f"Unknown target language token: {target_lid}") inputs = tokenizer( prompt, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LENGTH, ).to(model.device) with torch.inference_mode(): outputs = model.generate( **inputs, forced_bos_token_id=forced_bos, decoder_start_token_id=tokenizer.eos_token_id, max_new_tokens=int(max_new_tokens), num_beams=int(num_beams), no_repeat_ngram_size=3, repetition_penalty=float(repetition_penalty), length_penalty=1.0, early_stopping=True, ) decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True) translation = decoded[0].strip() if decoded else "" meta = ( f"Model: `{bundle.repo_id}` \n" f"Source: `{tokenizer.src_lang}` → Target: `{target_lid}` \n" f"Hidden prefix: `{prompt[:220]}{'...' if len(prompt) > 220 else ''}`" ) return translation, meta def swap_placeholder(direction_label: str, formosan_language: str) -> gr.Textbox: direction_key = DIRECTION_LABELS[direction_label] if direction_key in {"f2en", "f2zh"}: target = "English" if direction_key == "f2en" else "Traditional Chinese" return gr.Textbox( placeholder=f"Enter text in {formosan_language}. The app will translate it into {target}.", label=f"{formosan_language} input", ) source = "English" if direction_key == "en2f" else "Traditional Chinese" return gr.Textbox( placeholder=f"Enter {source} text to translate into {formosan_language}.", label=f"{source} input", ) def load_example(example_name: str): values = EXAMPLE_PRESETS.get(example_name) or next(iter(EXAMPLE_PRESETS.values())) return (*values, "", "Model metadata will appear after translation.") with gr.Blocks(title="FormosanBank MT") as demo: gr.Markdown( """ # Formosan ↔ English / Chinese MT Translate between 15 Formosan languages and English or Traditional Chinese using directional NLLB-200 checkpoints. The app adds the training control tags internally; users only choose direction and language. """ ) with gr.Row(): with gr.Column(scale=2): input_text = gr.Textbox( label="English input", placeholder="Enter English text to translate into a Formosan language.", lines=5, max_lines=10, ) translate_btn = gr.Button("Translate", variant="primary", size="lg") output_text = gr.Textbox( label="Translation", lines=5, max_lines=10, show_copy_button=True, interactive=False, ) metadata = gr.Markdown("Model metadata will appear after translation.") with gr.Column(scale=1): direction = gr.Radio( label="Direction", choices=list(DIRECTION_LABELS), value="English → Formosan", ) formosan_language = gr.Dropdown( label="Formosan language", choices=list(FORMOSAN_LANGS), value="Amis", ) with gr.Accordion("Advanced metadata tags", open=False): source_domain = gr.Dropdown( label="Source/domain bucket", choices=list(DOMAIN_CHOICES), value="Unknown / general", info="Most users should leave this as Unknown / general.", ) dialect = gr.Dropdown( label="Dialect tag", choices=list(DIALECT_CHOICES), value="Default / unknown", info="Use a specific dialect only if you know it.", ) with gr.Accordion("Generation controls", open=False): max_new_tokens = gr.Slider( label="Max new tokens", minimum=24, maximum=256, value=128, step=8, ) num_beams = gr.Slider( label="Beam size", minimum=1, maximum=8, value=4, step=1, ) repetition_penalty = gr.Slider( label="Repetition penalty", minimum=1.0, maximum=1.5, value=1.15, step=0.05, ) with gr.Group(): example_select = gr.Dropdown( label="Example preset", choices=list(EXAMPLE_PRESETS), value=next(iter(EXAMPLE_PRESETS)), ) load_example_btn = gr.Button("Load example", variant="secondary", size="sm") gr.Markdown( """ **Current hard-split scores** Formosan→English: BLEU 8.23 / chrF2 27.35 English→Formosan: BLEU 5.77 / chrF2 30.24 Formosan→Chinese: BLEU 9.79 / chrF2 11.77 Chinese→Formosan: BLEU 7.65 / chrF2 32.97 """ ) gr.Markdown( """ ## Notes This is a research demo, not an authoritative translation service. Outputs can be wrong, incomplete, or culturally inappropriate, especially when translating from English into a Formosan language. Use fluent-speaker review for community-facing, ceremonial, legal, medical, or other high-stakes use. Model cards and evaluation details are available at: - [`FormosanBank/nllb200-formosan-en-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-en-spm8k) - [`FormosanBank/nllb200-en-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-en-formosan-spm8k) - [`FormosanBank/nllb200-formosan-zh-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-zh-spm8k) - [`FormosanBank/nllb200-zh-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-zh-formosan-spm8k) """ ) direction.change(swap_placeholder, inputs=[direction, formosan_language], outputs=input_text) formosan_language.change(swap_placeholder, inputs=[direction, formosan_language], outputs=input_text) load_example_btn.click( load_example, inputs=[example_select], outputs=[ input_text, direction, formosan_language, source_domain, dialect, max_new_tokens, num_beams, repetition_penalty, output_text, metadata, ], ) translate_btn.click( translate, inputs=[ input_text, direction, formosan_language, source_domain, dialect, max_new_tokens, num_beams, repetition_penalty, ], outputs=[output_text, metadata], ) if __name__ == "__main__": demo.queue(max_size=16).launch(ssr_mode=False)