formosan-mt / app.py
hunterschep's picture
Add Formosan-Chinese directional models to MT demo
5df181a verified
from __future__ import annotations
import re
import sys
import threading
import unicodedata
from dataclasses import dataclass
from typing import Dict, Tuple
import gradio as gr
import torch
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
try:
from sacremoses import MosesPunctNormalizer
except Exception:
MosesPunctNormalizer = None
try:
import spaces
gpu = spaces.GPU(duration=60)
except Exception:
def gpu(fn):
return fn
F2EN_MODEL_ID = "FormosanBank/nllb200-formosan-en-spm8k"
EN2F_MODEL_ID = "FormosanBank/nllb200-en-formosan-spm8k"
F2ZH_MODEL_ID = "FormosanBank/nllb200-formosan-zh-spm8k"
ZH2F_MODEL_ID = "FormosanBank/nllb200-zh-formosan-spm8k"
ENGLISH_LID = "eng_Latn"
CHINESE_LID = "zho_Hant"
MAX_INPUT_LENGTH = 384
FORMOSAN_LANGS: Dict[str, Tuple[str, str]] = {
"Amis": ("ami", "ami_Latn"),
"Bunun": ("bnn", "bnn_Latn"),
"Kavalan": ("ckv", "ckv_Latn"),
"Rukai": ("dru", "dru_Latn"),
"Paiwan": ("pwn", "pwn_Latn"),
"Puyuma": ("pyu", "pyu_Latn"),
"Thao": ("ssf", "ssf_Latn"),
"Saaroa": ("sxr", "sxr_Latn"),
"Sakizaya": ("szy", "szy_Latn"),
"Tao / Yami": ("tao", "tao_Latn"),
"Atayal": ("tay", "tay_Latn"),
"Seediq": ("trv", "trv_Latn"),
"Tsou": ("tsu", "tsu_Latn"),
"Kanakanavu": ("xnb", "xnb_Latn"),
"Saisiyat": ("xsy", "xsy_Latn"),
}
DIRECTION_LABELS = {
"Formosan → English": "f2en",
"English → Formosan": "en2f",
"Formosan → Chinese": "f2zh",
"Chinese → Formosan": "zh2f",
}
DOMAIN_CHOICES = {
"Unknown / general": "unknown",
"Dictionary": "dictionary",
"Learning vocabulary": "learning_vocab",
"Classroom context": "classroom_context",
"Picture story": "picture_story",
"Picture book": "picture_book",
"Essays": "essays",
"Reading / writing": "reading_writing",
"Culture": "culture",
"Nine-level materials": "nine_level",
"YouTube": "youtube",
"NTU": "ntu",
"Presidential apology": "presidential_apology",
"Formosan ePark": "formosan_epark",
"Formosan 100 Paiwan Texts": "formosan_100_paiwan_texts",
"Formosan Amis Myths and Customs": "formosan_amis_myths_and_customs",
"Formosan Old Texts": "formosan_old_texts",
"Formosan Paiwan Stories": "formosan_paiwanstories",
"Formosan Rik Bunun": "formosan_rik_bunun",
"Formosan SEALS": "formosan_seals",
"Formosan Wilang Yutas Videos": "formosan_wilang_yutas_videos",
"Formosan Yeddas Blog": "formosan_yeddas_blog",
"Formosan Zheng Data": "formosan_zheng_data",
"Formosan GitBook translations": "formosan_gitbook_translations",
}
DIALECT_CHOICES = {
"Default / unknown": "default",
"Unknown": "unknown",
"Central": "central",
"Coastal": "coastal",
"Dawu": "dawu",
"Delu Valley": "deluvalley",
"Dona": "dona",
"Duda": "duda",
"Eastern": "eastern",
"Four Seasons": "fourseasons",
"Hengchun": "hengchun",
"Jianhe": "jianhe",
"Junqun": "junqun",
"Kaqun": "kaqun",
"Luanqun": "luanqun",
"Malan": "malan",
"Maolin": "maolin",
"Nanwang": "nanwang",
"Northern": "northern",
"Sekolik": "sekolik",
"Southern": "southern",
"Tanqun": "tanqun",
"Tegudaya": "tegudaya",
"Truku": "truku",
"Wanda": "wanda",
"Wanshan": "wanshan",
"Wenshui": "wenshui",
"Wutai": "wutai",
"Xiqun": "xiqun",
"Xiuguluan": "xiuguluan",
}
EXAMPLE_PRESETS = {
"English → Amis: He revealed what he was doing.": (
"He revealed what he was doing.",
"English → Formosan",
"Amis",
"Unknown / general",
"Default / unknown",
96,
4,
1.15,
),
"English → Seediq: beetles in the forest": (
"There are many beetles in the forest.",
"English → Formosan",
"Seediq",
"Unknown / general",
"Default / unknown",
96,
4,
1.15,
),
"Amis → English: Pa'araw cingra...": (
"Pa'araw cingra to demak nira.",
"Formosan → English",
"Amis",
"Unknown / general",
"Default / unknown",
96,
4,
1.15,
),
"Paiwan → English: abonai aravac...": (
"abonai aravac a sapoi.",
"Formosan → English",
"Paiwan",
"Unknown / general",
"Default / unknown",
96,
4,
1.15,
),
"Chinese → Amis: 他回家了。": (
"他回家了。",
"Chinese → Formosan",
"Amis",
"Unknown / general",
"Default / unknown",
96,
4,
1.15,
),
"Amis → Chinese: Pa'araw cingra...": (
"Pa'araw cingra to demak nira.",
"Formosan → Chinese",
"Amis",
"Unknown / general",
"Default / unknown",
96,
4,
1.15,
),
}
if MosesPunctNormalizer is not None:
mpn_english = MosesPunctNormalizer(lang="en")
mpn_english.substitutions = [(re.compile(pattern), sub) for pattern, sub in mpn_english.substitutions]
else:
mpn_english = None
def get_non_printing_char_replacer(replace_by: str = " "):
non_printable_map = {
ord(c): replace_by
for c in (chr(i) for i in range(sys.maxunicode + 1))
if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
}
return lambda line: line.translate(non_printable_map)
replace_nonprint = get_non_printing_char_replacer(" ")
def preproc_english(text: str) -> str:
clean = text
if mpn_english is not None:
for pattern, sub in mpn_english.substitutions:
clean = pattern.sub(sub, clean)
clean = replace_nonprint(clean)
return unicodedata.normalize("NFKC", clean).strip()
def preproc_formosan(text: str) -> str:
return unicodedata.normalize("NFKC", replace_nonprint(text)).strip()
def preproc_chinese(text: str) -> str:
return unicodedata.normalize("NFKC", replace_nonprint(text)).strip()
@dataclass
class ModelBundle:
tokenizer: NllbTokenizer
model: AutoModelForSeq2SeqLM
repo_id: str
MODEL_CACHE: Dict[str, ModelBundle] = {}
MODEL_LOCK = threading.RLock()
def active_device() -> torch.device:
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
def model_id_for(direction_key: str) -> str:
return {
"f2en": F2EN_MODEL_ID,
"en2f": EN2F_MODEL_ID,
"f2zh": F2ZH_MODEL_ID,
"zh2f": ZH2F_MODEL_ID,
}[direction_key]
def load_bundle(direction_key: str) -> ModelBundle:
repo_id = model_id_for(direction_key)
device = active_device()
with MODEL_LOCK:
if direction_key not in MODEL_CACHE:
if device.type == "cuda":
for bundle in MODEL_CACHE.values():
if next(bundle.model.parameters()).device.type == "cuda":
bundle.model.to("cpu")
torch.cuda.empty_cache()
tokenizer = NllbTokenizer.from_pretrained(repo_id)
dtype = torch.float16 if device.type == "cuda" else torch.float32
model = AutoModelForSeq2SeqLM.from_pretrained(repo_id, torch_dtype=dtype)
model.config.decoder_start_token_id = tokenizer.eos_token_id
model.generation_config.decoder_start_token_id = tokenizer.eos_token_id
model.to(device)
model.eval()
MODEL_CACHE[direction_key] = ModelBundle(tokenizer=tokenizer, model=model, repo_id=repo_id)
else:
bundle = MODEL_CACHE[direction_key]
model_device = next(bundle.model.parameters()).device
if model_device != device:
bundle.model.to(device)
bundle.model.eval()
if device.type == "cuda":
for key, bundle in MODEL_CACHE.items():
if key != direction_key and next(bundle.model.parameters()).device.type == "cuda":
bundle.model.to("cpu")
torch.cuda.empty_cache()
return MODEL_CACHE[direction_key]
def known_tag(tokenizer: NllbTokenizer, tag: str, fallback: str) -> str:
token_id = tokenizer.convert_tokens_to_ids(tag)
if token_id is None or token_id == tokenizer.unk_token_id:
return fallback
return tag
def format_prompt(
tokenizer: NllbTokenizer,
text: str,
direction_key: str,
lang_code: str,
domain_value: str,
dialect_value: str,
) -> str:
domain_tag = known_tag(tokenizer, f"<dom_{domain_value}>", "<dom_unknown>")
dialect_tag = known_tag(tokenizer, f"<dialect_{dialect_value}>", "<dialect_default>")
if direction_key == "f2en":
return f"<to_eng> <src_{lang_code}> {domain_tag} {dialect_tag} {text}"
if direction_key == "en2f":
return f"<to_{lang_code}> <src_eng> {domain_tag} {dialect_tag} {text}"
if direction_key == "f2zh":
return f"<to_zh> <src_{lang_code}> {domain_tag} {dialect_tag} {text}"
return f"<to_{lang_code}> <src_zh> {domain_tag} {dialect_tag} {text}"
@gpu
def translate(
text: str,
direction_label: str,
formosan_language: str,
source_domain: str,
dialect: str,
max_new_tokens: int,
num_beams: int,
repetition_penalty: float,
) -> Tuple[str, str]:
raw_text = text.strip()
if not raw_text:
return "", "Enter text to translate."
direction_key = DIRECTION_LABELS[direction_label]
lang_code, lang_lid = FORMOSAN_LANGS[formosan_language]
domain_value = DOMAIN_CHOICES[source_domain]
dialect_value = DIALECT_CHOICES[dialect]
bundle = load_bundle(direction_key)
tokenizer = bundle.tokenizer
model = bundle.model
if direction_key == "f2en":
tokenizer.src_lang = lang_lid
clean_text = preproc_formosan(raw_text)
target_lid = ENGLISH_LID
elif direction_key == "en2f":
tokenizer.src_lang = ENGLISH_LID
clean_text = preproc_english(raw_text)
target_lid = lang_lid
elif direction_key == "f2zh":
tokenizer.src_lang = lang_lid
clean_text = preproc_formosan(raw_text)
target_lid = CHINESE_LID
else:
tokenizer.src_lang = CHINESE_LID
clean_text = preproc_chinese(raw_text)
target_lid = lang_lid
prompt = format_prompt(tokenizer, clean_text, direction_key, lang_code, domain_value, dialect_value)
forced_bos = tokenizer.convert_tokens_to_ids(target_lid)
if forced_bos is None or forced_bos == tokenizer.unk_token_id:
raise gr.Error(f"Unknown target language token: {target_lid}")
inputs = tokenizer(
prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=MAX_INPUT_LENGTH,
).to(model.device)
with torch.inference_mode():
outputs = model.generate(
**inputs,
forced_bos_token_id=forced_bos,
decoder_start_token_id=tokenizer.eos_token_id,
max_new_tokens=int(max_new_tokens),
num_beams=int(num_beams),
no_repeat_ngram_size=3,
repetition_penalty=float(repetition_penalty),
length_penalty=1.0,
early_stopping=True,
)
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
translation = decoded[0].strip() if decoded else ""
meta = (
f"Model: `{bundle.repo_id}` \n"
f"Source: `{tokenizer.src_lang}` → Target: `{target_lid}` \n"
f"Hidden prefix: `{prompt[:220]}{'...' if len(prompt) > 220 else ''}`"
)
return translation, meta
def swap_placeholder(direction_label: str, formosan_language: str) -> gr.Textbox:
direction_key = DIRECTION_LABELS[direction_label]
if direction_key in {"f2en", "f2zh"}:
target = "English" if direction_key == "f2en" else "Traditional Chinese"
return gr.Textbox(
placeholder=f"Enter text in {formosan_language}. The app will translate it into {target}.",
label=f"{formosan_language} input",
)
source = "English" if direction_key == "en2f" else "Traditional Chinese"
return gr.Textbox(
placeholder=f"Enter {source} text to translate into {formosan_language}.",
label=f"{source} input",
)
def load_example(example_name: str):
values = EXAMPLE_PRESETS.get(example_name) or next(iter(EXAMPLE_PRESETS.values()))
return (*values, "", "Model metadata will appear after translation.")
with gr.Blocks(title="FormosanBank MT") as demo:
gr.Markdown(
"""
# Formosan ↔ English / Chinese MT
Translate between 15 Formosan languages and English or Traditional Chinese using directional NLLB-200 checkpoints.
The app adds the training control tags internally; users only choose direction and language.
"""
)
with gr.Row():
with gr.Column(scale=2):
input_text = gr.Textbox(
label="English input",
placeholder="Enter English text to translate into a Formosan language.",
lines=5,
max_lines=10,
)
translate_btn = gr.Button("Translate", variant="primary", size="lg")
output_text = gr.Textbox(
label="Translation",
lines=5,
max_lines=10,
show_copy_button=True,
interactive=False,
)
metadata = gr.Markdown("Model metadata will appear after translation.")
with gr.Column(scale=1):
direction = gr.Radio(
label="Direction",
choices=list(DIRECTION_LABELS),
value="English → Formosan",
)
formosan_language = gr.Dropdown(
label="Formosan language",
choices=list(FORMOSAN_LANGS),
value="Amis",
)
with gr.Accordion("Advanced metadata tags", open=False):
source_domain = gr.Dropdown(
label="Source/domain bucket",
choices=list(DOMAIN_CHOICES),
value="Unknown / general",
info="Most users should leave this as Unknown / general.",
)
dialect = gr.Dropdown(
label="Dialect tag",
choices=list(DIALECT_CHOICES),
value="Default / unknown",
info="Use a specific dialect only if you know it.",
)
with gr.Accordion("Generation controls", open=False):
max_new_tokens = gr.Slider(
label="Max new tokens",
minimum=24,
maximum=256,
value=128,
step=8,
)
num_beams = gr.Slider(
label="Beam size",
minimum=1,
maximum=8,
value=4,
step=1,
)
repetition_penalty = gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=1.5,
value=1.15,
step=0.05,
)
with gr.Group():
example_select = gr.Dropdown(
label="Example preset",
choices=list(EXAMPLE_PRESETS),
value=next(iter(EXAMPLE_PRESETS)),
)
load_example_btn = gr.Button("Load example", variant="secondary", size="sm")
gr.Markdown(
"""
**Current hard-split scores**
Formosan→English: BLEU 8.23 / chrF2 27.35
English→Formosan: BLEU 5.77 / chrF2 30.24
Formosan→Chinese: BLEU 9.79 / chrF2 11.77
Chinese→Formosan: BLEU 7.65 / chrF2 32.97
"""
)
gr.Markdown(
"""
## Notes
This is a research demo, not an authoritative translation service. Outputs can be wrong, incomplete,
or culturally inappropriate, especially when translating from English into a Formosan language.
Use fluent-speaker review for community-facing, ceremonial, legal, medical, or other high-stakes use.
Model cards and evaluation details are available at:
- [`FormosanBank/nllb200-formosan-en-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-en-spm8k)
- [`FormosanBank/nllb200-en-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-en-formosan-spm8k)
- [`FormosanBank/nllb200-formosan-zh-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-zh-spm8k)
- [`FormosanBank/nllb200-zh-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-zh-formosan-spm8k)
"""
)
direction.change(swap_placeholder, inputs=[direction, formosan_language], outputs=input_text)
formosan_language.change(swap_placeholder, inputs=[direction, formosan_language], outputs=input_text)
load_example_btn.click(
load_example,
inputs=[example_select],
outputs=[
input_text,
direction,
formosan_language,
source_domain,
dialect,
max_new_tokens,
num_beams,
repetition_penalty,
output_text,
metadata,
],
)
translate_btn.click(
translate,
inputs=[
input_text,
direction,
formosan_language,
source_domain,
dialect,
max_new_tokens,
num_beams,
repetition_penalty,
],
outputs=[output_text, metadata],
)
if __name__ == "__main__":
demo.queue(max_size=16).launch(ssr_mode=False)