hunterschep commited on
Commit
312055d
Β·
verified Β·
1 Parent(s): 61dc1b2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -0
app.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
3
+ import torch
4
+ from sacremoses import MosesPunctNormalizer
5
+ import re
6
+ import unicodedata
7
+ import sys
8
+
9
+ # -------------------------------------------------------------------
10
+ # Device & model loading
11
+ # -------------------------------------------------------------------
12
+
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+
15
+ # English multilingual model
16
+ MODEL_ID = "FormosonBankDemos/nllb200-formosan-en"
17
+ ENGLISH_CODE = "eng_Latn"
18
+
19
+ # Formosan languages exposed in the UI
20
+ FORMOSAN_LANGS = {
21
+ "Amis (ami_Latn)": "ami_Latn",
22
+ "Bunun (bnn_Latn)": "bnn_Latn",
23
+ "Kavalan (ckv_Latn)": "ckv_Latn",
24
+ "Rukai (dru_Latn)": "dru_Latn",
25
+ "Paiwan (pwn_Latn)": "pwn_Latn",
26
+ "Puyuma (pyu_Latn)": "pyu_Latn",
27
+ "Thao (ssf_Latn)": "ssf_Latn",
28
+ "Saaroa (sxr_Latn)": "sxr_Latn",
29
+ "Sakizaya (szy_Latn)": "szy_Latn",
30
+ "Tao / Yami (tao_Latn)": "tao_Latn",
31
+ "Atayal (tay_Latn)": "tay_Latn",
32
+ "Seediq (trv_Latn)": "trv_Latn",
33
+ "Tsou (tsu_Latn)": "tsu_Latn",
34
+ "Kanakanavu (xnb_Latn)": "xnb_Latn",
35
+ "Saisiyat (xsy_Latn)": "xsy_Latn",
36
+ }
37
+
38
+ # Load multilingual model + tokenizer
39
+ tokenizer = NllbTokenizer.from_pretrained(MODEL_ID)
40
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID).to(device)
41
+
42
+
43
+ # -------------------------------------------------------------------
44
+ # Normalization / preprocessing helpers
45
+ # -------------------------------------------------------------------
46
+
47
+ mpn_english = MosesPunctNormalizer(lang="en")
48
+ mpn_english.substitutions = [
49
+ (re.compile(pattern), sub) for pattern, sub in mpn_english.substitutions
50
+ ]
51
+
52
+
53
+ def get_non_printing_char_replacer(replace_by: str = " "):
54
+ non_printable_map = {
55
+ ord(c): replace_by
56
+ for c in (chr(i) for i in range(sys.maxunicode + 1))
57
+ if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
58
+ }
59
+ return lambda line: line.translate(non_printable_map)
60
+
61
+
62
+ replace_nonprint = get_non_printing_char_replacer(" ")
63
+
64
+
65
+ def preproc_english(text: str) -> str:
66
+ clean = text
67
+ for pattern, sub in mpn_english.substitutions:
68
+ clean = pattern.sub(sub, clean)
69
+ clean = replace_nonprint(clean)
70
+ return unicodedata.normalize("NFKC", clean)
71
+
72
+
73
+ def preproc_generic(text: str) -> str:
74
+ clean = replace_nonprint(text)
75
+ return unicodedata.normalize("NFKC", clean)
76
+
77
+
78
+ # -------------------------------------------------------------------
79
+ # Core translation function (Formosan <-> English)
80
+ # -------------------------------------------------------------------
81
+
82
+ def translate(
83
+ text: str,
84
+ direction: str,
85
+ form_lang_label: str,
86
+ max_new_tokens: int,
87
+ num_beams: int,
88
+ ) -> str:
89
+ text = text.strip()
90
+ if not text:
91
+ return ""
92
+
93
+ # Resolve language codes
94
+ form_code = FORMOSAN_LANGS[form_lang_label]
95
+
96
+ if direction.startswith("Formosan β†’ English"):
97
+ src_code = form_code
98
+ tgt_code = ENGLISH_CODE
99
+ src_text = preproc_generic(text)
100
+ else: # "English β†’ Formosan"
101
+ src_code = ENGLISH_CODE
102
+ tgt_code = form_code
103
+ src_text = preproc_english(text)
104
+
105
+ # Set source language on tokenizer
106
+ tokenizer.src_lang = src_code
107
+
108
+ # Tokenize
109
+ inputs = tokenizer(
110
+ src_text,
111
+ return_tensors="pt",
112
+ padding=True,
113
+ truncation=True,
114
+ max_length=512,
115
+ ).to(model.device)
116
+
117
+ forced_bos = tokenizer.convert_tokens_to_ids(tgt_code)
118
+
119
+ # Generate
120
+ model.eval()
121
+ with torch.no_grad():
122
+ outputs = model.generate(
123
+ **inputs,
124
+ forced_bos_token_id=forced_bos,
125
+ decoder_start_token_id=forced_bos,
126
+ max_new_tokens=int(max_new_tokens),
127
+ num_beams=int(num_beams),
128
+ no_repeat_ngram_size=3,
129
+ repetition_penalty=1.2,
130
+ length_penalty=1.05,
131
+ early_stopping=True,
132
+ )
133
+
134
+ decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
135
+ return decoded[0] if decoded else ""
136
+
137
+
138
+ # -------------------------------------------------------------------
139
+ # Gradio UI
140
+ # -------------------------------------------------------------------
141
+
142
+ with gr.Blocks() as demo:
143
+ gr.Markdown(
144
+ """
145
+ # 🌏 Formosan ↔ English Translation (NLLB-200, Multilingual)
146
+
147
+ This Space uses **`FormosonBankDemos/nllb200-formosan-en`**, a multilingual NLLB-200 model
148
+ fine-tuned on **15 Formosan languages ↔ English (`eng_Latn`)**.
149
+
150
+ - Choose the **direction** and the **Formosan language**.
151
+ - Paste text in the input box and click **Translate**.
152
+ - This is a **research / demo** tool β€” translations should be reviewed by fluent speakers.
153
+ """
154
+ )
155
+
156
+ with gr.Row():
157
+ with gr.Column(scale=1):
158
+ direction = gr.Radio(
159
+ label="Direction",
160
+ choices=[
161
+ "Formosan β†’ English (eng_Latn)",
162
+ "English (eng_Latn) β†’ Formosan",
163
+ ],
164
+ value="English (eng_Latn) β†’ Formosan",
165
+ )
166
+ form_lang = gr.Dropdown(
167
+ label="Formosan language",
168
+ choices=list(FORMOSAN_LANGS.keys()),
169
+ value="Amis (ami_Latn)",
170
+ )
171
+ max_new_tokens = gr.Slider(
172
+ label="Max new tokens",
173
+ minimum=16,
174
+ maximum=256,
175
+ value=64,
176
+ step=8,
177
+ )
178
+ num_beams = gr.Slider(
179
+ label="Beam size",
180
+ minimum=1,
181
+ maximum=8,
182
+ value=4,
183
+ step=1,
184
+ )
185
+
186
+ with gr.Column(scale=2):
187
+ input_text = gr.Textbox(
188
+ label="Input text",
189
+ placeholder="Enter text in English or the selected Formosan language...",
190
+ lines=5,
191
+ )
192
+ translate_btn = gr.Button("Translate βœ…", variant="primary")
193
+ output_text = gr.Textbox(
194
+ label="Translated text",
195
+ lines=5,
196
+ interactive=False,
197
+ )
198
+
199
+ gr.Markdown(
200
+ """
201
+ ### πŸ” Example sentences
202
+
203
+ Click an example to load it into the interface:
204
+ """
205
+ )
206
+
207
+ gr.Examples(
208
+ examples=[
209
+ # English β†’ Amis
210
+ [
211
+ "There are many beetles in the forest.",
212
+ "English (eng_Latn) β†’ Formosan",
213
+ "Amis (ami_Latn)",
214
+ 64,
215
+ 4,
216
+ ],
217
+ # Amis β†’ English
218
+ [
219
+ "Adihay ko 'adadongac i kilakilangan.",
220
+ "Formosan β†’ English (eng_Latn)",
221
+ "Amis (ami_Latn)",
222
+ 64,
223
+ 4,
224
+ ],
225
+ # English β†’ Seediq
226
+ [
227
+ "Many of your relatives live near Fonglin.",
228
+ "English (eng_Latn) β†’ Formosan",
229
+ "Seediq (trv_Latn)",
230
+ 64,
231
+ 4,
232
+ ],
233
+ # Paiwan β†’ English
234
+ [
235
+ "abonai aravac a sapoi.",
236
+ "Formosan β†’ English (eng_Latn)",
237
+ "Paiwan (pwn_Latn)",
238
+ 64,
239
+ 4,
240
+ ],
241
+ ],
242
+ inputs=[input_text, direction, form_lang, max_new_tokens, num_beams],
243
+ outputs=output_text,
244
+ fn=translate,
245
+ cache_examples=False,
246
+ )
247
+
248
+ gr.Markdown(
249
+ """
250
+ ---
251
+
252
+ ### ℹ️ Notes
253
+
254
+ - Model: **`FormosonBankDemos/nllb200-formosan-en`** (based on `facebook/nllb-200-distilled-600M`).
255
+ - Directions supported: **Formosan ↔ English (`eng_Latn`)**.
256
+ - Some directions (especially English β†’ Formosan) are still challenging and may produce rough or partially incorrect translations.
257
+ - Please treat outputs as **drafts**, not final translations, especially for sensitive or ceremonial content.
258
+
259
+ If you are a speaker or researcher and notice systematic issues, feedback is very welcome!
260
+ """
261
+ )
262
+
263
+ translate_btn.click(
264
+ translate,
265
+ inputs=[input_text, direction, form_lang, max_new_tokens, num_beams],
266
+ outputs=output_text,
267
+ )
268
+
269
+ if __name__ == "__main__":
270
+ demo.launch()