hunterschep commited on
Commit
5df181a
·
verified ·
1 Parent(s): 207fa5d

Add Formosan-Chinese directional models to MT demo

Browse files
Files changed (2) hide show
  1. README.md +7 -3
  2. app.py +72 -13
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Formosan ↔ English MT
3
  emoji: 🌿
4
  colorFrom: yellow
5
  colorTo: green
@@ -8,10 +8,12 @@ sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-4.0
11
- short_description: Formosan-English NLLB-200 translation demo
12
  models:
13
  - FormosanBank/nllb200-formosan-en-spm8k
14
  - FormosanBank/nllb200-en-formosan-spm8k
 
 
15
  tags:
16
  - translation
17
  - nllb
@@ -20,12 +22,14 @@ tags:
20
  - endangered-languages
21
  ---
22
 
23
- # Formosan ↔ English Machine Translation
24
 
25
  This Space is a research demo for FormosanBank directional NLLB-200 models:
26
 
27
  - [`FormosanBank/nllb200-formosan-en-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-en-spm8k)
28
  - [`FormosanBank/nllb200-en-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-en-formosan-spm8k)
 
 
29
 
30
  The interface hides the metadata control tags used during training. Users only choose direction, language, and optionally source/domain and dialect metadata.
31
 
 
1
  ---
2
+ title: Formosan ↔ English / Chinese MT
3
  emoji: 🌿
4
  colorFrom: yellow
5
  colorTo: green
 
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-4.0
11
+ short_description: Formosan-English/Chinese NLLB-200 translation demo
12
  models:
13
  - FormosanBank/nllb200-formosan-en-spm8k
14
  - FormosanBank/nllb200-en-formosan-spm8k
15
+ - FormosanBank/nllb200-formosan-zh-spm8k
16
+ - FormosanBank/nllb200-zh-formosan-spm8k
17
  tags:
18
  - translation
19
  - nllb
 
22
  - endangered-languages
23
  ---
24
 
25
+ # Formosan ↔ English / Chinese Machine Translation
26
 
27
  This Space is a research demo for FormosanBank directional NLLB-200 models:
28
 
29
  - [`FormosanBank/nllb200-formosan-en-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-en-spm8k)
30
  - [`FormosanBank/nllb200-en-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-en-formosan-spm8k)
31
+ - [`FormosanBank/nllb200-formosan-zh-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-zh-spm8k)
32
+ - [`FormosanBank/nllb200-zh-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-zh-formosan-spm8k)
33
 
34
  The interface hides the metadata control tags used during training. Users only choose direction, language, and optionally source/domain and dialect metadata.
35
 
app.py CHANGED
@@ -27,7 +27,10 @@ except Exception:
27
 
28
  F2EN_MODEL_ID = "FormosanBank/nllb200-formosan-en-spm8k"
29
  EN2F_MODEL_ID = "FormosanBank/nllb200-en-formosan-spm8k"
 
 
30
  ENGLISH_LID = "eng_Latn"
 
31
  MAX_INPUT_LENGTH = 384
32
 
33
 
@@ -52,6 +55,8 @@ FORMOSAN_LANGS: Dict[str, Tuple[str, str]] = {
52
  DIRECTION_LABELS = {
53
  "Formosan → English": "f2en",
54
  "English → Formosan": "en2f",
 
 
55
  }
56
 
57
  DOMAIN_CHOICES = {
@@ -155,6 +160,26 @@ EXAMPLE_PRESETS = {
155
  4,
156
  1.15,
157
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  }
159
 
160
 
@@ -190,6 +215,10 @@ def preproc_formosan(text: str) -> str:
190
  return unicodedata.normalize("NFKC", replace_nonprint(text)).strip()
191
 
192
 
 
 
 
 
193
  @dataclass
194
  class ModelBundle:
195
  tokenizer: NllbTokenizer
@@ -206,7 +235,12 @@ def active_device() -> torch.device:
206
 
207
 
208
  def model_id_for(direction_key: str) -> str:
209
- return F2EN_MODEL_ID if direction_key == "f2en" else EN2F_MODEL_ID
 
 
 
 
 
210
 
211
 
212
  def load_bundle(direction_key: str) -> ModelBundle:
@@ -214,6 +248,11 @@ def load_bundle(direction_key: str) -> ModelBundle:
214
  device = active_device()
215
  with MODEL_LOCK:
216
  if direction_key not in MODEL_CACHE:
 
 
 
 
 
217
  tokenizer = NllbTokenizer.from_pretrained(repo_id)
218
  dtype = torch.float16 if device.type == "cuda" else torch.float32
219
  model = AutoModelForSeq2SeqLM.from_pretrained(repo_id, torch_dtype=dtype)
@@ -255,7 +294,11 @@ def format_prompt(
255
  dialect_tag = known_tag(tokenizer, f"<dialect_{dialect_value}>", "<dialect_default>")
256
  if direction_key == "f2en":
257
  return f"<to_eng> <src_{lang_code}> {domain_tag} {dialect_tag} {text}"
258
- return f"<to_{lang_code}> <src_eng> {domain_tag} {dialect_tag} {text}"
 
 
 
 
259
 
260
 
261
  @gpu
@@ -285,10 +328,18 @@ def translate(
285
  tokenizer.src_lang = lang_lid
286
  clean_text = preproc_formosan(raw_text)
287
  target_lid = ENGLISH_LID
288
- else:
289
  tokenizer.src_lang = ENGLISH_LID
290
  clean_text = preproc_english(raw_text)
291
  target_lid = lang_lid
 
 
 
 
 
 
 
 
292
 
293
  prompt = format_prompt(tokenizer, clean_text, direction_key, lang_code, domain_value, dialect_value)
294
  forced_bos = tokenizer.convert_tokens_to_ids(target_lid)
@@ -327,14 +378,17 @@ def translate(
327
 
328
 
329
  def swap_placeholder(direction_label: str, formosan_language: str) -> gr.Textbox:
330
- if DIRECTION_LABELS[direction_label] == "f2en":
 
 
331
  return gr.Textbox(
332
- placeholder=f"Enter text in {formosan_language}. The app will add the hidden NLLB/control tags.",
333
  label=f"{formosan_language} input",
334
  )
 
335
  return gr.Textbox(
336
- placeholder=f"Enter English text to translate into {formosan_language}.",
337
- label="English input",
338
  )
339
 
340
 
@@ -346,9 +400,9 @@ def load_example(example_name: str):
346
  with gr.Blocks(title="FormosanBank MT") as demo:
347
  gr.Markdown(
348
  """
349
- # Formosan ↔ English MT
350
 
351
- Translate between English and 15 Formosan languages using directional NLLB-200 checkpoints.
352
  The app adds the training control tags internally; users only choose direction and language.
353
  """
354
  )
@@ -428,7 +482,9 @@ The app adds the training control tags internally; users only choose direction a
428
  **Current hard-split scores**
429
 
430
  Formosan→English: BLEU 8.23 / chrF2 27.35
431
- English→Formosan: BLEU 5.77 / chrF2 30.24
 
 
432
  """
433
  )
434
 
@@ -440,9 +496,12 @@ This is a research demo, not an authoritative translation service. Outputs can b
440
  or culturally inappropriate, especially when translating from English into a Formosan language.
441
  Use fluent-speaker review for community-facing, ceremonial, legal, medical, or other high-stakes use.
442
 
443
- The model cards and evaluation details are available at
444
- [`FormosanBank/nllb200-formosan-en-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-en-spm8k)
445
- and [`FormosanBank/nllb200-en-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-en-formosan-spm8k).
 
 
 
446
  """
447
  )
448
 
 
27
 
28
  F2EN_MODEL_ID = "FormosanBank/nllb200-formosan-en-spm8k"
29
  EN2F_MODEL_ID = "FormosanBank/nllb200-en-formosan-spm8k"
30
+ F2ZH_MODEL_ID = "FormosanBank/nllb200-formosan-zh-spm8k"
31
+ ZH2F_MODEL_ID = "FormosanBank/nllb200-zh-formosan-spm8k"
32
  ENGLISH_LID = "eng_Latn"
33
+ CHINESE_LID = "zho_Hant"
34
  MAX_INPUT_LENGTH = 384
35
 
36
 
 
55
  DIRECTION_LABELS = {
56
  "Formosan → English": "f2en",
57
  "English → Formosan": "en2f",
58
+ "Formosan → Chinese": "f2zh",
59
+ "Chinese → Formosan": "zh2f",
60
  }
61
 
62
  DOMAIN_CHOICES = {
 
160
  4,
161
  1.15,
162
  ),
163
+ "Chinese → Amis: 他回家了。": (
164
+ "他回家了。",
165
+ "Chinese → Formosan",
166
+ "Amis",
167
+ "Unknown / general",
168
+ "Default / unknown",
169
+ 96,
170
+ 4,
171
+ 1.15,
172
+ ),
173
+ "Amis → Chinese: Pa'araw cingra...": (
174
+ "Pa'araw cingra to demak nira.",
175
+ "Formosan → Chinese",
176
+ "Amis",
177
+ "Unknown / general",
178
+ "Default / unknown",
179
+ 96,
180
+ 4,
181
+ 1.15,
182
+ ),
183
  }
184
 
185
 
 
215
  return unicodedata.normalize("NFKC", replace_nonprint(text)).strip()
216
 
217
 
218
+ def preproc_chinese(text: str) -> str:
219
+ return unicodedata.normalize("NFKC", replace_nonprint(text)).strip()
220
+
221
+
222
  @dataclass
223
  class ModelBundle:
224
  tokenizer: NllbTokenizer
 
235
 
236
 
237
  def model_id_for(direction_key: str) -> str:
238
+ return {
239
+ "f2en": F2EN_MODEL_ID,
240
+ "en2f": EN2F_MODEL_ID,
241
+ "f2zh": F2ZH_MODEL_ID,
242
+ "zh2f": ZH2F_MODEL_ID,
243
+ }[direction_key]
244
 
245
 
246
  def load_bundle(direction_key: str) -> ModelBundle:
 
248
  device = active_device()
249
  with MODEL_LOCK:
250
  if direction_key not in MODEL_CACHE:
251
+ if device.type == "cuda":
252
+ for bundle in MODEL_CACHE.values():
253
+ if next(bundle.model.parameters()).device.type == "cuda":
254
+ bundle.model.to("cpu")
255
+ torch.cuda.empty_cache()
256
  tokenizer = NllbTokenizer.from_pretrained(repo_id)
257
  dtype = torch.float16 if device.type == "cuda" else torch.float32
258
  model = AutoModelForSeq2SeqLM.from_pretrained(repo_id, torch_dtype=dtype)
 
294
  dialect_tag = known_tag(tokenizer, f"<dialect_{dialect_value}>", "<dialect_default>")
295
  if direction_key == "f2en":
296
  return f"<to_eng> <src_{lang_code}> {domain_tag} {dialect_tag} {text}"
297
+ if direction_key == "en2f":
298
+ return f"<to_{lang_code}> <src_eng> {domain_tag} {dialect_tag} {text}"
299
+ if direction_key == "f2zh":
300
+ return f"<to_zh> <src_{lang_code}> {domain_tag} {dialect_tag} {text}"
301
+ return f"<to_{lang_code}> <src_zh> {domain_tag} {dialect_tag} {text}"
302
 
303
 
304
  @gpu
 
328
  tokenizer.src_lang = lang_lid
329
  clean_text = preproc_formosan(raw_text)
330
  target_lid = ENGLISH_LID
331
+ elif direction_key == "en2f":
332
  tokenizer.src_lang = ENGLISH_LID
333
  clean_text = preproc_english(raw_text)
334
  target_lid = lang_lid
335
+ elif direction_key == "f2zh":
336
+ tokenizer.src_lang = lang_lid
337
+ clean_text = preproc_formosan(raw_text)
338
+ target_lid = CHINESE_LID
339
+ else:
340
+ tokenizer.src_lang = CHINESE_LID
341
+ clean_text = preproc_chinese(raw_text)
342
+ target_lid = lang_lid
343
 
344
  prompt = format_prompt(tokenizer, clean_text, direction_key, lang_code, domain_value, dialect_value)
345
  forced_bos = tokenizer.convert_tokens_to_ids(target_lid)
 
378
 
379
 
380
  def swap_placeholder(direction_label: str, formosan_language: str) -> gr.Textbox:
381
+ direction_key = DIRECTION_LABELS[direction_label]
382
+ if direction_key in {"f2en", "f2zh"}:
383
+ target = "English" if direction_key == "f2en" else "Traditional Chinese"
384
  return gr.Textbox(
385
+ placeholder=f"Enter text in {formosan_language}. The app will translate it into {target}.",
386
  label=f"{formosan_language} input",
387
  )
388
+ source = "English" if direction_key == "en2f" else "Traditional Chinese"
389
  return gr.Textbox(
390
+ placeholder=f"Enter {source} text to translate into {formosan_language}.",
391
+ label=f"{source} input",
392
  )
393
 
394
 
 
400
  with gr.Blocks(title="FormosanBank MT") as demo:
401
  gr.Markdown(
402
  """
403
+ # Formosan ↔ English / Chinese MT
404
 
405
+ Translate between 15 Formosan languages and English or Traditional Chinese using directional NLLB-200 checkpoints.
406
  The app adds the training control tags internally; users only choose direction and language.
407
  """
408
  )
 
482
  **Current hard-split scores**
483
 
484
  Formosan→English: BLEU 8.23 / chrF2 27.35
485
+ English→Formosan: BLEU 5.77 / chrF2 30.24
486
+ Formosan→Chinese: BLEU 9.79 / chrF2 11.77
487
+ Chinese→Formosan: BLEU 7.65 / chrF2 32.97
488
  """
489
  )
490
 
 
496
  or culturally inappropriate, especially when translating from English into a Formosan language.
497
  Use fluent-speaker review for community-facing, ceremonial, legal, medical, or other high-stakes use.
498
 
499
+ Model cards and evaluation details are available at:
500
+
501
+ - [`FormosanBank/nllb200-formosan-en-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-en-spm8k)
502
+ - [`FormosanBank/nllb200-en-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-en-formosan-spm8k)
503
+ - [`FormosanBank/nllb200-formosan-zh-spm8k`](https://huggingface.co/FormosanBank/nllb200-formosan-zh-spm8k)
504
+ - [`FormosanBank/nllb200-zh-formosan-spm8k`](https://huggingface.co/FormosanBank/nllb200-zh-formosan-spm8k)
505
  """
506
  )
507