| | import gradio as gr |
| | from transformers import NllbTokenizer |
| | import ctranslate2 |
| | from huggingface_hub import snapshot_download |
| |
|
| |
|
| | MODEL_ID = "Tamazight-NLP/NLLB-200-600M-Tamazight-All-Data-3-epoch-ct2-int8" |
| |
|
| | snapshot_download(MODEL_ID, local_dir=MODEL_ID.split('/')[1]) |
| |
|
| | translator = ctranslate2.Translator(MODEL_ID.split('/')[1]) |
| | nllb_tokenizer = NllbTokenizer.from_pretrained(MODEL_ID.split('/')[1]) |
| |
|
| | NLLB_LANG_MAPPING = { |
| | "English": "eng_Latn", |
| | "Standard Moroccan Tamazight": "tzm_Tfng", |
| | "Tachelhit/Central Atlas Tamazight": "taq_Tfng", |
| | "Tachelhit/Central Atlas Tamazight (Latin)": "taq_Latn", |
| | "Tarifit": "kab_Tfng", |
| | "Tarifit (Latin)": "kab_Latn", |
| | "Moroccan Darija": "ary_Arab", |
| | "Modern Standard Arabic": "arb_Arab", |
| | "Catalan": "cat_Latn", |
| | "Spanish": "spa_Latn", |
| | "French": "fra_Latn", |
| | "German": "deu_Latn", |
| | "Dutch": "nld_Latn", |
| | "Russian": "rus_Cyrl", |
| | "Italian": "ita_Latn", |
| | "Turkish": "tur_Latn", |
| | "Esperanto": "epo_Latn" |
| | } |
| |
|
| |
|
| | def translate(text, source_lang="English", target_lang="Tachelhit/Central Atlas Tamazight", |
| | max_length=237, num_beams=4, repetition_penalty=1.0): |
| | """ |
| | Translate multi-line text while preserving line breaks. |
| | Each line is translated independently. |
| | """ |
| | translations = [] |
| | for line in text.split("\n"): |
| | if line.strip() == "": |
| | translations.append("") |
| | else: |
| | nllb_tokenizer.src_lang = NLLB_LANG_MAPPING[source_lang] |
| | source = nllb_tokenizer.convert_ids_to_tokens(nllb_tokenizer.encode(line)) |
| | target_prefix = [NLLB_LANG_MAPPING[target_lang]] |
| | results = translator.translate_batch( |
| | [source], |
| | target_prefix=[target_prefix], |
| | max_decoding_length=max_length, |
| | beam_size=num_beams, |
| | repetition_penalty=repetition_penalty, |
| | ) |
| | target = results[0].hypotheses[0][1:] |
| | |
| | translation = nllb_tokenizer.decode(nllb_tokenizer.convert_tokens_to_ids(target), skip_special_tokens=True) |
| |
|
| | translations.append(translation) |
| |
|
| | return "\n".join(translations) |
| |
|
| |
|
| | gradio_ui= gr.Interface( |
| | fn=translate, |
| | title="NLLB Tamazight Translation Demo", |
| | inputs= [ |
| | gr.components.Textbox(label="Text", lines=4, placeholder="ⵙⵙⴽⵛⵎ ⴰⴹⵕⵉⵚ...\nEnter text to translate..."), |
| | gr.components.Dropdown(label="Source Language", choices=list(NLLB_LANG_MAPPING.keys()), value="English"), |
| | gr.components.Dropdown(label="Target Language", choices=list(NLLB_LANG_MAPPING.keys()), value="Standard Moroccan Tamazight"), |
| | gr.components.Slider(1, 400, value=237, step=10, label="Max Length (in tokens). Increase in case the output looks truncated."), |
| | gr.components.Slider(1, 25, value=4, step=1, label="Number of beams. Higher values might improve translation accuracy at the cost of speed."), |
| | gr.components.Slider(1, 10, value=1.0, step=0.1, label="Repetition penalty."), |
| | ], |
| | outputs=gr.components.Textbox(label="Translated text", lines=4) |
| | ) |
| |
|
| | gradio_ui.launch() |