| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig |
| from sacremoses import MosesPunctNormalizer |
| from flores import code_mapping |
| import gradio as gr |
| import platform |
|
|
| device = "cpu" if platform.system() == "Darwin" else "cuda" |
| device = "cpu" |
| MODEL_DIR = "./nllb-600M-quantized" |
|
|
| |
| |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) |
| if device == "cuda": |
| pass |
| |
| |
| |
| else: |
| model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR) |
|
|
| punct_normalizer = MosesPunctNormalizer(lang="en") |
|
|
| |
| langs = { |
| "Hindi": "hin_Deva", |
| "French": "fra_Latn", |
| "Spanish": "spa_Latn", |
| "German": "deu_Latn", |
| "Arabic": "arb_Arab" |
| } |
|
|
| def translate(text: str, src_lang: str, tgt_lang: str): |
| src_code = code_mapping[src_lang] |
| tgt_code = code_mapping[tgt_lang] |
| print('source lang code ',src_code) |
|
|
| tokenizer.src_lang = src_code |
| tokenizer.tgt_lang = tgt_code |
|
|
| |
| text = punct_normalizer.normalize(text) |
|
|
| |
| inputs = tokenizer(text, return_tensors="pt").to(device) |
| outputs = model.generate( |
| **inputs, |
| forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code), |
| |
| num_beams=3, |
| no_repeat_ngram_size=2, |
| ) |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
| langs = list(code_mapping.keys()) |
|
|
| iface = gr.Interface( |
| fn=translate, |
| inputs=[gr.Textbox(lines=10, label="Input Text"), |
| gr.Dropdown(langs, label="Source Language"), |
| gr.Dropdown(langs, label="Target Language")], |
| outputs=gr.Textbox(lines=30, label="Translated Text"), |
| title="🌍 Language Translation (CPU-friendly)" |
| ) |
|
|
| iface.launch(share=True) |