Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoTokenizer, pipeline | |
| model_id = "Saugat212/ne-en-nllb-model" | |
| SOURCE_LANG = "npi_Deva" | |
| TARGET_LANG = "eng_Latn" | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| # Load translator | |
| translator = pipeline( | |
| "translation", | |
| model=model_id, | |
| device_map="auto" | |
| ) | |
| # ---- CHUNKING FUNCTION (TOKEN-BASED) ---- | |
| def chunk_by_tokens(text, max_tokens=1024): | |
| tokens = tokenizer.encode(text, add_special_tokens=False) | |
| chunks = [] | |
| # Split tokens into chunks | |
| for i in range(0, len(tokens), max_tokens): | |
| chunk_tokens = tokens[i:i + max_tokens] | |
| chunk_text = tokenizer.decode(chunk_tokens) | |
| chunks.append(chunk_text) | |
| return chunks | |
| # ---- TRANSLATION FUNCTION WITH TOKEN CHUNKING ---- | |
| def translate_text(text): | |
| chunks = chunk_by_tokens(text, max_tokens=1024) | |
| translated_parts = [] | |
| for chunk in chunks: | |
| result = translator( | |
| chunk, | |
| src_lang=SOURCE_LANG, | |
| tgt_lang=TARGET_LANG | |
| ) | |
| translated_parts.append(result[0]["translation_text"]) | |
| return " ".join(translated_parts) | |
| # ---- GRADIO UI ---- | |
| iface = gr.Interface( | |
| fn=translate_text, | |
| inputs=gr.Textbox(lines=5, label=f"Nepali ({SOURCE_LANG})"), | |
| outputs=gr.Textbox(lines=5, label=f"English ({TARGET_LANG})"), | |
| title="Nepali β English (NLLB Long-Text Translator)", | |
| ) | |
| iface.launch() | |