Saugat212's picture
tokens
0394171 verified
import gradio as gr
from transformers import AutoTokenizer, pipeline
model_id = "Saugat212/ne-en-nllb-model"
SOURCE_LANG = "npi_Deva"
TARGET_LANG = "eng_Latn"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load translator
translator = pipeline(
"translation",
model=model_id,
device_map="auto"
)
# ---- CHUNKING FUNCTION (TOKEN-BASED) ----
def chunk_by_tokens(text, max_tokens=1024):
tokens = tokenizer.encode(text, add_special_tokens=False)
chunks = []
# Split tokens into chunks
for i in range(0, len(tokens), max_tokens):
chunk_tokens = tokens[i:i + max_tokens]
chunk_text = tokenizer.decode(chunk_tokens)
chunks.append(chunk_text)
return chunks
# ---- TRANSLATION FUNCTION WITH TOKEN CHUNKING ----
def translate_text(text):
chunks = chunk_by_tokens(text, max_tokens=1024)
translated_parts = []
for chunk in chunks:
result = translator(
chunk,
src_lang=SOURCE_LANG,
tgt_lang=TARGET_LANG
)
translated_parts.append(result[0]["translation_text"])
return " ".join(translated_parts)
# ---- GRADIO UI ----
iface = gr.Interface(
fn=translate_text,
inputs=gr.Textbox(lines=5, label=f"Nepali ({SOURCE_LANG})"),
outputs=gr.Textbox(lines=5, label=f"English ({TARGET_LANG})"),
title="Nepali β†’ English (NLLB Long-Text Translator)",
)
iface.launch()