kasimali's picture
Update app.py
ef77679 verified
# Indictrans2final
# Step 1: Clean environment and clone repo fresh
# Step 2: Install dependencies with pinned transformers and stable indic-transliteration
# Step 3: Add IndicTransToolkit source to system path
import sys
sys.path.insert(0, '/content/IndicTrans2/src')
# Step 4: Import all required packages
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit.processor import IndicProcessor
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import torch
# Step 5: Load models and tokenizer
device = torch.device("cpu")
model_name = "ai4bharat/indictrans2-indic-en-dist-200M"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to(device)
ip = IndicProcessor(inference=True)
# Step 6: Language codes for transliteration and translation
LANG_CODES = {
"Hindi": {"xlit": sanscript.DEVANAGARI, "indictrans": "hin_Deva"},
"Tamil": {"xlit": sanscript.TAMIL, "indictrans": "tam_Taml"},
"Bengali": {"xlit": sanscript.BENGALI, "indictrans": "ben_Beng"},
"Telugu": {"xlit": sanscript.TELUGU, "indictrans": "tel_Telu"},
"Kannada": {"xlit": sanscript.KANNADA, "indictrans": "kan_Knda"},
"Malayalam": {"xlit": sanscript.MALAYALAM, "indictrans": "mal_Mlym"},
"Gujarati": {"xlit": sanscript.GUJARATI, "indictrans": "guj_Gujr"},
"Punjabi": {"xlit": sanscript.GURMUKHI, "indictrans": "pan_Guru"},
"Marathi": {"xlit": sanscript.DEVANAGARI, "indictrans": "mar_Deva"}
}
# Step 7: Native translation function
def translate_native_script(text, lang):
if not text or not text.strip():
return "Please enter text."
src_lang = LANG_CODES[lang]["indictrans"]
processed_text = ip.preprocess_batch([text], src_lang=src_lang, tgt_lang="eng_Latn")
inputs = tokenizer(processed_text, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
outputs = model.generate(**inputs, num_beams=5, max_length=256)
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
return ip.postprocess_batch(decoded, lang=src_lang)[0]
# Step 8: Romanized translation function (transliterate then translate)
def translate_roman_script(text, lang):
if not text or not text.strip():
return "Please enter text."
translit_script = LANG_CODES[lang]["xlit"]
native_text = transliterate(text, sanscript.ITRANS, translit_script)
return translate_native_script(native_text, lang)
# Step 9: Gradio UI with separate tabs
with gr.Blocks() as demo:
gr.Markdown("## IndicTrans2: Multilingual Translation")
gr.Markdown("Translate native and romanized Indian languages to English using specialized high-accuracy workflows.")
with gr.Tab("Native Script to English"):
native_input = gr.Textbox(lines=5, label="Native Indian Language Text", placeholder="यहाँ अपना पाठ दर्ज करें...")
native_lang = gr.Dropdown(list(LANG_CODES.keys()), label="Select Language", value="Hindi")
native_output = gr.Textbox(label="English Translation")
gr.Button("Translate").click(translate_native_script, inputs=[native_input, native_lang], outputs=native_output)
with gr.Tab("Romanized Script to English"):
roman_input = gr.Textbox(lines=5, label="Romanized Indian Language Text", placeholder="Aap kaise hain?")
roman_lang = gr.Dropdown(list(LANG_CODES.keys()), label="Select Language", value="Hindi")
roman_output = gr.Textbox(label="English Translation")
gr.Button("Translate").click(translate_roman_script, inputs=[roman_input, roman_lang], outputs=roman_output)
print("🚀 Launching the IndicTrans2 translation app...")
demo.launch(share=True)