# Indictrans2final # Step 1: Clean environment and clone repo fresh # Step 2: Install dependencies with pinned transformers and stable indic-transliteration # Step 3: Add IndicTransToolkit source to system path import sys sys.path.insert(0, '/content/IndicTrans2/src') # Step 4: Import all required packages import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from IndicTransToolkit.processor import IndicProcessor from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate import torch # Step 5: Load models and tokenizer device = torch.device("cpu") model_name = "ai4bharat/indictrans2-indic-en-dist-200M" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to(device) ip = IndicProcessor(inference=True) # Step 6: Language codes for transliteration and translation LANG_CODES = { "Hindi": {"xlit": sanscript.DEVANAGARI, "indictrans": "hin_Deva"}, "Tamil": {"xlit": sanscript.TAMIL, "indictrans": "tam_Taml"}, "Bengali": {"xlit": sanscript.BENGALI, "indictrans": "ben_Beng"}, "Telugu": {"xlit": sanscript.TELUGU, "indictrans": "tel_Telu"}, "Kannada": {"xlit": sanscript.KANNADA, "indictrans": "kan_Knda"}, "Malayalam": {"xlit": sanscript.MALAYALAM, "indictrans": "mal_Mlym"}, "Gujarati": {"xlit": sanscript.GUJARATI, "indictrans": "guj_Gujr"}, "Punjabi": {"xlit": sanscript.GURMUKHI, "indictrans": "pan_Guru"}, "Marathi": {"xlit": sanscript.DEVANAGARI, "indictrans": "mar_Deva"} } # Step 7: Native translation function def translate_native_script(text, lang): if not text or not text.strip(): return "Please enter text." src_lang = LANG_CODES[lang]["indictrans"] processed_text = ip.preprocess_batch([text], src_lang=src_lang, tgt_lang="eng_Latn") inputs = tokenizer(processed_text, return_tensors="pt", padding=True).to(device) with torch.no_grad(): outputs = model.generate(**inputs, num_beams=5, max_length=256) decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True) return ip.postprocess_batch(decoded, lang=src_lang)[0] # Step 8: Romanized translation function (transliterate then translate) def translate_roman_script(text, lang): if not text or not text.strip(): return "Please enter text." translit_script = LANG_CODES[lang]["xlit"] native_text = transliterate(text, sanscript.ITRANS, translit_script) return translate_native_script(native_text, lang) # Step 9: Gradio UI with separate tabs with gr.Blocks() as demo: gr.Markdown("## IndicTrans2: Multilingual Translation") gr.Markdown("Translate native and romanized Indian languages to English using specialized high-accuracy workflows.") with gr.Tab("Native Script to English"): native_input = gr.Textbox(lines=5, label="Native Indian Language Text", placeholder="यहाँ अपना पाठ दर्ज करें...") native_lang = gr.Dropdown(list(LANG_CODES.keys()), label="Select Language", value="Hindi") native_output = gr.Textbox(label="English Translation") gr.Button("Translate").click(translate_native_script, inputs=[native_input, native_lang], outputs=native_output) with gr.Tab("Romanized Script to English"): roman_input = gr.Textbox(lines=5, label="Romanized Indian Language Text", placeholder="Aap kaise hain?") roman_lang = gr.Dropdown(list(LANG_CODES.keys()), label="Select Language", value="Hindi") roman_output = gr.Textbox(label="English Translation") gr.Button("Translate").click(translate_roman_script, inputs=[roman_input, roman_lang], outputs=roman_output) print("🚀 Launching the IndicTrans2 translation app...") demo.launch(share=True)