Spaces:

kasimali
/

demonstrateindiclidtrans2

Runtime error

App Files Files Community

kasimali commited on Oct 8, 2025

Commit

bad5076

verified ·

1 Parent(s): 6dd899c

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +3 -8
app.py +232 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,12 +1,7 @@
 ---
-title: Demonstrateindiclidtrans2
-emoji: 📊
-colorFrom: pink
-colorTo: green
 sdk: gradio
-sdk_version: 5.49.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: demonstrateindiclidtrans2
+emoji: 🚀
 sdk: gradio
 ---
+# demonstrateindiclidtrans2

app.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# demonstrateindiclidtrans2
+print("--- 1. Installing All Libraries ---")
+print("✅ Libraries installed.")
+print("\n--- 2. Cloning IndicLID Repository ---")
+# Using your proven method of changing directories
+print("✅ Repository cloned.")
+# Navigate into the correct directory structure
+print("\n--- 3. Downloading and Unzipping IndicLID Models ---")
+print("✅ Download commands executed. Unzipping now...")
+print("✅ Unzip commands executed.")
+import os
+import sys
+import torch
+print("--- Applying your original add_safe_globals fix... ---")
+if "/content/IndicLID/Inference" not in sys.path:
+    sys.path.append("/content/IndicLID/Inference")
+from transformers.models.bert.modeling_bert import (
+    BertModel, BertPreTrainedModel, BertForSequenceClassification,
+    BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
+    BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput
+)
+from transformers.models.bert.configuration_bert import BertConfig
+import torch.nn as nn
+from torch.nn.modules.sparse import Embedding
+from torch.nn.modules.container import ModuleList
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.normalization import LayerNorm
+from torch.nn.modules.dropout import Dropout
+torch.serialization.add_safe_globals([
+    BertModel, BertPreTrainedModel, BertForSequenceClassification,
+    BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
+    BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig,
+    Embedding, ModuleList, Linear, LayerNorm, Dropout,
+])
+print("✅ Comprehensive safe globals added successfully.")
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from IndicTransToolkit.processor import IndicProcessor
+from ai4bharat.IndicLID import IndicLID
+print("--- Loading all models into memory... ---")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
+print("✅ IndicLID model loaded successfully.")
+MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device)
+ip = IndicProcessor(inference=True)
+import gradio as gr
+import pandas as pd
+from indic_transliteration import sanscript
+from indic_transliteration.sanscript import transliterate
+# Language mapping for translation
+LID_TO_TRANSLATE = {
+    "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
+    "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
+    "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
+    "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
+    "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
+    "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
+    "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
+    "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
+    "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
+    "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
+    "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
+    "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
+    "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
+    "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
+    # Common misdetections mapped to supported languages
+    "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+}
+def enhanced_transliterate_robust(text, target_script):
+    try:
+        cleaned_text = text.lower().strip()
+        replacements = {
+            'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
+            'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
+            'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
+        }
+        for old, new in replacements.items():
+            cleaned_text = cleaned_text.replace(old, new)
+        result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
+        return result if result else text
+    except Exception as e:
+        return text
+def detect_and_translate_single(text):
+    """
+    Detect language and translate single text input
+    """
+    try:
+        # Language detection
+        preds = lid.batch_predict([text], 1)
+        item = preds[0]
+        if isinstance(item, dict):
+            detected_lang = item.get("lang", item.get("pred_lang", ""))
+            score = float(item.get("score", 0.0))
+            model_name = item.get("model", "")
+        else:
+            _, detected_lang, score, model_name = item
+        is_romanized = detected_lang.endswith("_Latn")
+        script_type = "Romanized" if is_romanized else "Native Script"
+        # Translation
+        if detected_lang not in LID_TO_TRANSLATE:
+            translation = f"Language '{detected_lang}' not supported for translation"
+            method = "Unsupported"
+        else:
+            try:
+                lang_info = LID_TO_TRANSLATE[detected_lang]
+                src_code = lang_info["it_code"]
+                if is_romanized:
+                    # Enhanced transliteration for romanized text
+                    native_text = enhanced_transliterate_robust(text, lang_info["script"])
+                    method = "Transliteration + IndicTrans2"
+                else:
+                    native_text = text
+                    method = "IndicTrans2"
+                # Translate with IndicTrans2
+                pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
+                inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
+                with torch.no_grad():
+                    out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
+                dec = tokenizer.batch_decode(out, skip_special_tokens=True)
+                post = ip.postprocess_batch(dec, lang=src_code)
+                translation = post[0]
+            except Exception as e:
+                translation = f"Translation error: {str(e)}"
+                method = "Error"
+        return detected_lang, script_type, f"{score:.3f}", method, translation
+    except Exception as e:
+        return "Error", "Error", "0.000", "Error", f"Detection error: {str(e)}"
+# Gradio Interface
+def gradio_interface(input_text):
+    if not input_text.strip():
+        return "Please enter some text", "", "", "", ""
+    detected_lang, script_type, confidence, method, translation = detect_and_translate_single(input_text)
+    return detected_lang, script_type, confidence, method, translation
+# Create Gradio app
+with gr.Blocks(title="Indian Language Detection & Translation") as app:
+    gr.Markdown("# 🇮🇳 Indian Language Detector & Translator")
+    gr.Markdown("Enter text in any Indian language (native script or romanized) to detect the language and get English translation.")
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter text in Hindi, Tamil, Bengali, etc...",
+                lines=3
+            )
+            translate_btn = gr.Button("🔍 Detect & Translate", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            detected_lang = gr.Textbox(label="🎯 Detected Language", interactive=False)
+            script_type = gr.Textbox(label="📝 Script Type", interactive=False)
+        with gr.Column():
+            confidence = gr.Textbox(label="🎯 Confidence Score", interactive=False)
+            method = gr.Textbox(label="⚙️ Translation Method", interactive=False)
+    translation_output = gr.Textbox(
+        label="🌍 English Translation",
+        interactive=False,
+        lines=2
+    )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["तुम कैसे हो?"],
+            ["tum kaise ho"],
+            ["நீங்கள் எப்படி இருக்கிறீர்கள்?"],
+            ["neenga epdi irukeenga"],
+            ["আমি ভালো আছি।"],
+            ["ami bhalo achi"],
+            ["ನೀವು ಹೇಗಿದ್ದೀರಾ?"],
+            ["neevu hegiddira"]
+        ],
+        inputs=input_text,
+        label="📚 Try these examples:"
+    )
+    translate_btn.click(
+        fn=gradio_interface,
+        inputs=[input_text],
+        outputs=[detected_lang, script_type, confidence, method, translation_output]
+    )
+# Launch the app
+app.launch(share=True, debug=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+pandas
+sentencepiece
+torch
+transformers