Spaces:

kasimali
/

final

Runtime error

App Files Files Community

kasimali commited on Oct 8

Commit

3d5d9f5

verified ·

1 Parent(s): 8962a42

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +3 -8
app.py +694 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,12 +1,7 @@
 ---
-title: Final
-emoji: 📊
-colorFrom: blue
-colorTo: blue
 sdk: gradio
-sdk_version: 5.49.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: final
+emoji: 🚀
 sdk: gradio
 ---
+# final

app.py ADDED Viewed

	@@ -0,0 +1,694 @@

+# final
+# ================================================================
+# = STEP 1: SETUP AND DOWNLOAD (YOUR PROVEN METHOD)              =
+# ================================================================
+import os
+print("--- 1. Installing All Libraries ---")
+print("✅ Libraries installed.")
+print("\n--- 2. Cloning IndicLID Repository ---")
+# Using your proven method of changing directories
+print("✅ Repository cloned.")
+# Navigate into the correct directory structure
+print("\n--- 3. Downloading and Unzipping IndicLID Models ---")
+print("✅ Download commands executed. Unzipping now...")
+print("✅ Unzip commands executed.")
+print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉")
+import shutil
+import os
+# Source folder path
+source = "/usr/local/lib/python3.12/dist-packages/transformers"
+# Destination folder path
+destination = "/content/IndicLID/Inference/ai4bharat/"
+# Ensure destination directory exists
+os.makedirs(destination, exist_ok=True)
+# Move folder
+moved_path = shutil.move(source, destination)
+print(f"Folder moved to: {moved_path}")
+# =========================
+# = STEP 2: INITIALIZE MODELS (EXACTLY AS YOUR OLD CODE) =
+# =========================
+import os
+import sys
+import torch
+print("--- Applying your original add_safe_globals fix... ---")
+if "/content/IndicLID/Inference" not in sys.path:
+    sys.path.append("/content/IndicLID/Inference")
+from transformers.models.bert.modeling_bert import (
+    BertModel, BertPreTrainedModel, BertForSequenceClassification,
+    BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
+    BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput
+)
+from transformers.models.bert.configuration_bert import BertConfig
+import torch.nn as nn
+from torch.nn.modules.sparse import Embedding
+from torch.nn.modules.container import ModuleList
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.normalization import LayerNorm
+from torch.nn.modules.dropout import Dropout
+torch.serialization.add_safe_globals([
+    BertModel, BertPreTrainedModel, BertForSequenceClassification,
+    BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
+    BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig,
+    Embedding, ModuleList, Linear, LayerNorm, Dropout,
+])
+print("✅ Comprehensive safe globals added successfully.")
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from IndicTransToolkit.processor import IndicProcessor
+from ai4bharat.IndicLID import IndicLID
+print("--- Loading all models into memory... ---")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
+print("✅ IndicLID model loaded successfully.")
+MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device)
+ip = IndicProcessor(inference=True)
+print("✅ IndicTrans2 1B model loaded.")
+print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.")
+import sys
+print(sys.path)
+pip show transformers
+# ================================================================
+# = STEP 2.5: LOAD ROMANSETU (COMPATIBLE WITH 4.40.2)           =
+# ================================================================
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---")
+# Try smaller, more compatible models first
+model_options = [
+    "ai4bharat/romansetu-cpt-roman-100m",
+    "ai4bharat/romansetu-cpt-roman-200m"
+]
+rs_model = None
+rs_tokenizer = None
+for model_id in model_options:
+    try:
+        print(f"Trying model: {model_id}")
+        rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
+        print(f"✅ {model_id} loaded successfully.")
+        break
+    except Exception as e:
+        print(f"❌ {model_id} failed: {e}")
+        continue
+if rs_model is None:
+    print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.")
+def translate_with_romansetu(text, max_new_tokens=50):
+    if rs_model is None:
+        # Fallback: use enhanced transliteration + IndicTrans2
+        from indic_transliteration import sanscript
+        from indic_transliteration.sanscript import transliterate
+        try:
+            # Try to transliterate and then translate with IndicTrans2
+            native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
+            pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn")
+            inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
+            with torch.no_grad():
+                out = model.generate(**inputs, num_beams=3, max_length=100)
+            dec = tokenizer.batch_decode(out, skip_special_tokens=True)
+            post = ip.postprocess_batch(dec, lang="hin_Deva")
+            return post[0]
+        except:
+            return text
+    try:
+        prompt = f"Translate this romanized Indian text to English: {text}"
+        inputs = rs_tokenizer(prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = rs_model.generate(
+                inputs.input_ids,
+                max_new_tokens=max_new_tokens,
+                num_beams=2,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=rs_tokenizer.eos_token_id
+            )
+        full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True)
+        translation = full_response.replace(prompt, "").strip()
+        return translation if translation and len(translation) > 2 else text
+    except Exception as e:
+        return text
+print("✅ RomanSetu/fallback translation function defined.")
+print("🎉 SETUP COMPLETE with fallback mechanism.")
+# ================================================================
+# = STEP 2.6: LOAD INDICXLIT FOR BETTER TRANSLITERATION (CORRECTED) =
+# ================================================================
+print("--- Installing and loading IndicXlit for better romanized text handling ---")
+# Install IndicXlit (compatible with your transformers==4.40.2)
+from ai4bharat.transliteration import XlitEngine
+import torch
+try:
+    # Load IndicXlit engines for different languages (based on official docs)
+    xlit_engines = {
+        "hindi": XlitEngine("hi", beam_width=4, rescore=True),
+        "bengali": XlitEngine("bn", beam_width=4, rescore=True),
+        "tamil": XlitEngine("ta", beam_width=4, rescore=True),
+        "telugu": XlitEngine("te", beam_width=4, rescore=True),
+        "gujarati": XlitEngine("gu", beam_width=4, rescore=True),
+        "kannada": XlitEngine("kn", beam_width=4, rescore=True),
+        "malayalam": XlitEngine("ml", beam_width=4, rescore=True),
+        "punjabi": XlitEngine("pa", beam_width=4, rescore=True),
+        "marathi": XlitEngine("mr", beam_width=4, rescore=True),
+        "urdu": XlitEngine("ur", beam_width=4, rescore=True),
+    }
+    print("✅ Multiple IndicXlit engines loaded successfully.")
+except Exception as e:
+    print(f"❌ Error loading IndicXlit: {e}")
+    print("💡 Falling back to basic transliteration.")
+    xlit_engines = {}
+def enhanced_transliterate_with_xlit(text, target_lang):
+    """
+    Enhanced transliteration using IndicXlit (based on official API)
+    """
+    lang_key = target_lang.lower()
+    if not xlit_engines or lang_key not in xlit_engines:
+        # Fallback to your existing transliteration
+        from indic_transliteration import sanscript
+        from indic_transliteration.sanscript import transliterate
+        script_map = {
+            "hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI,
+            "tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU,
+            "kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM,
+            "gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI,
+            "marathi": sanscript.DEVANAGARI, "urdu": 'urdu'
+        }
+        return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI))
+    try:
+        # Use IndicXlit for better transliteration (official API)
+        engine = xlit_engines[lang_key]
+        # For sentences, use translit_sentence (returns dict with lang code as key)
+        if ' ' in text:
+            result = engine.translit_sentence(text)
+            # Get the language code for this engine
+            lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
+                         "gujarati": "gu", "kannada": "kn", "malayalam": "ml",
+                         "punjabi": "pa", "marathi": "mr", "urdu": "ur"}
+            lang_code = lang_codes.get(lang_key, "hi")
+            return result.get(lang_code, text)
+        else:
+            # For single words, use translit_word (returns dict with topk results)
+            result = engine.translit_word(text, topk=1)
+            lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
+                         "gujarati": "gu", "kannada": "kn", "malayalam": "ml",
+                         "punjabi": "pa", "marathi": "mr", "urdu": "ur"}
+            lang_code = lang_codes.get(lang_key, "hi")
+            return result.get(lang_code, [text])[0]
+    except Exception as e:
+        print(f"IndicXlit error for '{text}': {e}")
+        # Fallback if IndicXlit fails
+        return text
+print("✅ Enhanced transliteration function defined.")
+print("🎉 INDICXLIT SETUP COMPLETE.")
+import pandas as pd
+from indic_transliteration import sanscript
+from indic_transliteration.sanscript import transliterate
+# EXPANDED language mapping to handle misdetections
+LID_TO_TRANSLATE = {
+    # Hindi variants
+    "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    # Maithili (often confused with Hindi) - map to Hindi
+    "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    # Bengali variants
+    "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    # Assamese (often confused with Bengali) - map to Bengali
+    "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    # Tamil variants
+    "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    # Telugu variants
+    "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
+    "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
+    # Kannada variants
+    "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
+    "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
+    # Malayalam variants
+    "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
+    "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
+    # Gujarati variants
+    "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
+    "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
+    # Punjabi variants
+    "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
+    "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
+    # Marathi variants
+    "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
+    "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
+    # Urdu variants
+    "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
+    "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
+    # Additional commonly misdetected languages
+    "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Sindhi → Hindi
+    "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Nepali → Hindi
+    "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Konkani → Hindi
+    "gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Goan Konkani → Hindi
+    "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Bodo → Hindi
+}
+def enhanced_transliterate_robust(text, target_script):
+    """
+    Enhanced transliteration with better romanization handling
+    """
+    try:
+        # Preprocess text for better transliteration
+        cleaned_text = text.lower().strip()
+        # Handle common romanization patterns
+        replacements = {
+            'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
+            'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
+            'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
+        }
+        for old, new in replacements.items():
+            cleaned_text = cleaned_text.replace(old, new)
+        # Transliterate using your existing library
+        result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
+        return result if result else text
+    except Exception as e:
+        print(f"Transliteration error: {e}")
+        return text
+def detect_and_translate_robust(texts, batch_size=64):
+    """
+    Robust detection and translation with expanded language mapping
+    """
+    results = []
+    preds = lid.batch_predict(texts, batch_size)
+    for item in preds:
+        if isinstance(item, dict):
+            text = item.get("text", "")
+            lang_code = item.get("lang", item.get("pred_lang", ""))
+            score = float(item.get("score", 0.0))
+            model_name = item.get("model", "")
+        else:
+            text, lang_code, score, model_name = item
+        is_romanized = lang_code.endswith("_Latn")
+        if lang_code not in LID_TO_TRANSLATE:
+            translation = f"Language '{lang_code}' not supported for translation"
+            method = "Unsupported"
+        else:
+            try:
+                lang_info = LID_TO_TRANSLATE[lang_code]
+                src_code = lang_info["it_code"]
+                if is_romanized:
+                    # Use enhanced transliteration
+                    native_text = enhanced_transliterate_robust(text, lang_info["script"])
+                    method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})"
+                    print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})")
+                else:
+                    native_text = text
+                    method = f"IndicTrans2 (detected as {lang_code})"
+                # Translate with IndicTrans2
+                pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
+                inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
+                with torch.no_grad():
+                    out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
+                dec = tokenizer.batch_decode(out, skip_special_tokens=True)
+                post = ip.postprocess_batch(dec, lang=src_code)
+                translation = post[0]
+            except Exception as e:
+                translation = f"Translation error: {str(e)}"
+                method = "Error"
+        results.append({
+            "original_text": text,
+            "detected_lang": lang_code,
+            "script_type": "Romanized" if is_romanized else "Native",
+            "confidence": f"{score:.3f}",
+            "translation_method": method,
+            "english_translation": translation
+        })
+    return pd.DataFrame(results)
+print("✅ Robust translation function with expanded language mapping defined")
+# Test with the same samples
+sample_texts = [
+    "यहाँ कितने लोग हैं?",
+    "tum kaha ho",
+    "aaj mausam suhana hai",
+    "aap kaise hain",
+    "আমি ভালো আছি।",
+    "ami bhalo achi",
+    "mera naam rahul hai",
+    "main office jaa raha hun"
+]
+print(f"🔍 Testing robust approach with expanded language mapping...")
+df_results = detect_and_translate_robust(sample_texts, batch_size=16)
+display(df_results)
+# ================================================================
+# = COMPLETE TEST CODE FOR ALL 22 INDIAN LANGUAGES              =
+# ================================================================
+import pandas as pd
+from indic_transliteration import sanscript
+from indic_transliteration.sanscript import transliterate
+# Official 22 Indian languages sample sentences (native + romanized)
+sample_sentences = {
+    "Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"),
+    "Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"),
+    "Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"),
+    "Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"),
+    "Gujarati": ("તમે કેમ છો?", "tame kem cho?"),
+    "Hindi": ("तुम कैसे हो?", "tum kaise ho?"),
+    "Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"),
+    "Kashmiri": ("तुस की छै?", "tus ki chhai?"),
+    "Konkani": ("तुम कशें आसा?", "tum kashen asa?"),
+    "Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"),
+    "Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"),
+    "Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"),
+    "Marathi": ("तू कसा आहेस?", "tu kasa ahes?"),
+    "Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"),
+    "Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"),
+    "Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"),
+    "Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"),
+    "Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"),
+    "Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"),
+    "Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"),
+    "Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"),
+    "Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?")
+}
+# Expanded language mapping (covers common misdetections)
+LID_TO_TRANSLATE = {
+    # Hindi variants
+    "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili→Hindi
+    "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
+    "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali→Hindi
+    "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi→Hindi
+    "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani→Hindi
+    "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo→Hindi
+    # Bengali variants
+    "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese→Bengali
+    "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
+    # Tamil variants
+    "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
+    # Telugu variants
+    "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
+    "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
+    # Kannada variants
+    "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
+    "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
+    # Malayalam variants
+    "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
+    "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
+    # Gujarati variants
+    "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
+    "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
+    # Punjabi variants
+    "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
+    "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
+    # Marathi variants
+    "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
+    "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
+    # Urdu variants
+    "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
+    "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
+}
+def enhanced_transliterate_robust(text, target_script):
+    """Enhanced transliteration with better romanization handling"""
+    try:
+        cleaned_text = text.lower().strip()
+        replacements = {
+            'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
+            'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
+            'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
+        }
+        for old, new in replacements.items():
+            cleaned_text = cleaned_text.replace(old, new)
+        result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
+        return result if result else text
+    except Exception as e:
+        print(f"Transliteration error: {e}")
+        return text
+def test_all_22_languages(texts, batch_size=32):
+    """Complete testing function for all 22 languages"""
+    results = []
+    preds = lid.batch_predict(texts, batch_size)
+    for item in preds:
+        if isinstance(item, dict):
+            text = item.get("text", "")
+            lang_code = item.get("lang", item.get("pred_lang", ""))
+            score = float(item.get("score", 0.0))
+            model_name = item.get("model", "")
+        else:
+            text, lang_code, score, model_name = item
+        is_romanized = lang_code.endswith("_Latn")
+        if lang_code not in LID_TO_TRANSLATE:
+            translation = f"Language '{lang_code}' not supported"
+            method = "Unsupported"
+        else:
+            try:
+                lang_info = LID_TO_TRANSLATE[lang_code]
+                src_code = lang_info["it_code"]
+                if is_romanized:
+                    native_text = enhanced_transliterate_robust(text, lang_info["script"])
+                    method = f"Transliteration+IndicTrans2 (detected: {lang_code})"
+                    print(f"Romanized: '{text}' → '{native_text}'")
+                else:
+                    native_text = text
+                    method = f"IndicTrans2 (detected: {lang_code})"
+                # Translate with IndicTrans2
+                pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
+                inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
+                with torch.no_grad():
+                    out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
+                dec = tokenizer.batch_decode(out, skip_special_tokens=True)
+                post = ip.postprocess_batch(dec, lang=src_code)
+                translation = post[0]
+            except Exception as e:
+                translation = f"Translation error: {str(e)}"
+                method = "Error"
+        results.append({
+            "language": text[:20] + "..." if len(text) > 20 else text,
+            "original_text": text,
+            "detected_lang": lang_code,
+            "script_type": "Romanized" if is_romanized else "Native",
+            "confidence": f"{score:.3f}",
+            "method": method,
+            "english_translation": translation
+        })
+    return pd.DataFrame(results)
+# Create test dataset with all 44 samples (22 native + 22 romanized)
+print("🔍 Creating test dataset for all 22 official Indian languages...")
+all_test_texts = []
+for lang, (native, roman) in sample_sentences.items():
+    all_test_texts.append(native)
+    all_test_texts.append(roman)
+print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...")
+# Run the complete test
+df_results = test_all_22_languages(all_test_texts, batch_size=32)
+# Display results
+print("\n🎯 COMPLETE TEST RESULTS:")
+display(df_results)
+# Summary statistics
+print(f"\n📈 SUMMARY STATISTICS:")
+print(f"Total samples tested: {len(df_results)}")
+print(f"Languages detected: {df_results['detected_lang'].nunique()}")
+print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}")
+print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}")
+print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}")
+import pandas as pd
+def detailed_translation_summary(df_results):
+    """
+    Generate comprehensive detailed summary of translation results
+    """
+    # Flag successful translations
+    df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False)
+    print("\n=========== OVERALL SUMMARY ===========")
+    print(f"Total samples tested: {len(df_results)}")
+    print(f"Languages detected: {df_results['detected_lang'].nunique()}")
+    print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}")
+    print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}")
+    print(f"Successfully translated: {df_results['successful_translation'].sum()}")
+    overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100)
+    print(f"Overall success rate: {overall_success_rate:.1f}%")
+    print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========")
+    # Per-language analysis
+    lang_summary = df_results.groupby('detected_lang').agg(
+        total_samples=('original_text', 'count'),
+        native_count=('script_type', lambda x: (x == 'Native').sum()),
+        romanized_count=('script_type', lambda x: (x == 'Romanized').sum()),
+        mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()),
+        success=('successful_translation', 'sum'),
+        error_count=('successful_translation', lambda x: (~x).sum())
+    ).reset_index().sort_values('total_samples', ascending=False)
+    lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1)
+    print(lang_summary)
+    print("\n=========== TOP PERFORMING LANGUAGES ===========")
+    top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False)
+    if len(top_performers) > 0:
+        print(top_performers[['detected_lang', 'total_samples', 'success_rate']])
+    else:
+        print("No languages with 90%+ success rate")
+    print("\n=========== CHALLENGING LANGUAGES ===========")
+    challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate')
+    if len(challenging) > 0:
+        print(challenging[['detected_lang', 'total_samples', 'success_rate']])
+    else:
+        print("No languages with <50% success rate")
+    print("\n=========== ERROR ANALYSIS ===========")
+    error_df = df_results[~df_results['successful_translation']]
+    print(f"Total errors: {len(error_df)}")
+    if len(error_df) > 0:
+        print("\nError samples:")
+        print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']])
+    else:
+        print("No errors found!")
+    print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========")
+    script_summary = df_results.groupby('script_type').agg(
+        total_samples=('original_text', 'count'),
+        successful=('successful_translation', 'sum'),
+        success_rate=('successful_translation', lambda x: x.mean() * 100)
+    ).round(1)
+    print(script_summary)
+    print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========")
+    confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False)
+    print("Top 10 most confident detections:")
+    print(confidence_summary.head(10))
+    return lang_summary, script_summary, error_df
+# ===== HOW TO USE =====
+print("✅ Detailed summary function defined")
+print("\n📋 To run on your test results:")
+print("   lang_summary, script_summary, error_df = detailed_translation_summary(df_results)")
+print("   display(lang_summary)")
+print("   display(error_df)")
+lang_summary, script_summary, error_df = detailed_translation_summary(df_results)
+display(lang_summary)
+display(error_df)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+pandas
+sentencepiece
+torch
+transformers