Spaces:

kasimali
/

final

Runtime error

App Files Files Community

final / app.py

kasimali

Upload folder using huggingface_hub

3d5d9f5 verified 2 months ago

raw

history blame contribute delete

30.1 kB

	# final

	# ================================================================
	# = STEP 1: SETUP AND DOWNLOAD (YOUR PROVEN METHOD) =
	# ================================================================
	import os

	print("--- 1. Installing All Libraries ---")
	print("✅ Libraries installed.")

	print("\n--- 2. Cloning IndicLID Repository ---")
	# Using your proven method of changing directories
	print("✅ Repository cloned.")

	# Navigate into the correct directory structure

	print("\n--- 3. Downloading and Unzipping IndicLID Models ---")
	print("✅ Download commands executed. Unzipping now...")
	print("✅ Unzip commands executed.")

	print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉")


	import shutil
	import os

	# Source folder path
	source = "/usr/local/lib/python3.12/dist-packages/transformers"

	# Destination folder path
	destination = "/content/IndicLID/Inference/ai4bharat/"

	# Ensure destination directory exists
	os.makedirs(destination, exist_ok=True)

	# Move folder
	moved_path = shutil.move(source, destination)

	print(f"Folder moved to: {moved_path}")



	# =========================
	# = STEP 2: INITIALIZE MODELS (EXACTLY AS YOUR OLD CODE) =
	# =========================
	import os
	import sys
	import torch
	print("--- Applying your original add_safe_globals fix... ---")

	if "/content/IndicLID/Inference" not in sys.path:
	sys.path.append("/content/IndicLID/Inference")

	from transformers.models.bert.modeling_bert import (
	BertModel, BertPreTrainedModel, BertForSequenceClassification,
	BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
	BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput
	)
	from transformers.models.bert.configuration_bert import BertConfig
	import torch.nn as nn
	from torch.nn.modules.sparse import Embedding
	from torch.nn.modules.container import ModuleList
	from torch.nn.modules.linear import Linear
	from torch.nn.modules.normalization import LayerNorm
	from torch.nn.modules.dropout import Dropout

	torch.serialization.add_safe_globals([
	BertModel, BertPreTrainedModel, BertForSequenceClassification,
	BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
	BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig,
	Embedding, ModuleList, Linear, LayerNorm, Dropout,
	])
	print("✅ Comprehensive safe globals added successfully.")

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from IndicTransToolkit.processor import IndicProcessor
	from ai4bharat.IndicLID import IndicLID

	print("--- Loading all models into memory... ---")
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
	print("✅ IndicLID model loaded successfully.")

	MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device)
	ip = IndicProcessor(inference=True)
	print("✅ IndicTrans2 1B model loaded.")

	print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.")


	import sys
	print(sys.path)

	pip show transformers



	# ================================================================
	# = STEP 2.5: LOAD ROMANSETU (COMPATIBLE WITH 4.40.2) =
	# ================================================================

	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch

	print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---")

	# Try smaller, more compatible models first
	model_options = [
	"ai4bharat/romansetu-cpt-roman-100m",
	"ai4bharat/romansetu-cpt-roman-200m"
	]

	rs_model = None
	rs_tokenizer = None

	for model_id in model_options:
	try:
	print(f"Trying model: {model_id}")
	rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
	print(f"✅ {model_id} loaded successfully.")
	break
	except Exception as e:
	print(f"❌ {model_id} failed: {e}")
	continue

	if rs_model is None:
	print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.")

	def translate_with_romansetu(text, max_new_tokens=50):
	if rs_model is None:
	# Fallback: use enhanced transliteration + IndicTrans2
	from indic_transliteration import sanscript
	from indic_transliteration.sanscript import transliterate
	try:
	# Try to transliterate and then translate with IndicTrans2
	native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
	pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn")
	inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
	with torch.no_grad():
	out = model.generate(**inputs, num_beams=3, max_length=100)
	dec = tokenizer.batch_decode(out, skip_special_tokens=True)
	post = ip.postprocess_batch(dec, lang="hin_Deva")
	return post[0]
	except:
	return text

	try:
	prompt = f"Translate this romanized Indian text to English: {text}"
	inputs = rs_tokenizer(prompt, return_tensors="pt").to(device)

	with torch.no_grad():
	outputs = rs_model.generate(
	inputs.input_ids,
	max_new_tokens=max_new_tokens,
	num_beams=2,
	temperature=0.7,
	do_sample=True,
	pad_token_id=rs_tokenizer.eos_token_id
	)

	full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True)
	translation = full_response.replace(prompt, "").strip()
	return translation if translation and len(translation) > 2 else text

	except Exception as e:
	return text

	print("✅ RomanSetu/fallback translation function defined.")
	print("🎉 SETUP COMPLETE with fallback mechanism.")


	# ================================================================
	# = STEP 2.6: LOAD INDICXLIT FOR BETTER TRANSLITERATION (CORRECTED) =
	# ================================================================

	print("--- Installing and loading IndicXlit for better romanized text handling ---")

	# Install IndicXlit (compatible with your transformers==4.40.2)

	from ai4bharat.transliteration import XlitEngine
	import torch

	try:
	# Load IndicXlit engines for different languages (based on official docs)
	xlit_engines = {
	"hindi": XlitEngine("hi", beam_width=4, rescore=True),
	"bengali": XlitEngine("bn", beam_width=4, rescore=True),
	"tamil": XlitEngine("ta", beam_width=4, rescore=True),
	"telugu": XlitEngine("te", beam_width=4, rescore=True),
	"gujarati": XlitEngine("gu", beam_width=4, rescore=True),
	"kannada": XlitEngine("kn", beam_width=4, rescore=True),
	"malayalam": XlitEngine("ml", beam_width=4, rescore=True),
	"punjabi": XlitEngine("pa", beam_width=4, rescore=True),
	"marathi": XlitEngine("mr", beam_width=4, rescore=True),
	"urdu": XlitEngine("ur", beam_width=4, rescore=True),
	}
	print("✅ Multiple IndicXlit engines loaded successfully.")

	except Exception as e:
	print(f"❌ Error loading IndicXlit: {e}")
	print("💡 Falling back to basic transliteration.")
	xlit_engines = {}

	def enhanced_transliterate_with_xlit(text, target_lang):
	"""
	Enhanced transliteration using IndicXlit (based on official API)
	"""
	lang_key = target_lang.lower()

	if not xlit_engines or lang_key not in xlit_engines:
	# Fallback to your existing transliteration
	from indic_transliteration import sanscript
	from indic_transliteration.sanscript import transliterate
	script_map = {
	"hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI,
	"tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU,
	"kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM,
	"gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI,
	"marathi": sanscript.DEVANAGARI, "urdu": 'urdu'
	}
	return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI))

	try:
	# Use IndicXlit for better transliteration (official API)
	engine = xlit_engines[lang_key]

	# For sentences, use translit_sentence (returns dict with lang code as key)
	if ' ' in text:
	result = engine.translit_sentence(text)
	# Get the language code for this engine
	lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
	"gujarati": "gu", "kannada": "kn", "malayalam": "ml",
	"punjabi": "pa", "marathi": "mr", "urdu": "ur"}
	lang_code = lang_codes.get(lang_key, "hi")
	return result.get(lang_code, text)
	else:
	# For single words, use translit_word (returns dict with topk results)
	result = engine.translit_word(text, topk=1)
	lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
	"gujarati": "gu", "kannada": "kn", "malayalam": "ml",
	"punjabi": "pa", "marathi": "mr", "urdu": "ur"}
	lang_code = lang_codes.get(lang_key, "hi")
	return result.get(lang_code, [text])[0]

	except Exception as e:
	print(f"IndicXlit error for '{text}': {e}")
	# Fallback if IndicXlit fails
	return text

	print("✅ Enhanced transliteration function defined.")
	print("🎉 INDICXLIT SETUP COMPLETE.")


	import pandas as pd
	from indic_transliteration import sanscript
	from indic_transliteration.sanscript import transliterate

	# EXPANDED language mapping to handle misdetections
	LID_TO_TRANSLATE = {
	# Hindi variants
	"hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
	"hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},

	# Maithili (often confused with Hindi) - map to Hindi
	"mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
	"mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},

	# Bengali variants
	"ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
	"ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},

	# Assamese (often confused with Bengali) - map to Bengali
	"asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
	"asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},

	# Tamil variants
	"tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
	"tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
	"tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},

	# Telugu variants
	"tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
	"tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},

	# Kannada variants
	"kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
	"kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},

	# Malayalam variants
	"mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
	"mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},

	# Gujarati variants
	"guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
	"guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},

	# Punjabi variants
	"pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
	"pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},

	# Marathi variants
	"mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
	"mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},

	# Urdu variants
	"urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
	"urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},

	# Additional commonly misdetected languages
	"snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi → Hindi
	"nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali → Hindi
	"kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani → Hindi
	"gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Goan Konkani → Hindi
	"brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo → Hindi
	}

	def enhanced_transliterate_robust(text, target_script):
	"""
	Enhanced transliteration with better romanization handling
	"""
	try:
	# Preprocess text for better transliteration
	cleaned_text = text.lower().strip()

	# Handle common romanization patterns
	replacements = {
	'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
	'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
	'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
	}

	for old, new in replacements.items():
	cleaned_text = cleaned_text.replace(old, new)

	# Transliterate using your existing library
	result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
	return result if result else text

	except Exception as e:
	print(f"Transliteration error: {e}")
	return text

	def detect_and_translate_robust(texts, batch_size=64):
	"""
	Robust detection and translation with expanded language mapping
	"""
	results = []
	preds = lid.batch_predict(texts, batch_size)

	for item in preds:
	if isinstance(item, dict):
	text = item.get("text", "")
	lang_code = item.get("lang", item.get("pred_lang", ""))
	score = float(item.get("score", 0.0))
	model_name = item.get("model", "")
	else:
	text, lang_code, score, model_name = item

	is_romanized = lang_code.endswith("_Latn")

	if lang_code not in LID_TO_TRANSLATE:
	translation = f"Language '{lang_code}' not supported for translation"
	method = "Unsupported"
	else:
	try:
	lang_info = LID_TO_TRANSLATE[lang_code]
	src_code = lang_info["it_code"]

	if is_romanized:
	# Use enhanced transliteration
	native_text = enhanced_transliterate_robust(text, lang_info["script"])
	method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})"
	print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})")
	else:
	native_text = text
	method = f"IndicTrans2 (detected as {lang_code})"

	# Translate with IndicTrans2
	pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
	inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
	with torch.no_grad():
	out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
	dec = tokenizer.batch_decode(out, skip_special_tokens=True)
	post = ip.postprocess_batch(dec, lang=src_code)
	translation = post[0]

	except Exception as e:
	translation = f"Translation error: {str(e)}"
	method = "Error"

	results.append({
	"original_text": text,
	"detected_lang": lang_code,
	"script_type": "Romanized" if is_romanized else "Native",
	"confidence": f"{score:.3f}",
	"translation_method": method,
	"english_translation": translation
	})

	return pd.DataFrame(results)

	print("✅ Robust translation function with expanded language mapping defined")

	# Test with the same samples
	sample_texts = [
	"यहाँ कितने लोग हैं?",
	"tum kaha ho",
	"aaj mausam suhana hai",
	"aap kaise hain",
	"আমি ভালো আছি।",
	"ami bhalo achi",
	"mera naam rahul hai",
	"main office jaa raha hun"
	]

	print(f"🔍 Testing robust approach with expanded language mapping...")
	df_results = detect_and_translate_robust(sample_texts, batch_size=16)
	display(df_results)


	# ================================================================
	# = COMPLETE TEST CODE FOR ALL 22 INDIAN LANGUAGES =
	# ================================================================

	import pandas as pd
	from indic_transliteration import sanscript
	from indic_transliteration.sanscript import transliterate

	# Official 22 Indian languages sample sentences (native + romanized)
	sample_sentences = {
	"Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"),
	"Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"),
	"Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"),
	"Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"),
	"Gujarati": ("તમે કેમ છો?", "tame kem cho?"),
	"Hindi": ("तुम कैसे हो?", "tum kaise ho?"),
	"Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"),
	"Kashmiri": ("तुस की छै?", "tus ki chhai?"),
	"Konkani": ("तुम कशें आसा?", "tum kashen asa?"),
	"Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"),
	"Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"),
	"Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"),
	"Marathi": ("तू कसा आहेस?", "tu kasa ahes?"),
	"Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"),
	"Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"),
	"Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"),
	"Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"),
	"Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"),
	"Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"),
	"Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"),
	"Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"),
	"Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?")
	}

	# Expanded language mapping (covers common misdetections)
	LID_TO_TRANSLATE = {
	# Hindi variants
	"hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
	"hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
	"mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili→Hindi
	"mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
	"nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali→Hindi
	"snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi→Hindi
	"kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani→Hindi
	"brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo→Hindi

	# Bengali variants
	"ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
	"ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
	"asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese→Bengali
	"asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},

	# Tamil variants
	"tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
	"tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
	"tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},

	# Telugu variants
	"tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
	"tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},

	# Kannada variants
	"kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
	"kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},

	# Malayalam variants
	"mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
	"mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},

	# Gujarati variants
	"guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
	"guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},

	# Punjabi variants
	"pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
	"pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},

	# Marathi variants
	"mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
	"mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},

	# Urdu variants
	"urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
	"urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
	}

	def enhanced_transliterate_robust(text, target_script):
	"""Enhanced transliteration with better romanization handling"""
	try:
	cleaned_text = text.lower().strip()
	replacements = {
	'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
	'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
	'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
	}
	for old, new in replacements.items():
	cleaned_text = cleaned_text.replace(old, new)
	result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
	return result if result else text
	except Exception as e:
	print(f"Transliteration error: {e}")
	return text

	def test_all_22_languages(texts, batch_size=32):
	"""Complete testing function for all 22 languages"""
	results = []
	preds = lid.batch_predict(texts, batch_size)

	for item in preds:
	if isinstance(item, dict):
	text = item.get("text", "")
	lang_code = item.get("lang", item.get("pred_lang", ""))
	score = float(item.get("score", 0.0))
	model_name = item.get("model", "")
	else:
	text, lang_code, score, model_name = item

	is_romanized = lang_code.endswith("_Latn")

	if lang_code not in LID_TO_TRANSLATE:
	translation = f"Language '{lang_code}' not supported"
	method = "Unsupported"
	else:
	try:
	lang_info = LID_TO_TRANSLATE[lang_code]
	src_code = lang_info["it_code"]

	if is_romanized:
	native_text = enhanced_transliterate_robust(text, lang_info["script"])
	method = f"Transliteration+IndicTrans2 (detected: {lang_code})"
	print(f"Romanized: '{text}' → '{native_text}'")
	else:
	native_text = text
	method = f"IndicTrans2 (detected: {lang_code})"

	# Translate with IndicTrans2
	pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
	inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
	with torch.no_grad():
	out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
	dec = tokenizer.batch_decode(out, skip_special_tokens=True)
	post = ip.postprocess_batch(dec, lang=src_code)
	translation = post[0]

	except Exception as e:
	translation = f"Translation error: {str(e)}"
	method = "Error"

	results.append({
	"language": text[:20] + "..." if len(text) > 20 else text,
	"original_text": text,
	"detected_lang": lang_code,
	"script_type": "Romanized" if is_romanized else "Native",
	"confidence": f"{score:.3f}",
	"method": method,
	"english_translation": translation
	})

	return pd.DataFrame(results)

	# Create test dataset with all 44 samples (22 native + 22 romanized)
	print("🔍 Creating test dataset for all 22 official Indian languages...")
	all_test_texts = []
	for lang, (native, roman) in sample_sentences.items():
	all_test_texts.append(native)
	all_test_texts.append(roman)

	print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...")

	# Run the complete test
	df_results = test_all_22_languages(all_test_texts, batch_size=32)

	# Display results
	print("\n🎯 COMPLETE TEST RESULTS:")
	display(df_results)

	# Summary statistics
	print(f"\n📈 SUMMARY STATISTICS:")
	print(f"Total samples tested: {len(df_results)}")
	print(f"Languages detected: {df_results['detected_lang'].nunique()}")
	print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}")
	print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}")
	print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error\|not supported', case=False)])}")


	import pandas as pd

	def detailed_translation_summary(df_results):
	"""
	Generate comprehensive detailed summary of translation results
	"""
	# Flag successful translations
	df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error\|not supported', case=False, na=False)

	print("\n=========== OVERALL SUMMARY ===========")
	print(f"Total samples tested: {len(df_results)}")
	print(f"Languages detected: {df_results['detected_lang'].nunique()}")
	print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}")
	print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}")
	print(f"Successfully translated: {df_results['successful_translation'].sum()}")

	overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100)
	print(f"Overall success rate: {overall_success_rate:.1f}%")

	print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========")
	# Per-language analysis
	lang_summary = df_results.groupby('detected_lang').agg(
	total_samples=('original_text', 'count'),
	native_count=('script_type', lambda x: (x == 'Native').sum()),
	romanized_count=('script_type', lambda x: (x == 'Romanized').sum()),
	mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()),
	success=('successful_translation', 'sum'),
	error_count=('successful_translation', lambda x: (~x).sum())
	).reset_index().sort_values('total_samples', ascending=False)

	lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1)
	print(lang_summary)

	print("\n=========== TOP PERFORMING LANGUAGES ===========")
	top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False)
	if len(top_performers) > 0:
	print(top_performers[['detected_lang', 'total_samples', 'success_rate']])
	else:
	print("No languages with 90%+ success rate")

	print("\n=========== CHALLENGING LANGUAGES ===========")
	challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate')
	if len(challenging) > 0:
	print(challenging[['detected_lang', 'total_samples', 'success_rate']])
	else:
	print("No languages with <50% success rate")

	print("\n=========== ERROR ANALYSIS ===========")
	error_df = df_results[~df_results['successful_translation']]
	print(f"Total errors: {len(error_df)}")
	if len(error_df) > 0:
	print("\nError samples:")
	print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']])
	else:
	print("No errors found!")

	print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========")
	script_summary = df_results.groupby('script_type').agg(
	total_samples=('original_text', 'count'),
	successful=('successful_translation', 'sum'),
	success_rate=('successful_translation', lambda x: x.mean() * 100)
	).round(1)
	print(script_summary)

	print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========")
	confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False)
	print("Top 10 most confident detections:")
	print(confidence_summary.head(10))

	return lang_summary, script_summary, error_df

	# ===== HOW TO USE =====
	print("✅ Detailed summary function defined")
	print("\n📋 To run on your test results:")
	print(" lang_summary, script_summary, error_df = detailed_translation_summary(df_results)")
	print(" display(lang_summary)")
	print(" display(error_df)")


	lang_summary, script_summary, error_df = detailed_translation_summary(df_results)


	display(lang_summary)
	display(error_df)