|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
|
|
print("--- 1. Installing All Libraries ---") |
|
|
print("✅ Libraries installed.") |
|
|
|
|
|
print("\n--- 2. Cloning IndicLID Repository ---") |
|
|
|
|
|
print("✅ Repository cloned.") |
|
|
|
|
|
|
|
|
|
|
|
print("\n--- 3. Downloading and Unzipping IndicLID Models ---") |
|
|
print("✅ Download commands executed. Unzipping now...") |
|
|
print("✅ Unzip commands executed.") |
|
|
|
|
|
print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉") |
|
|
|
|
|
|
|
|
import shutil |
|
|
import os |
|
|
|
|
|
|
|
|
source = "/usr/local/lib/python3.12/dist-packages/transformers" |
|
|
|
|
|
|
|
|
destination = "/content/IndicLID/Inference/ai4bharat/" |
|
|
|
|
|
|
|
|
os.makedirs(destination, exist_ok=True) |
|
|
|
|
|
|
|
|
moved_path = shutil.move(source, destination) |
|
|
|
|
|
print(f"Folder moved to: {moved_path}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import sys |
|
|
import torch |
|
|
print("--- Applying your original add_safe_globals fix... ---") |
|
|
|
|
|
if "/content/IndicLID/Inference" not in sys.path: |
|
|
sys.path.append("/content/IndicLID/Inference") |
|
|
|
|
|
from transformers.models.bert.modeling_bert import ( |
|
|
BertModel, BertPreTrainedModel, BertForSequenceClassification, |
|
|
BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention, |
|
|
BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput |
|
|
) |
|
|
from transformers.models.bert.configuration_bert import BertConfig |
|
|
import torch.nn as nn |
|
|
from torch.nn.modules.sparse import Embedding |
|
|
from torch.nn.modules.container import ModuleList |
|
|
from torch.nn.modules.linear import Linear |
|
|
from torch.nn.modules.normalization import LayerNorm |
|
|
from torch.nn.modules.dropout import Dropout |
|
|
|
|
|
torch.serialization.add_safe_globals([ |
|
|
BertModel, BertPreTrainedModel, BertForSequenceClassification, |
|
|
BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention, |
|
|
BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig, |
|
|
Embedding, ModuleList, Linear, LayerNorm, Dropout, |
|
|
]) |
|
|
print("✅ Comprehensive safe globals added successfully.") |
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
from IndicTransToolkit.processor import IndicProcessor |
|
|
from ai4bharat.IndicLID import IndicLID |
|
|
|
|
|
print("--- Loading all models into memory... ---") |
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6) |
|
|
print("✅ IndicLID model loaded successfully.") |
|
|
|
|
|
MODEL_ID = "ai4bharat/indictrans2-indic-en-1B" |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device) |
|
|
ip = IndicProcessor(inference=True) |
|
|
print("✅ IndicTrans2 1B model loaded.") |
|
|
|
|
|
print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.") |
|
|
|
|
|
|
|
|
import sys |
|
|
print(sys.path) |
|
|
|
|
|
pip show transformers |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
|
|
|
print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---") |
|
|
|
|
|
|
|
|
model_options = [ |
|
|
"ai4bharat/romansetu-cpt-roman-100m", |
|
|
"ai4bharat/romansetu-cpt-roman-200m" |
|
|
] |
|
|
|
|
|
rs_model = None |
|
|
rs_tokenizer = None |
|
|
|
|
|
for model_id in model_options: |
|
|
try: |
|
|
print(f"Trying model: {model_id}") |
|
|
rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
|
|
rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device) |
|
|
print(f"✅ {model_id} loaded successfully.") |
|
|
break |
|
|
except Exception as e: |
|
|
print(f"❌ {model_id} failed: {e}") |
|
|
continue |
|
|
|
|
|
if rs_model is None: |
|
|
print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.") |
|
|
|
|
|
def translate_with_romansetu(text, max_new_tokens=50): |
|
|
if rs_model is None: |
|
|
|
|
|
from indic_transliteration import sanscript |
|
|
from indic_transliteration.sanscript import transliterate |
|
|
try: |
|
|
|
|
|
native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI) |
|
|
pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn") |
|
|
inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) |
|
|
with torch.no_grad(): |
|
|
out = model.generate(**inputs, num_beams=3, max_length=100) |
|
|
dec = tokenizer.batch_decode(out, skip_special_tokens=True) |
|
|
post = ip.postprocess_batch(dec, lang="hin_Deva") |
|
|
return post[0] |
|
|
except: |
|
|
return text |
|
|
|
|
|
try: |
|
|
prompt = f"Translate this romanized Indian text to English: {text}" |
|
|
inputs = rs_tokenizer(prompt, return_tensors="pt").to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = rs_model.generate( |
|
|
inputs.input_ids, |
|
|
max_new_tokens=max_new_tokens, |
|
|
num_beams=2, |
|
|
temperature=0.7, |
|
|
do_sample=True, |
|
|
pad_token_id=rs_tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True) |
|
|
translation = full_response.replace(prompt, "").strip() |
|
|
return translation if translation and len(translation) > 2 else text |
|
|
|
|
|
except Exception as e: |
|
|
return text |
|
|
|
|
|
print("✅ RomanSetu/fallback translation function defined.") |
|
|
print("🎉 SETUP COMPLETE with fallback mechanism.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("--- Installing and loading IndicXlit for better romanized text handling ---") |
|
|
|
|
|
|
|
|
|
|
|
from ai4bharat.transliteration import XlitEngine |
|
|
import torch |
|
|
|
|
|
try: |
|
|
|
|
|
xlit_engines = { |
|
|
"hindi": XlitEngine("hi", beam_width=4, rescore=True), |
|
|
"bengali": XlitEngine("bn", beam_width=4, rescore=True), |
|
|
"tamil": XlitEngine("ta", beam_width=4, rescore=True), |
|
|
"telugu": XlitEngine("te", beam_width=4, rescore=True), |
|
|
"gujarati": XlitEngine("gu", beam_width=4, rescore=True), |
|
|
"kannada": XlitEngine("kn", beam_width=4, rescore=True), |
|
|
"malayalam": XlitEngine("ml", beam_width=4, rescore=True), |
|
|
"punjabi": XlitEngine("pa", beam_width=4, rescore=True), |
|
|
"marathi": XlitEngine("mr", beam_width=4, rescore=True), |
|
|
"urdu": XlitEngine("ur", beam_width=4, rescore=True), |
|
|
} |
|
|
print("✅ Multiple IndicXlit engines loaded successfully.") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error loading IndicXlit: {e}") |
|
|
print("💡 Falling back to basic transliteration.") |
|
|
xlit_engines = {} |
|
|
|
|
|
def enhanced_transliterate_with_xlit(text, target_lang): |
|
|
""" |
|
|
Enhanced transliteration using IndicXlit (based on official API) |
|
|
""" |
|
|
lang_key = target_lang.lower() |
|
|
|
|
|
if not xlit_engines or lang_key not in xlit_engines: |
|
|
|
|
|
from indic_transliteration import sanscript |
|
|
from indic_transliteration.sanscript import transliterate |
|
|
script_map = { |
|
|
"hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI, |
|
|
"tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU, |
|
|
"kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM, |
|
|
"gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI, |
|
|
"marathi": sanscript.DEVANAGARI, "urdu": 'urdu' |
|
|
} |
|
|
return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI)) |
|
|
|
|
|
try: |
|
|
|
|
|
engine = xlit_engines[lang_key] |
|
|
|
|
|
|
|
|
if ' ' in text: |
|
|
result = engine.translit_sentence(text) |
|
|
|
|
|
lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te", |
|
|
"gujarati": "gu", "kannada": "kn", "malayalam": "ml", |
|
|
"punjabi": "pa", "marathi": "mr", "urdu": "ur"} |
|
|
lang_code = lang_codes.get(lang_key, "hi") |
|
|
return result.get(lang_code, text) |
|
|
else: |
|
|
|
|
|
result = engine.translit_word(text, topk=1) |
|
|
lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te", |
|
|
"gujarati": "gu", "kannada": "kn", "malayalam": "ml", |
|
|
"punjabi": "pa", "marathi": "mr", "urdu": "ur"} |
|
|
lang_code = lang_codes.get(lang_key, "hi") |
|
|
return result.get(lang_code, [text])[0] |
|
|
|
|
|
except Exception as e: |
|
|
print(f"IndicXlit error for '{text}': {e}") |
|
|
|
|
|
return text |
|
|
|
|
|
print("✅ Enhanced transliteration function defined.") |
|
|
print("🎉 INDICXLIT SETUP COMPLETE.") |
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
from indic_transliteration import sanscript |
|
|
from indic_transliteration.sanscript import transliterate |
|
|
|
|
|
|
|
|
LID_TO_TRANSLATE = { |
|
|
|
|
|
"hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
|
|
|
|
|
|
"mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
|
|
|
|
|
|
"ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
"ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
|
|
|
|
|
|
"asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
"asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
|
|
|
|
|
|
"tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
|
|
"tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
|
|
"tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
|
|
|
|
|
|
|
|
"tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, |
|
|
"tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, |
|
|
|
|
|
|
|
|
"kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, |
|
|
"kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, |
|
|
|
|
|
|
|
|
"mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, |
|
|
"mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, |
|
|
|
|
|
|
|
|
"guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, |
|
|
"guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, |
|
|
|
|
|
|
|
|
"pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, |
|
|
"pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, |
|
|
|
|
|
|
|
|
"mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, |
|
|
"mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, |
|
|
|
|
|
|
|
|
"urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, |
|
|
"urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, |
|
|
|
|
|
|
|
|
"snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
} |
|
|
|
|
|
def enhanced_transliterate_robust(text, target_script): |
|
|
""" |
|
|
Enhanced transliteration with better romanization handling |
|
|
""" |
|
|
try: |
|
|
|
|
|
cleaned_text = text.lower().strip() |
|
|
|
|
|
|
|
|
replacements = { |
|
|
'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph', |
|
|
'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh', |
|
|
'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au' |
|
|
} |
|
|
|
|
|
for old, new in replacements.items(): |
|
|
cleaned_text = cleaned_text.replace(old, new) |
|
|
|
|
|
|
|
|
result = transliterate(cleaned_text, sanscript.ITRANS, target_script) |
|
|
return result if result else text |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Transliteration error: {e}") |
|
|
return text |
|
|
|
|
|
def detect_and_translate_robust(texts, batch_size=64): |
|
|
""" |
|
|
Robust detection and translation with expanded language mapping |
|
|
""" |
|
|
results = [] |
|
|
preds = lid.batch_predict(texts, batch_size) |
|
|
|
|
|
for item in preds: |
|
|
if isinstance(item, dict): |
|
|
text = item.get("text", "") |
|
|
lang_code = item.get("lang", item.get("pred_lang", "")) |
|
|
score = float(item.get("score", 0.0)) |
|
|
model_name = item.get("model", "") |
|
|
else: |
|
|
text, lang_code, score, model_name = item |
|
|
|
|
|
is_romanized = lang_code.endswith("_Latn") |
|
|
|
|
|
if lang_code not in LID_TO_TRANSLATE: |
|
|
translation = f"Language '{lang_code}' not supported for translation" |
|
|
method = "Unsupported" |
|
|
else: |
|
|
try: |
|
|
lang_info = LID_TO_TRANSLATE[lang_code] |
|
|
src_code = lang_info["it_code"] |
|
|
|
|
|
if is_romanized: |
|
|
|
|
|
native_text = enhanced_transliterate_robust(text, lang_info["script"]) |
|
|
method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})" |
|
|
print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})") |
|
|
else: |
|
|
native_text = text |
|
|
method = f"IndicTrans2 (detected as {lang_code})" |
|
|
|
|
|
|
|
|
pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn") |
|
|
inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) |
|
|
with torch.no_grad(): |
|
|
out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True) |
|
|
dec = tokenizer.batch_decode(out, skip_special_tokens=True) |
|
|
post = ip.postprocess_batch(dec, lang=src_code) |
|
|
translation = post[0] |
|
|
|
|
|
except Exception as e: |
|
|
translation = f"Translation error: {str(e)}" |
|
|
method = "Error" |
|
|
|
|
|
results.append({ |
|
|
"original_text": text, |
|
|
"detected_lang": lang_code, |
|
|
"script_type": "Romanized" if is_romanized else "Native", |
|
|
"confidence": f"{score:.3f}", |
|
|
"translation_method": method, |
|
|
"english_translation": translation |
|
|
}) |
|
|
|
|
|
return pd.DataFrame(results) |
|
|
|
|
|
print("✅ Robust translation function with expanded language mapping defined") |
|
|
|
|
|
|
|
|
sample_texts = [ |
|
|
"यहाँ कितने लोग हैं?", |
|
|
"tum kaha ho", |
|
|
"aaj mausam suhana hai", |
|
|
"aap kaise hain", |
|
|
"আমি ভালো আছি।", |
|
|
"ami bhalo achi", |
|
|
"mera naam rahul hai", |
|
|
"main office jaa raha hun" |
|
|
] |
|
|
|
|
|
print(f"🔍 Testing robust approach with expanded language mapping...") |
|
|
df_results = detect_and_translate_robust(sample_texts, batch_size=16) |
|
|
display(df_results) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
from indic_transliteration import sanscript |
|
|
from indic_transliteration.sanscript import transliterate |
|
|
|
|
|
|
|
|
sample_sentences = { |
|
|
"Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"), |
|
|
"Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"), |
|
|
"Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"), |
|
|
"Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"), |
|
|
"Gujarati": ("તમે કેમ છો?", "tame kem cho?"), |
|
|
"Hindi": ("तुम कैसे हो?", "tum kaise ho?"), |
|
|
"Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"), |
|
|
"Kashmiri": ("तुस की छै?", "tus ki chhai?"), |
|
|
"Konkani": ("तुम कशें आसा?", "tum kashen asa?"), |
|
|
"Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"), |
|
|
"Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"), |
|
|
"Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"), |
|
|
"Marathi": ("तू कसा आहेस?", "tu kasa ahes?"), |
|
|
"Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"), |
|
|
"Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"), |
|
|
"Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"), |
|
|
"Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"), |
|
|
"Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"), |
|
|
"Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"), |
|
|
"Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"), |
|
|
"Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"), |
|
|
"Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?") |
|
|
} |
|
|
|
|
|
|
|
|
LID_TO_TRANSLATE = { |
|
|
|
|
|
"hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
"brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
|
|
|
|
|
|
"ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
"ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
"asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
"asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
|
|
|
|
|
|
"tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
|
|
"tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
|
|
"tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
|
|
|
|
|
|
|
|
"tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, |
|
|
"tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, |
|
|
|
|
|
|
|
|
"kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, |
|
|
"kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, |
|
|
|
|
|
|
|
|
"mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, |
|
|
"mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, |
|
|
|
|
|
|
|
|
"guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, |
|
|
"guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, |
|
|
|
|
|
|
|
|
"pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, |
|
|
"pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, |
|
|
|
|
|
|
|
|
"mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, |
|
|
"mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, |
|
|
|
|
|
|
|
|
"urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, |
|
|
"urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, |
|
|
} |
|
|
|
|
|
def enhanced_transliterate_robust(text, target_script): |
|
|
"""Enhanced transliteration with better romanization handling""" |
|
|
try: |
|
|
cleaned_text = text.lower().strip() |
|
|
replacements = { |
|
|
'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph', |
|
|
'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh', |
|
|
'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au' |
|
|
} |
|
|
for old, new in replacements.items(): |
|
|
cleaned_text = cleaned_text.replace(old, new) |
|
|
result = transliterate(cleaned_text, sanscript.ITRANS, target_script) |
|
|
return result if result else text |
|
|
except Exception as e: |
|
|
print(f"Transliteration error: {e}") |
|
|
return text |
|
|
|
|
|
def test_all_22_languages(texts, batch_size=32): |
|
|
"""Complete testing function for all 22 languages""" |
|
|
results = [] |
|
|
preds = lid.batch_predict(texts, batch_size) |
|
|
|
|
|
for item in preds: |
|
|
if isinstance(item, dict): |
|
|
text = item.get("text", "") |
|
|
lang_code = item.get("lang", item.get("pred_lang", "")) |
|
|
score = float(item.get("score", 0.0)) |
|
|
model_name = item.get("model", "") |
|
|
else: |
|
|
text, lang_code, score, model_name = item |
|
|
|
|
|
is_romanized = lang_code.endswith("_Latn") |
|
|
|
|
|
if lang_code not in LID_TO_TRANSLATE: |
|
|
translation = f"Language '{lang_code}' not supported" |
|
|
method = "Unsupported" |
|
|
else: |
|
|
try: |
|
|
lang_info = LID_TO_TRANSLATE[lang_code] |
|
|
src_code = lang_info["it_code"] |
|
|
|
|
|
if is_romanized: |
|
|
native_text = enhanced_transliterate_robust(text, lang_info["script"]) |
|
|
method = f"Transliteration+IndicTrans2 (detected: {lang_code})" |
|
|
print(f"Romanized: '{text}' → '{native_text}'") |
|
|
else: |
|
|
native_text = text |
|
|
method = f"IndicTrans2 (detected: {lang_code})" |
|
|
|
|
|
|
|
|
pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn") |
|
|
inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) |
|
|
with torch.no_grad(): |
|
|
out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True) |
|
|
dec = tokenizer.batch_decode(out, skip_special_tokens=True) |
|
|
post = ip.postprocess_batch(dec, lang=src_code) |
|
|
translation = post[0] |
|
|
|
|
|
except Exception as e: |
|
|
translation = f"Translation error: {str(e)}" |
|
|
method = "Error" |
|
|
|
|
|
results.append({ |
|
|
"language": text[:20] + "..." if len(text) > 20 else text, |
|
|
"original_text": text, |
|
|
"detected_lang": lang_code, |
|
|
"script_type": "Romanized" if is_romanized else "Native", |
|
|
"confidence": f"{score:.3f}", |
|
|
"method": method, |
|
|
"english_translation": translation |
|
|
}) |
|
|
|
|
|
return pd.DataFrame(results) |
|
|
|
|
|
|
|
|
print("🔍 Creating test dataset for all 22 official Indian languages...") |
|
|
all_test_texts = [] |
|
|
for lang, (native, roman) in sample_sentences.items(): |
|
|
all_test_texts.append(native) |
|
|
all_test_texts.append(roman) |
|
|
|
|
|
print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...") |
|
|
|
|
|
|
|
|
df_results = test_all_22_languages(all_test_texts, batch_size=32) |
|
|
|
|
|
|
|
|
print("\n🎯 COMPLETE TEST RESULTS:") |
|
|
display(df_results) |
|
|
|
|
|
|
|
|
print(f"\n📈 SUMMARY STATISTICS:") |
|
|
print(f"Total samples tested: {len(df_results)}") |
|
|
print(f"Languages detected: {df_results['detected_lang'].nunique()}") |
|
|
print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}") |
|
|
print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}") |
|
|
print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}") |
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
def detailed_translation_summary(df_results): |
|
|
""" |
|
|
Generate comprehensive detailed summary of translation results |
|
|
""" |
|
|
|
|
|
df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False) |
|
|
|
|
|
print("\n=========== OVERALL SUMMARY ===========") |
|
|
print(f"Total samples tested: {len(df_results)}") |
|
|
print(f"Languages detected: {df_results['detected_lang'].nunique()}") |
|
|
print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}") |
|
|
print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}") |
|
|
print(f"Successfully translated: {df_results['successful_translation'].sum()}") |
|
|
|
|
|
overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100) |
|
|
print(f"Overall success rate: {overall_success_rate:.1f}%") |
|
|
|
|
|
print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========") |
|
|
|
|
|
lang_summary = df_results.groupby('detected_lang').agg( |
|
|
total_samples=('original_text', 'count'), |
|
|
native_count=('script_type', lambda x: (x == 'Native').sum()), |
|
|
romanized_count=('script_type', lambda x: (x == 'Romanized').sum()), |
|
|
mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()), |
|
|
success=('successful_translation', 'sum'), |
|
|
error_count=('successful_translation', lambda x: (~x).sum()) |
|
|
).reset_index().sort_values('total_samples', ascending=False) |
|
|
|
|
|
lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1) |
|
|
print(lang_summary) |
|
|
|
|
|
print("\n=========== TOP PERFORMING LANGUAGES ===========") |
|
|
top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False) |
|
|
if len(top_performers) > 0: |
|
|
print(top_performers[['detected_lang', 'total_samples', 'success_rate']]) |
|
|
else: |
|
|
print("No languages with 90%+ success rate") |
|
|
|
|
|
print("\n=========== CHALLENGING LANGUAGES ===========") |
|
|
challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate') |
|
|
if len(challenging) > 0: |
|
|
print(challenging[['detected_lang', 'total_samples', 'success_rate']]) |
|
|
else: |
|
|
print("No languages with <50% success rate") |
|
|
|
|
|
print("\n=========== ERROR ANALYSIS ===========") |
|
|
error_df = df_results[~df_results['successful_translation']] |
|
|
print(f"Total errors: {len(error_df)}") |
|
|
if len(error_df) > 0: |
|
|
print("\nError samples:") |
|
|
print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']]) |
|
|
else: |
|
|
print("No errors found!") |
|
|
|
|
|
print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========") |
|
|
script_summary = df_results.groupby('script_type').agg( |
|
|
total_samples=('original_text', 'count'), |
|
|
successful=('successful_translation', 'sum'), |
|
|
success_rate=('successful_translation', lambda x: x.mean() * 100) |
|
|
).round(1) |
|
|
print(script_summary) |
|
|
|
|
|
print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========") |
|
|
confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False) |
|
|
print("Top 10 most confident detections:") |
|
|
print(confidence_summary.head(10)) |
|
|
|
|
|
return lang_summary, script_summary, error_df |
|
|
|
|
|
|
|
|
print("✅ Detailed summary function defined") |
|
|
print("\n📋 To run on your test results:") |
|
|
print(" lang_summary, script_summary, error_df = detailed_translation_summary(df_results)") |
|
|
print(" display(lang_summary)") |
|
|
print(" display(error_df)") |
|
|
|
|
|
|
|
|
lang_summary, script_summary, error_df = detailed_translation_summary(df_results) |
|
|
|
|
|
|
|
|
display(lang_summary) |
|
|
display(error_df) |
|
|
|