kasimali commited on
Commit
3d5d9f5
·
verified ·
1 Parent(s): 8962a42

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +3 -8
  2. app.py +694 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,7 @@
1
  ---
2
- title: Final
3
- emoji: 📊
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.49.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: final
3
+ emoji: 🚀
 
 
4
  sdk: gradio
 
 
 
5
  ---
6
 
7
+ # final
app.py ADDED
@@ -0,0 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # final
2
+
3
+ # ================================================================
4
+ # = STEP 1: SETUP AND DOWNLOAD (YOUR PROVEN METHOD) =
5
+ # ================================================================
6
+ import os
7
+
8
+ print("--- 1. Installing All Libraries ---")
9
+ print("✅ Libraries installed.")
10
+
11
+ print("\n--- 2. Cloning IndicLID Repository ---")
12
+ # Using your proven method of changing directories
13
+ print("✅ Repository cloned.")
14
+
15
+ # Navigate into the correct directory structure
16
+
17
+ print("\n--- 3. Downloading and Unzipping IndicLID Models ---")
18
+ print("✅ Download commands executed. Unzipping now...")
19
+ print("✅ Unzip commands executed.")
20
+
21
+ print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉")
22
+
23
+
24
+ import shutil
25
+ import os
26
+
27
+ # Source folder path
28
+ source = "/usr/local/lib/python3.12/dist-packages/transformers"
29
+
30
+ # Destination folder path
31
+ destination = "/content/IndicLID/Inference/ai4bharat/"
32
+
33
+ # Ensure destination directory exists
34
+ os.makedirs(destination, exist_ok=True)
35
+
36
+ # Move folder
37
+ moved_path = shutil.move(source, destination)
38
+
39
+ print(f"Folder moved to: {moved_path}")
40
+
41
+
42
+
43
+ # =========================
44
+ # = STEP 2: INITIALIZE MODELS (EXACTLY AS YOUR OLD CODE) =
45
+ # =========================
46
+ import os
47
+ import sys
48
+ import torch
49
+ print("--- Applying your original add_safe_globals fix... ---")
50
+
51
+ if "/content/IndicLID/Inference" not in sys.path:
52
+ sys.path.append("/content/IndicLID/Inference")
53
+
54
+ from transformers.models.bert.modeling_bert import (
55
+ BertModel, BertPreTrainedModel, BertForSequenceClassification,
56
+ BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
57
+ BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput
58
+ )
59
+ from transformers.models.bert.configuration_bert import BertConfig
60
+ import torch.nn as nn
61
+ from torch.nn.modules.sparse import Embedding
62
+ from torch.nn.modules.container import ModuleList
63
+ from torch.nn.modules.linear import Linear
64
+ from torch.nn.modules.normalization import LayerNorm
65
+ from torch.nn.modules.dropout import Dropout
66
+
67
+ torch.serialization.add_safe_globals([
68
+ BertModel, BertPreTrainedModel, BertForSequenceClassification,
69
+ BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
70
+ BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig,
71
+ Embedding, ModuleList, Linear, LayerNorm, Dropout,
72
+ ])
73
+ print("✅ Comprehensive safe globals added successfully.")
74
+
75
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
76
+ from IndicTransToolkit.processor import IndicProcessor
77
+ from ai4bharat.IndicLID import IndicLID
78
+
79
+ print("--- Loading all models into memory... ---")
80
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
81
+ print(f"Using device: {device}")
82
+
83
+ lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
84
+ print("✅ IndicLID model loaded successfully.")
85
+
86
+ MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"
87
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
88
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device)
89
+ ip = IndicProcessor(inference=True)
90
+ print("✅ IndicTrans2 1B model loaded.")
91
+
92
+ print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.")
93
+
94
+
95
+ import sys
96
+ print(sys.path)
97
+
98
+ pip show transformers
99
+
100
+
101
+
102
+ # ================================================================
103
+ # = STEP 2.5: LOAD ROMANSETU (COMPATIBLE WITH 4.40.2) =
104
+ # ================================================================
105
+
106
+ from transformers import AutoTokenizer, AutoModelForCausalLM
107
+ import torch
108
+
109
+ print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---")
110
+
111
+ # Try smaller, more compatible models first
112
+ model_options = [
113
+ "ai4bharat/romansetu-cpt-roman-100m",
114
+ "ai4bharat/romansetu-cpt-roman-200m"
115
+ ]
116
+
117
+ rs_model = None
118
+ rs_tokenizer = None
119
+
120
+ for model_id in model_options:
121
+ try:
122
+ print(f"Trying model: {model_id}")
123
+ rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
124
+ rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
125
+ print(f"✅ {model_id} loaded successfully.")
126
+ break
127
+ except Exception as e:
128
+ print(f"❌ {model_id} failed: {e}")
129
+ continue
130
+
131
+ if rs_model is None:
132
+ print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.")
133
+
134
+ def translate_with_romansetu(text, max_new_tokens=50):
135
+ if rs_model is None:
136
+ # Fallback: use enhanced transliteration + IndicTrans2
137
+ from indic_transliteration import sanscript
138
+ from indic_transliteration.sanscript import transliterate
139
+ try:
140
+ # Try to transliterate and then translate with IndicTrans2
141
+ native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
142
+ pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn")
143
+ inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
144
+ with torch.no_grad():
145
+ out = model.generate(**inputs, num_beams=3, max_length=100)
146
+ dec = tokenizer.batch_decode(out, skip_special_tokens=True)
147
+ post = ip.postprocess_batch(dec, lang="hin_Deva")
148
+ return post[0]
149
+ except:
150
+ return text
151
+
152
+ try:
153
+ prompt = f"Translate this romanized Indian text to English: {text}"
154
+ inputs = rs_tokenizer(prompt, return_tensors="pt").to(device)
155
+
156
+ with torch.no_grad():
157
+ outputs = rs_model.generate(
158
+ inputs.input_ids,
159
+ max_new_tokens=max_new_tokens,
160
+ num_beams=2,
161
+ temperature=0.7,
162
+ do_sample=True,
163
+ pad_token_id=rs_tokenizer.eos_token_id
164
+ )
165
+
166
+ full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True)
167
+ translation = full_response.replace(prompt, "").strip()
168
+ return translation if translation and len(translation) > 2 else text
169
+
170
+ except Exception as e:
171
+ return text
172
+
173
+ print("✅ RomanSetu/fallback translation function defined.")
174
+ print("🎉 SETUP COMPLETE with fallback mechanism.")
175
+
176
+
177
+ # ================================================================
178
+ # = STEP 2.6: LOAD INDICXLIT FOR BETTER TRANSLITERATION (CORRECTED) =
179
+ # ================================================================
180
+
181
+ print("--- Installing and loading IndicXlit for better romanized text handling ---")
182
+
183
+ # Install IndicXlit (compatible with your transformers==4.40.2)
184
+
185
+ from ai4bharat.transliteration import XlitEngine
186
+ import torch
187
+
188
+ try:
189
+ # Load IndicXlit engines for different languages (based on official docs)
190
+ xlit_engines = {
191
+ "hindi": XlitEngine("hi", beam_width=4, rescore=True),
192
+ "bengali": XlitEngine("bn", beam_width=4, rescore=True),
193
+ "tamil": XlitEngine("ta", beam_width=4, rescore=True),
194
+ "telugu": XlitEngine("te", beam_width=4, rescore=True),
195
+ "gujarati": XlitEngine("gu", beam_width=4, rescore=True),
196
+ "kannada": XlitEngine("kn", beam_width=4, rescore=True),
197
+ "malayalam": XlitEngine("ml", beam_width=4, rescore=True),
198
+ "punjabi": XlitEngine("pa", beam_width=4, rescore=True),
199
+ "marathi": XlitEngine("mr", beam_width=4, rescore=True),
200
+ "urdu": XlitEngine("ur", beam_width=4, rescore=True),
201
+ }
202
+ print("✅ Multiple IndicXlit engines loaded successfully.")
203
+
204
+ except Exception as e:
205
+ print(f"❌ Error loading IndicXlit: {e}")
206
+ print("💡 Falling back to basic transliteration.")
207
+ xlit_engines = {}
208
+
209
+ def enhanced_transliterate_with_xlit(text, target_lang):
210
+ """
211
+ Enhanced transliteration using IndicXlit (based on official API)
212
+ """
213
+ lang_key = target_lang.lower()
214
+
215
+ if not xlit_engines or lang_key not in xlit_engines:
216
+ # Fallback to your existing transliteration
217
+ from indic_transliteration import sanscript
218
+ from indic_transliteration.sanscript import transliterate
219
+ script_map = {
220
+ "hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI,
221
+ "tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU,
222
+ "kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM,
223
+ "gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI,
224
+ "marathi": sanscript.DEVANAGARI, "urdu": 'urdu'
225
+ }
226
+ return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI))
227
+
228
+ try:
229
+ # Use IndicXlit for better transliteration (official API)
230
+ engine = xlit_engines[lang_key]
231
+
232
+ # For sentences, use translit_sentence (returns dict with lang code as key)
233
+ if ' ' in text:
234
+ result = engine.translit_sentence(text)
235
+ # Get the language code for this engine
236
+ lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
237
+ "gujarati": "gu", "kannada": "kn", "malayalam": "ml",
238
+ "punjabi": "pa", "marathi": "mr", "urdu": "ur"}
239
+ lang_code = lang_codes.get(lang_key, "hi")
240
+ return result.get(lang_code, text)
241
+ else:
242
+ # For single words, use translit_word (returns dict with topk results)
243
+ result = engine.translit_word(text, topk=1)
244
+ lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
245
+ "gujarati": "gu", "kannada": "kn", "malayalam": "ml",
246
+ "punjabi": "pa", "marathi": "mr", "urdu": "ur"}
247
+ lang_code = lang_codes.get(lang_key, "hi")
248
+ return result.get(lang_code, [text])[0]
249
+
250
+ except Exception as e:
251
+ print(f"IndicXlit error for '{text}': {e}")
252
+ # Fallback if IndicXlit fails
253
+ return text
254
+
255
+ print("✅ Enhanced transliteration function defined.")
256
+ print("🎉 INDICXLIT SETUP COMPLETE.")
257
+
258
+
259
+ import pandas as pd
260
+ from indic_transliteration import sanscript
261
+ from indic_transliteration.sanscript import transliterate
262
+
263
+ # EXPANDED language mapping to handle misdetections
264
+ LID_TO_TRANSLATE = {
265
+ # Hindi variants
266
+ "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
267
+ "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
268
+
269
+ # Maithili (often confused with Hindi) - map to Hindi
270
+ "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
271
+ "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
272
+
273
+ # Bengali variants
274
+ "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
275
+ "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
276
+
277
+ # Assamese (often confused with Bengali) - map to Bengali
278
+ "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
279
+ "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
280
+
281
+ # Tamil variants
282
+ "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
283
+ "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
284
+ "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
285
+
286
+ # Telugu variants
287
+ "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
288
+ "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
289
+
290
+ # Kannada variants
291
+ "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
292
+ "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
293
+
294
+ # Malayalam variants
295
+ "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
296
+ "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
297
+
298
+ # Gujarati variants
299
+ "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
300
+ "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
301
+
302
+ # Punjabi variants
303
+ "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
304
+ "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
305
+
306
+ # Marathi variants
307
+ "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
308
+ "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
309
+
310
+ # Urdu variants
311
+ "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
312
+ "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
313
+
314
+ # Additional commonly misdetected languages
315
+ "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi → Hindi
316
+ "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali → Hindi
317
+ "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani → Hindi
318
+ "gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Goan Konkani → Hindi
319
+ "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo → Hindi
320
+ }
321
+
322
+ def enhanced_transliterate_robust(text, target_script):
323
+ """
324
+ Enhanced transliteration with better romanization handling
325
+ """
326
+ try:
327
+ # Preprocess text for better transliteration
328
+ cleaned_text = text.lower().strip()
329
+
330
+ # Handle common romanization patterns
331
+ replacements = {
332
+ 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
333
+ 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
334
+ 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
335
+ }
336
+
337
+ for old, new in replacements.items():
338
+ cleaned_text = cleaned_text.replace(old, new)
339
+
340
+ # Transliterate using your existing library
341
+ result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
342
+ return result if result else text
343
+
344
+ except Exception as e:
345
+ print(f"Transliteration error: {e}")
346
+ return text
347
+
348
+ def detect_and_translate_robust(texts, batch_size=64):
349
+ """
350
+ Robust detection and translation with expanded language mapping
351
+ """
352
+ results = []
353
+ preds = lid.batch_predict(texts, batch_size)
354
+
355
+ for item in preds:
356
+ if isinstance(item, dict):
357
+ text = item.get("text", "")
358
+ lang_code = item.get("lang", item.get("pred_lang", ""))
359
+ score = float(item.get("score", 0.0))
360
+ model_name = item.get("model", "")
361
+ else:
362
+ text, lang_code, score, model_name = item
363
+
364
+ is_romanized = lang_code.endswith("_Latn")
365
+
366
+ if lang_code not in LID_TO_TRANSLATE:
367
+ translation = f"Language '{lang_code}' not supported for translation"
368
+ method = "Unsupported"
369
+ else:
370
+ try:
371
+ lang_info = LID_TO_TRANSLATE[lang_code]
372
+ src_code = lang_info["it_code"]
373
+
374
+ if is_romanized:
375
+ # Use enhanced transliteration
376
+ native_text = enhanced_transliterate_robust(text, lang_info["script"])
377
+ method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})"
378
+ print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})")
379
+ else:
380
+ native_text = text
381
+ method = f"IndicTrans2 (detected as {lang_code})"
382
+
383
+ # Translate with IndicTrans2
384
+ pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
385
+ inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
386
+ with torch.no_grad():
387
+ out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
388
+ dec = tokenizer.batch_decode(out, skip_special_tokens=True)
389
+ post = ip.postprocess_batch(dec, lang=src_code)
390
+ translation = post[0]
391
+
392
+ except Exception as e:
393
+ translation = f"Translation error: {str(e)}"
394
+ method = "Error"
395
+
396
+ results.append({
397
+ "original_text": text,
398
+ "detected_lang": lang_code,
399
+ "script_type": "Romanized" if is_romanized else "Native",
400
+ "confidence": f"{score:.3f}",
401
+ "translation_method": method,
402
+ "english_translation": translation
403
+ })
404
+
405
+ return pd.DataFrame(results)
406
+
407
+ print("✅ Robust translation function with expanded language mapping defined")
408
+
409
+ # Test with the same samples
410
+ sample_texts = [
411
+ "यहाँ कितने लोग हैं?",
412
+ "tum kaha ho",
413
+ "aaj mausam suhana hai",
414
+ "aap kaise hain",
415
+ "আমি ভালো আছি।",
416
+ "ami bhalo achi",
417
+ "mera naam rahul hai",
418
+ "main office jaa raha hun"
419
+ ]
420
+
421
+ print(f"🔍 Testing robust approach with expanded language mapping...")
422
+ df_results = detect_and_translate_robust(sample_texts, batch_size=16)
423
+ display(df_results)
424
+
425
+
426
+ # ================================================================
427
+ # = COMPLETE TEST CODE FOR ALL 22 INDIAN LANGUAGES =
428
+ # ================================================================
429
+
430
+ import pandas as pd
431
+ from indic_transliteration import sanscript
432
+ from indic_transliteration.sanscript import transliterate
433
+
434
+ # Official 22 Indian languages sample sentences (native + romanized)
435
+ sample_sentences = {
436
+ "Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"),
437
+ "Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"),
438
+ "Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"),
439
+ "Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"),
440
+ "Gujarati": ("તમે કેમ છો?", "tame kem cho?"),
441
+ "Hindi": ("तुम कैसे हो?", "tum kaise ho?"),
442
+ "Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"),
443
+ "Kashmiri": ("तुस की छै?", "tus ki chhai?"),
444
+ "Konkani": ("तुम कशें आसा?", "tum kashen asa?"),
445
+ "Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"),
446
+ "Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"),
447
+ "Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"),
448
+ "Marathi": ("तू कसा आहेस?", "tu kasa ahes?"),
449
+ "Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"),
450
+ "Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"),
451
+ "Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"),
452
+ "Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"),
453
+ "Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"),
454
+ "Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"),
455
+ "Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"),
456
+ "Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"),
457
+ "Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?")
458
+ }
459
+
460
+ # Expanded language mapping (covers common misdetections)
461
+ LID_TO_TRANSLATE = {
462
+ # Hindi variants
463
+ "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
464
+ "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
465
+ "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili→Hindi
466
+ "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
467
+ "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali→Hindi
468
+ "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi→Hindi
469
+ "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani→Hindi
470
+ "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo→Hindi
471
+
472
+ # Bengali variants
473
+ "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
474
+ "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
475
+ "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese→Bengali
476
+ "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
477
+
478
+ # Tamil variants
479
+ "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
480
+ "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
481
+ "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
482
+
483
+ # Telugu variants
484
+ "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
485
+ "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
486
+
487
+ # Kannada variants
488
+ "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
489
+ "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
490
+
491
+ # Malayalam variants
492
+ "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
493
+ "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
494
+
495
+ # Gujarati variants
496
+ "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
497
+ "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
498
+
499
+ # Punjabi variants
500
+ "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
501
+ "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
502
+
503
+ # Marathi variants
504
+ "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
505
+ "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
506
+
507
+ # Urdu variants
508
+ "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
509
+ "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
510
+ }
511
+
512
+ def enhanced_transliterate_robust(text, target_script):
513
+ """Enhanced transliteration with better romanization handling"""
514
+ try:
515
+ cleaned_text = text.lower().strip()
516
+ replacements = {
517
+ 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
518
+ 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
519
+ 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
520
+ }
521
+ for old, new in replacements.items():
522
+ cleaned_text = cleaned_text.replace(old, new)
523
+ result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
524
+ return result if result else text
525
+ except Exception as e:
526
+ print(f"Transliteration error: {e}")
527
+ return text
528
+
529
+ def test_all_22_languages(texts, batch_size=32):
530
+ """Complete testing function for all 22 languages"""
531
+ results = []
532
+ preds = lid.batch_predict(texts, batch_size)
533
+
534
+ for item in preds:
535
+ if isinstance(item, dict):
536
+ text = item.get("text", "")
537
+ lang_code = item.get("lang", item.get("pred_lang", ""))
538
+ score = float(item.get("score", 0.0))
539
+ model_name = item.get("model", "")
540
+ else:
541
+ text, lang_code, score, model_name = item
542
+
543
+ is_romanized = lang_code.endswith("_Latn")
544
+
545
+ if lang_code not in LID_TO_TRANSLATE:
546
+ translation = f"Language '{lang_code}' not supported"
547
+ method = "Unsupported"
548
+ else:
549
+ try:
550
+ lang_info = LID_TO_TRANSLATE[lang_code]
551
+ src_code = lang_info["it_code"]
552
+
553
+ if is_romanized:
554
+ native_text = enhanced_transliterate_robust(text, lang_info["script"])
555
+ method = f"Transliteration+IndicTrans2 (detected: {lang_code})"
556
+ print(f"Romanized: '{text}' → '{native_text}'")
557
+ else:
558
+ native_text = text
559
+ method = f"IndicTrans2 (detected: {lang_code})"
560
+
561
+ # Translate with IndicTrans2
562
+ pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
563
+ inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
564
+ with torch.no_grad():
565
+ out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
566
+ dec = tokenizer.batch_decode(out, skip_special_tokens=True)
567
+ post = ip.postprocess_batch(dec, lang=src_code)
568
+ translation = post[0]
569
+
570
+ except Exception as e:
571
+ translation = f"Translation error: {str(e)}"
572
+ method = "Error"
573
+
574
+ results.append({
575
+ "language": text[:20] + "..." if len(text) > 20 else text,
576
+ "original_text": text,
577
+ "detected_lang": lang_code,
578
+ "script_type": "Romanized" if is_romanized else "Native",
579
+ "confidence": f"{score:.3f}",
580
+ "method": method,
581
+ "english_translation": translation
582
+ })
583
+
584
+ return pd.DataFrame(results)
585
+
586
+ # Create test dataset with all 44 samples (22 native + 22 romanized)
587
+ print("🔍 Creating test dataset for all 22 official Indian languages...")
588
+ all_test_texts = []
589
+ for lang, (native, roman) in sample_sentences.items():
590
+ all_test_texts.append(native)
591
+ all_test_texts.append(roman)
592
+
593
+ print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...")
594
+
595
+ # Run the complete test
596
+ df_results = test_all_22_languages(all_test_texts, batch_size=32)
597
+
598
+ # Display results
599
+ print("\n🎯 COMPLETE TEST RESULTS:")
600
+ display(df_results)
601
+
602
+ # Summary statistics
603
+ print(f"\n📈 SUMMARY STATISTICS:")
604
+ print(f"Total samples tested: {len(df_results)}")
605
+ print(f"Languages detected: {df_results['detected_lang'].nunique()}")
606
+ print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}")
607
+ print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}")
608
+ print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}")
609
+
610
+
611
+ import pandas as pd
612
+
613
+ def detailed_translation_summary(df_results):
614
+ """
615
+ Generate comprehensive detailed summary of translation results
616
+ """
617
+ # Flag successful translations
618
+ df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False)
619
+
620
+ print("\n=========== OVERALL SUMMARY ===========")
621
+ print(f"Total samples tested: {len(df_results)}")
622
+ print(f"Languages detected: {df_results['detected_lang'].nunique()}")
623
+ print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}")
624
+ print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}")
625
+ print(f"Successfully translated: {df_results['successful_translation'].sum()}")
626
+
627
+ overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100)
628
+ print(f"Overall success rate: {overall_success_rate:.1f}%")
629
+
630
+ print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========")
631
+ # Per-language analysis
632
+ lang_summary = df_results.groupby('detected_lang').agg(
633
+ total_samples=('original_text', 'count'),
634
+ native_count=('script_type', lambda x: (x == 'Native').sum()),
635
+ romanized_count=('script_type', lambda x: (x == 'Romanized').sum()),
636
+ mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()),
637
+ success=('successful_translation', 'sum'),
638
+ error_count=('successful_translation', lambda x: (~x).sum())
639
+ ).reset_index().sort_values('total_samples', ascending=False)
640
+
641
+ lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1)
642
+ print(lang_summary)
643
+
644
+ print("\n=========== TOP PERFORMING LANGUAGES ===========")
645
+ top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False)
646
+ if len(top_performers) > 0:
647
+ print(top_performers[['detected_lang', 'total_samples', 'success_rate']])
648
+ else:
649
+ print("No languages with 90%+ success rate")
650
+
651
+ print("\n=========== CHALLENGING LANGUAGES ===========")
652
+ challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate')
653
+ if len(challenging) > 0:
654
+ print(challenging[['detected_lang', 'total_samples', 'success_rate']])
655
+ else:
656
+ print("No languages with <50% success rate")
657
+
658
+ print("\n=========== ERROR ANALYSIS ===========")
659
+ error_df = df_results[~df_results['successful_translation']]
660
+ print(f"Total errors: {len(error_df)}")
661
+ if len(error_df) > 0:
662
+ print("\nError samples:")
663
+ print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']])
664
+ else:
665
+ print("No errors found!")
666
+
667
+ print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========")
668
+ script_summary = df_results.groupby('script_type').agg(
669
+ total_samples=('original_text', 'count'),
670
+ successful=('successful_translation', 'sum'),
671
+ success_rate=('successful_translation', lambda x: x.mean() * 100)
672
+ ).round(1)
673
+ print(script_summary)
674
+
675
+ print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========")
676
+ confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False)
677
+ print("Top 10 most confident detections:")
678
+ print(confidence_summary.head(10))
679
+
680
+ return lang_summary, script_summary, error_df
681
+
682
+ # ===== HOW TO USE =====
683
+ print("✅ Detailed summary function defined")
684
+ print("\n📋 To run on your test results:")
685
+ print(" lang_summary, script_summary, error_df = detailed_translation_summary(df_results)")
686
+ print(" display(lang_summary)")
687
+ print(" display(error_df)")
688
+
689
+
690
+ lang_summary, script_summary, error_df = detailed_translation_summary(df_results)
691
+
692
+
693
+ display(lang_summary)
694
+ display(error_df)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ sentencepiece
4
+ torch
5
+ transformers