kasimali commited on
Commit
bad5076
·
verified ·
1 Parent(s): 6dd899c

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +3 -8
  2. app.py +232 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,7 @@
1
  ---
2
- title: Demonstrateindiclidtrans2
3
- emoji: 📊
4
- colorFrom: pink
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.49.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: demonstrateindiclidtrans2
3
+ emoji: 🚀
 
 
4
  sdk: gradio
 
 
 
5
  ---
6
 
7
+ # demonstrateindiclidtrans2
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # demonstrateindiclidtrans2
2
+
3
+
4
+ print("--- 1. Installing All Libraries ---")
5
+ print("✅ Libraries installed.")
6
+
7
+ print("\n--- 2. Cloning IndicLID Repository ---")
8
+ # Using your proven method of changing directories
9
+ print("✅ Repository cloned.")
10
+
11
+ # Navigate into the correct directory structure
12
+
13
+ print("\n--- 3. Downloading and Unzipping IndicLID Models ---")
14
+ print("✅ Download commands executed. Unzipping now...")
15
+ print("✅ Unzip commands executed.")
16
+
17
+
18
+
19
+ import os
20
+ import sys
21
+ import torch
22
+ print("--- Applying your original add_safe_globals fix... ---")
23
+
24
+ if "/content/IndicLID/Inference" not in sys.path:
25
+ sys.path.append("/content/IndicLID/Inference")
26
+
27
+ from transformers.models.bert.modeling_bert import (
28
+ BertModel, BertPreTrainedModel, BertForSequenceClassification,
29
+ BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
30
+ BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput
31
+ )
32
+ from transformers.models.bert.configuration_bert import BertConfig
33
+ import torch.nn as nn
34
+ from torch.nn.modules.sparse import Embedding
35
+ from torch.nn.modules.container import ModuleList
36
+ from torch.nn.modules.linear import Linear
37
+ from torch.nn.modules.normalization import LayerNorm
38
+ from torch.nn.modules.dropout import Dropout
39
+
40
+ torch.serialization.add_safe_globals([
41
+ BertModel, BertPreTrainedModel, BertForSequenceClassification,
42
+ BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
43
+ BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig,
44
+ Embedding, ModuleList, Linear, LayerNorm, Dropout,
45
+ ])
46
+ print("✅ Comprehensive safe globals added successfully.")
47
+
48
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
49
+ from IndicTransToolkit.processor import IndicProcessor
50
+ from ai4bharat.IndicLID import IndicLID
51
+
52
+ print("--- Loading all models into memory... ---")
53
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
+ print(f"Using device: {device}")
55
+
56
+ lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
57
+ print("✅ IndicLID model loaded successfully.")
58
+
59
+ MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"
60
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
61
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device)
62
+ ip = IndicProcessor(inference=True)
63
+
64
+
65
+ import gradio as gr
66
+ import pandas as pd
67
+ from indic_transliteration import sanscript
68
+ from indic_transliteration.sanscript import transliterate
69
+
70
+ # Language mapping for translation
71
+ LID_TO_TRANSLATE = {
72
+ "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
73
+ "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
74
+ "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
75
+ "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
76
+ "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
77
+ "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
78
+ "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
79
+ "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
80
+ "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
81
+ "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
82
+ "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
83
+ "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
84
+ "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
85
+ "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
86
+ "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
87
+ "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
88
+ "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
89
+ "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
90
+ "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
91
+ "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
92
+ "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
93
+ # Common misdetections mapped to supported languages
94
+ "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
95
+ "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
96
+ "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
97
+ "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
98
+ "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
99
+ "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
100
+ "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
101
+ }
102
+
103
+ def enhanced_transliterate_robust(text, target_script):
104
+ try:
105
+ cleaned_text = text.lower().strip()
106
+ replacements = {
107
+ 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
108
+ 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
109
+ 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
110
+ }
111
+ for old, new in replacements.items():
112
+ cleaned_text = cleaned_text.replace(old, new)
113
+ result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
114
+ return result if result else text
115
+ except Exception as e:
116
+ return text
117
+
118
+ def detect_and_translate_single(text):
119
+ """
120
+ Detect language and translate single text input
121
+ """
122
+ try:
123
+ # Language detection
124
+ preds = lid.batch_predict([text], 1)
125
+ item = preds[0]
126
+
127
+ if isinstance(item, dict):
128
+ detected_lang = item.get("lang", item.get("pred_lang", ""))
129
+ score = float(item.get("score", 0.0))
130
+ model_name = item.get("model", "")
131
+ else:
132
+ _, detected_lang, score, model_name = item
133
+
134
+ is_romanized = detected_lang.endswith("_Latn")
135
+ script_type = "Romanized" if is_romanized else "Native Script"
136
+
137
+ # Translation
138
+ if detected_lang not in LID_TO_TRANSLATE:
139
+ translation = f"Language '{detected_lang}' not supported for translation"
140
+ method = "Unsupported"
141
+ else:
142
+ try:
143
+ lang_info = LID_TO_TRANSLATE[detected_lang]
144
+ src_code = lang_info["it_code"]
145
+
146
+ if is_romanized:
147
+ # Enhanced transliteration for romanized text
148
+ native_text = enhanced_transliterate_robust(text, lang_info["script"])
149
+ method = "Transliteration + IndicTrans2"
150
+ else:
151
+ native_text = text
152
+ method = "IndicTrans2"
153
+
154
+ # Translate with IndicTrans2
155
+ pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
156
+ inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
157
+ with torch.no_grad():
158
+ out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
159
+ dec = tokenizer.batch_decode(out, skip_special_tokens=True)
160
+ post = ip.postprocess_batch(dec, lang=src_code)
161
+ translation = post[0]
162
+
163
+ except Exception as e:
164
+ translation = f"Translation error: {str(e)}"
165
+ method = "Error"
166
+
167
+ return detected_lang, script_type, f"{score:.3f}", method, translation
168
+
169
+ except Exception as e:
170
+ return "Error", "Error", "0.000", "Error", f"Detection error: {str(e)}"
171
+
172
+ # Gradio Interface
173
+ def gradio_interface(input_text):
174
+ if not input_text.strip():
175
+ return "Please enter some text", "", "", "", ""
176
+
177
+ detected_lang, script_type, confidence, method, translation = detect_and_translate_single(input_text)
178
+
179
+ return detected_lang, script_type, confidence, method, translation
180
+
181
+ # Create Gradio app
182
+ with gr.Blocks(title="Indian Language Detection & Translation") as app:
183
+ gr.Markdown("# 🇮🇳 Indian Language Detector & Translator")
184
+ gr.Markdown("Enter text in any Indian language (native script or romanized) to detect the language and get English translation.")
185
+
186
+ with gr.Row():
187
+ with gr.Column():
188
+ input_text = gr.Textbox(
189
+ label="Input Text",
190
+ placeholder="Enter text in Hindi, Tamil, Bengali, etc...",
191
+ lines=3
192
+ )
193
+ translate_btn = gr.Button("🔍 Detect & Translate", variant="primary")
194
+
195
+ with gr.Row():
196
+ with gr.Column():
197
+ detected_lang = gr.Textbox(label="🎯 Detected Language", interactive=False)
198
+ script_type = gr.Textbox(label="📝 Script Type", interactive=False)
199
+ with gr.Column():
200
+ confidence = gr.Textbox(label="🎯 Confidence Score", interactive=False)
201
+ method = gr.Textbox(label="⚙️ Translation Method", interactive=False)
202
+
203
+ translation_output = gr.Textbox(
204
+ label="🌍 English Translation",
205
+ interactive=False,
206
+ lines=2
207
+ )
208
+
209
+ # Examples
210
+ gr.Examples(
211
+ examples=[
212
+ ["तुम कैसे हो?"],
213
+ ["tum kaise ho"],
214
+ ["நீங்கள் எப்படி இருக்கிறீர்கள்?"],
215
+ ["neenga epdi irukeenga"],
216
+ ["আমি ভালো আছি।"],
217
+ ["ami bhalo achi"],
218
+ ["ನೀವು ಹೇಗಿದ್ದೀರಾ?"],
219
+ ["neevu hegiddira"]
220
+ ],
221
+ inputs=input_text,
222
+ label="📚 Try these examples:"
223
+ )
224
+
225
+ translate_btn.click(
226
+ fn=gradio_interface,
227
+ inputs=[input_text],
228
+ outputs=[detected_lang, script_type, confidence, method, translation_output]
229
+ )
230
+
231
+ # Launch the app
232
+ app.launch(share=True, debug=False)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ sentencepiece
4
+ torch
5
+ transformers