kasimali commited on
Commit
571d55d
Β·
verified Β·
1 Parent(s): 496a913

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +3 -8
  2. app.py +262 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,12 +1,7 @@
1
  ---
2
- title: Xlit Testing
3
- emoji: πŸ‘
4
- colorFrom: green
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.49.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: XLIT-TESTING
3
+ emoji: πŸš€
 
 
4
  sdk: gradio
 
 
 
5
  ---
6
 
7
+ # XLIT-TESTING
app.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # XLIT-TESTING
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import requests
6
+ from typing import List, Dict, Union, Optional
7
+ import io
8
+
9
+ # YOUR EXACT IndicXlit API Code (no changes)
10
+ class IndicXlitClient:
11
+ """Simple client for IndicXlit Transliteration API"""
12
+
13
+ def __init__(self, api_url: str = "https://awake-blowfish-liberal.ngrok-free.app"):
14
+ self.api_url = api_url.rstrip('/')
15
+ self.session = requests.Session()
16
+ self.session.headers.update({
17
+ 'Content-Type': 'application/json',
18
+ 'Accept': 'application/json'
19
+ })
20
+
21
+ def health_check(self) -> dict:
22
+ try:
23
+ response = self.session.get(f"{self.api_url}/health")
24
+ response.raise_for_status()
25
+ return response.json()
26
+ except Exception as e:
27
+ return {"error": str(e), "status": "unhealthy"}
28
+
29
+ def get_supported_languages(self) -> List[str]:
30
+ try:
31
+ response = self.session.get(f"{self.api_url}/languages")
32
+ response.raise_for_status()
33
+ data = response.json()
34
+ return data.get("supported_languages", [])
35
+ except Exception as e:
36
+ print(f"Error getting languages: {e}")
37
+ return []
38
+
39
+ def english_to_indic(self, text: str, target_languages: Union[str, List[str]], beam_width: int = 4) -> Dict[str, str]:
40
+ try:
41
+ payload = {
42
+ "text": text,
43
+ "target_languages": target_languages,
44
+ "beam_width": beam_width
45
+ }
46
+
47
+ response = self.session.post(
48
+ f"{self.api_url}/transliterate/en-to-indic",
49
+ json=payload
50
+ )
51
+ response.raise_for_status()
52
+ result = response.json()
53
+
54
+ if result.get("success"):
55
+ return result.get("results", {})
56
+ else:
57
+ print(f"API Error: {result}")
58
+ return {}
59
+
60
+ except Exception as e:
61
+ print(f"Error transliterating: {e}")
62
+ return {}
63
+
64
+ # Create global client instance
65
+ client = IndicXlitClient()
66
+
67
+ # Convenience functions
68
+ def transliterate_from_en(text: str, target_languages: Union[str, List[str]]) -> Dict[str, str]:
69
+ return client.english_to_indic(text, target_languages)
70
+
71
+ def get_supported_languages() -> List[str]:
72
+ return client.get_supported_languages()
73
+
74
+ def check_api_health() -> bool:
75
+ health = client.health_check()
76
+ return health.get("status") == "healthy"
77
+
78
+ # Test API connectivity
79
+ print("πŸ”„ Testing IndicXlit API connectivity...")
80
+ if check_api_health():
81
+ print("βœ… IndicXlit API is healthy and ready!")
82
+ supported_langs = get_supported_languages()
83
+ print(f"πŸ“‹ Supported languages: {supported_langs}")
84
+ print(f"πŸ“Š Total supported languages: {len(supported_langs)}")
85
+ else:
86
+ print("⚠️ IndicXlit API is not available")
87
+ print("❌ Please check your API URL or connection")
88
+
89
+ print("βœ… IndicXlit API setup completed!")
90
+
91
+
92
+ # Master language mapping for IndicXlit model testing
93
+ INDICXLIT_LANGUAGE_MAPPING = {
94
+ # Language name to IndicXlit API code mapping
95
+ 'assamese': 'as',
96
+ 'bengali': 'bn',
97
+ 'bodo': 'brx',
98
+ 'gujarati': 'gu',
99
+ 'hindi': 'hi',
100
+ 'kannada': 'kn',
101
+ 'kashmiri': 'ks',
102
+ 'konkani': 'gom', # IndicXlit uses 'gom' for Konkani
103
+ 'maithili': 'mai',
104
+ 'malayalam': 'ml',
105
+ 'marathi': 'mr',
106
+ 'manipuri': 'mni',
107
+ 'nepali': 'ne',
108
+ 'odia': 'or',
109
+ 'punjabi': 'pa',
110
+ 'sanskrit': 'sa',
111
+ 'sindhi': 'sd',
112
+ 'tamil': 'ta',
113
+ 'telugu': 'te',
114
+ 'urdu': 'ur'
115
+ }
116
+
117
+ # Languages NOT supported by IndicXlit (based on your previous testing)
118
+ UNSUPPORTED_LANGUAGES = ['dogri', 'santali']
119
+
120
+ print("πŸ“‹ IndicXlit Language Mapping:")
121
+ for lang_name, code in INDICXLIT_LANGUAGE_MAPPING.items():
122
+ print(f" {lang_name.capitalize()}: {code}")
123
+
124
+ print(f"\n⚠️ Unsupported languages: {', '.join(UNSUPPORTED_LANGUAGES)}")
125
+ print(f"βœ… Total mappings loaded: {len(INDICXLIT_LANGUAGE_MAPPING)}")
126
+
127
+
128
+ from google.colab import files
129
+ import pandas as pd
130
+
131
+ def process_excel_dataset_with_indicxlit():
132
+ """
133
+ Process Excel dataset using ONLY IndicXlit model
134
+ Input: Excel file with columns - Language, Roman Script, Native Script, English Translation
135
+ Output: Excel with all ground truth columns + IndicXlit Native Output
136
+ """
137
+ print("πŸ“ Please upload your Excel file containing the dataset...")
138
+ uploaded = files.upload()
139
+
140
+ for filename in uploaded.keys():
141
+ print(f"πŸ“„ Processing file: {filename}")
142
+
143
+ # Read the Excel file
144
+ try:
145
+ df_input = pd.read_excel(filename)
146
+ print(f"βœ… Successfully loaded Excel with {len(df_input)} rows")
147
+
148
+ # Display column names to verify structure
149
+ print(f"πŸ“‹ Columns found: {list(df_input.columns)}")
150
+
151
+ # Identify columns (case-insensitive matching)
152
+ column_mapping = {}
153
+ for col in df_input.columns:
154
+ col_lower = col.lower().strip()
155
+ if 'language' in col_lower:
156
+ column_mapping['language'] = col
157
+ elif 'roman' in col_lower:
158
+ column_mapping['roman'] = col
159
+ elif 'native' in col_lower:
160
+ column_mapping['native'] = col
161
+ elif 'english' in col_lower:
162
+ column_mapping['english'] = col
163
+
164
+ print(f"πŸ” Column mapping: {column_mapping}")
165
+
166
+ # Check if all required columns are found
167
+ if len(column_mapping) < 4:
168
+ print("❌ Could not identify all required columns (Language, Roman, Native, English)")
169
+ return None
170
+
171
+ results = []
172
+ print(f"πŸ”„ Processing {len(df_input)} samples with IndicXlit model...")
173
+
174
+ for i, row in df_input.iterrows():
175
+ language = str(row[column_mapping['language']]).lower().strip()
176
+ roman_text = str(row[column_mapping['roman']]).strip()
177
+ native_ground_truth = str(row[column_mapping['native']]).strip()
178
+ english_text = str(row[column_mapping['english']]).strip()
179
+
180
+ # Skip if language not supported
181
+ if language in UNSUPPORTED_LANGUAGES:
182
+ indicxlit_native_output = "NOT_SUPPORTED"
183
+ status = "UNSUPPORTED_LANGUAGE"
184
+ target_code = "N/A"
185
+ elif language in INDICXLIT_LANGUAGE_MAPPING:
186
+ target_code = INDICXLIT_LANGUAGE_MAPPING[language]
187
+
188
+ try:
189
+ # Use IndicXlit API for transliteration
190
+ api_results = transliterate_from_en(roman_text, target_code)
191
+
192
+ if api_results and target_code in api_results:
193
+ indicxlit_native_output = api_results[target_code]
194
+ status = "SUCCESS"
195
+ else:
196
+ indicxlit_native_output = roman_text # Fallback to original
197
+ status = "API_FAILED"
198
+
199
+ except Exception as e:
200
+ indicxlit_native_output = roman_text # Fallback to original
201
+ status = f"ERROR: {str(e)}"
202
+ else:
203
+ indicxlit_native_output = "LANGUAGE_NOT_MAPPED"
204
+ status = "UNKNOWN_LANGUAGE"
205
+ target_code = "N/A"
206
+
207
+ # Create result row with all ground truth + IndicXlit output
208
+ results.append({
209
+ 'Language': language.capitalize(),
210
+ 'Roman_Script_Input': roman_text,
211
+ 'Native_Script_Ground_Truth': native_ground_truth,
212
+ 'English_Translation_Ground_Truth': english_text,
213
+ 'IndicXlit_Native_Output': indicxlit_native_output,
214
+ 'Processing_Status': status,
215
+ 'IndicXlit_Code': target_code
216
+ })
217
+
218
+ if (i + 1) % 50 == 0:
219
+ print(f"βœ… Processed {i + 1}/{len(df_input)} samples...")
220
+
221
+ # Create results DataFrame
222
+ df_results = pd.DataFrame(results)
223
+
224
+ # Display summary
225
+ print("\nπŸ“Š Processing Summary:")
226
+ print(f"Total samples processed: {len(df_results)}")
227
+ print(f"Successful translations: {len(df_results[df_results['Processing_Status'] == 'SUCCESS'])}")
228
+ print(f"Failed translations: {len(df_results[df_results['Processing_Status'] != 'SUCCESS'])}")
229
+
230
+ # Language-wise breakdown
231
+ print(f"\nπŸ“ˆ Language-wise breakdown:")
232
+ lang_summary = df_results['Language'].value_counts()
233
+ for lang, count in lang_summary.items():
234
+ success_count = len(df_results[(df_results['Language'] == lang) & (df_results['Processing_Status'] == 'SUCCESS')])
235
+ print(f" {lang}: {count} total, {success_count} successful")
236
+
237
+ # Save to Excel
238
+ output_filename = "indicxlit_excel_results_with_ground_truth.xlsx"
239
+ df_results.to_excel(output_filename, index=False, engine='openpyxl')
240
+
241
+ print(f"\nπŸ’Ύ Results saved to: {output_filename}")
242
+
243
+ # Download the file
244
+
245
+ # Display first few rows
246
+ print("\nπŸ“‹ Sample Results:")
247
+ print(df_results.head())
248
+
249
+ return df_results
250
+
251
+ except Exception as e:
252
+ print(f"❌ Error processing Excel file: {str(e)}")
253
+ return None
254
+
255
+ # Run the processing function
256
+ print("πŸš€ Ready to process Excel dataset with IndicXlit model")
257
+ print("πŸ“Š Expected Excel columns: Language, Roman Script, Native Script, English Translation")
258
+ print("πŸ‘† Execute the function below to start:")
259
+ print("df_results = process_excel_dataset_with_indicxlit()")
260
+
261
+
262
+ df_results = process_excel_dataset_with_indicxlit()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ pandas