xlit-testing / app.py
kasimali's picture
Upload folder using huggingface_hub
571d55d verified
# XLIT-TESTING
import gradio as gr
import pandas as pd
import requests
from typing import List, Dict, Union, Optional
import io
# YOUR EXACT IndicXlit API Code (no changes)
class IndicXlitClient:
"""Simple client for IndicXlit Transliteration API"""
def __init__(self, api_url: str = "https://awake-blowfish-liberal.ngrok-free.app"):
self.api_url = api_url.rstrip('/')
self.session = requests.Session()
self.session.headers.update({
'Content-Type': 'application/json',
'Accept': 'application/json'
})
def health_check(self) -> dict:
try:
response = self.session.get(f"{self.api_url}/health")
response.raise_for_status()
return response.json()
except Exception as e:
return {"error": str(e), "status": "unhealthy"}
def get_supported_languages(self) -> List[str]:
try:
response = self.session.get(f"{self.api_url}/languages")
response.raise_for_status()
data = response.json()
return data.get("supported_languages", [])
except Exception as e:
print(f"Error getting languages: {e}")
return []
def english_to_indic(self, text: str, target_languages: Union[str, List[str]], beam_width: int = 4) -> Dict[str, str]:
try:
payload = {
"text": text,
"target_languages": target_languages,
"beam_width": beam_width
}
response = self.session.post(
f"{self.api_url}/transliterate/en-to-indic",
json=payload
)
response.raise_for_status()
result = response.json()
if result.get("success"):
return result.get("results", {})
else:
print(f"API Error: {result}")
return {}
except Exception as e:
print(f"Error transliterating: {e}")
return {}
# Create global client instance
client = IndicXlitClient()
# Convenience functions
def transliterate_from_en(text: str, target_languages: Union[str, List[str]]) -> Dict[str, str]:
return client.english_to_indic(text, target_languages)
def get_supported_languages() -> List[str]:
return client.get_supported_languages()
def check_api_health() -> bool:
health = client.health_check()
return health.get("status") == "healthy"
# Test API connectivity
print("πŸ”„ Testing IndicXlit API connectivity...")
if check_api_health():
print("βœ… IndicXlit API is healthy and ready!")
supported_langs = get_supported_languages()
print(f"πŸ“‹ Supported languages: {supported_langs}")
print(f"πŸ“Š Total supported languages: {len(supported_langs)}")
else:
print("⚠️ IndicXlit API is not available")
print("❌ Please check your API URL or connection")
print("βœ… IndicXlit API setup completed!")
# Master language mapping for IndicXlit model testing
INDICXLIT_LANGUAGE_MAPPING = {
# Language name to IndicXlit API code mapping
'assamese': 'as',
'bengali': 'bn',
'bodo': 'brx',
'gujarati': 'gu',
'hindi': 'hi',
'kannada': 'kn',
'kashmiri': 'ks',
'konkani': 'gom', # IndicXlit uses 'gom' for Konkani
'maithili': 'mai',
'malayalam': 'ml',
'marathi': 'mr',
'manipuri': 'mni',
'nepali': 'ne',
'odia': 'or',
'punjabi': 'pa',
'sanskrit': 'sa',
'sindhi': 'sd',
'tamil': 'ta',
'telugu': 'te',
'urdu': 'ur'
}
# Languages NOT supported by IndicXlit (based on your previous testing)
UNSUPPORTED_LANGUAGES = ['dogri', 'santali']
print("πŸ“‹ IndicXlit Language Mapping:")
for lang_name, code in INDICXLIT_LANGUAGE_MAPPING.items():
print(f" {lang_name.capitalize()}: {code}")
print(f"\n⚠️ Unsupported languages: {', '.join(UNSUPPORTED_LANGUAGES)}")
print(f"βœ… Total mappings loaded: {len(INDICXLIT_LANGUAGE_MAPPING)}")
from google.colab import files
import pandas as pd
def process_excel_dataset_with_indicxlit():
"""
Process Excel dataset using ONLY IndicXlit model
Input: Excel file with columns - Language, Roman Script, Native Script, English Translation
Output: Excel with all ground truth columns + IndicXlit Native Output
"""
print("πŸ“ Please upload your Excel file containing the dataset...")
uploaded = files.upload()
for filename in uploaded.keys():
print(f"πŸ“„ Processing file: {filename}")
# Read the Excel file
try:
df_input = pd.read_excel(filename)
print(f"βœ… Successfully loaded Excel with {len(df_input)} rows")
# Display column names to verify structure
print(f"πŸ“‹ Columns found: {list(df_input.columns)}")
# Identify columns (case-insensitive matching)
column_mapping = {}
for col in df_input.columns:
col_lower = col.lower().strip()
if 'language' in col_lower:
column_mapping['language'] = col
elif 'roman' in col_lower:
column_mapping['roman'] = col
elif 'native' in col_lower:
column_mapping['native'] = col
elif 'english' in col_lower:
column_mapping['english'] = col
print(f"πŸ” Column mapping: {column_mapping}")
# Check if all required columns are found
if len(column_mapping) < 4:
print("❌ Could not identify all required columns (Language, Roman, Native, English)")
return None
results = []
print(f"πŸ”„ Processing {len(df_input)} samples with IndicXlit model...")
for i, row in df_input.iterrows():
language = str(row[column_mapping['language']]).lower().strip()
roman_text = str(row[column_mapping['roman']]).strip()
native_ground_truth = str(row[column_mapping['native']]).strip()
english_text = str(row[column_mapping['english']]).strip()
# Skip if language not supported
if language in UNSUPPORTED_LANGUAGES:
indicxlit_native_output = "NOT_SUPPORTED"
status = "UNSUPPORTED_LANGUAGE"
target_code = "N/A"
elif language in INDICXLIT_LANGUAGE_MAPPING:
target_code = INDICXLIT_LANGUAGE_MAPPING[language]
try:
# Use IndicXlit API for transliteration
api_results = transliterate_from_en(roman_text, target_code)
if api_results and target_code in api_results:
indicxlit_native_output = api_results[target_code]
status = "SUCCESS"
else:
indicxlit_native_output = roman_text # Fallback to original
status = "API_FAILED"
except Exception as e:
indicxlit_native_output = roman_text # Fallback to original
status = f"ERROR: {str(e)}"
else:
indicxlit_native_output = "LANGUAGE_NOT_MAPPED"
status = "UNKNOWN_LANGUAGE"
target_code = "N/A"
# Create result row with all ground truth + IndicXlit output
results.append({
'Language': language.capitalize(),
'Roman_Script_Input': roman_text,
'Native_Script_Ground_Truth': native_ground_truth,
'English_Translation_Ground_Truth': english_text,
'IndicXlit_Native_Output': indicxlit_native_output,
'Processing_Status': status,
'IndicXlit_Code': target_code
})
if (i + 1) % 50 == 0:
print(f"βœ… Processed {i + 1}/{len(df_input)} samples...")
# Create results DataFrame
df_results = pd.DataFrame(results)
# Display summary
print("\nπŸ“Š Processing Summary:")
print(f"Total samples processed: {len(df_results)}")
print(f"Successful translations: {len(df_results[df_results['Processing_Status'] == 'SUCCESS'])}")
print(f"Failed translations: {len(df_results[df_results['Processing_Status'] != 'SUCCESS'])}")
# Language-wise breakdown
print(f"\nπŸ“ˆ Language-wise breakdown:")
lang_summary = df_results['Language'].value_counts()
for lang, count in lang_summary.items():
success_count = len(df_results[(df_results['Language'] == lang) & (df_results['Processing_Status'] == 'SUCCESS')])
print(f" {lang}: {count} total, {success_count} successful")
# Save to Excel
output_filename = "indicxlit_excel_results_with_ground_truth.xlsx"
df_results.to_excel(output_filename, index=False, engine='openpyxl')
print(f"\nπŸ’Ύ Results saved to: {output_filename}")
# Download the file
# Display first few rows
print("\nπŸ“‹ Sample Results:")
print(df_results.head())
return df_results
except Exception as e:
print(f"❌ Error processing Excel file: {str(e)}")
return None
# Run the processing function
print("πŸš€ Ready to process Excel dataset with IndicXlit model")
print("πŸ“Š Expected Excel columns: Language, Roman Script, Native Script, English Translation")
print("πŸ‘† Execute the function below to start:")
print("df_results = process_excel_dataset_with_indicxlit()")
df_results = process_excel_dataset_with_indicxlit()