Spaces:

kasimali
/

xlit-testing

Runtime error

App Files Files Community

xlit-testing / app.py

kasimali

Upload folder using huggingface_hub

571d55d verified 4 months ago

raw

history blame contribute delete

9.85 kB

	# XLIT-TESTING

	import gradio as gr
	import pandas as pd
	import requests
	from typing import List, Dict, Union, Optional
	import io

	# YOUR EXACT IndicXlit API Code (no changes)
	class IndicXlitClient:
	"""Simple client for IndicXlit Transliteration API"""

	def __init__(self, api_url: str = "https://awake-blowfish-liberal.ngrok-free.app"):
	self.api_url = api_url.rstrip('/')
	self.session = requests.Session()
	self.session.headers.update({
	'Content-Type': 'application/json',
	'Accept': 'application/json'
	})

	def health_check(self) -> dict:
	try:
	response = self.session.get(f"{self.api_url}/health")
	response.raise_for_status()
	return response.json()
	except Exception as e:
	return {"error": str(e), "status": "unhealthy"}

	def get_supported_languages(self) -> List[str]:
	try:
	response = self.session.get(f"{self.api_url}/languages")
	response.raise_for_status()
	data = response.json()
	return data.get("supported_languages", [])
	except Exception as e:
	print(f"Error getting languages: {e}")
	return []

	def english_to_indic(self, text: str, target_languages: Union[str, List[str]], beam_width: int = 4) -> Dict[str, str]:
	try:
	payload = {
	"text": text,
	"target_languages": target_languages,
	"beam_width": beam_width
	}

	response = self.session.post(
	f"{self.api_url}/transliterate/en-to-indic",
	json=payload
	)
	response.raise_for_status()
	result = response.json()

	if result.get("success"):
	return result.get("results", {})
	else:
	print(f"API Error: {result}")
	return {}

	except Exception as e:
	print(f"Error transliterating: {e}")
	return {}

	# Create global client instance
	client = IndicXlitClient()

	# Convenience functions
	def transliterate_from_en(text: str, target_languages: Union[str, List[str]]) -> Dict[str, str]:
	return client.english_to_indic(text, target_languages)

	def get_supported_languages() -> List[str]:
	return client.get_supported_languages()

	def check_api_health() -> bool:
	health = client.health_check()
	return health.get("status") == "healthy"

	# Test API connectivity
	print("🔄 Testing IndicXlit API connectivity...")
	if check_api_health():
	print("✅ IndicXlit API is healthy and ready!")
	supported_langs = get_supported_languages()
	print(f"📋 Supported languages: {supported_langs}")
	print(f"📊 Total supported languages: {len(supported_langs)}")
	else:
	print("⚠️ IndicXlit API is not available")
	print("❌ Please check your API URL or connection")

	print("✅ IndicXlit API setup completed!")


	# Master language mapping for IndicXlit model testing
	INDICXLIT_LANGUAGE_MAPPING = {
	# Language name to IndicXlit API code mapping
	'assamese': 'as',
	'bengali': 'bn',
	'bodo': 'brx',
	'gujarati': 'gu',
	'hindi': 'hi',
	'kannada': 'kn',
	'kashmiri': 'ks',
	'konkani': 'gom', # IndicXlit uses 'gom' for Konkani
	'maithili': 'mai',
	'malayalam': 'ml',
	'marathi': 'mr',
	'manipuri': 'mni',
	'nepali': 'ne',
	'odia': 'or',
	'punjabi': 'pa',
	'sanskrit': 'sa',
	'sindhi': 'sd',
	'tamil': 'ta',
	'telugu': 'te',
	'urdu': 'ur'
	}

	# Languages NOT supported by IndicXlit (based on your previous testing)
	UNSUPPORTED_LANGUAGES = ['dogri', 'santali']

	print("📋 IndicXlit Language Mapping:")
	for lang_name, code in INDICXLIT_LANGUAGE_MAPPING.items():
	print(f" {lang_name.capitalize()}: {code}")

	print(f"\n⚠️ Unsupported languages: {', '.join(UNSUPPORTED_LANGUAGES)}")
	print(f"✅ Total mappings loaded: {len(INDICXLIT_LANGUAGE_MAPPING)}")


	from google.colab import files
	import pandas as pd

	def process_excel_dataset_with_indicxlit():
	"""
	Process Excel dataset using ONLY IndicXlit model
	Input: Excel file with columns - Language, Roman Script, Native Script, English Translation
	Output: Excel with all ground truth columns + IndicXlit Native Output
	"""
	print("📁 Please upload your Excel file containing the dataset...")
	uploaded = files.upload()

	for filename in uploaded.keys():
	print(f"📄 Processing file: {filename}")

	# Read the Excel file
	try:
	df_input = pd.read_excel(filename)
	print(f"✅ Successfully loaded Excel with {len(df_input)} rows")

	# Display column names to verify structure
	print(f"📋 Columns found: {list(df_input.columns)}")

	# Identify columns (case-insensitive matching)
	column_mapping = {}
	for col in df_input.columns:
	col_lower = col.lower().strip()
	if 'language' in col_lower:
	column_mapping['language'] = col
	elif 'roman' in col_lower:
	column_mapping['roman'] = col
	elif 'native' in col_lower:
	column_mapping['native'] = col
	elif 'english' in col_lower:
	column_mapping['english'] = col

	print(f"🔍 Column mapping: {column_mapping}")

	# Check if all required columns are found
	if len(column_mapping) < 4:
	print("❌ Could not identify all required columns (Language, Roman, Native, English)")
	return None

	results = []
	print(f"🔄 Processing {len(df_input)} samples with IndicXlit model...")

	for i, row in df_input.iterrows():
	language = str(row[column_mapping['language']]).lower().strip()
	roman_text = str(row[column_mapping['roman']]).strip()
	native_ground_truth = str(row[column_mapping['native']]).strip()
	english_text = str(row[column_mapping['english']]).strip()

	# Skip if language not supported
	if language in UNSUPPORTED_LANGUAGES:
	indicxlit_native_output = "NOT_SUPPORTED"
	status = "UNSUPPORTED_LANGUAGE"
	target_code = "N/A"
	elif language in INDICXLIT_LANGUAGE_MAPPING:
	target_code = INDICXLIT_LANGUAGE_MAPPING[language]

	try:
	# Use IndicXlit API for transliteration
	api_results = transliterate_from_en(roman_text, target_code)

	if api_results and target_code in api_results:
	indicxlit_native_output = api_results[target_code]
	status = "SUCCESS"
	else:
	indicxlit_native_output = roman_text # Fallback to original
	status = "API_FAILED"

	except Exception as e:
	indicxlit_native_output = roman_text # Fallback to original
	status = f"ERROR: {str(e)}"
	else:
	indicxlit_native_output = "LANGUAGE_NOT_MAPPED"
	status = "UNKNOWN_LANGUAGE"
	target_code = "N/A"

	# Create result row with all ground truth + IndicXlit output
	results.append({
	'Language': language.capitalize(),
	'Roman_Script_Input': roman_text,
	'Native_Script_Ground_Truth': native_ground_truth,
	'English_Translation_Ground_Truth': english_text,
	'IndicXlit_Native_Output': indicxlit_native_output,
	'Processing_Status': status,
	'IndicXlit_Code': target_code
	})

	if (i + 1) % 50 == 0:
	print(f"✅ Processed {i + 1}/{len(df_input)} samples...")

	# Create results DataFrame
	df_results = pd.DataFrame(results)

	# Display summary
	print("\n📊 Processing Summary:")
	print(f"Total samples processed: {len(df_results)}")
	print(f"Successful translations: {len(df_results[df_results['Processing_Status'] == 'SUCCESS'])}")
	print(f"Failed translations: {len(df_results[df_results['Processing_Status'] != 'SUCCESS'])}")

	# Language-wise breakdown
	print(f"\n📈 Language-wise breakdown:")
	lang_summary = df_results['Language'].value_counts()
	for lang, count in lang_summary.items():
	success_count = len(df_results[(df_results['Language'] == lang) & (df_results['Processing_Status'] == 'SUCCESS')])
	print(f" {lang}: {count} total, {success_count} successful")

	# Save to Excel
	output_filename = "indicxlit_excel_results_with_ground_truth.xlsx"
	df_results.to_excel(output_filename, index=False, engine='openpyxl')

	print(f"\n💾 Results saved to: {output_filename}")

	# Download the file

	# Display first few rows
	print("\n📋 Sample Results:")
	print(df_results.head())

	return df_results

	except Exception as e:
	print(f"❌ Error processing Excel file: {str(e)}")
	return None

	# Run the processing function
	print("🚀 Ready to process Excel dataset with IndicXlit model")
	print("📊 Expected Excel columns: Language, Roman Script, Native Script, English Translation")
	print("👆 Execute the function below to start:")
	print("df_results = process_excel_dataset_with_indicxlit()")


	df_results = process_excel_dataset_with_indicxlit()