Spaces:

kasimali
/

xlit-testing

Runtime error

File size: 9,849 Bytes

571d55d

# XLIT-TESTING

import gradio as gr
import pandas as pd
import requests
from typing import List, Dict, Union, Optional
import io

# YOUR EXACT IndicXlit API Code (no changes)
class IndicXlitClient:
    """Simple client for IndicXlit Transliteration API"""

    def __init__(self, api_url: str = "https://awake-blowfish-liberal.ngrok-free.app"):
        self.api_url = api_url.rstrip('/')
        self.session = requests.Session()
        self.session.headers.update({
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        })

    def health_check(self) -> dict:
        try:
            response = self.session.get(f"{self.api_url}/health")
            response.raise_for_status()
            return response.json()
        except Exception as e:
            return {"error": str(e), "status": "unhealthy"}

    def get_supported_languages(self) -> List[str]:
        try:
            response = self.session.get(f"{self.api_url}/languages")
            response.raise_for_status()
            data = response.json()
            return data.get("supported_languages", [])
        except Exception as e:
            print(f"Error getting languages: {e}")
            return []

    def english_to_indic(self, text: str, target_languages: Union[str, List[str]], beam_width: int = 4) -> Dict[str, str]:
        try:
            payload = {
                "text": text,
                "target_languages": target_languages,
                "beam_width": beam_width
            }

            response = self.session.post(
                f"{self.api_url}/transliterate/en-to-indic",
                json=payload
            )
            response.raise_for_status()
            result = response.json()

            if result.get("success"):
                return result.get("results", {})
            else:
                print(f"API Error: {result}")
                return {}

        except Exception as e:
            print(f"Error transliterating: {e}")
            return {}

# Create global client instance
client = IndicXlitClient()

# Convenience functions
def transliterate_from_en(text: str, target_languages: Union[str, List[str]]) -> Dict[str, str]:
    return client.english_to_indic(text, target_languages)

def get_supported_languages() -> List[str]:
    return client.get_supported_languages()

def check_api_health() -> bool:
    health = client.health_check()
    return health.get("status") == "healthy"

# Test API connectivity
print("🔄 Testing IndicXlit API connectivity...")
if check_api_health():
    print("✅ IndicXlit API is healthy and ready!")
    supported_langs = get_supported_languages()
    print(f"📋 Supported languages: {supported_langs}")
    print(f"📊 Total supported languages: {len(supported_langs)}")
else:
    print("⚠️ IndicXlit API is not available")
    print("❌ Please check your API URL or connection")

print("✅ IndicXlit API setup completed!")


# Master language mapping for IndicXlit model testing
INDICXLIT_LANGUAGE_MAPPING = {
    # Language name to IndicXlit API code mapping
    'assamese': 'as',
    'bengali': 'bn',
    'bodo': 'brx',
    'gujarati': 'gu',
    'hindi': 'hi',
    'kannada': 'kn',
    'kashmiri': 'ks',
    'konkani': 'gom',  # IndicXlit uses 'gom' for Konkani
    'maithili': 'mai',
    'malayalam': 'ml',
    'marathi': 'mr',
    'manipuri': 'mni',
    'nepali': 'ne',
    'odia': 'or',
    'punjabi': 'pa',
    'sanskrit': 'sa',
    'sindhi': 'sd',
    'tamil': 'ta',
    'telugu': 'te',
    'urdu': 'ur'
}

# Languages NOT supported by IndicXlit (based on your previous testing)
UNSUPPORTED_LANGUAGES = ['dogri', 'santali']

print("📋 IndicXlit Language Mapping:")
for lang_name, code in INDICXLIT_LANGUAGE_MAPPING.items():
    print(f"  {lang_name.capitalize()}: {code}")

print(f"\n⚠️ Unsupported languages: {', '.join(UNSUPPORTED_LANGUAGES)}")
print(f"✅ Total mappings loaded: {len(INDICXLIT_LANGUAGE_MAPPING)}")


from google.colab import files
import pandas as pd

def process_excel_dataset_with_indicxlit():
    """
    Process Excel dataset using ONLY IndicXlit model
    Input: Excel file with columns - Language, Roman Script, Native Script, English Translation
    Output: Excel with all ground truth columns + IndicXlit Native Output
    """
    print("📁 Please upload your Excel file containing the dataset...")
    uploaded = files.upload()

    for filename in uploaded.keys():
        print(f"📄 Processing file: {filename}")

        # Read the Excel file
        try:
            df_input = pd.read_excel(filename)
            print(f"✅ Successfully loaded Excel with {len(df_input)} rows")

            # Display column names to verify structure
            print(f"📋 Columns found: {list(df_input.columns)}")

            # Identify columns (case-insensitive matching)
            column_mapping = {}
            for col in df_input.columns:
                col_lower = col.lower().strip()
                if 'language' in col_lower:
                    column_mapping['language'] = col
                elif 'roman' in col_lower:
                    column_mapping['roman'] = col
                elif 'native' in col_lower:
                    column_mapping['native'] = col
                elif 'english' in col_lower:
                    column_mapping['english'] = col

            print(f"🔍 Column mapping: {column_mapping}")

            # Check if all required columns are found
            if len(column_mapping) < 4:
                print("❌ Could not identify all required columns (Language, Roman, Native, English)")
                return None

            results = []
            print(f"🔄 Processing {len(df_input)} samples with IndicXlit model...")

            for i, row in df_input.iterrows():
                language = str(row[column_mapping['language']]).lower().strip()
                roman_text = str(row[column_mapping['roman']]).strip()
                native_ground_truth = str(row[column_mapping['native']]).strip()
                english_text = str(row[column_mapping['english']]).strip()

                # Skip if language not supported
                if language in UNSUPPORTED_LANGUAGES:
                    indicxlit_native_output = "NOT_SUPPORTED"
                    status = "UNSUPPORTED_LANGUAGE"
                    target_code = "N/A"
                elif language in INDICXLIT_LANGUAGE_MAPPING:
                    target_code = INDICXLIT_LANGUAGE_MAPPING[language]

                    try:
                        # Use IndicXlit API for transliteration
                        api_results = transliterate_from_en(roman_text, target_code)

                        if api_results and target_code in api_results:
                            indicxlit_native_output = api_results[target_code]
                            status = "SUCCESS"
                        else:
                            indicxlit_native_output = roman_text  # Fallback to original
                            status = "API_FAILED"

                    except Exception as e:
                        indicxlit_native_output = roman_text  # Fallback to original
                        status = f"ERROR: {str(e)}"
                else:
                    indicxlit_native_output = "LANGUAGE_NOT_MAPPED"
                    status = "UNKNOWN_LANGUAGE"
                    target_code = "N/A"

                # Create result row with all ground truth + IndicXlit output
                results.append({
                    'Language': language.capitalize(),
                    'Roman_Script_Input': roman_text,
                    'Native_Script_Ground_Truth': native_ground_truth,
                    'English_Translation_Ground_Truth': english_text,
                    'IndicXlit_Native_Output': indicxlit_native_output,
                    'Processing_Status': status,
                    'IndicXlit_Code': target_code
                })

                if (i + 1) % 50 == 0:
                    print(f"✅ Processed {i + 1}/{len(df_input)} samples...")

            # Create results DataFrame
            df_results = pd.DataFrame(results)

            # Display summary
            print("\n📊 Processing Summary:")
            print(f"Total samples processed: {len(df_results)}")
            print(f"Successful translations: {len(df_results[df_results['Processing_Status'] == 'SUCCESS'])}")
            print(f"Failed translations: {len(df_results[df_results['Processing_Status'] != 'SUCCESS'])}")

            # Language-wise breakdown
            print(f"\n📈 Language-wise breakdown:")
            lang_summary = df_results['Language'].value_counts()
            for lang, count in lang_summary.items():
                success_count = len(df_results[(df_results['Language'] == lang) & (df_results['Processing_Status'] == 'SUCCESS')])
                print(f"  {lang}: {count} total, {success_count} successful")

            # Save to Excel
            output_filename = "indicxlit_excel_results_with_ground_truth.xlsx"
            df_results.to_excel(output_filename, index=False, engine='openpyxl')

            print(f"\n💾 Results saved to: {output_filename}")

            # Download the file

            # Display first few rows
            print("\n📋 Sample Results:")
            print(df_results.head())

            return df_results

        except Exception as e:
            print(f"❌ Error processing Excel file: {str(e)}")
            return None

# Run the processing function
print("🚀 Ready to process Excel dataset with IndicXlit model")
print("📊 Expected Excel columns: Language, Roman Script, Native Script, English Translation")
print("👆 Execute the function below to start:")
print("df_results = process_excel_dataset_with_indicxlit()")


df_results = process_excel_dataset_with_indicxlit()