Spaces:
Runtime error
Runtime error
| # XLIT-TESTING | |
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| from typing import List, Dict, Union, Optional | |
| import io | |
| # YOUR EXACT IndicXlit API Code (no changes) | |
| class IndicXlitClient: | |
| """Simple client for IndicXlit Transliteration API""" | |
| def __init__(self, api_url: str = "https://awake-blowfish-liberal.ngrok-free.app"): | |
| self.api_url = api_url.rstrip('/') | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'Content-Type': 'application/json', | |
| 'Accept': 'application/json' | |
| }) | |
| def health_check(self) -> dict: | |
| try: | |
| response = self.session.get(f"{self.api_url}/health") | |
| response.raise_for_status() | |
| return response.json() | |
| except Exception as e: | |
| return {"error": str(e), "status": "unhealthy"} | |
| def get_supported_languages(self) -> List[str]: | |
| try: | |
| response = self.session.get(f"{self.api_url}/languages") | |
| response.raise_for_status() | |
| data = response.json() | |
| return data.get("supported_languages", []) | |
| except Exception as e: | |
| print(f"Error getting languages: {e}") | |
| return [] | |
| def english_to_indic(self, text: str, target_languages: Union[str, List[str]], beam_width: int = 4) -> Dict[str, str]: | |
| try: | |
| payload = { | |
| "text": text, | |
| "target_languages": target_languages, | |
| "beam_width": beam_width | |
| } | |
| response = self.session.post( | |
| f"{self.api_url}/transliterate/en-to-indic", | |
| json=payload | |
| ) | |
| response.raise_for_status() | |
| result = response.json() | |
| if result.get("success"): | |
| return result.get("results", {}) | |
| else: | |
| print(f"API Error: {result}") | |
| return {} | |
| except Exception as e: | |
| print(f"Error transliterating: {e}") | |
| return {} | |
| # Create global client instance | |
| client = IndicXlitClient() | |
| # Convenience functions | |
| def transliterate_from_en(text: str, target_languages: Union[str, List[str]]) -> Dict[str, str]: | |
| return client.english_to_indic(text, target_languages) | |
| def get_supported_languages() -> List[str]: | |
| return client.get_supported_languages() | |
| def check_api_health() -> bool: | |
| health = client.health_check() | |
| return health.get("status") == "healthy" | |
| # Test API connectivity | |
| print("π Testing IndicXlit API connectivity...") | |
| if check_api_health(): | |
| print("β IndicXlit API is healthy and ready!") | |
| supported_langs = get_supported_languages() | |
| print(f"π Supported languages: {supported_langs}") | |
| print(f"π Total supported languages: {len(supported_langs)}") | |
| else: | |
| print("β οΈ IndicXlit API is not available") | |
| print("β Please check your API URL or connection") | |
| print("β IndicXlit API setup completed!") | |
| # Master language mapping for IndicXlit model testing | |
| INDICXLIT_LANGUAGE_MAPPING = { | |
| # Language name to IndicXlit API code mapping | |
| 'assamese': 'as', | |
| 'bengali': 'bn', | |
| 'bodo': 'brx', | |
| 'gujarati': 'gu', | |
| 'hindi': 'hi', | |
| 'kannada': 'kn', | |
| 'kashmiri': 'ks', | |
| 'konkani': 'gom', # IndicXlit uses 'gom' for Konkani | |
| 'maithili': 'mai', | |
| 'malayalam': 'ml', | |
| 'marathi': 'mr', | |
| 'manipuri': 'mni', | |
| 'nepali': 'ne', | |
| 'odia': 'or', | |
| 'punjabi': 'pa', | |
| 'sanskrit': 'sa', | |
| 'sindhi': 'sd', | |
| 'tamil': 'ta', | |
| 'telugu': 'te', | |
| 'urdu': 'ur' | |
| } | |
| # Languages NOT supported by IndicXlit (based on your previous testing) | |
| UNSUPPORTED_LANGUAGES = ['dogri', 'santali'] | |
| print("π IndicXlit Language Mapping:") | |
| for lang_name, code in INDICXLIT_LANGUAGE_MAPPING.items(): | |
| print(f" {lang_name.capitalize()}: {code}") | |
| print(f"\nβ οΈ Unsupported languages: {', '.join(UNSUPPORTED_LANGUAGES)}") | |
| print(f"β Total mappings loaded: {len(INDICXLIT_LANGUAGE_MAPPING)}") | |
| from google.colab import files | |
| import pandas as pd | |
| def process_excel_dataset_with_indicxlit(): | |
| """ | |
| Process Excel dataset using ONLY IndicXlit model | |
| Input: Excel file with columns - Language, Roman Script, Native Script, English Translation | |
| Output: Excel with all ground truth columns + IndicXlit Native Output | |
| """ | |
| print("π Please upload your Excel file containing the dataset...") | |
| uploaded = files.upload() | |
| for filename in uploaded.keys(): | |
| print(f"π Processing file: {filename}") | |
| # Read the Excel file | |
| try: | |
| df_input = pd.read_excel(filename) | |
| print(f"β Successfully loaded Excel with {len(df_input)} rows") | |
| # Display column names to verify structure | |
| print(f"π Columns found: {list(df_input.columns)}") | |
| # Identify columns (case-insensitive matching) | |
| column_mapping = {} | |
| for col in df_input.columns: | |
| col_lower = col.lower().strip() | |
| if 'language' in col_lower: | |
| column_mapping['language'] = col | |
| elif 'roman' in col_lower: | |
| column_mapping['roman'] = col | |
| elif 'native' in col_lower: | |
| column_mapping['native'] = col | |
| elif 'english' in col_lower: | |
| column_mapping['english'] = col | |
| print(f"π Column mapping: {column_mapping}") | |
| # Check if all required columns are found | |
| if len(column_mapping) < 4: | |
| print("β Could not identify all required columns (Language, Roman, Native, English)") | |
| return None | |
| results = [] | |
| print(f"π Processing {len(df_input)} samples with IndicXlit model...") | |
| for i, row in df_input.iterrows(): | |
| language = str(row[column_mapping['language']]).lower().strip() | |
| roman_text = str(row[column_mapping['roman']]).strip() | |
| native_ground_truth = str(row[column_mapping['native']]).strip() | |
| english_text = str(row[column_mapping['english']]).strip() | |
| # Skip if language not supported | |
| if language in UNSUPPORTED_LANGUAGES: | |
| indicxlit_native_output = "NOT_SUPPORTED" | |
| status = "UNSUPPORTED_LANGUAGE" | |
| target_code = "N/A" | |
| elif language in INDICXLIT_LANGUAGE_MAPPING: | |
| target_code = INDICXLIT_LANGUAGE_MAPPING[language] | |
| try: | |
| # Use IndicXlit API for transliteration | |
| api_results = transliterate_from_en(roman_text, target_code) | |
| if api_results and target_code in api_results: | |
| indicxlit_native_output = api_results[target_code] | |
| status = "SUCCESS" | |
| else: | |
| indicxlit_native_output = roman_text # Fallback to original | |
| status = "API_FAILED" | |
| except Exception as e: | |
| indicxlit_native_output = roman_text # Fallback to original | |
| status = f"ERROR: {str(e)}" | |
| else: | |
| indicxlit_native_output = "LANGUAGE_NOT_MAPPED" | |
| status = "UNKNOWN_LANGUAGE" | |
| target_code = "N/A" | |
| # Create result row with all ground truth + IndicXlit output | |
| results.append({ | |
| 'Language': language.capitalize(), | |
| 'Roman_Script_Input': roman_text, | |
| 'Native_Script_Ground_Truth': native_ground_truth, | |
| 'English_Translation_Ground_Truth': english_text, | |
| 'IndicXlit_Native_Output': indicxlit_native_output, | |
| 'Processing_Status': status, | |
| 'IndicXlit_Code': target_code | |
| }) | |
| if (i + 1) % 50 == 0: | |
| print(f"β Processed {i + 1}/{len(df_input)} samples...") | |
| # Create results DataFrame | |
| df_results = pd.DataFrame(results) | |
| # Display summary | |
| print("\nπ Processing Summary:") | |
| print(f"Total samples processed: {len(df_results)}") | |
| print(f"Successful translations: {len(df_results[df_results['Processing_Status'] == 'SUCCESS'])}") | |
| print(f"Failed translations: {len(df_results[df_results['Processing_Status'] != 'SUCCESS'])}") | |
| # Language-wise breakdown | |
| print(f"\nπ Language-wise breakdown:") | |
| lang_summary = df_results['Language'].value_counts() | |
| for lang, count in lang_summary.items(): | |
| success_count = len(df_results[(df_results['Language'] == lang) & (df_results['Processing_Status'] == 'SUCCESS')]) | |
| print(f" {lang}: {count} total, {success_count} successful") | |
| # Save to Excel | |
| output_filename = "indicxlit_excel_results_with_ground_truth.xlsx" | |
| df_results.to_excel(output_filename, index=False, engine='openpyxl') | |
| print(f"\nπΎ Results saved to: {output_filename}") | |
| # Download the file | |
| # Display first few rows | |
| print("\nπ Sample Results:") | |
| print(df_results.head()) | |
| return df_results | |
| except Exception as e: | |
| print(f"β Error processing Excel file: {str(e)}") | |
| return None | |
| # Run the processing function | |
| print("π Ready to process Excel dataset with IndicXlit model") | |
| print("π Expected Excel columns: Language, Roman Script, Native Script, English Translation") | |
| print("π Execute the function below to start:") | |
| print("df_results = process_excel_dataset_with_indicxlit()") | |
| df_results = process_excel_dataset_with_indicxlit() | |