# Real AI-Powered Multi-Lingual Product Catalog Translator # Hugging Face Spaces Deployment with IndicTrans2 import streamlit as st import os import sys import torch import logging from typing import Dict, List, Optional import time import warnings # Suppress warnings warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings("ignore", category=FutureWarning) # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Set environment variable for model type os.environ.setdefault("MODEL_TYPE", "indictrans2") os.environ.setdefault("DEVICE", "cuda" if torch.cuda.is_available() else "cpu") try: from transformers import AutoTokenizer, AutoModelForSeq2SeqLM TRANSFORMERS_AVAILABLE = True except ImportError: TRANSFORMERS_AVAILABLE = False logger.warning("Transformers not available, falling back to mock mode") # Streamlit page config st.set_page_config( page_title="Multi-Lingual Catalog Translator - Real AI", page_icon="🌐", layout="wide", initial_sidebar_state="expanded" ) # Language mappings for IndicTrans2 SUPPORTED_LANGUAGES = { "en": "English", "hi": "Hindi", "bn": "Bengali", "gu": "Gujarati", "kn": "Kannada", "ml": "Malayalam", "mr": "Marathi", "or": "Odia", "pa": "Punjabi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu", "as": "Assamese", "ne": "Nepali", "sa": "Sanskrit" } # Flores language codes for IndicTrans2 FLORES_CODES = { "en": "eng_Latn", "hi": "hin_Deva", "bn": "ben_Beng", "gu": "guj_Gujr", "kn": "kan_Knda", "ml": "mal_Mlym", "mr": "mar_Deva", "or": "ory_Orya", "pa": "pan_Guru", "ta": "tam_Taml", "te": "tel_Telu", "ur": "urd_Arab", "as": "asm_Beng", "ne": "npi_Deva", "sa": "san_Deva" } class IndicTrans2Service: """Real IndicTrans2 Translation Service for Hugging Face Spaces""" def __init__(self): self.en_indic_model = None self.indic_en_model = None self.en_indic_tokenizer = None self.indic_en_tokenizer = None self.device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {self.device}") @st.cache_resource def load_models(_self): """Load IndicTrans2 models with caching""" if not TRANSFORMERS_AVAILABLE: logger.error("Transformers library not available") return False try: with st.spinner("🔄 Loading IndicTrans2 AI models... This may take a few minutes on first run."): # Load English to Indic model logger.info("Loading English to Indic model...") _self.en_indic_tokenizer = AutoTokenizer.from_pretrained( "ai4bharat/indictrans2-en-indic-1B", trust_remote_code=True ) _self.en_indic_model = AutoModelForSeq2SeqLM.from_pretrained( "ai4bharat/indictrans2-en-indic-1B", trust_remote_code=True, torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32 ) _self.en_indic_model.to(_self.device) _self.en_indic_model.eval() # Load Indic to English model logger.info("Loading Indic to English model...") _self.indic_en_tokenizer = AutoTokenizer.from_pretrained( "ai4bharat/indictrans2-indic-en-1B", trust_remote_code=True ) _self.indic_en_model = AutoModelForSeq2SeqLM.from_pretrained( "ai4bharat/indictrans2-indic-en-1B", trust_remote_code=True, torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32 ) _self.indic_en_model.to(_self.device) _self.indic_en_model.eval() logger.info("✅ Models loaded successfully!") return True except Exception as e: logger.error(f"❌ Error loading models: {e}") st.error(f"Failed to load AI models: {e}") return False def translate_text(self, text: str, source_lang: str, target_lang: str) -> Dict: """Translate text using real IndicTrans2 models""" try: logger.info(f"Translation request: '{text[:50]}...' from {source_lang} to {target_lang}") # Validate language codes if source_lang not in FLORES_CODES: logger.error(f"Unsupported source language: {source_lang}") return {"error": f"Unsupported source language: {source_lang}"} if target_lang not in FLORES_CODES: logger.error(f"Unsupported target language: {target_lang}") return {"error": f"Unsupported target language: {target_lang}"} if not self.load_models(): return {"error": "Failed to load translation models"} start_time = time.time() # Determine translation direction if source_lang == "en" and target_lang in FLORES_CODES: # English to Indic model = self.en_indic_model tokenizer = self.en_indic_tokenizer src_code = FLORES_CODES[source_lang] tgt_code = FLORES_CODES[target_lang] elif source_lang in FLORES_CODES and target_lang == "en": # Indic to English model = self.indic_en_model tokenizer = self.indic_en_tokenizer src_code = FLORES_CODES[source_lang] tgt_code = FLORES_CODES[target_lang] else: return {"error": f"Translation not supported: {source_lang} → {target_lang}"} # Prepare input text with correct IndicTrans2 format input_text = f"{src_code} {tgt_code} {text}" # Tokenize inputs = tokenizer( input_text, return_tensors="pt", padding=True, truncation=True, max_length=512 ).to(self.device) # Generate translation with torch.no_grad(): outputs = model.generate( **inputs, max_length=512, num_beams=4, length_penalty=0.6, early_stopping=True ) # Decode translation translation = tokenizer.decode(outputs[0], skip_special_tokens=True) # Calculate processing time processing_time = time.time() - start_time # Calculate confidence (simplified scoring) confidence = min(0.95, max(0.75, 1.0 - (processing_time / 10))) return { "translated_text": translation, "source_language": source_lang, "target_language": target_lang, "confidence_score": confidence, "processing_time": processing_time, "model_info": "IndicTrans2-1B by AI4Bharat" } except Exception as e: logger.error(f"Translation error: {e}") return {"error": f"Translation failed: {str(e)}"} # Initialize translation service @st.cache_resource def get_translation_service(): return IndicTrans2Service() def main(): """Main Streamlit application with real AI translation""" # Header st.title("🌐 Multi-Lingual Product Catalog Translator") st.markdown("### Powered by IndicTrans2 by AI4Bharat") # Real AI banner st.success(""" 🤖 **Real AI Translation** This version uses actual IndicTrans2 neural machine translation models (1B parameters) for state-of-the-art translation quality between English and Indian languages. ✨ Features: Neural translation • 15+ languages • High accuracy • GPU acceleration """) # Initialize translation service translator = get_translation_service() # Sidebar with st.sidebar: st.header("🎯 Translation Settings") # Language selection source_lang = st.selectbox( "Source Language", options=list(SUPPORTED_LANGUAGES.keys()), format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})", index=0 # Default to English ) target_lang = st.selectbox( "Target Language", options=list(SUPPORTED_LANGUAGES.keys()), format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})", index=1 # Default to Hindi ) st.info(f"🔄 Translating: {SUPPORTED_LANGUAGES[source_lang]} → {SUPPORTED_LANGUAGES[target_lang]}") # Model info st.header("🤖 AI Model Info") st.markdown(""" **Model**: IndicTrans2-1B **Developer**: AI4Bharat **Parameters**: 1 Billion **Type**: Neural Machine Translation **Specialization**: Indian Languages """) # Main content col1, col2 = st.columns(2) with col1: st.header("📝 Product Details") # Product form product_name = st.text_input( "Product Name", placeholder="e.g., Wireless Bluetooth Headphones" ) product_description = st.text_area( "Product Description", placeholder="e.g., Premium quality headphones with noise cancellation...", height=100 ) product_features = st.text_area( "Key Features", placeholder="e.g., Long battery life, comfortable fit, premium sound quality", height=80 ) # Translation button if st.button("🚀 Translate with AI", type="primary", use_container_width=True): if product_name or product_description or product_features: with st.spinner("🤖 AI translation in progress..."): translations = {} # Translate each field if product_name: result = translator.translate_text(product_name, source_lang, target_lang) translations["name"] = result if product_description: result = translator.translate_text(product_description, source_lang, target_lang) translations["description"] = result if product_features: result = translator.translate_text(product_features, source_lang, target_lang) translations["features"] = result # Store in session state st.session_state.translations = translations else: st.warning("⚠️ Please enter at least one product detail to translate.") with col2: st.header("🎯 AI Translation Results") if hasattr(st.session_state, 'translations') and st.session_state.translations: translations = st.session_state.translations # Display translations for field, result in translations.items(): if "error" not in result: st.markdown(f"**{field.title()}:**") st.success(result.get("translated_text", "")) # Show confidence and timing col_conf, col_time = st.columns(2) with col_conf: confidence = result.get("confidence_score", 0) st.metric("Confidence", f"{confidence:.1%}") with col_time: time_taken = result.get("processing_time", 0) st.metric("Time", f"{time_taken:.1f}s") else: st.error(f"Translation error for {field}: {result['error']}") # Export option if st.button("📥 Export Translations", use_container_width=True): export_data = {} for field, result in translations.items(): if "error" not in result: export_data[f"{field}_original"] = st.session_state.get(f"original_{field}", "") export_data[f"{field}_translated"] = result.get("translated_text", "") st.download_button( label="Download as JSON", data=str(export_data), file_name=f"translation_{source_lang}_{target_lang}.json", mime="application/json" ) else: st.info("👆 Enter product details and click translate to see AI-powered results") # Statistics st.header("📊 Translation Analytics") col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Languages Supported", "15+") with col2: st.metric("Model Parameters", "1B") with col3: st.metric("Translation Quality", "State-of-art") with col4: device_type = "GPU" if torch.cuda.is_available() else "CPU" st.metric("Processing", device_type) # Footer st.markdown("---") st.markdown("""
🤖 Powered by IndicTrans2 by AI4Bharat
🚀 Deployed on Hugging Face Spaces with real neural machine translation