Spaces:

Athena1621
/

Translation_app_

Sleeping

File size: 14,147 Bytes

67f25fb

# Real AI-Powered Multi-Lingual Product Catalog Translator
# Hugging Face Spaces Deployment with IndicTrans2

import streamlit as st
import os
import sys
import torch
import logging
from typing import Dict, List, Optional
import time
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set environment variable for model type
os.environ.setdefault("MODEL_TYPE", "indictrans2")
os.environ.setdefault("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")

try:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    logger.warning("Transformers not available, falling back to mock mode")

# Streamlit page config
st.set_page_config(
    page_title="Multi-Lingual Catalog Translator - Real AI",
    page_icon="🌐",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Language mappings for IndicTrans2
SUPPORTED_LANGUAGES = {
    "en": "English",
    "hi": "Hindi", 
    "bn": "Bengali",
    "gu": "Gujarati",
    "kn": "Kannada",
    "ml": "Malayalam", 
    "mr": "Marathi",
    "or": "Odia",
    "pa": "Punjabi",
    "ta": "Tamil",
    "te": "Telugu",
    "ur": "Urdu",
    "as": "Assamese",
    "ne": "Nepali",
    "sa": "Sanskrit"
}

# Flores language codes for IndicTrans2
FLORES_CODES = {
    "en": "eng_Latn",
    "hi": "hin_Deva",
    "bn": "ben_Beng", 
    "gu": "guj_Gujr",
    "kn": "kan_Knda",
    "ml": "mal_Mlym",
    "mr": "mar_Deva", 
    "or": "ory_Orya",
    "pa": "pan_Guru",
    "ta": "tam_Taml",
    "te": "tel_Telu",
    "ur": "urd_Arab",
    "as": "asm_Beng",
    "ne": "npi_Deva",
    "sa": "san_Deva"
}

class IndicTrans2Service:
    """Real IndicTrans2 Translation Service for Hugging Face Spaces"""
    
    def __init__(self):
        self.en_indic_model = None
        self.indic_en_model = None
        self.en_indic_tokenizer = None
        self.indic_en_tokenizer = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {self.device}")
        
    @st.cache_resource
    def load_models(_self):
        """Load IndicTrans2 models with caching"""
        if not TRANSFORMERS_AVAILABLE:
            logger.error("Transformers library not available")
            return False
            
        try:
            with st.spinner("🔄 Loading IndicTrans2 AI models... This may take a few minutes on first run."):
                # Load English to Indic model
                logger.info("Loading English to Indic model...")
                _self.en_indic_tokenizer = AutoTokenizer.from_pretrained(
                    "ai4bharat/indictrans2-en-indic-1B",
                    trust_remote_code=True
                )
                _self.en_indic_model = AutoModelForSeq2SeqLM.from_pretrained(
                    "ai4bharat/indictrans2-en-indic-1B",
                    trust_remote_code=True,
                    torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32
                )
                _self.en_indic_model.to(_self.device)
                _self.en_indic_model.eval()
                
                # Load Indic to English model  
                logger.info("Loading Indic to English model...")
                _self.indic_en_tokenizer = AutoTokenizer.from_pretrained(
                    "ai4bharat/indictrans2-indic-en-1B", 
                    trust_remote_code=True
                )
                _self.indic_en_model = AutoModelForSeq2SeqLM.from_pretrained(
                    "ai4bharat/indictrans2-indic-en-1B",
                    trust_remote_code=True,
                    torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32
                )
                _self.indic_en_model.to(_self.device)
                _self.indic_en_model.eval()
                
                logger.info("✅ Models loaded successfully!")
                return True
                
        except Exception as e:
            logger.error(f"❌ Error loading models: {e}")
            st.error(f"Failed to load AI models: {e}")
            return False
    
    def translate_text(self, text: str, source_lang: str, target_lang: str) -> Dict:
        """Translate text using real IndicTrans2 models"""
        try:
            logger.info(f"Translation request: '{text[:50]}...' from {source_lang} to {target_lang}")
            
            # Validate language codes
            if source_lang not in FLORES_CODES:
                logger.error(f"Unsupported source language: {source_lang}")
                return {"error": f"Unsupported source language: {source_lang}"}
            if target_lang not in FLORES_CODES:
                logger.error(f"Unsupported target language: {target_lang}")
                return {"error": f"Unsupported target language: {target_lang}"}
                
            if not self.load_models():
                return {"error": "Failed to load translation models"}
            
            start_time = time.time()
            
            # Determine translation direction
            if source_lang == "en" and target_lang in FLORES_CODES:
                # English to Indic
                model = self.en_indic_model
                tokenizer = self.en_indic_tokenizer
                src_code = FLORES_CODES[source_lang]
                tgt_code = FLORES_CODES[target_lang]
                
            elif source_lang in FLORES_CODES and target_lang == "en":
                # Indic to English
                model = self.indic_en_model
                tokenizer = self.indic_en_tokenizer
                src_code = FLORES_CODES[source_lang]
                tgt_code = FLORES_CODES[target_lang]
                
            else:
                return {"error": f"Translation not supported: {source_lang} → {target_lang}"}
            
            # Prepare input text with correct IndicTrans2 format
            input_text = f"{src_code} {tgt_code} {text}"
            
            # Tokenize
            inputs = tokenizer(
                input_text,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(self.device)
            
            # Generate translation
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_length=512,
                    num_beams=4,
                    length_penalty=0.6,
                    early_stopping=True
                )
            
            # Decode translation
            translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Calculate processing time
            processing_time = time.time() - start_time
            
            # Calculate confidence (simplified scoring)
            confidence = min(0.95, max(0.75, 1.0 - (processing_time / 10)))
            
            return {
                "translated_text": translation,
                "source_language": source_lang,
                "target_language": target_lang,
                "confidence_score": confidence,
                "processing_time": processing_time,
                "model_info": "IndicTrans2-1B by AI4Bharat"
            }
            
        except Exception as e:
            logger.error(f"Translation error: {e}")
            return {"error": f"Translation failed: {str(e)}"}

# Initialize translation service
@st.cache_resource
def get_translation_service():
    return IndicTrans2Service()

def main():
    """Main Streamlit application with real AI translation"""
    
    # Header
    st.title("🌐 Multi-Lingual Product Catalog Translator")
    st.markdown("### Powered by IndicTrans2 by AI4Bharat")
    
    # Real AI banner
    st.success("""
    🤖 **Real AI Translation**
    
    This version uses actual IndicTrans2 neural machine translation models (1B parameters) 
    for state-of-the-art translation quality between English and Indian languages.
    
    ✨ Features: Neural translation • 15+ languages • High accuracy • GPU acceleration
    """)
    
    # Initialize translation service
    translator = get_translation_service()
    
    # Sidebar
    with st.sidebar:
        st.header("🎯 Translation Settings")
        
        # Language selection
        source_lang = st.selectbox(
            "Source Language",
            options=list(SUPPORTED_LANGUAGES.keys()),
            format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})",
            index=0  # Default to English
        )
        
        target_lang = st.selectbox(
            "Target Language", 
            options=list(SUPPORTED_LANGUAGES.keys()),
            format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})",
            index=1  # Default to Hindi
        )
        
        st.info(f"🔄 Translating: {SUPPORTED_LANGUAGES[source_lang]} → {SUPPORTED_LANGUAGES[target_lang]}")
        
        # Model info
        st.header("🤖 AI Model Info")
        st.markdown("""
        **Model**: IndicTrans2-1B  
        **Developer**: AI4Bharat  
        **Parameters**: 1 Billion  
        **Type**: Neural Machine Translation  
        **Specialization**: Indian Languages
        """)
    
    # Main content
    col1, col2 = st.columns(2)
    
    with col1:
        st.header("📝 Product Details")
        
        # Product form
        product_name = st.text_input(
            "Product Name",
            placeholder="e.g., Wireless Bluetooth Headphones"
        )
        
        product_description = st.text_area(
            "Product Description", 
            placeholder="e.g., Premium quality headphones with noise cancellation...",
            height=100
        )
        
        product_features = st.text_area(
            "Key Features",
            placeholder="e.g., Long battery life, comfortable fit, premium sound quality",
            height=80
        )
        
        # Translation button
        if st.button("🚀 Translate with AI", type="primary", use_container_width=True):
            if product_name or product_description or product_features:
                with st.spinner("🤖 AI translation in progress..."):
                    translations = {}
                    
                    # Translate each field
                    if product_name:
                        result = translator.translate_text(product_name, source_lang, target_lang)
                        translations["name"] = result
                        
                    if product_description:
                        result = translator.translate_text(product_description, source_lang, target_lang)
                        translations["description"] = result
                        
                    if product_features:
                        result = translator.translate_text(product_features, source_lang, target_lang)
                        translations["features"] = result
                    
                    # Store in session state
                    st.session_state.translations = translations
            else:
                st.warning("⚠️ Please enter at least one product detail to translate.")
    
    with col2:
        st.header("🎯 AI Translation Results")
        
        if hasattr(st.session_state, 'translations') and st.session_state.translations:
            translations = st.session_state.translations
            
            # Display translations
            for field, result in translations.items():
                if "error" not in result:
                    st.markdown(f"**{field.title()}:**")
                    st.success(result.get("translated_text", ""))
                    
                    # Show confidence and timing
                    col_conf, col_time = st.columns(2)
                    with col_conf:
                        confidence = result.get("confidence_score", 0)
                        st.metric("Confidence", f"{confidence:.1%}")
                    with col_time:
                        time_taken = result.get("processing_time", 0)
                        st.metric("Time", f"{time_taken:.1f}s")
                else:
                    st.error(f"Translation error for {field}: {result['error']}")
            
            # Export option
            if st.button("📥 Export Translations", use_container_width=True):
                export_data = {}
                for field, result in translations.items():
                    if "error" not in result:
                        export_data[f"{field}_original"] = st.session_state.get(f"original_{field}", "")
                        export_data[f"{field}_translated"] = result.get("translated_text", "")
                
                st.download_button(
                    label="Download as JSON",
                    data=str(export_data),
                    file_name=f"translation_{source_lang}_{target_lang}.json",
                    mime="application/json"
                )
        else:
            st.info("👆 Enter product details and click translate to see AI-powered results")
    
    # Statistics
    st.header("📊 Translation Analytics")
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric("Languages Supported", "15+")
    with col2:
        st.metric("Model Parameters", "1B")
    with col3:
        st.metric("Translation Quality", "State-of-art")
    with col4:
        device_type = "GPU" if torch.cuda.is_available() else "CPU"
        st.metric("Processing", device_type)
    
    # Footer
    st.markdown("---")
    st.markdown("""
    <div style='text-align: center'>
        <p>🤖 Powered by <strong>IndicTrans2</strong> by <strong>AI4Bharat</strong></p>
        <p>🚀 Deployed on <strong>Hugging Face Spaces</strong> with real neural machine translation</p>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()