Athena1621's picture
feat: Implement Multi-Lingual Product Catalog Translator frontend with Streamlit
67f25fb
# Real AI-Powered Multi-Lingual Product Catalog Translator
# Hugging Face Spaces Deployment with IndicTrans2
import streamlit as st
import os
import sys
import torch
import logging
from typing import Dict, List, Optional
import time
import warnings
# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Set environment variable for model type
os.environ.setdefault("MODEL_TYPE", "indictrans2")
os.environ.setdefault("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
try:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
logger.warning("Transformers not available, falling back to mock mode")
# Streamlit page config
st.set_page_config(
page_title="Multi-Lingual Catalog Translator - Real AI",
page_icon="🌐",
layout="wide",
initial_sidebar_state="expanded"
)
# Language mappings for IndicTrans2
SUPPORTED_LANGUAGES = {
"en": "English",
"hi": "Hindi",
"bn": "Bengali",
"gu": "Gujarati",
"kn": "Kannada",
"ml": "Malayalam",
"mr": "Marathi",
"or": "Odia",
"pa": "Punjabi",
"ta": "Tamil",
"te": "Telugu",
"ur": "Urdu",
"as": "Assamese",
"ne": "Nepali",
"sa": "Sanskrit"
}
# Flores language codes for IndicTrans2
FLORES_CODES = {
"en": "eng_Latn",
"hi": "hin_Deva",
"bn": "ben_Beng",
"gu": "guj_Gujr",
"kn": "kan_Knda",
"ml": "mal_Mlym",
"mr": "mar_Deva",
"or": "ory_Orya",
"pa": "pan_Guru",
"ta": "tam_Taml",
"te": "tel_Telu",
"ur": "urd_Arab",
"as": "asm_Beng",
"ne": "npi_Deva",
"sa": "san_Deva"
}
class IndicTrans2Service:
"""Real IndicTrans2 Translation Service for Hugging Face Spaces"""
def __init__(self):
self.en_indic_model = None
self.indic_en_model = None
self.en_indic_tokenizer = None
self.indic_en_tokenizer = None
self.device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {self.device}")
@st.cache_resource
def load_models(_self):
"""Load IndicTrans2 models with caching"""
if not TRANSFORMERS_AVAILABLE:
logger.error("Transformers library not available")
return False
try:
with st.spinner("πŸ”„ Loading IndicTrans2 AI models... This may take a few minutes on first run."):
# Load English to Indic model
logger.info("Loading English to Indic model...")
_self.en_indic_tokenizer = AutoTokenizer.from_pretrained(
"ai4bharat/indictrans2-en-indic-1B",
trust_remote_code=True
)
_self.en_indic_model = AutoModelForSeq2SeqLM.from_pretrained(
"ai4bharat/indictrans2-en-indic-1B",
trust_remote_code=True,
torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32
)
_self.en_indic_model.to(_self.device)
_self.en_indic_model.eval()
# Load Indic to English model
logger.info("Loading Indic to English model...")
_self.indic_en_tokenizer = AutoTokenizer.from_pretrained(
"ai4bharat/indictrans2-indic-en-1B",
trust_remote_code=True
)
_self.indic_en_model = AutoModelForSeq2SeqLM.from_pretrained(
"ai4bharat/indictrans2-indic-en-1B",
trust_remote_code=True,
torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32
)
_self.indic_en_model.to(_self.device)
_self.indic_en_model.eval()
logger.info("βœ… Models loaded successfully!")
return True
except Exception as e:
logger.error(f"❌ Error loading models: {e}")
st.error(f"Failed to load AI models: {e}")
return False
def translate_text(self, text: str, source_lang: str, target_lang: str) -> Dict:
"""Translate text using real IndicTrans2 models"""
try:
logger.info(f"Translation request: '{text[:50]}...' from {source_lang} to {target_lang}")
# Validate language codes
if source_lang not in FLORES_CODES:
logger.error(f"Unsupported source language: {source_lang}")
return {"error": f"Unsupported source language: {source_lang}"}
if target_lang not in FLORES_CODES:
logger.error(f"Unsupported target language: {target_lang}")
return {"error": f"Unsupported target language: {target_lang}"}
if not self.load_models():
return {"error": "Failed to load translation models"}
start_time = time.time()
# Determine translation direction
if source_lang == "en" and target_lang in FLORES_CODES:
# English to Indic
model = self.en_indic_model
tokenizer = self.en_indic_tokenizer
src_code = FLORES_CODES[source_lang]
tgt_code = FLORES_CODES[target_lang]
elif source_lang in FLORES_CODES and target_lang == "en":
# Indic to English
model = self.indic_en_model
tokenizer = self.indic_en_tokenizer
src_code = FLORES_CODES[source_lang]
tgt_code = FLORES_CODES[target_lang]
else:
return {"error": f"Translation not supported: {source_lang} β†’ {target_lang}"}
# Prepare input text with correct IndicTrans2 format
input_text = f"{src_code} {tgt_code} {text}"
# Tokenize
inputs = tokenizer(
input_text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(self.device)
# Generate translation
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=512,
num_beams=4,
length_penalty=0.6,
early_stopping=True
)
# Decode translation
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Calculate processing time
processing_time = time.time() - start_time
# Calculate confidence (simplified scoring)
confidence = min(0.95, max(0.75, 1.0 - (processing_time / 10)))
return {
"translated_text": translation,
"source_language": source_lang,
"target_language": target_lang,
"confidence_score": confidence,
"processing_time": processing_time,
"model_info": "IndicTrans2-1B by AI4Bharat"
}
except Exception as e:
logger.error(f"Translation error: {e}")
return {"error": f"Translation failed: {str(e)}"}
# Initialize translation service
@st.cache_resource
def get_translation_service():
return IndicTrans2Service()
def main():
"""Main Streamlit application with real AI translation"""
# Header
st.title("🌐 Multi-Lingual Product Catalog Translator")
st.markdown("### Powered by IndicTrans2 by AI4Bharat")
# Real AI banner
st.success("""
πŸ€– **Real AI Translation**
This version uses actual IndicTrans2 neural machine translation models (1B parameters)
for state-of-the-art translation quality between English and Indian languages.
✨ Features: Neural translation β€’ 15+ languages β€’ High accuracy β€’ GPU acceleration
""")
# Initialize translation service
translator = get_translation_service()
# Sidebar
with st.sidebar:
st.header("🎯 Translation Settings")
# Language selection
source_lang = st.selectbox(
"Source Language",
options=list(SUPPORTED_LANGUAGES.keys()),
format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})",
index=0 # Default to English
)
target_lang = st.selectbox(
"Target Language",
options=list(SUPPORTED_LANGUAGES.keys()),
format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})",
index=1 # Default to Hindi
)
st.info(f"πŸ”„ Translating: {SUPPORTED_LANGUAGES[source_lang]} β†’ {SUPPORTED_LANGUAGES[target_lang]}")
# Model info
st.header("πŸ€– AI Model Info")
st.markdown("""
**Model**: IndicTrans2-1B
**Developer**: AI4Bharat
**Parameters**: 1 Billion
**Type**: Neural Machine Translation
**Specialization**: Indian Languages
""")
# Main content
col1, col2 = st.columns(2)
with col1:
st.header("πŸ“ Product Details")
# Product form
product_name = st.text_input(
"Product Name",
placeholder="e.g., Wireless Bluetooth Headphones"
)
product_description = st.text_area(
"Product Description",
placeholder="e.g., Premium quality headphones with noise cancellation...",
height=100
)
product_features = st.text_area(
"Key Features",
placeholder="e.g., Long battery life, comfortable fit, premium sound quality",
height=80
)
# Translation button
if st.button("πŸš€ Translate with AI", type="primary", use_container_width=True):
if product_name or product_description or product_features:
with st.spinner("πŸ€– AI translation in progress..."):
translations = {}
# Translate each field
if product_name:
result = translator.translate_text(product_name, source_lang, target_lang)
translations["name"] = result
if product_description:
result = translator.translate_text(product_description, source_lang, target_lang)
translations["description"] = result
if product_features:
result = translator.translate_text(product_features, source_lang, target_lang)
translations["features"] = result
# Store in session state
st.session_state.translations = translations
else:
st.warning("⚠️ Please enter at least one product detail to translate.")
with col2:
st.header("🎯 AI Translation Results")
if hasattr(st.session_state, 'translations') and st.session_state.translations:
translations = st.session_state.translations
# Display translations
for field, result in translations.items():
if "error" not in result:
st.markdown(f"**{field.title()}:**")
st.success(result.get("translated_text", ""))
# Show confidence and timing
col_conf, col_time = st.columns(2)
with col_conf:
confidence = result.get("confidence_score", 0)
st.metric("Confidence", f"{confidence:.1%}")
with col_time:
time_taken = result.get("processing_time", 0)
st.metric("Time", f"{time_taken:.1f}s")
else:
st.error(f"Translation error for {field}: {result['error']}")
# Export option
if st.button("πŸ“₯ Export Translations", use_container_width=True):
export_data = {}
for field, result in translations.items():
if "error" not in result:
export_data[f"{field}_original"] = st.session_state.get(f"original_{field}", "")
export_data[f"{field}_translated"] = result.get("translated_text", "")
st.download_button(
label="Download as JSON",
data=str(export_data),
file_name=f"translation_{source_lang}_{target_lang}.json",
mime="application/json"
)
else:
st.info("πŸ‘† Enter product details and click translate to see AI-powered results")
# Statistics
st.header("πŸ“Š Translation Analytics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Languages Supported", "15+")
with col2:
st.metric("Model Parameters", "1B")
with col3:
st.metric("Translation Quality", "State-of-art")
with col4:
device_type = "GPU" if torch.cuda.is_available() else "CPU"
st.metric("Processing", device_type)
# Footer
st.markdown("---")
st.markdown("""
<div style='text-align: center'>
<p>πŸ€– Powered by <strong>IndicTrans2</strong> by <strong>AI4Bharat</strong></p>
<p>πŸš€ Deployed on <strong>Hugging Face Spaces</strong> with real neural machine translation</p>
</div>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()