Spaces:

Athena1621
/

Translation_app_

Sleeping

App Files Files Community

Translation_app_ / app.py

Athena1621

feat: Implement Multi-Lingual Product Catalog Translator frontend with Streamlit

67f25fb 6 months ago

raw

history blame contribute delete

14.1 kB

	# Real AI-Powered Multi-Lingual Product Catalog Translator
	# Hugging Face Spaces Deployment with IndicTrans2

	import streamlit as st
	import os
	import sys
	import torch
	import logging
	from typing import Dict, List, Optional
	import time
	import warnings

	# Suppress warnings
	warnings.filterwarnings("ignore", category=UserWarning)
	warnings.filterwarnings("ignore", category=FutureWarning)

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Set environment variable for model type
	os.environ.setdefault("MODEL_TYPE", "indictrans2")
	os.environ.setdefault("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")

	try:
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	TRANSFORMERS_AVAILABLE = True
	except ImportError:
	TRANSFORMERS_AVAILABLE = False
	logger.warning("Transformers not available, falling back to mock mode")

	# Streamlit page config
	st.set_page_config(
	page_title="Multi-Lingual Catalog Translator - Real AI",
	page_icon="🌐",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Language mappings for IndicTrans2
	SUPPORTED_LANGUAGES = {
	"en": "English",
	"hi": "Hindi",
	"bn": "Bengali",
	"gu": "Gujarati",
	"kn": "Kannada",
	"ml": "Malayalam",
	"mr": "Marathi",
	"or": "Odia",
	"pa": "Punjabi",
	"ta": "Tamil",
	"te": "Telugu",
	"ur": "Urdu",
	"as": "Assamese",
	"ne": "Nepali",
	"sa": "Sanskrit"
	}

	# Flores language codes for IndicTrans2
	FLORES_CODES = {
	"en": "eng_Latn",
	"hi": "hin_Deva",
	"bn": "ben_Beng",
	"gu": "guj_Gujr",
	"kn": "kan_Knda",
	"ml": "mal_Mlym",
	"mr": "mar_Deva",
	"or": "ory_Orya",
	"pa": "pan_Guru",
	"ta": "tam_Taml",
	"te": "tel_Telu",
	"ur": "urd_Arab",
	"as": "asm_Beng",
	"ne": "npi_Deva",
	"sa": "san_Deva"
	}

	class IndicTrans2Service:
	"""Real IndicTrans2 Translation Service for Hugging Face Spaces"""

	def __init__(self):
	self.en_indic_model = None
	self.indic_en_model = None
	self.en_indic_tokenizer = None
	self.indic_en_tokenizer = None
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {self.device}")

	@st.cache_resource
	def load_models(_self):
	"""Load IndicTrans2 models with caching"""
	if not TRANSFORMERS_AVAILABLE:
	logger.error("Transformers library not available")
	return False

	try:
	with st.spinner("🔄 Loading IndicTrans2 AI models... This may take a few minutes on first run."):
	# Load English to Indic model
	logger.info("Loading English to Indic model...")
	_self.en_indic_tokenizer = AutoTokenizer.from_pretrained(
	"ai4bharat/indictrans2-en-indic-1B",
	trust_remote_code=True
	)
	_self.en_indic_model = AutoModelForSeq2SeqLM.from_pretrained(
	"ai4bharat/indictrans2-en-indic-1B",
	trust_remote_code=True,
	torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32
	)
	_self.en_indic_model.to(_self.device)
	_self.en_indic_model.eval()

	# Load Indic to English model
	logger.info("Loading Indic to English model...")
	_self.indic_en_tokenizer = AutoTokenizer.from_pretrained(
	"ai4bharat/indictrans2-indic-en-1B",
	trust_remote_code=True
	)
	_self.indic_en_model = AutoModelForSeq2SeqLM.from_pretrained(
	"ai4bharat/indictrans2-indic-en-1B",
	trust_remote_code=True,
	torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32
	)
	_self.indic_en_model.to(_self.device)
	_self.indic_en_model.eval()

	logger.info("✅ Models loaded successfully!")
	return True

	except Exception as e:
	logger.error(f"❌ Error loading models: {e}")
	st.error(f"Failed to load AI models: {e}")
	return False

	def translate_text(self, text: str, source_lang: str, target_lang: str) -> Dict:
	"""Translate text using real IndicTrans2 models"""
	try:
	logger.info(f"Translation request: '{text[:50]}...' from {source_lang} to {target_lang}")

	# Validate language codes
	if source_lang not in FLORES_CODES:
	logger.error(f"Unsupported source language: {source_lang}")
	return {"error": f"Unsupported source language: {source_lang}"}
	if target_lang not in FLORES_CODES:
	logger.error(f"Unsupported target language: {target_lang}")
	return {"error": f"Unsupported target language: {target_lang}"}

	if not self.load_models():
	return {"error": "Failed to load translation models"}

	start_time = time.time()

	# Determine translation direction
	if source_lang == "en" and target_lang in FLORES_CODES:
	# English to Indic
	model = self.en_indic_model
	tokenizer = self.en_indic_tokenizer
	src_code = FLORES_CODES[source_lang]
	tgt_code = FLORES_CODES[target_lang]

	elif source_lang in FLORES_CODES and target_lang == "en":
	# Indic to English
	model = self.indic_en_model
	tokenizer = self.indic_en_tokenizer
	src_code = FLORES_CODES[source_lang]
	tgt_code = FLORES_CODES[target_lang]

	else:
	return {"error": f"Translation not supported: {source_lang} → {target_lang}"}

	# Prepare input text with correct IndicTrans2 format
	input_text = f"{src_code} {tgt_code} {text}"

	# Tokenize
	inputs = tokenizer(
	input_text,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=512
	).to(self.device)

	# Generate translation
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_length=512,
	num_beams=4,
	length_penalty=0.6,
	early_stopping=True
	)

	# Decode translation
	translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Calculate processing time
	processing_time = time.time() - start_time

	# Calculate confidence (simplified scoring)
	confidence = min(0.95, max(0.75, 1.0 - (processing_time / 10)))

	return {
	"translated_text": translation,
	"source_language": source_lang,
	"target_language": target_lang,
	"confidence_score": confidence,
	"processing_time": processing_time,
	"model_info": "IndicTrans2-1B by AI4Bharat"
	}

	except Exception as e:
	logger.error(f"Translation error: {e}")
	return {"error": f"Translation failed: {str(e)}"}

	# Initialize translation service
	@st.cache_resource
	def get_translation_service():
	return IndicTrans2Service()

	def main():
	"""Main Streamlit application with real AI translation"""

	# Header
	st.title("🌐 Multi-Lingual Product Catalog Translator")
	st.markdown("### Powered by IndicTrans2 by AI4Bharat")

	# Real AI banner
	st.success("""
	🤖 Real AI Translation

	This version uses actual IndicTrans2 neural machine translation models (1B parameters)
	for state-of-the-art translation quality between English and Indian languages.

	✨ Features: Neural translation • 15+ languages • High accuracy • GPU acceleration
	""")

	# Initialize translation service
	translator = get_translation_service()

	# Sidebar
	with st.sidebar:
	st.header("🎯 Translation Settings")

	# Language selection
	source_lang = st.selectbox(
	"Source Language",
	options=list(SUPPORTED_LANGUAGES.keys()),
	format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})",
	index=0 # Default to English
	)

	target_lang = st.selectbox(
	"Target Language",
	options=list(SUPPORTED_LANGUAGES.keys()),
	format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})",
	index=1 # Default to Hindi
	)

	st.info(f"🔄 Translating: {SUPPORTED_LANGUAGES[source_lang]} → {SUPPORTED_LANGUAGES[target_lang]}")

	# Model info
	st.header("🤖 AI Model Info")
	st.markdown("""
	Model: IndicTrans2-1B
	Developer: AI4Bharat
	Parameters: 1 Billion
	Type: Neural Machine Translation
	Specialization: Indian Languages
	""")

	# Main content
	col1, col2 = st.columns(2)

	with col1:
	st.header("📝 Product Details")

	# Product form
	product_name = st.text_input(
	"Product Name",
	placeholder="e.g., Wireless Bluetooth Headphones"
	)

	product_description = st.text_area(
	"Product Description",
	placeholder="e.g., Premium quality headphones with noise cancellation...",
	height=100
	)

	product_features = st.text_area(
	"Key Features",
	placeholder="e.g., Long battery life, comfortable fit, premium sound quality",
	height=80
	)

	# Translation button
	if st.button("🚀 Translate with AI", type="primary", use_container_width=True):
	if product_name or product_description or product_features:
	with st.spinner("🤖 AI translation in progress..."):
	translations = {}

	# Translate each field
	if product_name:
	result = translator.translate_text(product_name, source_lang, target_lang)
	translations["name"] = result

	if product_description:
	result = translator.translate_text(product_description, source_lang, target_lang)
	translations["description"] = result

	if product_features:
	result = translator.translate_text(product_features, source_lang, target_lang)
	translations["features"] = result

	# Store in session state
	st.session_state.translations = translations
	else:
	st.warning("⚠️ Please enter at least one product detail to translate.")

	with col2:
	st.header("🎯 AI Translation Results")

	if hasattr(st.session_state, 'translations') and st.session_state.translations:
	translations = st.session_state.translations

	# Display translations
	for field, result in translations.items():
	if "error" not in result:
	st.markdown(f"{field.title()}:")
	st.success(result.get("translated_text", ""))

	# Show confidence and timing
	col_conf, col_time = st.columns(2)
	with col_conf:
	confidence = result.get("confidence_score", 0)
	st.metric("Confidence", f"{confidence:.1%}")
	with col_time:
	time_taken = result.get("processing_time", 0)
	st.metric("Time", f"{time_taken:.1f}s")
	else:
	st.error(f"Translation error for {field}: {result['error']}")

	# Export option
	if st.button("📥 Export Translations", use_container_width=True):
	export_data = {}
	for field, result in translations.items():
	if "error" not in result:
	export_data[f"{field}_original"] = st.session_state.get(f"original_{field}", "")
	export_data[f"{field}_translated"] = result.get("translated_text", "")

	st.download_button(
	label="Download as JSON",
	data=str(export_data),
	file_name=f"translation_{source_lang}_{target_lang}.json",
	mime="application/json"
	)
	else:
	st.info("👆 Enter product details and click translate to see AI-powered results")

	# Statistics
	st.header("📊 Translation Analytics")
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Languages Supported", "15+")
	with col2:
	st.metric("Model Parameters", "1B")
	with col3:
	st.metric("Translation Quality", "State-of-art")
	with col4:
	device_type = "GPU" if torch.cuda.is_available() else "CPU"
	st.metric("Processing", device_type)

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style='text-align: center'>
	<p>🤖 Powered by <strong>IndicTrans2</strong> by <strong>AI4Bharat</strong></p>
	<p>🚀 Deployed on <strong>Hugging Face Spaces</strong> with real neural machine translation</p>
	</div>
	""", unsafe_allow_html=True)

	if __name__ == "__main__":
	main()