File size: 14,147 Bytes
67f25fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# Real AI-Powered Multi-Lingual Product Catalog Translator
# Hugging Face Spaces Deployment with IndicTrans2

import streamlit as st
import os
import sys
import torch
import logging
from typing import Dict, List, Optional
import time
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set environment variable for model type
os.environ.setdefault("MODEL_TYPE", "indictrans2")
os.environ.setdefault("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")

try:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    logger.warning("Transformers not available, falling back to mock mode")

# Streamlit page config
st.set_page_config(
    page_title="Multi-Lingual Catalog Translator - Real AI",
    page_icon="🌐",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Language mappings for IndicTrans2
SUPPORTED_LANGUAGES = {
    "en": "English",
    "hi": "Hindi", 
    "bn": "Bengali",
    "gu": "Gujarati",
    "kn": "Kannada",
    "ml": "Malayalam", 
    "mr": "Marathi",
    "or": "Odia",
    "pa": "Punjabi",
    "ta": "Tamil",
    "te": "Telugu",
    "ur": "Urdu",
    "as": "Assamese",
    "ne": "Nepali",
    "sa": "Sanskrit"
}

# Flores language codes for IndicTrans2
FLORES_CODES = {
    "en": "eng_Latn",
    "hi": "hin_Deva",
    "bn": "ben_Beng", 
    "gu": "guj_Gujr",
    "kn": "kan_Knda",
    "ml": "mal_Mlym",
    "mr": "mar_Deva", 
    "or": "ory_Orya",
    "pa": "pan_Guru",
    "ta": "tam_Taml",
    "te": "tel_Telu",
    "ur": "urd_Arab",
    "as": "asm_Beng",
    "ne": "npi_Deva",
    "sa": "san_Deva"
}

class IndicTrans2Service:
    """Real IndicTrans2 Translation Service for Hugging Face Spaces"""
    
    def __init__(self):
        self.en_indic_model = None
        self.indic_en_model = None
        self.en_indic_tokenizer = None
        self.indic_en_tokenizer = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {self.device}")
        
    @st.cache_resource
    def load_models(_self):
        """Load IndicTrans2 models with caching"""
        if not TRANSFORMERS_AVAILABLE:
            logger.error("Transformers library not available")
            return False
            
        try:
            with st.spinner("πŸ”„ Loading IndicTrans2 AI models... This may take a few minutes on first run."):
                # Load English to Indic model
                logger.info("Loading English to Indic model...")
                _self.en_indic_tokenizer = AutoTokenizer.from_pretrained(
                    "ai4bharat/indictrans2-en-indic-1B",
                    trust_remote_code=True
                )
                _self.en_indic_model = AutoModelForSeq2SeqLM.from_pretrained(
                    "ai4bharat/indictrans2-en-indic-1B",
                    trust_remote_code=True,
                    torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32
                )
                _self.en_indic_model.to(_self.device)
                _self.en_indic_model.eval()
                
                # Load Indic to English model  
                logger.info("Loading Indic to English model...")
                _self.indic_en_tokenizer = AutoTokenizer.from_pretrained(
                    "ai4bharat/indictrans2-indic-en-1B", 
                    trust_remote_code=True
                )
                _self.indic_en_model = AutoModelForSeq2SeqLM.from_pretrained(
                    "ai4bharat/indictrans2-indic-en-1B",
                    trust_remote_code=True,
                    torch_dtype=torch.float16 if _self.device == "cuda" else torch.float32
                )
                _self.indic_en_model.to(_self.device)
                _self.indic_en_model.eval()
                
                logger.info("βœ… Models loaded successfully!")
                return True
                
        except Exception as e:
            logger.error(f"❌ Error loading models: {e}")
            st.error(f"Failed to load AI models: {e}")
            return False
    
    def translate_text(self, text: str, source_lang: str, target_lang: str) -> Dict:
        """Translate text using real IndicTrans2 models"""
        try:
            logger.info(f"Translation request: '{text[:50]}...' from {source_lang} to {target_lang}")
            
            # Validate language codes
            if source_lang not in FLORES_CODES:
                logger.error(f"Unsupported source language: {source_lang}")
                return {"error": f"Unsupported source language: {source_lang}"}
            if target_lang not in FLORES_CODES:
                logger.error(f"Unsupported target language: {target_lang}")
                return {"error": f"Unsupported target language: {target_lang}"}
                
            if not self.load_models():
                return {"error": "Failed to load translation models"}
            
            start_time = time.time()
            
            # Determine translation direction
            if source_lang == "en" and target_lang in FLORES_CODES:
                # English to Indic
                model = self.en_indic_model
                tokenizer = self.en_indic_tokenizer
                src_code = FLORES_CODES[source_lang]
                tgt_code = FLORES_CODES[target_lang]
                
            elif source_lang in FLORES_CODES and target_lang == "en":
                # Indic to English
                model = self.indic_en_model
                tokenizer = self.indic_en_tokenizer
                src_code = FLORES_CODES[source_lang]
                tgt_code = FLORES_CODES[target_lang]
                
            else:
                return {"error": f"Translation not supported: {source_lang} β†’ {target_lang}"}
            
            # Prepare input text with correct IndicTrans2 format
            input_text = f"{src_code} {tgt_code} {text}"
            
            # Tokenize
            inputs = tokenizer(
                input_text,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(self.device)
            
            # Generate translation
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_length=512,
                    num_beams=4,
                    length_penalty=0.6,
                    early_stopping=True
                )
            
            # Decode translation
            translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Calculate processing time
            processing_time = time.time() - start_time
            
            # Calculate confidence (simplified scoring)
            confidence = min(0.95, max(0.75, 1.0 - (processing_time / 10)))
            
            return {
                "translated_text": translation,
                "source_language": source_lang,
                "target_language": target_lang,
                "confidence_score": confidence,
                "processing_time": processing_time,
                "model_info": "IndicTrans2-1B by AI4Bharat"
            }
            
        except Exception as e:
            logger.error(f"Translation error: {e}")
            return {"error": f"Translation failed: {str(e)}"}

# Initialize translation service
@st.cache_resource
def get_translation_service():
    return IndicTrans2Service()

def main():
    """Main Streamlit application with real AI translation"""
    
    # Header
    st.title("🌐 Multi-Lingual Product Catalog Translator")
    st.markdown("### Powered by IndicTrans2 by AI4Bharat")
    
    # Real AI banner
    st.success("""
    πŸ€– **Real AI Translation**
    
    This version uses actual IndicTrans2 neural machine translation models (1B parameters) 
    for state-of-the-art translation quality between English and Indian languages.
    
    ✨ Features: Neural translation β€’ 15+ languages β€’ High accuracy β€’ GPU acceleration
    """)
    
    # Initialize translation service
    translator = get_translation_service()
    
    # Sidebar
    with st.sidebar:
        st.header("🎯 Translation Settings")
        
        # Language selection
        source_lang = st.selectbox(
            "Source Language",
            options=list(SUPPORTED_LANGUAGES.keys()),
            format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})",
            index=0  # Default to English
        )
        
        target_lang = st.selectbox(
            "Target Language", 
            options=list(SUPPORTED_LANGUAGES.keys()),
            format_func=lambda x: f"{SUPPORTED_LANGUAGES[x]} ({x})",
            index=1  # Default to Hindi
        )
        
        st.info(f"πŸ”„ Translating: {SUPPORTED_LANGUAGES[source_lang]} β†’ {SUPPORTED_LANGUAGES[target_lang]}")
        
        # Model info
        st.header("πŸ€– AI Model Info")
        st.markdown("""
        **Model**: IndicTrans2-1B  
        **Developer**: AI4Bharat  
        **Parameters**: 1 Billion  
        **Type**: Neural Machine Translation  
        **Specialization**: Indian Languages
        """)
    
    # Main content
    col1, col2 = st.columns(2)
    
    with col1:
        st.header("πŸ“ Product Details")
        
        # Product form
        product_name = st.text_input(
            "Product Name",
            placeholder="e.g., Wireless Bluetooth Headphones"
        )
        
        product_description = st.text_area(
            "Product Description", 
            placeholder="e.g., Premium quality headphones with noise cancellation...",
            height=100
        )
        
        product_features = st.text_area(
            "Key Features",
            placeholder="e.g., Long battery life, comfortable fit, premium sound quality",
            height=80
        )
        
        # Translation button
        if st.button("πŸš€ Translate with AI", type="primary", use_container_width=True):
            if product_name or product_description or product_features:
                with st.spinner("πŸ€– AI translation in progress..."):
                    translations = {}
                    
                    # Translate each field
                    if product_name:
                        result = translator.translate_text(product_name, source_lang, target_lang)
                        translations["name"] = result
                        
                    if product_description:
                        result = translator.translate_text(product_description, source_lang, target_lang)
                        translations["description"] = result
                        
                    if product_features:
                        result = translator.translate_text(product_features, source_lang, target_lang)
                        translations["features"] = result
                    
                    # Store in session state
                    st.session_state.translations = translations
            else:
                st.warning("⚠️ Please enter at least one product detail to translate.")
    
    with col2:
        st.header("🎯 AI Translation Results")
        
        if hasattr(st.session_state, 'translations') and st.session_state.translations:
            translations = st.session_state.translations
            
            # Display translations
            for field, result in translations.items():
                if "error" not in result:
                    st.markdown(f"**{field.title()}:**")
                    st.success(result.get("translated_text", ""))
                    
                    # Show confidence and timing
                    col_conf, col_time = st.columns(2)
                    with col_conf:
                        confidence = result.get("confidence_score", 0)
                        st.metric("Confidence", f"{confidence:.1%}")
                    with col_time:
                        time_taken = result.get("processing_time", 0)
                        st.metric("Time", f"{time_taken:.1f}s")
                else:
                    st.error(f"Translation error for {field}: {result['error']}")
            
            # Export option
            if st.button("πŸ“₯ Export Translations", use_container_width=True):
                export_data = {}
                for field, result in translations.items():
                    if "error" not in result:
                        export_data[f"{field}_original"] = st.session_state.get(f"original_{field}", "")
                        export_data[f"{field}_translated"] = result.get("translated_text", "")
                
                st.download_button(
                    label="Download as JSON",
                    data=str(export_data),
                    file_name=f"translation_{source_lang}_{target_lang}.json",
                    mime="application/json"
                )
        else:
            st.info("πŸ‘† Enter product details and click translate to see AI-powered results")
    
    # Statistics
    st.header("πŸ“Š Translation Analytics")
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric("Languages Supported", "15+")
    with col2:
        st.metric("Model Parameters", "1B")
    with col3:
        st.metric("Translation Quality", "State-of-art")
    with col4:
        device_type = "GPU" if torch.cuda.is_available() else "CPU"
        st.metric("Processing", device_type)
    
    # Footer
    st.markdown("---")
    st.markdown("""
    <div style='text-align: center'>
        <p>πŸ€– Powered by <strong>IndicTrans2</strong> by <strong>AI4Bharat</strong></p>
        <p>πŸš€ Deployed on <strong>Hugging Face Spaces</strong> with real neural machine translation</p>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()