import streamlit as st from transformers import MarianMTModel, MarianTokenizer import speech_recognition as sr from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate import torch import re import pandas as pd import requests import json from pathlib import Path import pickle from datasets import load_dataset import numpy as np st.set_page_config(page_title="Talk or Text Translator", page_icon="ЁЯМН") tab2 = st.tabs(["Text Translation"])[0] @st.cache_resource def load_translation_model(model_name): try: tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) return tokenizer, model except Exception as e: st.error(f"Error loading model {model_name}: {e}") return None, None @st.cache_data def load_professional_datasets(): datasets_info = { "Dakshina": { "size": "1.7M pairs", "description": "Google Research - Roman to Native script for 12 South Asian languages", "huggingface": "vrclc/dakshina-lexicons-ml", "languages": ["Hindi", "Bengali", "Tamil", "Telugu", "Malayalam", "Gujarati", "Punjabi", "Kannada", "Marathi", "Odia", "Assamese", "Urdu"] }, "Aksharantar": { "size": "26M pairs", "description": "AI4Bharat - Largest Indic transliteration dataset (21x larger than existing)", "github": "AI4Bharat/IndicXlit", "languages": ["21 Indic languages", "3 language families", "12 scripts"] }, "Samanantar": { "size": "49M pairs", "description": "Largest Indic-English parallel corpus", "url": "https://indicnlp.ai4bharat.org/samanantar/", "languages": ["11 Indic languages to English"] }, "FIRE Hinglish": { "size": "Large corpus", "description": "Code-mixed Hinglish datasets from FIRE workshop", "domain": "Social media, informal text" } } return datasets_info @st.cache_data def download_dakshina_sample(): try: dataset = load_dataset("vrclc/dakshina-lexicons-ml", split="train[:1000]") # Sample 1000 entries df = pd.DataFrame(dataset) if 'romanized' in df.columns and 'native' in df.columns: hindi_pairs = df[df['language'] == 'hi'] if 'language' in df.columns else df mapping_dict = dict(zip(hindi_pairs['romanized'].str.lower(), hindi_pairs['native'])) return mapping_dict, len(mapping_dict) return {}, 0 except Exception as e: st.warning(f"Could not download Dakshina: {e}") return {}, 0 @st.cache_data def load_enhanced_hinglish_dataset(): hinglish_dict = {} sources_loaded = [] try: dakshina_dict, dakshina_count = download_dakshina_sample() if dakshina_count > 0: hinglish_dict.update(dakshina_dict) sources_loaded.append(f"Dakshina ({dakshina_count} pairs)") enhanced_csv = Path("enhanced_hinglish_mapping.csv") if enhanced_csv.exists(): df = pd.read_csv(enhanced_csv) local_dict = dict(zip(df['hinglish'].str.lower(), df['hindi'])) hinglish_dict.update(local_dict) sources_loaded.append(f"Local enhanced ({len(local_dict)} pairs)") research_patterns = get_research_based_patterns() hinglish_dict.update(research_patterns) sources_loaded.append(f"Research patterns ({len(research_patterns)} pairs)") return hinglish_dict, sources_loaded except Exception as e: st.error(f"Error loading enhanced datasets: {e}") return get_basic_mappings(), ["Basic fallback"] def get_research_based_patterns(): return { 'kya': 'рдХреНрдпрд╛', 'hai': 'рд╣реИ', 'hain': 'рд╣реИрдВ', 'kar': 'рдХрд░', 'karo': 'рдХрд░реЛ', 'ja': 'рдЬрд╛', 'jao': 'рдЬрд╛рдУ', 'aa': 'рдЖ', 'aao': 'рдЖрдУ', 'de': 'рджреЗ', 'le': 'рд▓реЗ', 'yaar': 'рдпрд╛рд░', 'dost': 'рджреЛрд╕реНрдд', 'bhai': 'рднрд╛рдИ', 'behen': 'рдмрд╣рди', 'ghar': 'рдШрд░', 'paani': 'рдкрд╛рдиреА', 'khana': 'рдЦрд╛рдирд╛', 'time': 'рдЯрд╛рдЗрдо', 'phone': 'рдлреЛрди', 'call': 'рдХреЙрд▓', 'message': 'рдореИрд╕реЗрдЬ', 'photo': 'рдлреЛрдЯреЛ', 'video': 'рд╡реАрдбрд┐рдпреЛ', 'music': 'рдореНрдпреВрдЬрд┐рдХ', 'movie': 'рдореВрд╡реА', 'book': 'рдмреБрдХ', 'school': 'рд╕реНрдХреВрд▓', 'college': 'рдХреЙрд▓реЗрдЬ', 'office': 'рдСрдлрд┐рд╕', 'work': 'рд╡рд░реНрдХ', 'maal': 'рдорд╛рд▓', 'scene': 'рд╕реАрди', 'tension': 'рдЯреЗрдВрд╢рди', 'problem': 'рдкреНрд░реЙрдмреНрд▓рдо', 'solution': 'рд╕реЛрд▓реНрдпреВрд╢рди', 'idea': 'рдЖрдЗрдбрд┐рдпрд╛', 'plan': 'рдкреНрд▓рд╛рди', 'party': 'рдкрд╛рд░реНрдЯреА', 'achha': 'рдЕрдЪреНрдЫрд╛', 'bura': 'рдмреБрд░рд╛', 'naya': 'рдирдпрд╛', 'purana': 'рдкреБрд░рд╛рдирд╛', 'bada': 'рдмрдбрд╝рд╛', 'chota': 'рдЫреЛрдЯрд╛', 'thoda': 'рдереЛрдбрд╝рд╛', 'jyada': 'рдЬреНрдпрд╛рджрд╛', 'sab': 'рд╕рдм', 'kuch': 'рдХреБрдЫ', 'koi': 'рдХреЛрдИ', 'yahan': 'рдпрд╣рд╛рдБ', 'wahan': 'рд╡рд╣рд╛рдБ', 'kal': 'рдХрд▓', 'aaj': 'рдЖрдЬ', 'abhi': 'рдЕрднреА', 'baad': 'рдмрд╛рдж', 'pehle': 'рдкрд╣рд▓реЗ' } def get_basic_mappings(): return get_research_based_patterns() def get_model(input_lang, output_lang): models = { ("Hindi", "English"): "Helsinki-NLP/opus-mt-hi-en", ("English", "Hindi"): "Helsinki-NLP/opus-mt-en-hi", } return models.get((input_lang, output_lang)) def translate_text(text, input_lang, output_lang): if not text or not text.strip(): return "No text to translate" model_name = get_model(input_lang, output_lang) if not model_name: return "Translation pair not supported" tokenizer, model = load_translation_model(model_name) if tokenizer is None or model is None: return "Failed to load translation model" try: text = text.strip() text = preprocess_text(text, input_lang) inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=512) with torch.no_grad(): translated_tokens = model.generate( **inputs, max_length=512, num_beams=6, length_penalty=0.8, early_stopping=True, do_sample=False ) translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True) translated_text = postprocess_text(translated_text, output_lang) return translated_text except Exception as e: return f"Translation error: {str(e)}" def preprocess_text(text, lang): import unicodedata text = unicodedata.normalize('NFC', text) text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C') if lang == "Hindi": text = re.sub(r'[^\u0900-\u097F\s]', '', text) return text.strip() def postprocess_text(text, lang): text = re.sub(r'\s+', ' ', text).strip() if lang == "Hindi": text = re.sub(r'(\u093C)', '', text) return text def professional_hinglish_to_hindi(text, hinglish_dict): words = text.lower().split() converted_words = [] confidence_scores = [] for word in words: clean_word = re.sub(r'[^\w]', '', word) confidence = 0.0 if clean_word in hinglish_dict: converted_words.append(hinglish_dict[clean_word]) confidence = 1.0 else: schemes = [sanscript.ITRANS, sanscript.HK, sanscript.IAST] best_result = word for scheme in schemes: try: result = transliterate(clean_word, scheme, sanscript.DEVANAGARI) if result != clean_word: best_result = result confidence = 0.7 break except: continue converted_words.append(best_result) confidence_scores.append(confidence) avg_confidence = np.mean(confidence_scores) if confidence_scores else 0.0 return ' '.join(converted_words), avg_confidence datasets_info = load_professional_datasets() hinglish_dict, sources_loaded = load_enhanced_hinglish_dataset() with tab2: st.subheader("ЁЯУЭText Translation") option = st.radio("Translation Type:", ["English тЮЭ Hindi", "Hindi тЮЭ English", "Hinglish тЮЭ English"]) input_text = st.text_area("Enter text:", height=150, max_chars=2000) if input_text: st.caption(f"Characters: {len(input_text)}/2000") if st.button("ЁЯФД Translate", type="primary"): if input_text.strip(): with st.spinner("ЁЯМР Processing with models..."): try: if option == "English тЮЭ Hindi": result = translate_text(input_text, "English", "Hindi") elif option == "Hindi тЮЭ English": result = translate_text(input_text, "Hindi", "English") elif option == "Hinglish тЮЭ English": hindi_text, confidence = professional_hinglish_to_hindi(input_text, hinglish_dict) st.info(f"ЁЯФд **Converted to Hindi:** {hindi_text}") st.caption(f"Confidence: {confidence:.2%}") result = translate_text(hindi_text, "Hindi", "English") if result and not result.startswith(("Translation error:", "Failed")): st.success("ЁЯМР **Translation:**") st.code(result, language=None) else: st.error(f"тЭМ {result}") except Exception as e: st.error(f"тЭМ Error: {str(e)}")