Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import MarianMTModel, MarianTokenizer | |
| import speech_recognition as sr | |
| from indic_transliteration import sanscript | |
| from indic_transliteration.sanscript import transliterate | |
| import torch | |
| import re | |
| import pandas as pd | |
| import requests | |
| import json | |
| from pathlib import Path | |
| import pickle | |
| from datasets import load_dataset | |
| import numpy as np | |
| st.set_page_config(page_title="Talk or Text Translator", page_icon="🌍") | |
| tab2 = st.tabs(["Text Translation"])[0] | |
| def load_translation_model(model_name): | |
| try: | |
| tokenizer = MarianTokenizer.from_pretrained(model_name) | |
| model = MarianMTModel.from_pretrained(model_name) | |
| return tokenizer, model | |
| except Exception as e: | |
| st.error(f"Error loading model {model_name}: {e}") | |
| return None, None | |
| def load_professional_datasets(): | |
| datasets_info = { | |
| "Dakshina": { | |
| "size": "1.7M pairs", | |
| "description": "Google Research - Roman to Native script for 12 South Asian languages", | |
| "huggingface": "vrclc/dakshina-lexicons-ml", | |
| "languages": ["Hindi", "Bengali", "Tamil", "Telugu", "Malayalam", "Gujarati", "Punjabi", "Kannada", "Marathi", "Odia", "Assamese", "Urdu"] | |
| }, | |
| "Aksharantar": { | |
| "size": "26M pairs", | |
| "description": "AI4Bharat - Largest Indic transliteration dataset (21x larger than existing)", | |
| "github": "AI4Bharat/IndicXlit", | |
| "languages": ["21 Indic languages", "3 language families", "12 scripts"] | |
| }, | |
| "Samanantar": { | |
| "size": "49M pairs", | |
| "description": "Largest Indic-English parallel corpus", | |
| "url": "https://indicnlp.ai4bharat.org/samanantar/", | |
| "languages": ["11 Indic languages to English"] | |
| }, | |
| "FIRE Hinglish": { | |
| "size": "Large corpus", | |
| "description": "Code-mixed Hinglish datasets from FIRE workshop", | |
| "domain": "Social media, informal text" | |
| } | |
| } | |
| return datasets_info | |
| def download_dakshina_sample(): | |
| try: | |
| dataset = load_dataset("vrclc/dakshina-lexicons-ml", split="train[:1000]") # Sample 1000 entries | |
| df = pd.DataFrame(dataset) | |
| if 'romanized' in df.columns and 'native' in df.columns: | |
| hindi_pairs = df[df['language'] == 'hi'] if 'language' in df.columns else df | |
| mapping_dict = dict(zip(hindi_pairs['romanized'].str.lower(), hindi_pairs['native'])) | |
| return mapping_dict, len(mapping_dict) | |
| return {}, 0 | |
| except Exception as e: | |
| st.warning(f"Could not download Dakshina: {e}") | |
| return {}, 0 | |
| def load_enhanced_hinglish_dataset(): | |
| hinglish_dict = {} | |
| sources_loaded = [] | |
| try: | |
| dakshina_dict, dakshina_count = download_dakshina_sample() | |
| if dakshina_count > 0: | |
| hinglish_dict.update(dakshina_dict) | |
| sources_loaded.append(f"Dakshina ({dakshina_count} pairs)") | |
| enhanced_csv = Path("enhanced_hinglish_mapping.csv") | |
| if enhanced_csv.exists(): | |
| df = pd.read_csv(enhanced_csv) | |
| local_dict = dict(zip(df['hinglish'].str.lower(), df['hindi'])) | |
| hinglish_dict.update(local_dict) | |
| sources_loaded.append(f"Local enhanced ({len(local_dict)} pairs)") | |
| research_patterns = get_research_based_patterns() | |
| hinglish_dict.update(research_patterns) | |
| sources_loaded.append(f"Research patterns ({len(research_patterns)} pairs)") | |
| return hinglish_dict, sources_loaded | |
| except Exception as e: | |
| st.error(f"Error loading enhanced datasets: {e}") | |
| return get_basic_mappings(), ["Basic fallback"] | |
| def get_research_based_patterns(): | |
| return { | |
| 'kya': 'क्या', 'hai': 'है', 'hain': 'हैं', 'kar': 'कर', 'karo': 'करो', | |
| 'ja': 'जा', 'jao': 'जाओ', 'aa': 'आ', 'aao': 'आओ', 'de': 'दे', 'le': 'ले', | |
| 'yaar': 'यार', 'dost': 'दोस्त', 'bhai': 'भाई', 'behen': 'बहन', | |
| 'ghar': 'घर', 'paani': 'पानी', 'khana': 'खाना', 'time': 'टाइम', | |
| 'phone': 'फोन', 'call': 'कॉल', 'message': 'मैसेज', 'photo': 'फोटो', | |
| 'video': 'वीडियो', 'music': 'म्यूजिक', 'movie': 'मूवी', 'book': 'बुक', | |
| 'school': 'स्कूल', 'college': 'कॉलेज', 'office': 'ऑफिस', 'work': 'वर्क', | |
| 'maal': 'माल', 'scene': 'सीन', 'tension': 'टेंशन', 'problem': 'प्रॉब्लम', | |
| 'solution': 'सोल्यूशन', 'idea': 'आइडिया', 'plan': 'प्लान', 'party': 'पार्टी', | |
| 'achha': 'अच्छा', 'bura': 'बुरा', 'naya': 'नया', 'purana': 'पुराना', | |
| 'bada': 'बड़ा', 'chota': 'छोटा', 'thoda': 'थोड़ा', 'jyada': 'ज्यादा', | |
| 'sab': 'सब', 'kuch': 'कुछ', 'koi': 'कोई', 'yahan': 'यहाँ', 'wahan': 'वहाँ', | |
| 'kal': 'कल', 'aaj': 'आज', 'abhi': 'अभी', 'baad': 'बाद', 'pehle': 'पहले' | |
| } | |
| def get_basic_mappings(): | |
| return get_research_based_patterns() | |
| def get_model(input_lang, output_lang): | |
| models = { | |
| ("Hindi", "English"): "Helsinki-NLP/opus-mt-hi-en", | |
| ("English", "Hindi"): "Helsinki-NLP/opus-mt-en-hi", | |
| } | |
| return models.get((input_lang, output_lang)) | |
| def translate_text(text, input_lang, output_lang): | |
| if not text or not text.strip(): | |
| return "No text to translate" | |
| model_name = get_model(input_lang, output_lang) | |
| if not model_name: | |
| return "Translation pair not supported" | |
| tokenizer, model = load_translation_model(model_name) | |
| if tokenizer is None or model is None: | |
| return "Failed to load translation model" | |
| try: | |
| text = text.strip() | |
| text = preprocess_text(text, input_lang) | |
| inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| translated_tokens = model.generate( | |
| **inputs, | |
| max_length=512, | |
| num_beams=6, | |
| length_penalty=0.8, | |
| early_stopping=True, | |
| do_sample=False | |
| ) | |
| translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True) | |
| translated_text = postprocess_text(translated_text, output_lang) | |
| return translated_text | |
| except Exception as e: | |
| return f"Translation error: {str(e)}" | |
| def preprocess_text(text, lang): | |
| import unicodedata | |
| text = unicodedata.normalize('NFC', text) | |
| text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C') | |
| if lang == "Hindi": | |
| text = re.sub(r'[^\u0900-\u097F\s]', '', text) | |
| return text.strip() | |
| def postprocess_text(text, lang): | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| if lang == "Hindi": | |
| text = re.sub(r'(\u093C)', '', text) | |
| return text | |
| def professional_hinglish_to_hindi(text, hinglish_dict): | |
| words = text.lower().split() | |
| converted_words = [] | |
| confidence_scores = [] | |
| for word in words: | |
| clean_word = re.sub(r'[^\w]', '', word) | |
| confidence = 0.0 | |
| if clean_word in hinglish_dict: | |
| converted_words.append(hinglish_dict[clean_word]) | |
| confidence = 1.0 | |
| else: | |
| schemes = [sanscript.ITRANS, sanscript.HK, sanscript.IAST] | |
| best_result = word | |
| for scheme in schemes: | |
| try: | |
| result = transliterate(clean_word, scheme, sanscript.DEVANAGARI) | |
| if result != clean_word: | |
| best_result = result | |
| confidence = 0.7 | |
| break | |
| except: | |
| continue | |
| converted_words.append(best_result) | |
| confidence_scores.append(confidence) | |
| avg_confidence = np.mean(confidence_scores) if confidence_scores else 0.0 | |
| return ' '.join(converted_words), avg_confidence | |
| datasets_info = load_professional_datasets() | |
| hinglish_dict, sources_loaded = load_enhanced_hinglish_dataset() | |
| with tab2: | |
| st.subheader("📝Text Translation") | |
| option = st.radio("Translation Type:", | |
| ["English ➝ Hindi", "Hindi ➝ English", "Hinglish ➝ English"]) | |
| input_text = st.text_area("Enter text:", height=150, max_chars=2000) | |
| if input_text: | |
| st.caption(f"Characters: {len(input_text)}/2000") | |
| if st.button("🔄 Translate", type="primary"): | |
| if input_text.strip(): | |
| with st.spinner("🌐 Processing with models..."): | |
| try: | |
| if option == "English ➝ Hindi": | |
| result = translate_text(input_text, "English", "Hindi") | |
| elif option == "Hindi ➝ English": | |
| result = translate_text(input_text, "Hindi", "English") | |
| elif option == "Hinglish ➝ English": | |
| hindi_text, confidence = professional_hinglish_to_hindi(input_text, hinglish_dict) | |
| st.info(f"🔤 **Converted to Hindi:** {hindi_text}") | |
| st.caption(f"Confidence: {confidence:.2%}") | |
| result = translate_text(hindi_text, "Hindi", "English") | |
| if result and not result.startswith(("Translation error:", "Failed")): | |
| st.success("🌐 **Translation:**") | |
| st.code(result, language=None) | |
| else: | |
| st.error(f"❌ {result}") | |
| except Exception as e: | |
| st.error(f"❌ Error: {str(e)}") | |