talk-text-translator / src /streamlit_app.py
Adithya12A's picture
Update src/streamlit_app.py
d78edb2 verified
import streamlit as st
from transformers import MarianMTModel, MarianTokenizer
import speech_recognition as sr
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import torch
import re
import pandas as pd
import requests
import json
from pathlib import Path
import pickle
from datasets import load_dataset
import numpy as np
st.set_page_config(page_title="Talk or Text Translator", page_icon="🌍")
tab2 = st.tabs(["Text Translation"])[0]
@st.cache_resource
def load_translation_model(model_name):
try:
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
return tokenizer, model
except Exception as e:
st.error(f"Error loading model {model_name}: {e}")
return None, None
@st.cache_data
def load_professional_datasets():
datasets_info = {
"Dakshina": {
"size": "1.7M pairs",
"description": "Google Research - Roman to Native script for 12 South Asian languages",
"huggingface": "vrclc/dakshina-lexicons-ml",
"languages": ["Hindi", "Bengali", "Tamil", "Telugu", "Malayalam", "Gujarati", "Punjabi", "Kannada", "Marathi", "Odia", "Assamese", "Urdu"]
},
"Aksharantar": {
"size": "26M pairs",
"description": "AI4Bharat - Largest Indic transliteration dataset (21x larger than existing)",
"github": "AI4Bharat/IndicXlit",
"languages": ["21 Indic languages", "3 language families", "12 scripts"]
},
"Samanantar": {
"size": "49M pairs",
"description": "Largest Indic-English parallel corpus",
"url": "https://indicnlp.ai4bharat.org/samanantar/",
"languages": ["11 Indic languages to English"]
},
"FIRE Hinglish": {
"size": "Large corpus",
"description": "Code-mixed Hinglish datasets from FIRE workshop",
"domain": "Social media, informal text"
}
}
return datasets_info
@st.cache_data
def download_dakshina_sample():
try:
dataset = load_dataset("vrclc/dakshina-lexicons-ml", split="train[:1000]") # Sample 1000 entries
df = pd.DataFrame(dataset)
if 'romanized' in df.columns and 'native' in df.columns:
hindi_pairs = df[df['language'] == 'hi'] if 'language' in df.columns else df
mapping_dict = dict(zip(hindi_pairs['romanized'].str.lower(), hindi_pairs['native']))
return mapping_dict, len(mapping_dict)
return {}, 0
except Exception as e:
st.warning(f"Could not download Dakshina: {e}")
return {}, 0
@st.cache_data
def load_enhanced_hinglish_dataset():
hinglish_dict = {}
sources_loaded = []
try:
dakshina_dict, dakshina_count = download_dakshina_sample()
if dakshina_count > 0:
hinglish_dict.update(dakshina_dict)
sources_loaded.append(f"Dakshina ({dakshina_count} pairs)")
enhanced_csv = Path("enhanced_hinglish_mapping.csv")
if enhanced_csv.exists():
df = pd.read_csv(enhanced_csv)
local_dict = dict(zip(df['hinglish'].str.lower(), df['hindi']))
hinglish_dict.update(local_dict)
sources_loaded.append(f"Local enhanced ({len(local_dict)} pairs)")
research_patterns = get_research_based_patterns()
hinglish_dict.update(research_patterns)
sources_loaded.append(f"Research patterns ({len(research_patterns)} pairs)")
return hinglish_dict, sources_loaded
except Exception as e:
st.error(f"Error loading enhanced datasets: {e}")
return get_basic_mappings(), ["Basic fallback"]
def get_research_based_patterns():
return {
'kya': 'क्या', 'hai': 'है', 'hain': 'हैं', 'kar': 'कर', 'karo': 'करो',
'ja': 'जा', 'jao': 'जाओ', 'aa': 'आ', 'aao': 'आओ', 'de': 'दे', 'le': 'ले',
'yaar': 'यार', 'dost': 'दोस्त', 'bhai': 'भाई', 'behen': 'बहन',
'ghar': 'घर', 'paani': 'पानी', 'khana': 'खाना', 'time': 'टाइम',
'phone': 'फोन', 'call': 'कॉल', 'message': 'मैसेज', 'photo': 'फोटो',
'video': 'वीडियो', 'music': 'म्यूजिक', 'movie': 'मूवी', 'book': 'बुक',
'school': 'स्कूल', 'college': 'कॉलेज', 'office': 'ऑफिस', 'work': 'वर्क',
'maal': 'माल', 'scene': 'सीन', 'tension': 'टेंशन', 'problem': 'प्रॉब्लम',
'solution': 'सोल्यूशन', 'idea': 'आइडिया', 'plan': 'प्लान', 'party': 'पार्टी',
'achha': 'अच्छा', 'bura': 'बुरा', 'naya': 'नया', 'purana': 'पुराना',
'bada': 'बड़ा', 'chota': 'छोटा', 'thoda': 'थोड़ा', 'jyada': 'ज्यादा',
'sab': 'सब', 'kuch': 'कुछ', 'koi': 'कोई', 'yahan': 'यहाँ', 'wahan': 'वहाँ',
'kal': 'कल', 'aaj': 'आज', 'abhi': 'अभी', 'baad': 'बाद', 'pehle': 'पहले'
}
def get_basic_mappings():
return get_research_based_patterns()
def get_model(input_lang, output_lang):
models = {
("Hindi", "English"): "Helsinki-NLP/opus-mt-hi-en",
("English", "Hindi"): "Helsinki-NLP/opus-mt-en-hi",
}
return models.get((input_lang, output_lang))
def translate_text(text, input_lang, output_lang):
if not text or not text.strip():
return "No text to translate"
model_name = get_model(input_lang, output_lang)
if not model_name:
return "Translation pair not supported"
tokenizer, model = load_translation_model(model_name)
if tokenizer is None or model is None:
return "Failed to load translation model"
try:
text = text.strip()
text = preprocess_text(text, input_lang)
inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
translated_tokens = model.generate(
**inputs,
max_length=512,
num_beams=6,
length_penalty=0.8,
early_stopping=True,
do_sample=False
)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
translated_text = postprocess_text(translated_text, output_lang)
return translated_text
except Exception as e:
return f"Translation error: {str(e)}"
def preprocess_text(text, lang):
import unicodedata
text = unicodedata.normalize('NFC', text)
text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')
if lang == "Hindi":
text = re.sub(r'[^\u0900-\u097F\s]', '', text)
return text.strip()
def postprocess_text(text, lang):
text = re.sub(r'\s+', ' ', text).strip()
if lang == "Hindi":
text = re.sub(r'(\u093C)', '', text)
return text
def professional_hinglish_to_hindi(text, hinglish_dict):
words = text.lower().split()
converted_words = []
confidence_scores = []
for word in words:
clean_word = re.sub(r'[^\w]', '', word)
confidence = 0.0
if clean_word in hinglish_dict:
converted_words.append(hinglish_dict[clean_word])
confidence = 1.0
else:
schemes = [sanscript.ITRANS, sanscript.HK, sanscript.IAST]
best_result = word
for scheme in schemes:
try:
result = transliterate(clean_word, scheme, sanscript.DEVANAGARI)
if result != clean_word:
best_result = result
confidence = 0.7
break
except:
continue
converted_words.append(best_result)
confidence_scores.append(confidence)
avg_confidence = np.mean(confidence_scores) if confidence_scores else 0.0
return ' '.join(converted_words), avg_confidence
datasets_info = load_professional_datasets()
hinglish_dict, sources_loaded = load_enhanced_hinglish_dataset()
with tab2:
st.subheader("📝Text Translation")
option = st.radio("Translation Type:",
["English ➝ Hindi", "Hindi ➝ English", "Hinglish ➝ English"])
input_text = st.text_area("Enter text:", height=150, max_chars=2000)
if input_text:
st.caption(f"Characters: {len(input_text)}/2000")
if st.button("🔄 Translate", type="primary"):
if input_text.strip():
with st.spinner("🌐 Processing with models..."):
try:
if option == "English ➝ Hindi":
result = translate_text(input_text, "English", "Hindi")
elif option == "Hindi ➝ English":
result = translate_text(input_text, "Hindi", "English")
elif option == "Hinglish ➝ English":
hindi_text, confidence = professional_hinglish_to_hindi(input_text, hinglish_dict)
st.info(f"🔤 **Converted to Hindi:** {hindi_text}")
st.caption(f"Confidence: {confidence:.2%}")
result = translate_text(hindi_text, "Hindi", "English")
if result and not result.startswith(("Translation error:", "Failed")):
st.success("🌐 **Translation:**")
st.code(result, language=None)
else:
st.error(f"❌ {result}")
except Exception as e:
st.error(f"❌ Error: {str(e)}")