prachi1507's picture
create app.py
3909dfe verified
import streamlit as st
import whisper
import tempfile
import os
import torch
from datetime import datetime
import warnings
import gc
# Suppress warnings
warnings.filterwarnings("ignore")
# Configure Streamlit page
st.set_page_config(
page_title="Audio Transcriber & Translator",
page_icon="🎡",
layout="centered"
)
# Custom CSS for better UI
st.markdown("""
<style>
.main-header {
text-align: center;
padding: 2rem 0;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
color: white;
border-radius: 10px;
margin-bottom: 2rem;
}
.result-section {
background: #f8f9fa;
padding: 1.5rem;
border-radius: 10px;
margin: 1rem 0;
border-left: 4px solid #667eea;
}
.download-section {
background: #e8f5e8;
padding: 1.5rem;
border-radius: 10px;
margin-top: 1.5rem;
text-align: center;
}
.language-badge {
background: #667eea;
color: white;
padding: 0.5rem 1rem;
border-radius: 20px;
font-weight: bold;
display: inline-block;
margin-bottom: 1rem;
}
.warning-box {
background: #fff3cd;
border: 1px solid #ffeaa7;
padding: 1rem;
border-radius: 8px;
margin: 1rem 0;
}
</style>
""", unsafe_allow_html=True)
class M2M100Translator:
def __init__(self):
self.model_name = "facebook/m2m100_418M"
self.tokenizer = None
self.model = None
# M2M100 language codes
self.supported_languages = {
'af': 'Afrikaans', 'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali',
'ca': 'Catalan', 'cs': 'Czech', 'da': 'Danish', 'de': 'German',
'el': 'Greek', 'en': 'English', 'es': 'Spanish', 'et': 'Estonian',
'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French', 'gu': 'Gujarati',
'he': 'Hebrew', 'hi': 'Hindi', 'hr': 'Croatian', 'hu': 'Hungarian',
'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese', 'ka': 'Georgian',
'kk': 'Kazakh', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean',
'lt': 'Lithuanian', 'lv': 'Latvian', 'mk': 'Macedonian', 'ml': 'Malayalam',
'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay', 'my': 'Myanmar',
'ne': 'Nepali', 'nl': 'Dutch', 'no': 'Norwegian', 'pl': 'Polish',
'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'si': 'Sinhala',
'sk': 'Slovak', 'sl': 'Slovenian', 'sq': 'Albanian', 'sr': 'Serbian',
'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu',
'th': 'Thai', 'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian',
'ur': 'Urdu', 'vi': 'Vietnamese', 'zh': 'Chinese'
}
def load_model(self):
if self.model is None:
try:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
with st.spinner("πŸ”„ Loading M2M100 translation model..."):
# Load tokenizer and model - simplified for HF Spaces
self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
self.model = M2M100ForConditionalGeneration.from_pretrained(
self.model_name,
torch_dtype=torch.float32 # Use float32 for CPU compatibility
)
st.success("βœ… Translation model loaded successfully!")
except Exception as e:
st.error(f"❌ Failed to load translation model: {str(e)}")
st.info("πŸ’‘ Translation will be skipped. You can still get transcripts.")
return False
return True
def get_language_name(self, lang_code):
return self.supported_languages.get(lang_code, lang_code.upper())
def translate_text(self, text, source_language):
if not text or not text.strip():
return {"success": False, "error": "Empty text provided"}
# If already English, return as is
if source_language == 'en':
return {
"success": True,
"original_text": text,
"translated_text": text,
"source_language": source_language,
"note": "Source is already English"
}
# Check if source language is supported
if source_language not in self.supported_languages:
return {
"success": False,
"error": f"Language '{source_language}' not supported",
"original_text": text,
"source_language": source_language
}
if not self.load_model():
return {
"success": False,
"error": "Translation model not available",
"original_text": text,
"source_language": source_language
}
try:
# Set source language
self.tokenizer.src_lang = source_language
# Tokenize input with length limits for HF Spaces
inputs = self.tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=200 # Reduced for faster processing
)
# Generate translation
with torch.no_grad():
generated_tokens = self.model.generate(
**inputs,
forced_bos_token_id=self.tokenizer.get_lang_id("en"),
max_length=250,
num_beams=2, # Reduced beams for speed
early_stopping=True,
do_sample=False
)
# Decode translation
translated_text = self.tokenizer.batch_decode(
generated_tokens,
skip_special_tokens=True
)[0]
# Clear memory
del inputs, generated_tokens
gc.collect()
return {
"success": True,
"original_text": text,
"translated_text": translated_text.strip(),
"source_language": source_language,
"model_used": self.model_name
}
except Exception as e:
return {
"success": False,
"error": str(e),
"original_text": text,
"source_language": source_language
}
@st.cache_resource
def load_whisper_model():
"""Load Whisper model with caching - optimized for HF Spaces"""
try:
# Use tiny model for faster loading and processing on HF Spaces
model = whisper.load_model("tiny")
return model
except Exception as e:
st.error(f"Failed to load Whisper model: {e}")
return None
@st.cache_resource
def load_translator():
"""Load translator with caching"""
return M2M100Translator()
def transcribe_audio(audio_file):
"""Transcribe uploaded audio file"""
try:
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_file.write(audio_file.read())
tmp_file_path = tmp_file.name
model = load_whisper_model()
if model is None:
return {"success": False, "error": "Whisper model not available"}
# Transcribe with optimized settings for HF Spaces
result = model.transcribe(
tmp_file_path,
fp16=False, # Use fp32 for better compatibility
task="transcribe"
)
# Clean up
os.unlink(tmp_file_path)
gc.collect()
return {
"success": True,
"transcript": result["text"].strip(),
"language": result["language"]
}
except Exception as e:
if 'tmp_file_path' in locals():
try:
os.unlink(tmp_file_path)
except:
pass
return {"success": False, "error": str(e)}
def main():
# Header
st.markdown("""
<div class="main-header">
<h1>🎡 Audio Transcriber & Translator</h1>
<p>Upload audio files and get transcripts with English translation</p>
<small>Optimized for Hugging Face Spaces</small>
</div>
""", unsafe_allow_html=True)
# HF Spaces notice
st.markdown("""
<div class="warning-box">
<strong>πŸš€ Hugging Face Spaces Version</strong><br>
β€’ Using Whisper-tiny for faster processing<br>
β€’ File limit: 10MB, Duration: 5 minutes<br>
β€’ Processing may take 1-2 minutes
</div>
""", unsafe_allow_html=True)
# Show system info in sidebar
with st.sidebar:
st.header("πŸ”§ System Info")
st.info("Running on Hugging Face Spaces")
st.info(f"PyTorch: {torch.__version__}")
st.warning("Using CPU (optimized for HF Spaces)")
st.header("🌍 Models")
st.info("β€’ Whisper: tiny (fast)")
st.info("β€’ Translation: M2M100-418M")
with st.expander("πŸ’‘ Tips"):
st.caption("β€’ Use shorter audio files (< 5 min)")
st.caption("β€’ MP3/WAV work best")
st.caption("β€’ Clear speech gives better results")
st.caption("β€’ Processing takes 1-2 minutes")
# File uploader with restrictions for HF Spaces
uploaded_file = st.file_uploader(
"🎡 Choose an audio file",
type=['mp3', 'wav', 'mp4', 'm4a'],
help="Supported: MP3, WAV, MP4, M4A | Max: 10MB, 5 minutes"
)
if uploaded_file is not None:
# File size check
file_size_mb = uploaded_file.size / (1024 * 1024)
if file_size_mb > 10:
st.error("❌ File too large! Please use files under 10MB for optimal performance on HF Spaces.")
return
st.success(f"πŸ“ **{uploaded_file.name}** ({file_size_mb:.2f} MB)")
# Processing options
col1, col2 = st.columns(2)
with col1:
transcribe_only = st.checkbox("Transcribe only (faster)", value=False)
with col2:
if st.button("🧹 Clear Cache", help="Clear models from memory"):
st.cache_resource.clear()
st.success("Cache cleared!")
# Process button
if st.button("πŸš€ Process Audio", type="primary", use_container_width=True):
start_time = datetime.now()
# Step 1: Transcription
with st.spinner("🎀 Transcribing audio... (this may take 1-2 minutes)"):
transcription_result = transcribe_audio(uploaded_file)
if transcription_result["success"]:
transcript = transcription_result["transcript"]
detected_language = transcription_result["language"]
# Get language name
translator = load_translator()
language_name = translator.get_language_name(detected_language)
# Display transcription results
st.markdown("""
<div class="result-section">
<h3>πŸ“ Transcription Results</h3>
</div>
""", unsafe_allow_html=True)
# Language badge
st.markdown(f"""
<div class="language-badge">
🌍 Detected: {language_name} ({detected_language})
</div>
""", unsafe_allow_html=True)
# Transcript
st.text_area(
"Original Transcript",
transcript,
height=150,
key="transcript"
)
# Step 2: Translation (if requested)
if not transcribe_only and detected_language != 'en':
with st.spinner("🌍 Translating to English..."):
translation_result = translator.translate_text(transcript, detected_language)
if translation_result["success"]:
translated_text = translation_result["translated_text"]
st.markdown("""
<div class="result-section">
<h3>🌍 English Translation</h3>
</div>
""", unsafe_allow_html=True)
st.text_area(
"English Translation",
translated_text,
height=150,
key="translation"
)
# Download section
st.markdown("""
<div class="download-section">
<h4>πŸ“₯ Download Results</h4>
</div>
""", unsafe_allow_html=True)
# Prepare download content
full_content = f"""Audio Transcription & Translation
{'='*60}
File: {uploaded_file.name}
Size: {file_size_mb:.2f} MB
Detected Language: {language_name} ({detected_language})
Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*60}
ORIGINAL TRANSCRIPT ({language_name}):
{transcript}
ENGLISH TRANSLATION:
{translated_text}
{'='*60}
Processed with Whisper (tiny) + M2M100 on Hugging Face Spaces
"""
st.download_button(
"πŸ“„ Download Complete Results",
full_content,
file_name=f"{os.path.splitext(uploaded_file.name)[0]}_results.txt",
mime="text/plain",
use_container_width=True
)
else:
st.error(f"❌ Translation failed: {translation_result['error']}")
# Still offer transcript download
transcript_content = f"""Audio Transcription
{'='*50}
File: {uploaded_file.name}
Language: {language_name} ({detected_language})
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*50}
{transcript}
"""
st.download_button(
"πŸ“„ Download Transcript",
transcript_content,
file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt",
mime="text/plain"
)
elif transcribe_only or detected_language == 'en':
# Transcript only
transcript_content = f"""Audio Transcription
{'='*50}
File: {uploaded_file.name}
Language: {language_name} ({detected_language})
Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*50}
{transcript}
"""
st.download_button(
"πŸ“„ Download Transcript",
transcript_content,
file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt",
mime="text/plain",
use_container_width=True
)
# Show processing time
processing_time = (datetime.now() - start_time).total_seconds()
st.success(f"βœ… Processing completed in {processing_time:.1f} seconds")
else:
st.error(f"❌ Transcription failed: {transcription_result['error']}")
st.info("πŸ’‘ Try with a different audio file or format")
# Footer
st.markdown("---")
st.markdown("""
<div style="text-align: center; color: #666; padding: 1rem;">
<p>🎡 Powered by OpenAI Whisper & Facebook M2M100</p>
<p>Running on Hugging Face Spaces πŸ€—</p>
</div>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()