import streamlit as st import whisper import tempfile import os import torch from datetime import datetime import warnings import gc # Suppress warnings warnings.filterwarnings("ignore") # Configure Streamlit page st.set_page_config( page_title="Audio Transcriber & Translator", page_icon="๐ŸŽต", layout="centered" ) # Custom CSS for better UI st.markdown(""" """, unsafe_allow_html=True) class M2M100Translator: def __init__(self): self.model_name = "facebook/m2m100_418M" self.tokenizer = None self.model = None # M2M100 language codes self.supported_languages = { 'af': 'Afrikaans', 'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali', 'ca': 'Catalan', 'cs': 'Czech', 'da': 'Danish', 'de': 'German', 'el': 'Greek', 'en': 'English', 'es': 'Spanish', 'et': 'Estonian', 'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French', 'gu': 'Gujarati', 'he': 'Hebrew', 'hi': 'Hindi', 'hr': 'Croatian', 'hu': 'Hungarian', 'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese', 'ka': 'Georgian', 'kk': 'Kazakh', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'lt': 'Lithuanian', 'lv': 'Latvian', 'mk': 'Macedonian', 'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay', 'my': 'Myanmar', 'ne': 'Nepali', 'nl': 'Dutch', 'no': 'Norwegian', 'pl': 'Polish', 'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'si': 'Sinhala', 'sk': 'Slovak', 'sl': 'Slovenian', 'sq': 'Albanian', 'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu', 'th': 'Thai', 'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese', 'zh': 'Chinese' } def load_model(self): if self.model is None: try: from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer with st.spinner("๐Ÿ”„ Loading M2M100 translation model..."): # Load tokenizer and model - simplified for HF Spaces self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name) self.model = M2M100ForConditionalGeneration.from_pretrained( self.model_name, torch_dtype=torch.float32 # Use float32 for CPU compatibility ) st.success("โœ… Translation model loaded successfully!") except Exception as e: st.error(f"โŒ Failed to load translation model: {str(e)}") st.info("๐Ÿ’ก Translation will be skipped. You can still get transcripts.") return False return True def get_language_name(self, lang_code): return self.supported_languages.get(lang_code, lang_code.upper()) def translate_text(self, text, source_language): if not text or not text.strip(): return {"success": False, "error": "Empty text provided"} # If already English, return as is if source_language == 'en': return { "success": True, "original_text": text, "translated_text": text, "source_language": source_language, "note": "Source is already English" } # Check if source language is supported if source_language not in self.supported_languages: return { "success": False, "error": f"Language '{source_language}' not supported", "original_text": text, "source_language": source_language } if not self.load_model(): return { "success": False, "error": "Translation model not available", "original_text": text, "source_language": source_language } try: # Set source language self.tokenizer.src_lang = source_language # Tokenize input with length limits for HF Spaces inputs = self.tokenizer( text, return_tensors="pt", padding=True, truncation=True, max_length=200 # Reduced for faster processing ) # Generate translation with torch.no_grad(): generated_tokens = self.model.generate( **inputs, forced_bos_token_id=self.tokenizer.get_lang_id("en"), max_length=250, num_beams=2, # Reduced beams for speed early_stopping=True, do_sample=False ) # Decode translation translated_text = self.tokenizer.batch_decode( generated_tokens, skip_special_tokens=True )[0] # Clear memory del inputs, generated_tokens gc.collect() return { "success": True, "original_text": text, "translated_text": translated_text.strip(), "source_language": source_language, "model_used": self.model_name } except Exception as e: return { "success": False, "error": str(e), "original_text": text, "source_language": source_language } @st.cache_resource def load_whisper_model(): """Load Whisper model with caching - optimized for HF Spaces""" try: # Use tiny model for faster loading and processing on HF Spaces model = whisper.load_model("tiny") return model except Exception as e: st.error(f"Failed to load Whisper model: {e}") return None @st.cache_resource def load_translator(): """Load translator with caching""" return M2M100Translator() def transcribe_audio(audio_file): """Transcribe uploaded audio file""" try: # Create temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_file.write(audio_file.read()) tmp_file_path = tmp_file.name model = load_whisper_model() if model is None: return {"success": False, "error": "Whisper model not available"} # Transcribe with optimized settings for HF Spaces result = model.transcribe( tmp_file_path, fp16=False, # Use fp32 for better compatibility task="transcribe" ) # Clean up os.unlink(tmp_file_path) gc.collect() return { "success": True, "transcript": result["text"].strip(), "language": result["language"] } except Exception as e: if 'tmp_file_path' in locals(): try: os.unlink(tmp_file_path) except: pass return {"success": False, "error": str(e)} def main(): # Header st.markdown("""

๐ŸŽต Audio Transcriber & Translator

Upload audio files and get transcripts with English translation

Optimized for Hugging Face Spaces
""", unsafe_allow_html=True) # HF Spaces notice st.markdown("""
๐Ÿš€ Hugging Face Spaces Version
โ€ข Using Whisper-tiny for faster processing
โ€ข File limit: 10MB, Duration: 5 minutes
โ€ข Processing may take 1-2 minutes
""", unsafe_allow_html=True) # Show system info in sidebar with st.sidebar: st.header("๐Ÿ”ง System Info") st.info("Running on Hugging Face Spaces") st.info(f"PyTorch: {torch.__version__}") st.warning("Using CPU (optimized for HF Spaces)") st.header("๐ŸŒ Models") st.info("โ€ข Whisper: tiny (fast)") st.info("โ€ข Translation: M2M100-418M") with st.expander("๐Ÿ’ก Tips"): st.caption("โ€ข Use shorter audio files (< 5 min)") st.caption("โ€ข MP3/WAV work best") st.caption("โ€ข Clear speech gives better results") st.caption("โ€ข Processing takes 1-2 minutes") # File uploader with restrictions for HF Spaces uploaded_file = st.file_uploader( "๐ŸŽต Choose an audio file", type=['mp3', 'wav', 'mp4', 'm4a'], help="Supported: MP3, WAV, MP4, M4A | Max: 10MB, 5 minutes" ) if uploaded_file is not None: # File size check file_size_mb = uploaded_file.size / (1024 * 1024) if file_size_mb > 10: st.error("โŒ File too large! Please use files under 10MB for optimal performance on HF Spaces.") return st.success(f"๐Ÿ“ **{uploaded_file.name}** ({file_size_mb:.2f} MB)") # Processing options col1, col2 = st.columns(2) with col1: transcribe_only = st.checkbox("Transcribe only (faster)", value=False) with col2: if st.button("๐Ÿงน Clear Cache", help="Clear models from memory"): st.cache_resource.clear() st.success("Cache cleared!") # Process button if st.button("๐Ÿš€ Process Audio", type="primary", use_container_width=True): start_time = datetime.now() # Step 1: Transcription with st.spinner("๐ŸŽค Transcribing audio... (this may take 1-2 minutes)"): transcription_result = transcribe_audio(uploaded_file) if transcription_result["success"]: transcript = transcription_result["transcript"] detected_language = transcription_result["language"] # Get language name translator = load_translator() language_name = translator.get_language_name(detected_language) # Display transcription results st.markdown("""

๐Ÿ“ Transcription Results

""", unsafe_allow_html=True) # Language badge st.markdown(f"""
๐ŸŒ Detected: {language_name} ({detected_language})
""", unsafe_allow_html=True) # Transcript st.text_area( "Original Transcript", transcript, height=150, key="transcript" ) # Step 2: Translation (if requested) if not transcribe_only and detected_language != 'en': with st.spinner("๐ŸŒ Translating to English..."): translation_result = translator.translate_text(transcript, detected_language) if translation_result["success"]: translated_text = translation_result["translated_text"] st.markdown("""

๐ŸŒ English Translation

""", unsafe_allow_html=True) st.text_area( "English Translation", translated_text, height=150, key="translation" ) # Download section st.markdown("""

๐Ÿ“ฅ Download Results

""", unsafe_allow_html=True) # Prepare download content full_content = f"""Audio Transcription & Translation {'='*60} File: {uploaded_file.name} Size: {file_size_mb:.2f} MB Detected Language: {language_name} ({detected_language}) Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {'='*60} ORIGINAL TRANSCRIPT ({language_name}): {transcript} ENGLISH TRANSLATION: {translated_text} {'='*60} Processed with Whisper (tiny) + M2M100 on Hugging Face Spaces """ st.download_button( "๐Ÿ“„ Download Complete Results", full_content, file_name=f"{os.path.splitext(uploaded_file.name)[0]}_results.txt", mime="text/plain", use_container_width=True ) else: st.error(f"โŒ Translation failed: {translation_result['error']}") # Still offer transcript download transcript_content = f"""Audio Transcription {'='*50} File: {uploaded_file.name} Language: {language_name} ({detected_language}) Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {'='*50} {transcript} """ st.download_button( "๐Ÿ“„ Download Transcript", transcript_content, file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt", mime="text/plain" ) elif transcribe_only or detected_language == 'en': # Transcript only transcript_content = f"""Audio Transcription {'='*50} File: {uploaded_file.name} Language: {language_name} ({detected_language}) Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {'='*50} {transcript} """ st.download_button( "๐Ÿ“„ Download Transcript", transcript_content, file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt", mime="text/plain", use_container_width=True ) # Show processing time processing_time = (datetime.now() - start_time).total_seconds() st.success(f"โœ… Processing completed in {processing_time:.1f} seconds") else: st.error(f"โŒ Transcription failed: {transcription_result['error']}") st.info("๐Ÿ’ก Try with a different audio file or format") # Footer st.markdown("---") st.markdown("""

๐ŸŽต Powered by OpenAI Whisper & Facebook M2M100

Running on Hugging Face Spaces ๐Ÿค—

""", unsafe_allow_html=True) if __name__ == "__main__": main()