Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import whisper | |
| import tempfile | |
| import os | |
| import torch | |
| from datetime import datetime | |
| import warnings | |
| import gc | |
| # Suppress warnings | |
| warnings.filterwarnings("ignore") | |
| # Configure Streamlit page | |
| st.set_page_config( | |
| page_title="Audio Transcriber & Translator", | |
| page_icon="π΅", | |
| layout="centered" | |
| ) | |
| # Custom CSS for better UI | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| text-align: center; | |
| padding: 2rem 0; | |
| background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| border-radius: 10px; | |
| margin-bottom: 2rem; | |
| } | |
| .result-section { | |
| background: #f8f9fa; | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| margin: 1rem 0; | |
| border-left: 4px solid #667eea; | |
| } | |
| .download-section { | |
| background: #e8f5e8; | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| margin-top: 1.5rem; | |
| text-align: center; | |
| } | |
| .language-badge { | |
| background: #667eea; | |
| color: white; | |
| padding: 0.5rem 1rem; | |
| border-radius: 20px; | |
| font-weight: bold; | |
| display: inline-block; | |
| margin-bottom: 1rem; | |
| } | |
| .warning-box { | |
| background: #fff3cd; | |
| border: 1px solid #ffeaa7; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| margin: 1rem 0; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| class M2M100Translator: | |
| def __init__(self): | |
| self.model_name = "facebook/m2m100_418M" | |
| self.tokenizer = None | |
| self.model = None | |
| # M2M100 language codes | |
| self.supported_languages = { | |
| 'af': 'Afrikaans', 'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali', | |
| 'ca': 'Catalan', 'cs': 'Czech', 'da': 'Danish', 'de': 'German', | |
| 'el': 'Greek', 'en': 'English', 'es': 'Spanish', 'et': 'Estonian', | |
| 'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French', 'gu': 'Gujarati', | |
| 'he': 'Hebrew', 'hi': 'Hindi', 'hr': 'Croatian', 'hu': 'Hungarian', | |
| 'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese', 'ka': 'Georgian', | |
| 'kk': 'Kazakh', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', | |
| 'lt': 'Lithuanian', 'lv': 'Latvian', 'mk': 'Macedonian', 'ml': 'Malayalam', | |
| 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay', 'my': 'Myanmar', | |
| 'ne': 'Nepali', 'nl': 'Dutch', 'no': 'Norwegian', 'pl': 'Polish', | |
| 'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'si': 'Sinhala', | |
| 'sk': 'Slovak', 'sl': 'Slovenian', 'sq': 'Albanian', 'sr': 'Serbian', | |
| 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu', | |
| 'th': 'Thai', 'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian', | |
| 'ur': 'Urdu', 'vi': 'Vietnamese', 'zh': 'Chinese' | |
| } | |
| def load_model(self): | |
| if self.model is None: | |
| try: | |
| from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
| with st.spinner("π Loading M2M100 translation model..."): | |
| # Load tokenizer and model - simplified for HF Spaces | |
| self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name) | |
| self.model = M2M100ForConditionalGeneration.from_pretrained( | |
| self.model_name, | |
| torch_dtype=torch.float32 # Use float32 for CPU compatibility | |
| ) | |
| st.success("β Translation model loaded successfully!") | |
| except Exception as e: | |
| st.error(f"β Failed to load translation model: {str(e)}") | |
| st.info("π‘ Translation will be skipped. You can still get transcripts.") | |
| return False | |
| return True | |
| def get_language_name(self, lang_code): | |
| return self.supported_languages.get(lang_code, lang_code.upper()) | |
| def translate_text(self, text, source_language): | |
| if not text or not text.strip(): | |
| return {"success": False, "error": "Empty text provided"} | |
| # If already English, return as is | |
| if source_language == 'en': | |
| return { | |
| "success": True, | |
| "original_text": text, | |
| "translated_text": text, | |
| "source_language": source_language, | |
| "note": "Source is already English" | |
| } | |
| # Check if source language is supported | |
| if source_language not in self.supported_languages: | |
| return { | |
| "success": False, | |
| "error": f"Language '{source_language}' not supported", | |
| "original_text": text, | |
| "source_language": source_language | |
| } | |
| if not self.load_model(): | |
| return { | |
| "success": False, | |
| "error": "Translation model not available", | |
| "original_text": text, | |
| "source_language": source_language | |
| } | |
| try: | |
| # Set source language | |
| self.tokenizer.src_lang = source_language | |
| # Tokenize input with length limits for HF Spaces | |
| inputs = self.tokenizer( | |
| text, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=200 # Reduced for faster processing | |
| ) | |
| # Generate translation | |
| with torch.no_grad(): | |
| generated_tokens = self.model.generate( | |
| **inputs, | |
| forced_bos_token_id=self.tokenizer.get_lang_id("en"), | |
| max_length=250, | |
| num_beams=2, # Reduced beams for speed | |
| early_stopping=True, | |
| do_sample=False | |
| ) | |
| # Decode translation | |
| translated_text = self.tokenizer.batch_decode( | |
| generated_tokens, | |
| skip_special_tokens=True | |
| )[0] | |
| # Clear memory | |
| del inputs, generated_tokens | |
| gc.collect() | |
| return { | |
| "success": True, | |
| "original_text": text, | |
| "translated_text": translated_text.strip(), | |
| "source_language": source_language, | |
| "model_used": self.model_name | |
| } | |
| except Exception as e: | |
| return { | |
| "success": False, | |
| "error": str(e), | |
| "original_text": text, | |
| "source_language": source_language | |
| } | |
| def load_whisper_model(): | |
| """Load Whisper model with caching - optimized for HF Spaces""" | |
| try: | |
| # Use tiny model for faster loading and processing on HF Spaces | |
| model = whisper.load_model("tiny") | |
| return model | |
| except Exception as e: | |
| st.error(f"Failed to load Whisper model: {e}") | |
| return None | |
| def load_translator(): | |
| """Load translator with caching""" | |
| return M2M100Translator() | |
| def transcribe_audio(audio_file): | |
| """Transcribe uploaded audio file""" | |
| try: | |
| # Create temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| tmp_file.write(audio_file.read()) | |
| tmp_file_path = tmp_file.name | |
| model = load_whisper_model() | |
| if model is None: | |
| return {"success": False, "error": "Whisper model not available"} | |
| # Transcribe with optimized settings for HF Spaces | |
| result = model.transcribe( | |
| tmp_file_path, | |
| fp16=False, # Use fp32 for better compatibility | |
| task="transcribe" | |
| ) | |
| # Clean up | |
| os.unlink(tmp_file_path) | |
| gc.collect() | |
| return { | |
| "success": True, | |
| "transcript": result["text"].strip(), | |
| "language": result["language"] | |
| } | |
| except Exception as e: | |
| if 'tmp_file_path' in locals(): | |
| try: | |
| os.unlink(tmp_file_path) | |
| except: | |
| pass | |
| return {"success": False, "error": str(e)} | |
| def main(): | |
| # Header | |
| st.markdown(""" | |
| <div class="main-header"> | |
| <h1>π΅ Audio Transcriber & Translator</h1> | |
| <p>Upload audio files and get transcripts with English translation</p> | |
| <small>Optimized for Hugging Face Spaces</small> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # HF Spaces notice | |
| st.markdown(""" | |
| <div class="warning-box"> | |
| <strong>π Hugging Face Spaces Version</strong><br> | |
| β’ Using Whisper-tiny for faster processing<br> | |
| β’ File limit: 10MB, Duration: 5 minutes<br> | |
| β’ Processing may take 1-2 minutes | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Show system info in sidebar | |
| with st.sidebar: | |
| st.header("π§ System Info") | |
| st.info("Running on Hugging Face Spaces") | |
| st.info(f"PyTorch: {torch.__version__}") | |
| st.warning("Using CPU (optimized for HF Spaces)") | |
| st.header("π Models") | |
| st.info("β’ Whisper: tiny (fast)") | |
| st.info("β’ Translation: M2M100-418M") | |
| with st.expander("π‘ Tips"): | |
| st.caption("β’ Use shorter audio files (< 5 min)") | |
| st.caption("β’ MP3/WAV work best") | |
| st.caption("β’ Clear speech gives better results") | |
| st.caption("β’ Processing takes 1-2 minutes") | |
| # File uploader with restrictions for HF Spaces | |
| uploaded_file = st.file_uploader( | |
| "π΅ Choose an audio file", | |
| type=['mp3', 'wav', 'mp4', 'm4a'], | |
| help="Supported: MP3, WAV, MP4, M4A | Max: 10MB, 5 minutes" | |
| ) | |
| if uploaded_file is not None: | |
| # File size check | |
| file_size_mb = uploaded_file.size / (1024 * 1024) | |
| if file_size_mb > 10: | |
| st.error("β File too large! Please use files under 10MB for optimal performance on HF Spaces.") | |
| return | |
| st.success(f"π **{uploaded_file.name}** ({file_size_mb:.2f} MB)") | |
| # Processing options | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| transcribe_only = st.checkbox("Transcribe only (faster)", value=False) | |
| with col2: | |
| if st.button("π§Ή Clear Cache", help="Clear models from memory"): | |
| st.cache_resource.clear() | |
| st.success("Cache cleared!") | |
| # Process button | |
| if st.button("π Process Audio", type="primary", use_container_width=True): | |
| start_time = datetime.now() | |
| # Step 1: Transcription | |
| with st.spinner("π€ Transcribing audio... (this may take 1-2 minutes)"): | |
| transcription_result = transcribe_audio(uploaded_file) | |
| if transcription_result["success"]: | |
| transcript = transcription_result["transcript"] | |
| detected_language = transcription_result["language"] | |
| # Get language name | |
| translator = load_translator() | |
| language_name = translator.get_language_name(detected_language) | |
| # Display transcription results | |
| st.markdown(""" | |
| <div class="result-section"> | |
| <h3>π Transcription Results</h3> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Language badge | |
| st.markdown(f""" | |
| <div class="language-badge"> | |
| π Detected: {language_name} ({detected_language}) | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Transcript | |
| st.text_area( | |
| "Original Transcript", | |
| transcript, | |
| height=150, | |
| key="transcript" | |
| ) | |
| # Step 2: Translation (if requested) | |
| if not transcribe_only and detected_language != 'en': | |
| with st.spinner("π Translating to English..."): | |
| translation_result = translator.translate_text(transcript, detected_language) | |
| if translation_result["success"]: | |
| translated_text = translation_result["translated_text"] | |
| st.markdown(""" | |
| <div class="result-section"> | |
| <h3>π English Translation</h3> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.text_area( | |
| "English Translation", | |
| translated_text, | |
| height=150, | |
| key="translation" | |
| ) | |
| # Download section | |
| st.markdown(""" | |
| <div class="download-section"> | |
| <h4>π₯ Download Results</h4> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Prepare download content | |
| full_content = f"""Audio Transcription & Translation | |
| {'='*60} | |
| File: {uploaded_file.name} | |
| Size: {file_size_mb:.2f} MB | |
| Detected Language: {language_name} ({detected_language}) | |
| Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds | |
| Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| {'='*60} | |
| ORIGINAL TRANSCRIPT ({language_name}): | |
| {transcript} | |
| ENGLISH TRANSLATION: | |
| {translated_text} | |
| {'='*60} | |
| Processed with Whisper (tiny) + M2M100 on Hugging Face Spaces | |
| """ | |
| st.download_button( | |
| "π Download Complete Results", | |
| full_content, | |
| file_name=f"{os.path.splitext(uploaded_file.name)[0]}_results.txt", | |
| mime="text/plain", | |
| use_container_width=True | |
| ) | |
| else: | |
| st.error(f"β Translation failed: {translation_result['error']}") | |
| # Still offer transcript download | |
| transcript_content = f"""Audio Transcription | |
| {'='*50} | |
| File: {uploaded_file.name} | |
| Language: {language_name} ({detected_language}) | |
| Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| {'='*50} | |
| {transcript} | |
| """ | |
| st.download_button( | |
| "π Download Transcript", | |
| transcript_content, | |
| file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt", | |
| mime="text/plain" | |
| ) | |
| elif transcribe_only or detected_language == 'en': | |
| # Transcript only | |
| transcript_content = f"""Audio Transcription | |
| {'='*50} | |
| File: {uploaded_file.name} | |
| Language: {language_name} ({detected_language}) | |
| Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds | |
| Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| {'='*50} | |
| {transcript} | |
| """ | |
| st.download_button( | |
| "π Download Transcript", | |
| transcript_content, | |
| file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt", | |
| mime="text/plain", | |
| use_container_width=True | |
| ) | |
| # Show processing time | |
| processing_time = (datetime.now() - start_time).total_seconds() | |
| st.success(f"β Processing completed in {processing_time:.1f} seconds") | |
| else: | |
| st.error(f"β Transcription failed: {transcription_result['error']}") | |
| st.info("π‘ Try with a different audio file or format") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style="text-align: center; color: #666; padding: 1rem;"> | |
| <p>π΅ Powered by OpenAI Whisper & Facebook M2M100</p> | |
| <p>Running on Hugging Face Spaces π€</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() |