Spaces:

prachi1507
/

AudioTranscriberTranslator15

Runtime error

File size: 16,726 Bytes

3909dfe

import streamlit as st
import whisper
import tempfile
import os
import torch
from datetime import datetime
import warnings
import gc

# Suppress warnings
warnings.filterwarnings("ignore")

# Configure Streamlit page
st.set_page_config(
    page_title="Audio Transcriber & Translator",
    page_icon="🎵",
    layout="centered"
)

# Custom CSS for better UI
st.markdown("""
<style>
    .main-header {
        text-align: center;
        padding: 2rem 0;
        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
        color: white;
        border-radius: 10px;
        margin-bottom: 2rem;
    }
    .result-section {
        background: #f8f9fa;
        padding: 1.5rem;
        border-radius: 10px;
        margin: 1rem 0;
        border-left: 4px solid #667eea;
    }
    .download-section {
        background: #e8f5e8;
        padding: 1.5rem;
        border-radius: 10px;
        margin-top: 1.5rem;
        text-align: center;
    }
    .language-badge {
        background: #667eea;
        color: white;
        padding: 0.5rem 1rem;
        border-radius: 20px;
        font-weight: bold;
        display: inline-block;
        margin-bottom: 1rem;
    }
    .warning-box {
        background: #fff3cd;
        border: 1px solid #ffeaa7;
        padding: 1rem;
        border-radius: 8px;
        margin: 1rem 0;
    }
</style>
""", unsafe_allow_html=True)

class M2M100Translator:
    def __init__(self):
        self.model_name = "facebook/m2m100_418M"
        self.tokenizer = None
        self.model = None
        
        # M2M100 language codes
        self.supported_languages = {
            'af': 'Afrikaans', 'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali',
            'ca': 'Catalan', 'cs': 'Czech', 'da': 'Danish', 'de': 'German',
            'el': 'Greek', 'en': 'English', 'es': 'Spanish', 'et': 'Estonian',
            'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French', 'gu': 'Gujarati',
            'he': 'Hebrew', 'hi': 'Hindi', 'hr': 'Croatian', 'hu': 'Hungarian',
            'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese', 'ka': 'Georgian',
            'kk': 'Kazakh', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean',
            'lt': 'Lithuanian', 'lv': 'Latvian', 'mk': 'Macedonian', 'ml': 'Malayalam',
            'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay', 'my': 'Myanmar',
            'ne': 'Nepali', 'nl': 'Dutch', 'no': 'Norwegian', 'pl': 'Polish',
            'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'si': 'Sinhala',
            'sk': 'Slovak', 'sl': 'Slovenian', 'sq': 'Albanian', 'sr': 'Serbian',
            'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu',
            'th': 'Thai', 'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian',
            'ur': 'Urdu', 'vi': 'Vietnamese', 'zh': 'Chinese'
        }
    
    def load_model(self):
        if self.model is None:
            try:
                from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
                
                with st.spinner("🔄 Loading M2M100 translation model..."):
                    # Load tokenizer and model - simplified for HF Spaces
                    self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
                    self.model = M2M100ForConditionalGeneration.from_pretrained(
                        self.model_name,
                        torch_dtype=torch.float32  # Use float32 for CPU compatibility
                    )
                    
                st.success("✅ Translation model loaded successfully!")
                
            except Exception as e:
                st.error(f"❌ Failed to load translation model: {str(e)}")
                st.info("💡 Translation will be skipped. You can still get transcripts.")
                return False
        return True
    
    def get_language_name(self, lang_code):
        return self.supported_languages.get(lang_code, lang_code.upper())
    
    def translate_text(self, text, source_language):
        if not text or not text.strip():
            return {"success": False, "error": "Empty text provided"}
        
        # If already English, return as is
        if source_language == 'en':
            return {
                "success": True,
                "original_text": text,
                "translated_text": text,
                "source_language": source_language,
                "note": "Source is already English"
            }
        
        # Check if source language is supported
        if source_language not in self.supported_languages:
            return {
                "success": False,
                "error": f"Language '{source_language}' not supported",
                "original_text": text,
                "source_language": source_language
            }
        
        if not self.load_model():
            return {
                "success": False,
                "error": "Translation model not available",
                "original_text": text,
                "source_language": source_language
            }
        
        try:
            # Set source language
            self.tokenizer.src_lang = source_language
            
            # Tokenize input with length limits for HF Spaces
            inputs = self.tokenizer(
                text, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=200  # Reduced for faster processing
            )
            
            # Generate translation
            with torch.no_grad():
                generated_tokens = self.model.generate(
                    **inputs,
                    forced_bos_token_id=self.tokenizer.get_lang_id("en"),
                    max_length=250,
                    num_beams=2,  # Reduced beams for speed
                    early_stopping=True,
                    do_sample=False
                )
            
            # Decode translation
            translated_text = self.tokenizer.batch_decode(
                generated_tokens, 
                skip_special_tokens=True
            )[0]
            
            # Clear memory
            del inputs, generated_tokens
            gc.collect()
            
            return {
                "success": True,
                "original_text": text,
                "translated_text": translated_text.strip(),
                "source_language": source_language,
                "model_used": self.model_name
            }
            
        except Exception as e:
            return {
                "success": False,
                "error": str(e),
                "original_text": text,
                "source_language": source_language
            }

@st.cache_resource
def load_whisper_model():
    """Load Whisper model with caching - optimized for HF Spaces"""
    try:
        # Use tiny model for faster loading and processing on HF Spaces
        model = whisper.load_model("tiny")
        return model
    except Exception as e:
        st.error(f"Failed to load Whisper model: {e}")
        return None

@st.cache_resource
def load_translator():
    """Load translator with caching"""
    return M2M100Translator()

def transcribe_audio(audio_file):
    """Transcribe uploaded audio file"""
    try:
        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            tmp_file.write(audio_file.read())
            tmp_file_path = tmp_file.name
        
        model = load_whisper_model()
        if model is None:
            return {"success": False, "error": "Whisper model not available"}
        
        # Transcribe with optimized settings for HF Spaces
        result = model.transcribe(
            tmp_file_path,
            fp16=False,  # Use fp32 for better compatibility
            task="transcribe"
        )
        
        # Clean up
        os.unlink(tmp_file_path)
        gc.collect()
        
        return {
            "success": True,
            "transcript": result["text"].strip(),
            "language": result["language"]
        }
    
    except Exception as e:
        if 'tmp_file_path' in locals():
            try:
                os.unlink(tmp_file_path)
            except:
                pass
        return {"success": False, "error": str(e)}

def main():
    # Header
    st.markdown("""
    <div class="main-header">
        <h1>🎵 Audio Transcriber & Translator</h1>
        <p>Upload audio files and get transcripts with English translation</p>
        <small>Optimized for Hugging Face Spaces</small>
    </div>
    """, unsafe_allow_html=True)
    
    # HF Spaces notice
    st.markdown("""
    <div class="warning-box">
        <strong>🚀 Hugging Face Spaces Version</strong><br>
        • Using Whisper-tiny for faster processing<br>
        • File limit: 10MB, Duration: 5 minutes<br>
        • Processing may take 1-2 minutes
    </div>
    """, unsafe_allow_html=True)
    
    # Show system info in sidebar
    with st.sidebar:
        st.header("🔧 System Info")
        st.info("Running on Hugging Face Spaces")
        st.info(f"PyTorch: {torch.__version__}")
        st.warning("Using CPU (optimized for HF Spaces)")
        
        st.header("🌍 Models")
        st.info("• Whisper: tiny (fast)")
        st.info("• Translation: M2M100-418M")
        
        with st.expander("💡 Tips"):
            st.caption("• Use shorter audio files (< 5 min)")
            st.caption("• MP3/WAV work best")
            st.caption("• Clear speech gives better results")
            st.caption("• Processing takes 1-2 minutes")
    
    # File uploader with restrictions for HF Spaces
    uploaded_file = st.file_uploader(
        "🎵 Choose an audio file",
        type=['mp3', 'wav', 'mp4', 'm4a'],
        help="Supported: MP3, WAV, MP4, M4A | Max: 10MB, 5 minutes"
    )
    
    if uploaded_file is not None:
        # File size check
        file_size_mb = uploaded_file.size / (1024 * 1024)
        
        if file_size_mb > 10:
            st.error("❌ File too large! Please use files under 10MB for optimal performance on HF Spaces.")
            return
        
        st.success(f"📁 **{uploaded_file.name}** ({file_size_mb:.2f} MB)")
        
        # Processing options
        col1, col2 = st.columns(2)
        with col1:
            transcribe_only = st.checkbox("Transcribe only (faster)", value=False)
        with col2:
            if st.button("🧹 Clear Cache", help="Clear models from memory"):
                st.cache_resource.clear()
                st.success("Cache cleared!")
        
        # Process button
        if st.button("🚀 Process Audio", type="primary", use_container_width=True):
            start_time = datetime.now()
            
            # Step 1: Transcription
            with st.spinner("🎤 Transcribing audio... (this may take 1-2 minutes)"):
                transcription_result = transcribe_audio(uploaded_file)
            
            if transcription_result["success"]:
                transcript = transcription_result["transcript"]
                detected_language = transcription_result["language"]
                
                # Get language name
                translator = load_translator()
                language_name = translator.get_language_name(detected_language)
                
                # Display transcription results
                st.markdown("""
                <div class="result-section">
                    <h3>📝 Transcription Results</h3>
                </div>
                """, unsafe_allow_html=True)
                
                # Language badge
                st.markdown(f"""
                <div class="language-badge">
                    🌍 Detected: {language_name} ({detected_language})
                </div>
                """, unsafe_allow_html=True)
                
                # Transcript
                st.text_area(
                    "Original Transcript", 
                    transcript, 
                    height=150, 
                    key="transcript"
                )
                
                # Step 2: Translation (if requested)
                if not transcribe_only and detected_language != 'en':
                    with st.spinner("🌍 Translating to English..."):
                        translation_result = translator.translate_text(transcript, detected_language)
                    
                    if translation_result["success"]:
                        translated_text = translation_result["translated_text"]
                        
                        st.markdown("""
                        <div class="result-section">
                            <h3>🌍 English Translation</h3>
                        </div>
                        """, unsafe_allow_html=True)
                        
                        st.text_area(
                            "English Translation", 
                            translated_text, 
                            height=150, 
                            key="translation"
                        )
                        
                        # Download section
                        st.markdown("""
                        <div class="download-section">
                            <h4>📥 Download Results</h4>
                        </div>
                        """, unsafe_allow_html=True)
                        
                        # Prepare download content
                        full_content = f"""Audio Transcription & Translation
{'='*60}
File: {uploaded_file.name}
Size: {file_size_mb:.2f} MB
Detected Language: {language_name} ({detected_language})
Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*60}

ORIGINAL TRANSCRIPT ({language_name}):
{transcript}

ENGLISH TRANSLATION:
{translated_text}

{'='*60}
Processed with Whisper (tiny) + M2M100 on Hugging Face Spaces
"""
                        
                        st.download_button(
                            "📄 Download Complete Results",
                            full_content,
                            file_name=f"{os.path.splitext(uploaded_file.name)[0]}_results.txt",
                            mime="text/plain",
                            use_container_width=True
                        )
                    
                    else:
                        st.error(f"❌ Translation failed: {translation_result['error']}")
                        # Still offer transcript download
                        transcript_content = f"""Audio Transcription
{'='*50}
File: {uploaded_file.name}
Language: {language_name} ({detected_language})
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*50}

{transcript}
"""
                        st.download_button(
                            "📄 Download Transcript",
                            transcript_content,
                            file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt",
                            mime="text/plain"
                        )
                
                elif transcribe_only or detected_language == 'en':
                    # Transcript only
                    transcript_content = f"""Audio Transcription
{'='*50}
File: {uploaded_file.name}
Language: {language_name} ({detected_language})
Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*50}

{transcript}
"""
                    st.download_button(
                        "📄 Download Transcript",
                        transcript_content,
                        file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt",
                        mime="text/plain",
                        use_container_width=True
                    )
                
                # Show processing time
                processing_time = (datetime.now() - start_time).total_seconds()
                st.success(f"✅ Processing completed in {processing_time:.1f} seconds")
            
            else:
                st.error(f"❌ Transcription failed: {transcription_result['error']}")
                st.info("💡 Try with a different audio file or format")
    
    # Footer
    st.markdown("---")
    st.markdown("""
    <div style="text-align: center; color: #666; padding: 1rem;">
        <p>🎵 Powered by OpenAI Whisper & Facebook M2M100</p>
        <p>Running on Hugging Face Spaces 🤗</p>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()