Spaces:

xTHExBEASTx
/

Whisper-Transcriber

Sleeping

App Files Files Community

Whisper Transcriber Bot commited on Dec 29, 2025

Commit

7f464b5

1 Parent(s): 14efc79

Simplify to minimal clean interface - default HF style

Browse files

Files changed (1) hide show

app.py +44 -166

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import gradio as gr
 import os
 import tempfile
-from pathlib import Path
-from typing import Optional, Tuple, List
 import logging
 from utils.audio_processor import AudioProcessor
@@ -32,20 +31,7 @@ class WhisperTranscriberApp:
         enable_diarization: bool,
         progress=gr.Progress()
     ) -> Tuple[str, str, str, str, str]:
-        """
-        Main processing function for transcription
-        Args:
-            file_input: Uploaded file
-            url_input: URL input (YouTube or direct link)
-            model_size: Whisper model size
-            language: Language code
-            enable_diarization: Whether to enable speaker diarization
-            progress: Gradio progress tracker
-        Returns:
-            Tuple of (preview_text, srt_file, vtt_file, txt_file, json_file)
-        """
         temp_files = []
         try:
@@ -53,30 +39,18 @@ class WhisperTranscriberApp:
             progress(0.05, desc="Processing input...")
             if url_input and url_input.strip():
-                # Download from URL
                 audio_file, source_type = MediaDownloader.download_media(
                     url_input,
                     progress_callback=lambda msg: progress(0.1, desc=msg)
                 )
                 temp_files.append(audio_file)
-                logger.info(f"Downloaded from {source_type}: {audio_file}")
             elif file_input is not None:
-                # Use uploaded file
                 audio_file = file_input.name
-                logger.info(f"Using uploaded file: {audio_file}")
             else:
                 raise ValueError("Please provide either a file or a URL")
-            # Step 2: Check file format and extract audio if needed
             progress(0.15, desc="Extracting audio...")
-            if not AudioProcessor.is_supported_file(audio_file):
-                raise ValueError(
-                    f"Unsupported file format. Supported: "
-                    f"{AudioProcessor.SUPPORTED_FORMATS['audio'] + AudioProcessor.SUPPORTED_FORMATS['video']}"
-                )
-            # Extract/convert audio to WAV for processing
             processed_audio = AudioProcessor.extract_audio(
                 audio_file,
                 output_format='wav',
@@ -84,12 +58,9 @@ class WhisperTranscriberApp:
             )
             temp_files.append(processed_audio)
-            # Get file info
             duration = AudioProcessor.get_audio_duration(processed_audio)
-            file_size = AudioProcessor.get_file_size_mb(processed_audio)
-            logger.info(f"Audio duration: {duration:.2f}s, Size: {file_size:.2f}MB")
-            # Step 3: Load Whisper model if needed
             if self.transcriber is None or self.current_model != model_size:
                 progress(0.25, desc=f"Loading Whisper {model_size} model...")
                 self.transcriber = WhisperTranscriber(model_size=model_size)
@@ -98,30 +69,25 @@ class WhisperTranscriberApp:
                 )
                 self.current_model = model_size
-            # Step 4: Chunk audio if necessary
-            progress(0.35, desc="Preparing audio for transcription...")
             chunks = AudioProcessor.chunk_audio(
                 processed_audio,
                 progress_callback=lambda msg: progress(0.4, desc=msg)
             )
-            # Add chunk files to cleanup list
             for chunk_file, _ in chunks:
                 if chunk_file != processed_audio:
                     temp_files.append(chunk_file)
             # Step 5: Transcribe
             progress(0.45, desc="Transcribing audio...")
             if len(chunks) == 1:
-                # Single chunk transcription
                 transcription_result = self.transcriber.transcribe(
                     chunks[0][0],
                     language=language,
                     progress_callback=lambda msg: progress(0.65, desc=msg)
                 )
             else:
-                # Multi-chunk transcription
                 transcription_result = self.transcriber.transcribe_chunks(
                     chunks,
                     language=language,
@@ -130,24 +96,20 @@ class WhisperTranscriberApp:
             progress(0.70, desc="Transcription complete!")
-            # Step 6: Speaker diarization (optional)
             speaker_labels = None
             if enable_diarization:
                 progress(0.75, desc="Performing speaker diarization...")
                 if not SpeakerDiarizer.is_available():
-                    logger.warning("HF_TOKEN not set, skipping diarization")
                     progress(0.75, desc="Skipping diarization (HF_TOKEN not set)")
                 else:
                     try:
                         if self.diarizer is None:
                             self.diarizer = SpeakerDiarizer()
                         diarization_result = self.diarizer.diarize(
                             processed_audio,
                             progress_callback=lambda msg: progress(0.85, desc=msg)
                         )
                         speaker_labels = self.diarizer.align_with_transcription(
                             diarization_result,
                             transcription_result,
@@ -155,11 +117,9 @@ class WhisperTranscriberApp:
                         )
                     except Exception as e:
                         logger.error(f"Diarization failed: {e}")
-                        progress(0.9, desc=f"Diarization failed: {str(e)[:50]}")
-            # Step 7: Generate output files
             progress(0.92, desc="Generating output files...")
             output_prefix = tempfile.mktemp(prefix="whisper_output_")
             outputs = SubtitleFormatter.generate_all_formats(
                 transcription_result,
@@ -167,22 +127,16 @@ class WhisperTranscriberApp:
                 speaker_labels
             )
-            # Step 8: Prepare preview
-            preview_text = f"""
-**Transcription Complete!**
 **Language:** {transcription_result['language']}
 **Duration:** {duration:.2f} seconds
 **Model Used:** {model_size}
-**Diarization:** {'Enabled' if speaker_labels else 'Disabled'}
-**Preview (first 500 characters):**
-{transcription_result['text'][:500]}...
-"""
             progress(1.0, desc="Done!")
-            # Cleanup temporary files
             AudioProcessor.cleanup_temp_files(*temp_files)
             return (
@@ -195,123 +149,47 @@ class WhisperTranscriberApp:
         except Exception as e:
             logger.error(f"Processing failed: {e}")
-            # Cleanup on error
             AudioProcessor.cleanup_temp_files(*temp_files)
             raise gr.Error(f"Processing failed: {str(e)}")
-def create_interface():
-    """Create and configure Gradio interface"""
-    app = WhisperTranscriberApp()
-    # Get available options
-    model_choices = WhisperTranscriber.get_available_models()
-    language_choices = WhisperTranscriber.get_language_list()
-    with gr.Blocks(theme=gr.themes.Soft(), title="Whisper Transcriber") as demo:
-        gr.Markdown(
-            """
-            # 🎤 Whisper Transcriber
-            Generate accurate subtitles and transcripts from audio/video files using OpenAI Whisper.
-            """
-        )
-        with gr.Tab("Transcribe"):
-            with gr.Row():
-                with gr.Column():
-                    file_input = gr.File(
-                        label="📁 Upload Audio/Video File",
-                        file_types=['audio', 'video']
-                    )
-                    url_input = gr.Textbox(
-                        label="🔗 Or Paste URL (YouTube or direct link)",
-                        placeholder="https://www.youtube.com/watch?v=... or https://example.com/audio.mp3"
-                    )
-                    model_size = gr.Dropdown(
-                        choices=model_choices,
-                        value='small',
-                        label="🎯 Model Size"
-                    )
-                    language = gr.Dropdown(
-                        choices=[(f"{v} ({k})", k) for k, v in language_choices.items()],
-                        value='auto',
-                        label="🌍 Language"
-                    )
-                    enable_diarization = gr.Checkbox(
-                        label="👥 Enable Speaker Diarization",
-                        value=False
-                    )
-                    process_btn = gr.Button("🚀 Generate Transcription", variant="primary")
-                with gr.Column():
-                    preview_output = gr.Markdown(label="📄 Preview")
-                    srt_output = gr.File(label="SRT File")
-                    vtt_output = gr.File(label="VTT File")
-                    txt_output = gr.File(label="TXT File")
-                    json_output = gr.File(label="JSON File")
-        with gr.Tab("Help"):
-            gr.Markdown(
-                """
-                ## 📚 How to Use
-                1. **Upload a file** or **paste a URL** (YouTube or direct media link)
-                2. **Select model size**: Tiny (fast), Small (balanced), Medium (accurate)
-                3. **Choose language**: Auto-detect or select manually
-                4. **Enable diarization** (optional): Identifies different speakers
-                5. Click **Generate Transcription**
-                6. **Download** your preferred format(s)
-                ## 📋 Supported Formats
-                **Audio:** MP3, WAV, M4A, FLAC, AAC, OGG, WMA
-                **Video:** MP4, AVI, MKV, MOV, WMV, WebM, FLV
-                ## ⚙️ Features
-                - ✅ Auto language detection (99+ languages)
-                - ✅ Multiple output formats (SRT, VTT, TXT, JSON)
-                - ✅ Word-level timestamps in JSON
-                - ✅ Large file chunking (30-min segments)
-                - ✅ Optional speaker identification
-                - ✅ Public API endpoint
-                ## 💡 Tips
-                - Use **Small model** for most cases
-                - **Diarization** requires HF_TOKEN (Space settings)
-                - Large files are automatically chunked
-                - Processing time varies by model and file length
-                """
             )
-        # Wire up the button
-        process_btn.click(
-            fn=app.process_media,
-            inputs=[
-                file_input,
-                url_input,
-                model_size,
-                language,
-                enable_diarization
-            ],
-            outputs=[
-                preview_output,
-                srt_output,
-                vtt_output,
-                txt_output,
-                json_output
-            ]
-        )
-    return demo
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.queue()  # Enable queuing for better handling of concurrent requests
     demo.launch()

 import gradio as gr
 import os
 import tempfile
+from typing import Optional, Tuple
 import logging
 from utils.audio_processor import AudioProcessor
         enable_diarization: bool,
         progress=gr.Progress()
     ) -> Tuple[str, str, str, str, str]:
+        """Main processing function for transcription"""
         temp_files = []
         try:
             progress(0.05, desc="Processing input...")
             if url_input and url_input.strip():
                 audio_file, source_type = MediaDownloader.download_media(
                     url_input,
                     progress_callback=lambda msg: progress(0.1, desc=msg)
                 )
                 temp_files.append(audio_file)
             elif file_input is not None:
                 audio_file = file_input.name
             else:
                 raise ValueError("Please provide either a file or a URL")
+            # Step 2: Extract audio
             progress(0.15, desc="Extracting audio...")
             processed_audio = AudioProcessor.extract_audio(
                 audio_file,
                 output_format='wav',
             )
             temp_files.append(processed_audio)
             duration = AudioProcessor.get_audio_duration(processed_audio)
+            # Step 3: Load model
             if self.transcriber is None or self.current_model != model_size:
                 progress(0.25, desc=f"Loading Whisper {model_size} model...")
                 self.transcriber = WhisperTranscriber(model_size=model_size)
                 )
                 self.current_model = model_size
+            # Step 4: Chunk audio
+            progress(0.35, desc="Preparing audio...")
             chunks = AudioProcessor.chunk_audio(
                 processed_audio,
                 progress_callback=lambda msg: progress(0.4, desc=msg)
             )
             for chunk_file, _ in chunks:
                 if chunk_file != processed_audio:
                     temp_files.append(chunk_file)
             # Step 5: Transcribe
             progress(0.45, desc="Transcribing audio...")
             if len(chunks) == 1:
                 transcription_result = self.transcriber.transcribe(
                     chunks[0][0],
                     language=language,
                     progress_callback=lambda msg: progress(0.65, desc=msg)
                 )
             else:
                 transcription_result = self.transcriber.transcribe_chunks(
                     chunks,
                     language=language,
             progress(0.70, desc="Transcription complete!")
+            # Step 6: Diarization (optional)
             speaker_labels = None
             if enable_diarization:
                 progress(0.75, desc="Performing speaker diarization...")
                 if not SpeakerDiarizer.is_available():
                     progress(0.75, desc="Skipping diarization (HF_TOKEN not set)")
                 else:
                     try:
                         if self.diarizer is None:
                             self.diarizer = SpeakerDiarizer()
                         diarization_result = self.diarizer.diarize(
                             processed_audio,
                             progress_callback=lambda msg: progress(0.85, desc=msg)
                         )
                         speaker_labels = self.diarizer.align_with_transcription(
                             diarization_result,
                             transcription_result,
                         )
                     except Exception as e:
                         logger.error(f"Diarization failed: {e}")
+            # Step 7: Generate outputs
             progress(0.92, desc="Generating output files...")
             output_prefix = tempfile.mktemp(prefix="whisper_output_")
             outputs = SubtitleFormatter.generate_all_formats(
                 transcription_result,
                 speaker_labels
             )
+            preview_text = f"""**Transcription Complete!**
 **Language:** {transcription_result['language']}
 **Duration:** {duration:.2f} seconds
 **Model Used:** {model_size}
+**Preview:**
+{transcription_result['text'][:500]}..."""
             progress(1.0, desc="Done!")
             AudioProcessor.cleanup_temp_files(*temp_files)
             return (
         except Exception as e:
             logger.error(f"Processing failed: {e}")
             AudioProcessor.cleanup_temp_files(*temp_files)
             raise gr.Error(f"Processing failed: {str(e)}")
+# Create app instance
+app = WhisperTranscriberApp()
+# Get available options
+model_choices = WhisperTranscriber.get_available_models()
+language_choices = WhisperTranscriber.get_language_list()
+# Create interface
+with gr.Blocks(title="Whisper Transcriber") as demo:
+    gr.Markdown("# 🎤 Whisper Transcriber\nGenerate subtitles from audio/video using OpenAI Whisper")
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(label="Upload Audio/Video File")
+            url_input = gr.Textbox(label="Or Paste URL", placeholder="YouTube or direct link")
+            model_size = gr.Dropdown(choices=model_choices, value='small', label="Model Size")
+            language = gr.Dropdown(
+                choices=[(f"{v} ({k})", k) for k, v in language_choices.items()],
+                value='auto',
+                label="Language"
             )
+            enable_diarization = gr.Checkbox(label="Enable Speaker Diarization", value=False)
+            btn = gr.Button("Generate Transcription", variant="primary")
+        with gr.Column():
+            preview = gr.Markdown(label="Preview")
+            srt_file = gr.File(label="SRT File")
+            vtt_file = gr.File(label="VTT File")
+            txt_file = gr.File(label="TXT File")
+            json_file = gr.File(label="JSON File")
+    btn.click(
+        fn=app.process_media,
+        inputs=[file_input, url_input, model_size, language, enable_diarization],
+        outputs=[preview, srt_file, vtt_file, txt_file, json_file]
+    )
 if __name__ == "__main__":
+    demo.queue()
     demo.launch()