Spaces:

NoLev
/

mp3transcriber

Sleeping

App Files Files Community

NoLev commited on Oct 11, 2025

Commit

7d6b2b0

verified ·

1 Parent(s): ab63b06

Create app.py

Browse files

Files changed (1) hide show

app.py +144 -0

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import gradio as gr
+from transformers import pipeline
+import torch
+# Global cache for pipelines to avoid reloading models
+pipelines = {}
+# List of available Whisper models (from smallest/fastest to largest/most accurate)
+MODEL_OPTIONS = [
+    "openai/whisper-tiny",      # ~39M params, fastest but least accurate
+    "openai/whisper-base",      # ~74M params, good balance
+    "openai/whisper-small",     # ~244M params, better accuracy
+    "openai/whisper-medium",    # ~769M params, high accuracy
+    "openai/whisper-large",     # ~1550M params, very high accuracy
+    "openai/whisper-large-v3",  # ~1550M params, latest with improvements
+]
+# Function to get or load a pipeline for a given model
+def get_pipeline(model_id):
+    if model_id not in pipelines:
+        print(f"Loading model: {model_id}...")  # Log for debugging in Spaces
+        pipelines[model_id] = pipeline(
+            "automatic-speech-recognition",
+            model=model_id,
+            device="cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available
+        )
+    return pipelines[model_id]
+# Transcription function with chunking for long audio
+def transcribe_speech(audio_file, model_id, language="english", return_timestamps=False):
+    if audio_file is None:
+        return "Please upload an audio file."
+    pipe = get_pipeline(model_id)
+    # Generate kwargs for transcription
+    generate_kwargs = {"task": "transcribe", "language": language}
+    if return_timestamps:
+        generate_kwargs["return_timestamps"] = True
+    # Transcribe with chunking for long files
+    output = pipe(
+        audio_file,
+        max_new_tokens=128,  # Per chunk for stability
+        generate_kwargs=generate_kwargs,
+        chunk_length_s=30,
+        stride_length_s=5,  # Overlap for smooth transitions
+        batch_size=8 if "tiny" not in model_id and "base" not in model_id else 16,  # Adjust batch for smaller models
+        return_timestamps=return_timestamps,
+    )
+    if return_timestamps:
+        # Format with timestamps if requested
+        if "chunks" in output:
+            formatted = []
+            for chunk in output["chunks"]:
+                start = f"{chunk['timestamp'][0]:.2f}s" if chunk['timestamp'][0] is not None else "0.00s"
+                end = f"{chunk['timestamp'][1]:.2f}s" if chunk['timestamp'][1] is not None else "?.?s"
+                formatted.append(f"[{start} - {end}] {chunk['text']}")
+            return "\n".join(formatted)
+        else:
+            return output["text"]  # Fallback
+    else:
+        return output["text"]
+# Create the Gradio app with a colorful, responsive theme
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="purple",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
+)
+with gr.Blocks(theme=theme, title="MP3 to Text Transcriber") as demo:
+    gr.Markdown(
+        """
+        # 🎤 MP3 to Text Transcription Tool
+        Upload an MP3 (or any audio file) and transcribe it to text using OpenAI's Whisper models.
+        Supports long files up to hours—handles 45+ minutes effortlessly!
+        Choose a model for speed vs. accuracy trade-off.
+        """,
+        elem_classes=["centered"]
+    )
+    with gr.Row(variant="panel", elem_classes=["max-w-4xl mx-auto"]):
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                sources="upload",
+                type="filepath",
+                label="📁 Upload Audio File (MP3/WAV/etc.)",
+                elem_classes=["w-full"]
+            )
+            model_dropdown = gr.Dropdown(
+                choices=MODEL_OPTIONS,
+                value=MODEL_OPTIONS[1],  # Default to base
+                label="🤖 Select Whisper Model",
+                info="Tiny: Fastest | Large-v3: Most accurate (slower on CPU)",
+                elem_classes=["w-full"]
+            )
+            language_dropdown = gr.Dropdown(
+                choices=["english", "french", "german", "spanish", "italian", "portuguese", "dutch", "russian", "swedish", "chinese", "japanese", "korean", "arabic", "hindi"],  # Common languages
+                value="english",
+                label="🌍 Language (for better accuracy)",
+                elem_classes=["w-full"]
+            )
+            timestamps_checkbox = gr.Checkbox(
+                label="⏰ Include Timestamps?",
+                value=False,
+                info="Adds [start - end] tags to the transcript."
+            )
+            transcribe_btn = gr.Button("🚀 Transcribe Audio", variant="primary", size="lg", elem_classes=["w-full"])
+        with gr.Column(scale=1):
+            status_output = gr.Markdown("Ready to transcribe! 💬", elem_classes=["text-center"])
+    transcript_output = gr.Textbox(
+        label="📝 Transcript",
+        lines=15,
+        max_lines=20,
+        placeholder="Your transcription will appear here...",
+        elem_classes=["w-full", "bg-gray-50 dark:bg-gray-800"],
+        show_copy_button=True
+    )
+    # Event handlers
+    def update_status(msg):
+        return gr.Markdown(f"**{msg}**")
+    transcribe_btn.click(
+        fn=transcribe_speech,
+        inputs=[audio_input, model_dropdown, language_dropdown, timestamps_checkbox],
+        outputs=transcript_output,
+        show_progress=True  # Progress bar for long transcriptions
+    ).then(
+        fn=lambda: update_status("Transcription complete! 🎉"),
+        outputs=status_output
+    )
+if __name__ == "__main__":
+    demo.launch()