Spaces:

FineToon
/

Ai-Audio-Text-To-Text

Sleeping

App Files Files Community

AiCoderv2 commited on Sep 15, 2025

Commit

ec0d1b9

verified ·

1 Parent(s): 81878e1

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -91

app.py CHANGED Viewed

@@ -1,133 +1,188 @@
 from transformers import pipeline
 import gradio as gr
-import numpy as np
-# Model options
 MODEL_OPTIONS = {
     "Whisper Tiny (Fastest)": "openai/whisper-tiny",
     "Whisper Base (Balanced)": "openai/whisper-base",
     "Whisper Small (Better Accuracy)": "openai/whisper-small",
-    "Whisper Medium (High Accuracy)": "openai/whisper-medium"
 }
-# Global variable for the current model
-current_model = None
-current_model_name = None
-def load_model(model_choice):
-    global current_model, current_model_name
     model_name = MODEL_OPTIONS[model_choice]
-    # Only reload if different from current model
-    if current_model_name != model_name:
-        current_model = pipeline("automatic-speech-recognition", model=model_name)
-        current_model_name = model_name
-    return current_model
-def transcribe_audio(audio, model_choice, task_choice, language_choice):
-    if audio is None:
-        return "No audio provided. Please upload an audio file or record using the microphone."
-    try:
-        # Load the selected model
-        asr = load_model(model_choice)
-        # Convert audio to numpy array (Gradio provides mono audio as float32)
-        sr, data = audio
-        # Prepare generation arguments
-        generate_kwargs = {}
-        # Set task (transcribe or translate)
-        if task_choice == "Translate to English":
-            generate_kwargs["task"] = "translate"
-        else:
-            generate_kwargs["task"] = "transcribe"
-        # Set language if specified
-        if language_choice != "Auto-detect":
-            language_map = {
-                "English": "en",
-                "Spanish": "es",
-                "French": "fr",
-                "German": "de",
-                "Italian": "it",
-                "Portuguese": "pt",
-                "Russian": "ru",
-                "Chinese": "zh",
-                "Japanese": "ja",
-                "Korean": "ko"
-            }
-            generate_kwargs["language"] = language_map[language_choice]
-        # Transcribe audio (sampling_rate is handled by the pipeline)
-        result = asr(data, generate_kwargs=generate_kwargs)
-        return result["text"]
-    except Exception as e:
-        return f"Error during transcription: {str(e)}"
-# Gradio interface
-with gr.Blocks(title="Advanced Speech to Text") as demo:
-    gr.Markdown("# 🎵 Advanced Speech to Text Transcription")
-    gr.Markdown("Convert audio to text using OpenAI's Whisper models with multiple options")
     with gr.Row():
         with gr.Column():
             audio_input = gr.Audio(
-                sources=["upload", "microphone"],
-                type="numpy",
-                label="Audio Input"
             )
-            with gr.Group():
-                model_choice = gr.Dropdown(
-                    choices=list(MODEL_OPTIONS.keys()),
-                    value="Whisper Tiny (Fastest)",
-                    label="Model Selection"
-                )
-                task_choice = gr.Radio(
-                    choices=["Transcribe", "Translate to English"],
-                    value="Transcribe",
-                    label="Task"
-                )
-                language_choice = gr.Dropdown(
-                    choices=["Auto-detect", "English", "Spanish", "French", "German",
-                            "Italian", "Portuguese", "Russian", "Chinese", "Japanese", "Korean"],
-                    value="Auto-detect",
-                    label="Language (for transcription)"
-                )
-            transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
         with gr.Column():
             text_output = gr.Textbox(
-                lines=12,
                 label="Transcription",
                 interactive=False
             )
     transcribe_btn.click(
-        transcribe_audio,
-        inputs=[audio_input, model_choice, task_choice, language_choice],
-        outputs=text_output
     )
     gr.Examples(
         examples=[
-            ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect"],
-            ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English"],
-            ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect"]
         ],
-        inputs=[audio_input, model_choice, task_choice, language_choice],
     )
     gr.Markdown("### Features")
-    gr.Markdown("- **Model Selection**: Choose from 4 different Whisper models with speed/accuracy tradeoffs")
     gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
     gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
     gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
     gr.Markdown("### Model Information")
     gr.Markdown("""
@@ -137,10 +192,12 @@ with gr.Blocks(title="Advanced Speech to Text") as demo:
     | Whisper Base | 74M | Fast | Balanced performance |
     | Whisper Small | 244M | Medium | Better accuracy |
     | Whisper Medium | 769M | Slow | High accuracy transcriptions |
     """)
     gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
-    gr.Markdown("- **Note**: First transcription may take 10-30 seconds (model loading)")
 if __name__ == "__main__":
     demo.launch()

 from transformers import pipeline
 import gradio as gr
+# Updated model options with 2 new models
 MODEL_OPTIONS = {
     "Whisper Tiny (Fastest)": "openai/whisper-tiny",
     "Whisper Base (Balanced)": "openai/whisper-base",
     "Whisper Small (Better Accuracy)": "openai/whisper-small",
+    "Whisper Medium (High Accuracy)": "openai/whisper-medium",
+    "Whisper Large (Highest Accuracy)": "openai/whisper-large",  # New model
+    "Whisper Large-v2 (Latest)": "openai/whisper-large-v2"       # New model
 }
+# Language codes for Whisper
+LANGUAGE_CODES = {
+    "Auto-detect": None,
+    "English": "en",
+    "Spanish": "es",
+    "French": "fr",
+    "German": "de",
+    "Italian": "it",
+    "Portuguese": "pt",
+    "Russian": "ru",
+    "Chinese": "zh",
+    "Japanese": "ja",
+    "Korean": "ko",
+    "Arabic": "ar",
+    "Hindi": "hi",
+    "Dutch": "nl"
+}
+def transcribe_audio(audio_file, model_choice, task_choice, language_choice):
+    # Initialize the pipeline with selected model
     model_name = MODEL_OPTIONS[model_choice]
+    task = "translate" if task_choice == "Translate to English" else "transcribe"
+    language = LANGUAGE_CODES[language_choice]
+    # Create pipeline
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model_name,
+        chunk_length_s=30,
+        device=0 if torch.cuda.is_available() else -1
+    )
+    # Generate kwargs for the pipeline
+    generate_kwargs = {"task": task}
+    if language and task == "transcribe":
+        generate_kwargs["language"] = language
+    # Process audio file
+    result = pipe(
+        audio_file,
+        generate_kwargs=generate_kwargs,
+        return_timestamps=False
+    )
+    return result["text"]
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎵 Audio Transcription & Translation")
+    gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.")
     with gr.Row():
         with gr.Column():
             audio_input = gr.Audio(
+                label="Audio Input",
+                type="filepath",
+                source="upload"
+            )
+            # Updated model selection with new models
+            model_choice = gr.Dropdown(
+                choices=list(MODEL_OPTIONS.keys()),
+                value="Whisper Tiny (Fastest)",
+                label="Model Selection"
+            )
+            task_choice = gr.Radio(
+                choices=["Transcribe", "Translate to English"],
+                value="Transcribe",
+                label="Task"
             )
+            # Extended language options
+            language_choice = gr.Dropdown(
+                choices=list(LANGUAGE_CODES.keys()),
+                value="Auto-detect",
+                label="Language (for transcription)"
+            )
+            # New features
+            timestamp_choice = gr.Checkbox(
+                label="Include Timestamps",
+                value=False
+            )
+            beam_size = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=1,
+                step=1,
+                label="Beam Size (Higher = Better Accuracy but Slower)"
+            )
         with gr.Column():
             text_output = gr.Textbox(
+                lines=15,
                 label="Transcription",
                 interactive=False
             )
+            # New output for timestamps
+            timestamp_output = gr.Textbox(
+                lines=8,
+                label="Timestamps (if enabled)",
+                interactive=False,
+                visible=False
+            )
+    transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
+    # Updated function to handle new features
+    def process_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size):
+        model_name = MODEL_OPTIONS[model_choice]
+        task = "translate" if task_choice == "Translate to English" else "transcribe"
+        language = LANGUAGE_CODES[language_choice]
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model_name,
+            chunk_length_s=30,
+            device=0 if torch.cuda.is_available() else -1
+        )
+        generate_kwargs = {
+            "task": task,
+            "num_beams": beam_size
+        }
+        if language and task == "transcribe":
+            generate_kwargs["language"] = language
+        # Process with or without timestamps
+        if timestamp_choice:
+            result = pipe(
+                audio_file,
+                generate_kwargs=generate_kwargs,
+                return_timestamps=True
+            )
+            timestamp_text = "\n".join([
+                f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}"
+                for chunk in result.get("chunks", [])
+            ])
+            return result["text"], timestamp_text, gr.update(visible=True)
+        else:
+            result = pipe(
+                audio_file,
+                generate_kwargs=generate_kwargs,
+                return_timestamps=False
+            )
+            return result["text"], "", gr.update(visible=False)
     transcribe_btn.click(
+        process_audio,
+        inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
+        outputs=[text_output, timestamp_output, timestamp_output]
     )
     gr.Examples(
         examples=[
+            ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1],
+            ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1],
+            ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1],
+            ["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3]
         ],
+        inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
     )
     gr.Markdown("### Features")
+    gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs")
     gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
     gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
     gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
+    gr.Markdown("- **Timestamps**: Option to include word-level timestamps")
+    gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy")
     gr.Markdown("### Model Information")
     gr.Markdown("""
     | Whisper Base | 74M | Fast | Balanced performance |
     | Whisper Small | 244M | Medium | Better accuracy |
     | Whisper Medium | 769M | Slow | High accuracy transcriptions |
+    | Whisper Large | 1.5B | Slower | Very high accuracy |
+    | Whisper Large-v2 | 1.5B | Slower | Latest improvements |
     """)
     gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
+    gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)")
 if __name__ == "__main__":
     demo.launch()