Spaces:

speech-transformer
/

speech

Build error

App Files Files Community

TristanBehrens commited on Jun 6, 2025

Commit

de6325b

verified ·

1 Parent(s): 7f65a48

Upload 3 files

Browse files

Files changed (3) hide show

app.py +158 -0
packages.txt +1 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import os
+import gradio as gr
+from transformers import pipeline
+import numpy as np
+import time
+from typing import Tuple
+import logging
+import torch
+# Create a logger.
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Check if all the variables are set.
+required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"]
+for required_variable in required_variables:
+    if os.environ.get(required_variable, "NO") == "NO":
+        logger.error(
+            f"Environment variable {required_variable} is not set. "
+            "Please set it before running the application."
+        )
+        raise ValueError(
+            f"Environment variable {required_variable} is not set. "
+            "Please set it before running the application."
+        )
+# Create the transcription pipeline.
+model_name = os.environ["MODEL_NAME"]
+model_name = "openai/whisper-tiny" # TODO: Remove this.
+logger.warning("Using hardcoded model name 'openai/whisper-tiny'.")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Loading model {model_name} with device {device}...")
+transcriber = pipeline(
+    "automatic-speech-recognition",
+    model=model_name,
+    device=device
+)
+logger.info(f"Model loaded successfully.")
+# Start the app.
+def main():
+    interface = create_interface()
+    interface.launch()
+# Create the Gradio interface for the Whisper transcription service.
+def create_interface():
+    # The UI is a block of Gradio components.
+    with gr.Blocks() as interface:
+        # Title.
+        gr.Markdown("# Whisper Speech Transcription")
+        # One row for the password input and another for the audio input.
+        with gr.Row():
+            with gr.Column(scale=2):
+                passwort_input = gr.Textbox(
+                    label="Enter Password",
+                    placeholder="Enter the password to access the transcription service",
+                    type="password"
+                )
+        # Row for audio input.
+        with gr.Row():
+            with gr.Column(scale=2):
+                audio_input = gr.Audio(
+                    sources=["microphone", "upload"],
+                    type="numpy",
+                    label="Record or Upload Audio"
+                )
+        # Row for the transcription button.
+        with gr.Row():
+            transcribe_button = gr.Button("Transcribe", variant="primary")
+        # Row for the transcription output.
+        with gr.Row():
+            output_text = gr.Textbox(
+                label="Transcription Output",
+                placeholder="Transcription will appear here...",
+                lines=5
+            )
+        # Status message for transcription time.
+        status_text = gr.Textbox(
+            label="Status",
+            placeholder="Transcription status will appear here...",
+            lines=1,
+            interactive=False
+        )
+        # Set up the transcribe button click event
+        transcribe_button.click(
+            fn=transcribe_audio,
+            inputs=[audio_input, passwort_input],
+            outputs=[output_text, status_text],
+        )
+        # Also transcribe when audio is recorded/uploaded
+        audio_input.change(
+            fn=transcribe_audio,
+            inputs=[audio_input, passwort_input],
+            outputs=[output_text, status_text],
+        )
+    return interface
+def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str:
+    # If the password is wrong, return an error message.
+    # TODO: Enable this.
+    #if password != os.environ.get("PASSWORD"):
+    #    return "Incorrect password. Please try again.", ""
+    # If there is no audio, return an error message.
+    if audio is None:
+        return "No audio detected. Please record some audio.", ""
+    print(f"Received audio: {audio}")
+    print(f"Audio type: {type(audio)}")
+    # Start measuring the time.
+    start_time = time.time()
+    # Unpack the audio.
+    sr, y = audio
+    # Convert to mono if stereo
+    if y.ndim > 1:
+        logger.debug(f"Converting {y.shape[1]} channels to mono")
+        y = y.mean(axis=1)
+    # Normalize audio
+    y = y.astype(np.float32)
+    max_abs = np.max(np.abs(y))
+    if max_abs > 0:  # Avoid division by zero
+        y /= max_abs
+    logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)")
+    # Run transcription
+    result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0])
+    logger.info(f"Transcription completed.")
+    # Calculate elapsed time
+    elapsed_time = time.time() - start_time
+    audio_time = len(y) / sr
+    status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio"
+    return result["text"], status_string
+if __name__ == "__main__":
+    main()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0.0
+torchaudio>=2.0.0
+transformers==4.52.3
+gradio==5.10.0
+pydantic==2.10.6
+numpy