Spaces:
Build error
Build error
| import os | |
| import gradio as gr | |
| from transformers import pipeline | |
| import numpy as np | |
| import time | |
| from typing import Tuple | |
| import logging | |
| import torch | |
| # Create a logger. | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Check if all the variables are set. | |
| required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"] | |
| for required_variable in required_variables: | |
| if os.environ.get(required_variable, "NO") == "NO": | |
| logger.error( | |
| f"Environment variable {required_variable} is not set. " | |
| "Please set it before running the application." | |
| ) | |
| raise ValueError( | |
| f"Environment variable {required_variable} is not set. " | |
| "Please set it before running the application." | |
| ) | |
| # Create the transcription pipeline. | |
| model_name = os.environ["MODEL_NAME"] | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| logger.info(f"Loading model {model_name} with device {device}...") | |
| transcriber = pipeline( | |
| "automatic-speech-recognition", | |
| model=model_name, | |
| device=device | |
| ) | |
| logger.info(f"Model loaded successfully.") | |
| # Start the app. | |
| def main(): | |
| interface = create_interface() | |
| interface.launch() | |
| # Create the Gradio interface for the Whisper transcription service. | |
| def create_interface(): | |
| # The UI is a block of Gradio components. | |
| with gr.Blocks() as interface: | |
| # Title. | |
| gr.Markdown("# Whisper Speech Transcription") | |
| # One row for the password input and another for the audio input. | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| passwort_input = gr.Textbox( | |
| label="Enter Password", | |
| placeholder="Enter the password to access the transcription service", | |
| type="password" | |
| ) | |
| # Row for audio input. | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="numpy", | |
| label="Record or Upload Audio" | |
| ) | |
| # Row for the transcription button. | |
| with gr.Row(): | |
| transcribe_button = gr.Button("Transcribe", variant="primary") | |
| # Row for the transcription output. | |
| with gr.Row(): | |
| output_text = gr.Textbox( | |
| label="Transcription Output", | |
| placeholder="Transcription will appear here...", | |
| lines=5 | |
| ) | |
| # Status message for transcription time. | |
| status_text = gr.Textbox( | |
| label="Status", | |
| placeholder="Transcription status will appear here...", | |
| lines=1, | |
| interactive=False | |
| ) | |
| # Set up the transcribe button click event | |
| transcribe_button.click( | |
| fn=transcribe_audio, | |
| inputs=[audio_input, passwort_input], | |
| outputs=[output_text, status_text], | |
| ) | |
| # Also transcribe when audio is recorded/uploaded | |
| audio_input.change( | |
| fn=transcribe_audio, | |
| inputs=[audio_input, passwort_input], | |
| outputs=[output_text, status_text], | |
| ) | |
| return interface | |
| def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str: | |
| # If the password is wrong, return an error message. | |
| if password != os.environ.get("PASSWORD"): | |
| return "Incorrect password. Please try again.", "" | |
| # If there is no audio, return an error message. | |
| if audio is None: | |
| return "No audio detected. Please record some audio.", "" | |
| # Start measuring the time. | |
| start_time = time.time() | |
| # Unpack the audio. | |
| sr, y = audio | |
| # Convert to mono if stereo | |
| if y.ndim > 1: | |
| logger.debug(f"Converting {y.shape[1]} channels to mono") | |
| y = y.mean(axis=1) | |
| # Normalize audio | |
| y = y.astype(np.float32) | |
| max_abs = np.max(np.abs(y)) | |
| if max_abs > 0: # Avoid division by zero | |
| y /= max_abs | |
| logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)") | |
| # Run transcription | |
| result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0]) | |
| logger.info(f"Transcription completed.") | |
| # Calculate elapsed time | |
| elapsed_time = time.time() - start_time | |
| audio_time = len(y) / sr | |
| status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio with model {model_name}." | |
| return result["text"], status_string | |
| # Entrypoint. | |
| main() |