import gradio as gr import numpy as np from transformers import pipeline from diffusers import DiffusionPipeline import torch import soundfile as sf import librosa pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base") def audio_to_text(audio): # Check if audio is a tuple of (sample_rate, numpy_array) if isinstance(audio, tuple): sample_rate, audio_data = audio # Unpack sample rate and data else: # If it's a file path, load and resample to 16kHz directly audio_data, sample_rate = librosa.load(audio, sr=16000) # Convert to mono if the audio has more than one channel if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) # Convert audio data to numpy array of float32 type audio_array = np.array(audio_data).astype(np.float32) # Transcribe the audio input with return_timestamps=True transcription = pipe1(audio_array, return_timestamps=True) # Check the output structure of transcription if "segments" in transcription: # Extract text from each segment if "segments" key exists transcription_text = " ".join([segment["text"] for segment in transcription["segments"]]) elif "text" in transcription: # Use the full transcription if it's directly in the "text" field transcription_text = transcription["text"] else: transcription_text = "No transcription available." # Print and return the transcription text print("Transcription:", transcription_text) return transcription_text demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text") demo.launch(share=True)