P1 / app.py
RP-Azul's picture
Update app.py
4a6a287 verified
import gradio as gr
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
import torch
import soundfile as sf
import librosa
pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
def audio_to_text(audio):
# Check if audio is a tuple of (sample_rate, numpy_array)
if isinstance(audio, tuple):
sample_rate, audio_data = audio # Unpack sample rate and data
else:
# If it's a file path, load and resample to 16kHz directly
audio_data, sample_rate = librosa.load(audio, sr=16000)
# Convert to mono if the audio has more than one channel
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Convert audio data to numpy array of float32 type
audio_array = np.array(audio_data).astype(np.float32)
# Transcribe the audio input with return_timestamps=True
transcription = pipe1(audio_array, return_timestamps=True)
# Check the output structure of transcription
if "segments" in transcription:
# Extract text from each segment if "segments" key exists
transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
elif "text" in transcription:
# Use the full transcription if it's directly in the "text" field
transcription_text = transcription["text"]
else:
transcription_text = "No transcription available."
# Print and return the transcription text
print("Transcription:", transcription_text)
return transcription_text
demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
demo.launch(share=True)