P1 / app.py
RP-Azul's picture
Update app.py
c207b4a verified
raw
history blame
1.68 kB
import gradio as gr
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
import torch
import soundfile as sf
import librosa
pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
def audio_to_text(audio):
# Check if audio is a tuple of (sample_rate, numpy_array)
if isinstance(audio, tuple):
sample_rate, audio_data = audio # Unpack sample rate and data
else:
# If it's a file path, load and resample to 16kHz directly
audio_data, sample_rate = librosa.load(audio, sr=16000)
# Convert to mono if the audio has more than one channel
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Convert audio data to numpy array of float32 type
audio_array = np.array(audio_data).astype(np.float32)
# Transcribe the audio input with return_timestamps=True
transcription = pipe1(audio_array, return_timestamps=True)
# Check the output structure of transcription
if "segments" in transcription:
# Extract text from each segment if "segments" key exists
transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
elif "text" in transcription:
# Use the full transcription if it's directly in the "text" field
transcription_text = transcription["text"]
else:
transcription_text = "No transcription available."
# Print and return the transcription text
print("Transcription:", transcription_text)
return transcription_text
demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
demo.launch(share=True)