File size: 1,679 Bytes
d7d44ed 7954582 d7d44ed c207b4a d7d44ed c39770b 4a6a287 359777d 1e150f3 5bed895 4a6a287 359777d 4a6a287 c8b8068 359777d 4a6a287 5bed895 4a6a287 5bed895 4a6a287 5bed895 359777d 7ed8f5e c8b8068 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import gradio as gr
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
import torch
import soundfile as sf
import librosa
pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
def audio_to_text(audio):
# Check if audio is a tuple of (sample_rate, numpy_array)
if isinstance(audio, tuple):
sample_rate, audio_data = audio # Unpack sample rate and data
else:
# If it's a file path, load and resample to 16kHz directly
audio_data, sample_rate = librosa.load(audio, sr=16000)
# Convert to mono if the audio has more than one channel
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Convert audio data to numpy array of float32 type
audio_array = np.array(audio_data).astype(np.float32)
# Transcribe the audio input with return_timestamps=True
transcription = pipe1(audio_array, return_timestamps=True)
# Check the output structure of transcription
if "segments" in transcription:
# Extract text from each segment if "segments" key exists
transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
elif "text" in transcription:
# Use the full transcription if it's directly in the "text" field
transcription_text = transcription["text"]
else:
transcription_text = "No transcription available."
# Print and return the transcription text
print("Transcription:", transcription_text)
return transcription_text
demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
demo.launch(share=True) |