|
|
import gradio as gr |
|
|
import numpy as np |
|
|
from transformers import pipeline |
|
|
from diffusers import DiffusionPipeline |
|
|
import torch |
|
|
import soundfile as sf |
|
|
import librosa |
|
|
|
|
|
pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base") |
|
|
|
|
|
def audio_to_text(audio): |
|
|
|
|
|
if isinstance(audio, tuple): |
|
|
sample_rate, audio_data = audio |
|
|
else: |
|
|
|
|
|
audio_data, sample_rate = librosa.load(audio, sr=16000) |
|
|
|
|
|
|
|
|
if len(audio_data.shape) > 1: |
|
|
audio_data = np.mean(audio_data, axis=1) |
|
|
|
|
|
|
|
|
audio_array = np.array(audio_data).astype(np.float32) |
|
|
|
|
|
|
|
|
transcription = pipe1(audio_array, return_timestamps=True) |
|
|
|
|
|
|
|
|
if "segments" in transcription: |
|
|
|
|
|
transcription_text = " ".join([segment["text"] for segment in transcription["segments"]]) |
|
|
elif "text" in transcription: |
|
|
|
|
|
transcription_text = transcription["text"] |
|
|
else: |
|
|
transcription_text = "No transcription available." |
|
|
|
|
|
|
|
|
print("Transcription:", transcription_text) |
|
|
return transcription_text |
|
|
|
|
|
demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text") |
|
|
demo.launch(share=True) |