Spaces:

RP-Azul
/

P1

Sleeping

File size: 1,679 Bytes

d7d44ed
 
 
 
 
7954582
 
d7d44ed
c207b4a
d7d44ed
c39770b
4a6a287
 
 
 
 
 
 
359777d
 
1e150f3
5bed895
4a6a287
359777d
4a6a287
 
c8b8068
359777d
4a6a287
5bed895
4a6a287
 
5bed895
4a6a287
5bed895
 
 
 
 
359777d
 
 
7ed8f5e
c8b8068

import gradio as gr
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
import torch
import soundfile as sf
import librosa

pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")

def audio_to_text(audio):
    # Check if audio is a tuple of (sample_rate, numpy_array)
    if isinstance(audio, tuple):
        sample_rate, audio_data = audio  # Unpack sample rate and data
    else:
        # If it's a file path, load and resample to 16kHz directly
        audio_data, sample_rate = librosa.load(audio, sr=16000)
    
    # Convert to mono if the audio has more than one channel
    if len(audio_data.shape) > 1:
        audio_data = np.mean(audio_data, axis=1)
    
    # Convert audio data to numpy array of float32 type
    audio_array = np.array(audio_data).astype(np.float32)
    
    # Transcribe the audio input with return_timestamps=True
    transcription = pipe1(audio_array, return_timestamps=True)
    
    # Check the output structure of transcription
    if "segments" in transcription:
        # Extract text from each segment if "segments" key exists
        transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
    elif "text" in transcription:
        # Use the full transcription if it's directly in the "text" field
        transcription_text = transcription["text"]
    else:
        transcription_text = "No transcription available."

    # Print and return the transcription text
    print("Transcription:", transcription_text)
    return transcription_text

demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
demo.launch(share=True)