import gradio as gr
import numpy as np
from transformers import pipeline
from diffusers import DiffusionPipeline
import torch
import soundfile as sf
import librosa

pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")

def audio_to_text(audio):
    # Check if audio is a tuple of (sample_rate, numpy_array)
    if isinstance(audio, tuple):
        sample_rate, audio_data = audio  # Unpack sample rate and data
    else:
        # If it's a file path, load and resample to 16kHz directly
        audio_data, sample_rate = librosa.load(audio, sr=16000)
    
    # Convert to mono if the audio has more than one channel
    if len(audio_data.shape) > 1:
        audio_data = np.mean(audio_data, axis=1)
    
    # Convert audio data to numpy array of float32 type
    audio_array = np.array(audio_data).astype(np.float32)
    
    # Transcribe the audio input with return_timestamps=True
    transcription = pipe1(audio_array, return_timestamps=True)
    
    # Check the output structure of transcription
    if "segments" in transcription:
        # Extract text from each segment if "segments" key exists
        transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
    elif "text" in transcription:
        # Use the full transcription if it's directly in the "text" field
        transcription_text = transcription["text"]
    else:
        transcription_text = "No transcription available."

    # Print and return the transcription text
    print("Transcription:", transcription_text)
    return transcription_text

demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
demo.launch(share=True)