import gradio as gr
from transformers import pipeline
import numpy as np

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

def transcribe(audio1, audio2):
	sr1, y1 = audio1
	sr2, y2 = audio2

	def convertToMono(y):
		if y.ndim > 1:
			y = y.mean(axis=1)
		
		y = y.astype(np.float32)
		y /= np.max(np.abs(y))
		return y
	# Convert to mono if stereo
	y1 = convertToMono(y1)
	y2 = convertToMono(y2)	

	text1 = transcriber({"sampling_rate": sr1, "raw": y1})["text"]
	text2 = transcriber({"sampling_rate": sr1, "raw": y2})["text"]
	return text1 + " -- " + text2

demo = gr.Interface(
	transcribe,
	[gr.Audio(sources="upload"), gr.Audio(sources="upload")],
	["text"],
)

demo.launch()