import gradio as gr from transformers import pipeline import numpy as np transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") def transcribe(audio1, audio2): sr1, y1 = audio1 sr2, y2 = audio2 def convertToMono(y): if y.ndim > 1: y = y.mean(axis=1) y = y.astype(np.float32) y /= np.max(np.abs(y)) return y # Convert to mono if stereo y1 = convertToMono(y1) y2 = convertToMono(y2) text1 = transcriber({"sampling_rate": sr1, "raw": y1})["text"] text2 = transcriber({"sampling_rate": sr1, "raw": y2})["text"] return text1 + " -- " + text2 demo = gr.Interface( transcribe, [gr.Audio(sources="upload"), gr.Audio(sources="upload")], ["text"], ) demo.launch()