clchan's picture
Create app.py
4eb6f67 verified
raw
history blame contribute delete
719 Bytes
import gradio as gr
from transformers import pipeline
import numpy as np
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
def transcribe(audio1, audio2):
sr1, y1 = audio1
sr2, y2 = audio2
def convertToMono(y):
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= np.max(np.abs(y))
return y
# Convert to mono if stereo
y1 = convertToMono(y1)
y2 = convertToMono(y2)
text1 = transcriber({"sampling_rate": sr1, "raw": y1})["text"]
text2 = transcriber({"sampling_rate": sr1, "raw": y2})["text"]
return text1 + " -- " + text2
demo = gr.Interface(
transcribe,
[gr.Audio(sources="upload"), gr.Audio(sources="upload")],
["text"],
)
demo.launch()