File size: 1,221 Bytes
d9ca276 b26a457 d9ca276 68c743d b26a457 a2f7538 b26a457 9f08cb3 a2f7538 b26a457 a2f7538 b26a457 9f08cb3 d9ca276 d2a9540 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import gradio as gr
from transformers import pipeline
import numpy as np
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", return_timestamps=True)
def transcribe(stream, new_chunk):
if stream is None:
return ""
sr, y = stream
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= np.max(np.abs(y))
text = transcriber({"sampling_rate": sr, "raw": y})["text"]
return text
def clear(audio, transcribed):
audio = None
transcribed = None
return audio, transcribed
with gr.Blocks() as demo:
gr.HTML(value="<h1>Transcribe Audio to Text Demo</h1>")
with gr.Row():
with gr.Column():
audio = gr.Audio(sources=["upload"], streaming=False, label="wav")
with gr.Row():
clr = gr.Button(value="Clear", variant="huggingface")
btn = gr.Button(value="Transcribe", variant="primary")
transcribed = gr.TextArea(label="Transcribed", lines=9)
btn.click(fn=transcribe, inputs=audio, outputs=transcribed)
clr.click(fn=clear, inputs=[audio, transcribed], outputs=[audio, transcribed])
demo.launch(share=True)
|