import gradio as gr from transformers import pipeline import numpy as np transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", return_timestamps=True) def transcribe(stream, new_chunk): if stream is None: return "" sr, y = stream # Convert to mono if stereo if y.ndim > 1: y = y.mean(axis=1) y = y.astype(np.float32) y /= np.max(np.abs(y)) text = transcriber({"sampling_rate": sr, "raw": y})["text"] return text def clear(audio, transcribed): audio = None transcribed = None return audio, transcribed with gr.Blocks() as demo: gr.HTML(value="

Transcribe Audio to Text Demo

") with gr.Row(): with gr.Column(): audio = gr.Audio(sources=["upload"], streaming=False, label="wav") with gr.Row(): clr = gr.Button(value="Clear", variant="huggingface") btn = gr.Button(value="Transcribe", variant="primary") transcribed = gr.TextArea(label="Transcribed", lines=9) btn.click(fn=transcribe, inputs=audio, outputs=transcribed) clr.click(fn=clear, inputs=[audio, transcribed], outputs=[audio, transcribed]) demo.launch(share=True)