| import gradio as gr |
| from transformers import pipeline |
| import numpy as np |
|
|
| transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", return_timestamps=True) |
|
|
|
|
| def transcribe(stream, new_chunk): |
|
|
| if stream is None: |
| return "" |
|
|
| sr, y = stream |
|
|
| |
| if y.ndim > 1: |
| y = y.mean(axis=1) |
|
|
| y = y.astype(np.float32) |
| y /= np.max(np.abs(y)) |
|
|
| text = transcriber({"sampling_rate": sr, "raw": y})["text"] |
| return text |
|
|
|
|
| def clear(audio, transcribed): |
| audio = None |
| transcribed = None |
| return audio, transcribed |
|
|
|
|
| with gr.Blocks() as demo: |
| gr.HTML(value="<h1>Transcribe Audio to Text Demo</h1>") |
| with gr.Row(): |
| with gr.Column(): |
| audio = gr.Audio(sources=["upload"], streaming=False, label="wav") |
| with gr.Row(): |
| clr = gr.Button(value="Clear", variant="huggingface") |
| btn = gr.Button(value="Transcribe", variant="primary") |
| transcribed = gr.TextArea(label="Transcribed", lines=9) |
|
|
| btn.click(fn=transcribe, inputs=audio, outputs=transcribed) |
| clr.click(fn=clear, inputs=[audio, transcribed], outputs=[audio, transcribed]) |
|
|
| demo.launch(share=True) |
|
|