File size: 1,221 Bytes
d9ca276
b26a457
 
d9ca276
68c743d
b26a457
a2f7538
b26a457
9f08cb3
 
 
 
 
a2f7538
b26a457
 
 
a2f7538
b26a457
 
 
9f08cb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9ca276
d2a9540
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
from transformers import pipeline
import numpy as np

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", return_timestamps=True)


def transcribe(stream, new_chunk):

    if stream is None:
        return ""

    sr, y = stream

    # Convert to mono if stereo
    if y.ndim > 1:
        y = y.mean(axis=1)

    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    text = transcriber({"sampling_rate": sr, "raw": y})["text"]
    return text


def clear(audio, transcribed):
    audio = None
    transcribed = None
    return audio, transcribed


with gr.Blocks() as demo:
    gr.HTML(value="<h1>Transcribe Audio to Text Demo</h1>")
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(sources=["upload"], streaming=False, label="wav")
            with gr.Row():
                clr = gr.Button(value="Clear", variant="huggingface")
                btn = gr.Button(value="Transcribe", variant="primary")
        transcribed = gr.TextArea(label="Transcribed", lines=9)

    btn.click(fn=transcribe, inputs=audio, outputs=transcribed)
    clr.click(fn=clear, inputs=[audio, transcribed], outputs=[audio, transcribed])

demo.launch(share=True)