| import gradio as gr |
| from transformers import pipeline |
| import numpy as np |
|
|
| asr_model = "distil-whisper/distil-medium.en" |
|
|
| asr_pipe = pipeline("automatic-speech-recognition", model=asr_model) |
|
|
| def transcribe(stream, new_chunk): |
| sr, y = new_chunk |
| y = y.astype(np.float32) |
| y /= np.max(np.abs(y)) |
|
|
| if stream is not None: |
| stream = np.concatenate([stream, y]) |
| else: |
| stream = y |
| return stream, asr_pipe({"sampling_rate": sr, "raw": stream})["text"] |
|
|
| demo = gr.Blocks() |
|
|
|
|
| mic = gr.Interface( |
| fn = transcribe, |
| inputs = [ |
| "state", gr.Audio(sources=["microphone"], streaming=True)], |
| outputs = ["state", "text"], |
| layout="horizontal", |
| theme="huggingface", |
| title="Whisper & BERT demo - Intent Classification", |
| description=( |
| "Transcribe audio inputs with Whisper ASR model and detect intention from the text. Use BERT NLP model to classify the intention as one of the commands to command a light." |
| ), |
| allow_flagging="never", |
| live=True, |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|