| | import gradio as gr |
| | from transformers import pipeline |
| | import numpy as np |
| | from ner import perform_ner |
| | from intent import perform_intent_classification |
| | |
| | transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") |
| | |
| | def transcribe(stream, new_chunk): |
| | transcription = "" |
| | sentence_buffer = "" |
| | results = [] |
| | sr, y = new_chunk |
| | y = y.astype(np.float32) |
| | y /= np.max(np.abs(y)) |
| | |
| | if stream is not None: |
| | stream = np.concatenate([stream, y]) |
| | else: |
| | stream = y |
| | print(transcriber({"sampling_rate": sr, "raw": stream})["text"]) |
| | transcription=transcriber({"sampling_rate": sr, "raw": stream})["text"] |
| | |
| | sentence_boundary = "." in transcription or "?" in transcription |
| | |
| | ner_result = None |
| | intent_result = None |
| | if sentence_boundary: |
| | sentence = sentence_buffer + transcription.split(transcription[-1])[0] |
| | print("Sentence Buffer :",sentence_buffer) |
| | print("Sentence :",sentence) |
| | ner_result = perform_ner(sentence) |
| | intent_result = perform_intent_classification(sentence) |
| | print("NER Result (sentence):", ner_result) |
| | print("Intent Result (sentence):", intent_result) |
| | sentence_buffer = transcription[-1] |
| | transcription = "" |
| | return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"], ner_result, intent_result |
| | |
| | demo = gr.Interface( |
| | transcribe,["state", gr.Audio(sources=["microphone"], streaming=True), |
| | ], |
| | ["state", gr.Text(label="Transcribe"), gr.Text(label="NER"), gr.Text(label="Intent")], |
| | live=True, |
| | ) |
| | demo.launch(share=True) |