| | import datetime |
| | import json |
| | import time |
| | import pytz |
| |
|
| | import gradio as gr |
| | from huggingface_hub import snapshot_download |
| | from omegaconf import OmegaConf |
| | from vosk import KaldiRecognizer, Model |
| |
|
| | THRESHOLD_G = 0.5 |
| | THRESHOLD_H = 0.34 |
| |
|
| |
|
| | def load_vosk(model_id: str): |
| | model_dir = snapshot_download(model_id) |
| | return Model(model_path=model_dir) |
| |
|
| |
|
| | OmegaConf.register_new_resolver("load_vosk", load_vosk) |
| |
|
| | models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml")) |
| |
|
| |
|
| | def automatic_speech_recognition(model_id: str, stream: str, new_chunk: str): |
| | model = models_config[model_id]["model"] |
| |
|
| | sample_rate, audio_array = new_chunk |
| | if audio_array.ndim == 2: |
| | audio_array = audio_array[:, 0] |
| |
|
| | audio_bytes = audio_array.tobytes() |
| |
|
| | if stream is None: |
| | rec = KaldiRecognizer(model, sample_rate) |
| | rec.SetWords(True) |
| | rec.SetMaxAlternatives(0) |
| | results = [] |
| | start_time = time.time() |
| | else: |
| | rec, results, start_time = stream |
| |
|
| | if time.time() - start_time > 10: |
| | rec.FinalResult() |
| | start_time = time.time() |
| |
|
| | if rec.AcceptWaveform(audio_bytes): |
| | result = json.loads(rec.Result()) |
| | text_result = result["text"] |
| | if text_result != "" and result["text"] != "<SIL>": |
| | for word in ["HEY EVAS", "HI EVAS", "EVAS GO", "EVAS STOP"]: |
| | if word in text_result: |
| | print(result) |
| |
|
| | conf_result = (round(result["result"][0]["conf"], 3), round(result["result"][1]["conf"], 3)) |
| | if "HI" in word or "HEY" in word: |
| | threshold_1 = THRESHOLD_H |
| | else: |
| | threshold_1 = THRESHOLD_G |
| | if conf_result[0] > threshold_1 and conf_result[1] > THRESHOLD_G: |
| | results.append( |
| | datetime.datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S") + " " + text_result |
| | ) |
| | rec.FinalResult() |
| | start_time = time.time() |
| | break |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | if len(results) > 0: |
| | output_text = "\n".join(results) |
| | else: |
| | output_text = "" |
| |
|
| | return (rec, results, start_time), output_text |
| |
|
| |
|
| | demo = gr.Blocks( |
| | title="Wakeword Demo", |
| | css="@import url(https://tauhu.tw/tauhu-oo.css);", |
| | theme=gr.themes.Default( |
| | font=( |
| | "tauhu-oo", |
| | gr.themes.GoogleFont("Source Sans Pro"), |
| | "ui-sans-serif", |
| | "system-ui", |
| | "sans-serif", |
| | ) |
| | ), |
| | ) |
| |
|
| | with demo: |
| | default_model_id = list(models_config.keys())[0] |
| | model_drop_down = gr.Dropdown( |
| | models_config.keys(), |
| | value=default_model_id, |
| | label="模型", |
| | ) |
| |
|
| | gr.Markdown( |
| | """ |
| | # Wakeword Demo |
| | """ |
| | ) |
| | state = gr.State() |
| | audio = gr.Audio( |
| | label="錄音", |
| | type="numpy", |
| | format="wav", |
| | waveform_options=gr.WaveformOptions( |
| | sample_rate=16000, |
| | ), |
| | sources=["microphone"], |
| | streaming=True, |
| | ) |
| | gr.Interface( |
| | automatic_speech_recognition, |
| | inputs=[ |
| | model_drop_down, |
| | state, |
| | audio, |
| | ], |
| | outputs=[ |
| | state, |
| | gr.Text(interactive=False, label="輸出"), |
| | ], |
| | live=True, |
| | stream_every=0.25, |
| | clear_btn=None, |
| | |
| | ) |
| |
|
| |
|
| | demo.launch() |
| | demo.launch() |
| | demo.launch() |
| |
|