import datetime import json import time import pytz import gradio as gr from huggingface_hub import snapshot_download from omegaconf import OmegaConf from vosk import KaldiRecognizer, Model THRESHOLD_G = 0.5 THRESHOLD_H = 0.34 def load_vosk(model_id: str): model_dir = snapshot_download(model_id) return Model(model_path=model_dir) OmegaConf.register_new_resolver("load_vosk", load_vosk) models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml")) def automatic_speech_recognition(model_id: str, stream: str, new_chunk: str): model = models_config[model_id]["model"] sample_rate, audio_array = new_chunk if audio_array.ndim == 2: audio_array = audio_array[:, 0] audio_bytes = audio_array.tobytes() if stream is None: rec = KaldiRecognizer(model, sample_rate) rec.SetWords(True) rec.SetMaxAlternatives(0) results = [] start_time = time.time() else: rec, results, start_time = stream if time.time() - start_time > 10: rec.FinalResult() start_time = time.time() if rec.AcceptWaveform(audio_bytes): result = json.loads(rec.Result()) text_result = result["text"] if text_result != "" and result["text"] != "": for word in ["HEY EVAS", "HI EVAS", "EVAS GO", "EVAS STOP"]: if word in text_result: print(result) conf_result = (round(result["result"][0]["conf"], 3), round(result["result"][1]["conf"], 3)) if "HI" in word or "HEY" in word: threshold_1 = THRESHOLD_H else: threshold_1 = THRESHOLD_G if conf_result[0] > threshold_1 and conf_result[1] > THRESHOLD_G: results.append( datetime.datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S") + " " + text_result ) rec.FinalResult() start_time = time.time() break # rec.AcceptWaveform(audio_bytes) # partial_result = json.loads(rec.PartialResult()) # if partial_result["partial"] != "": # print(partial_result) # results.append( # datetime.datetime.now().strftime("%H:%M:%S") # + " " # + partial_result["partial"] # ) if len(results) > 0: output_text = "\n".join(results) else: output_text = "" return (rec, results, start_time), output_text demo = gr.Blocks( title="Wakeword Demo", css="@import url(https://tauhu.tw/tauhu-oo.css);", theme=gr.themes.Default( font=( "tauhu-oo", gr.themes.GoogleFont("Source Sans Pro"), "ui-sans-serif", "system-ui", "sans-serif", ) ), ) with demo: default_model_id = list(models_config.keys())[0] model_drop_down = gr.Dropdown( models_config.keys(), value=default_model_id, label="模型", ) gr.Markdown( """ # Wakeword Demo """ ) state = gr.State() audio = gr.Audio( label="錄音", type="numpy", format="wav", waveform_options=gr.WaveformOptions( sample_rate=16000, ), sources=["microphone"], streaming=True, ) gr.Interface( automatic_speech_recognition, inputs=[ model_drop_down, state, audio, ], outputs=[ state, gr.Text(interactive=False, label="輸出"), ], live=True, stream_every=0.25, clear_btn=None, # flagging_mode="auto", ) demo.launch() demo.launch() demo.launch()