import os from datetime import datetime, timezone from functools import partial from pathlib import Path try: import spaces # pyright: ignore[reportMissingImports] except ImportError: # pragma: no cover - local fallback when not running on HF Spaces. class _SpacesShim: @staticmethod def GPU(*decorator_args, **decorator_kwargs): if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs: return decorator_args[0] def _decorator(func): return func return _decorator spaces = _SpacesShim() from benchmarks.harness import ( DEFAULT_STAGE1_SMOKE_SCRIPT_IDS, evaluate_internal_benchmark, evaluate_stage1_integration_smoke, ) from core.adapters import ( AVAILABLE_EDGE_TTS_VOICES, DEFAULT_EDGE_TTS_VOICE, EdgeTTSAdapter, FasterWhisperAdapter, FunASRSenseVoiceAdapter, MockASRAdapter, MockTTSAdapter, ) from core.pipeline import ( AUTO_REPLY_DELAY_SECONDS, create_streaming_turn_state, maybe_auto_reply_streaming_turn, preview_streaming_turn, run_controlled_turn, run_streaming_turn, stop_streaming_turn_state, ) from utils.config_loader import ( DEFAULT_BENCHMARK_CSV_PATH, DEFAULT_EVALUATION_CONTRACT_PATH, DEFAULT_RULES_PATH, TABLE_HEADERS, load_evaluation_contract, load_rule_catalog, rules_from_editor_rows, rules_to_table_rows, ) from utils.tracking import LocalJsonlTracker, RunMetadata, make_run_id, resolve_code_version MINIMAL_UI_NOTICE = "当前为演示版,优先展示实时接话体验。首次加载模型可能稍慢;麦克风不可用时可改用下方兜底输入。" def _is_zero_gpu_runtime() -> bool: return os.getenv("SPACES_ZERO_GPU", "").lower() in {"1", "t", "true"} def _build_stream_preview_asr_adapters(): # ZeroGPU forbids CUDA init from the main process, so preview callbacks must # stay on CPU when they are not wrapped by @spaces.GPU. if _is_zero_gpu_runtime(): return { "funasr": FunASRSenseVoiceAdapter(device_order=("cpu",)), "faster-whisper": FasterWhisperAdapter(device_order=(("cpu", "int8"),)), } return { "funasr": FunASRSenseVoiceAdapter(), "faster-whisper": FasterWhisperAdapter(), } @spaces.GPU def _process_turn_for_space( audio_input, manual_text, asr_backend_key, tts_backend_key, tts_voice, rule_rows, *, tracker, manual_asr, asr_adapters, tts_adapters, code_version, ): session_rules = rules_from_editor_rows(rule_rows) audio_path = Path(audio_input) if audio_input else None hypothesis_ids = ["H-001", "H-002", "H-003"] if audio_path else ["H-002", "H-004"] route_id = "R-001" if asr_backend_key == "funasr" else "R-002" if asr_backend_key == "faster-whisper" else "R-001" run_id = make_run_id(f"stage1-turn-{asr_backend_key or 'manual'}") metadata = RunMetadata( run_id=run_id, experiment_name="stage1-controlled-turn", hypothesis_ids=hypothesis_ids, baseline_run_id=None, route_id=route_id, seed=42, code_version=code_version, data_version=str(audio_path) if audio_path else "ui-manual-text", started_at=datetime.now(timezone.utc).isoformat(), status="running", ) try: outcome = run_controlled_turn( run_id=run_id, audio_path=audio_path, manual_text=manual_text, rules=session_rules, audio_asr=asr_adapters.get(asr_backend_key), manual_asr=manual_asr, tts_adapter=tts_adapters.get(tts_backend_key), tts_voice=tts_voice, tracker=tracker, metadata=metadata, ) except Exception as exc: return ( "", "", "未命中", "未命中", "", f"处理失败:{exc}", None, _format_run_summary( run_id=run_id, input_mode="audio" if audio_path else "manual_text", asr_backend=asr_backend_key, asr_latency_ms=None, rule_id=None, matched_keyword=None, tts_status="失败", runtime_note=None, ), ) match = outcome.match_result tts_preview = outcome.tts_preview reply_text_value = match.reply or ("当前未命中规则。" if not match.matched else "") tts_status_text = _format_tts_status(tts_preview, match.matched, finalized=True) summary = _format_run_summary( run_id=outcome.run_id, input_mode=outcome.input_mode, asr_backend=outcome.asr_result.backend, asr_latency_ms=outcome.asr_result.latency_ms, rule_id=match.rule_id, matched_keyword=match.matched_keyword, tts_status=tts_status_text, runtime_note=outcome.asr_result.runtime_note, ) return ( outcome.asr_result.transcript, outcome.asr_result.partial_transcript, match.matched_keyword or "未命中", match.rule_id or "未命中", reply_text_value, tts_status_text, tts_preview.audio_path if tts_preview is not None else None, summary, ) def _preview_live_microphone_stream_for_space( audio_chunk, stream_state, asr_backend_key, rule_rows, *, asr_adapters, ): session_rules = rules_from_editor_rows(rule_rows) try: updated_state, asr_result, match = preview_streaming_turn( state=stream_state, audio_chunk=audio_chunk, rules=session_rules, audio_asr=asr_adapters[asr_backend_key], ) except Exception as exc: safe_state = stream_state or create_streaming_turn_state(recording_active=True) return ( safe_state, safe_state.transcript, safe_state.partial_transcript, safe_state.matched_keyword or "未命中", safe_state.matched_rule_id or "未命中", safe_state.matched_reply_text or "", f"实时识别失败:{exc}", safe_state.auto_reply_audio_path, _format_stream_preview_summary(state=safe_state, error=str(exc)), ) reply_preview = match.reply or ("当前尚未命中规则。" if updated_state.transcript else "") return ( updated_state, updated_state.transcript, updated_state.partial_transcript, match.matched_keyword or "未命中", match.rule_id or "未命中", reply_preview, _format_live_stream_status(updated_state, asr_result=asr_result, finalized=False), updated_state.auto_reply_audio_path, _format_stream_preview_summary(state=updated_state), ) def _auto_reply_live_microphone_stream_for_space( stream_state, tts_backend_key, tts_voice, *, tts_adapters, ): state = stream_state or create_streaming_turn_state() try: updated_state, tts_preview = maybe_auto_reply_streaming_turn( state=state, tts_adapter=tts_adapters.get(tts_backend_key), tts_voice=tts_voice, ) except Exception as exc: return ( state, f"自动播报失败:{exc}", state.auto_reply_audio_path, _format_stream_preview_summary(state=state, error=str(exc)), ) return ( updated_state, _format_live_stream_status(updated_state, finalized=False, auto_reply_triggered=tts_preview is not None), updated_state.auto_reply_audio_path, _format_stream_preview_summary(state=updated_state, auto_reply_triggered=tts_preview is not None), ) @spaces.GPU def _finalize_live_microphone_stream_for_space( stream_state, asr_backend_key, tts_backend_key, tts_voice, rule_rows, *, tracker, manual_asr, asr_adapters, tts_adapters, code_version, ): safe_state = stop_streaming_turn_state(stream_state or create_streaming_turn_state()) session_rules = rules_from_editor_rows(rule_rows) run_id = make_run_id(f"stage1-live-stream-{asr_backend_key or 'manual'}") metadata = RunMetadata( run_id=run_id, experiment_name="stage1-live-stream-turn", hypothesis_ids=["H-001", "H-002", "H-003"], baseline_run_id=None, route_id="R-001" if asr_backend_key == "funasr" else "R-002", seed=42, code_version=code_version, data_version="ui-live-microphone-stream", started_at=datetime.now(timezone.utc).isoformat(), status="running", ) reuse_auto_reply = bool( safe_state.current_match_key and safe_state.auto_reply_key == safe_state.current_match_key and safe_state.auto_reply_audio_path ) try: outcome = run_streaming_turn( run_id=run_id, state=safe_state, rules=session_rules, audio_asr=asr_adapters[asr_backend_key], manual_asr=manual_asr, tts_adapter=None if reuse_auto_reply else tts_adapters.get(tts_backend_key), tts_voice=tts_voice, tracker=tracker, metadata=metadata, ) except Exception as exc: return ( create_streaming_turn_state(), safe_state.transcript, safe_state.partial_transcript, safe_state.matched_keyword or "未命中", safe_state.matched_rule_id or "未命中", safe_state.matched_reply_text or "", f"结束录音时处理失败:{exc}", safe_state.auto_reply_audio_path, _format_stream_preview_summary(state=safe_state, error=str(exc), finalized=True), ) match = outcome.match_result tts_preview = outcome.tts_preview reply_text_value = safe_state.auto_reply_text or match.reply or ("当前未命中规则。" if not match.matched else "") tts_status_text = _format_live_stream_status( safe_state, asr_result=outcome.asr_result, finalized=True, auto_reply_triggered=reuse_auto_reply or tts_preview is not None, ) summary = _format_run_summary( run_id=outcome.run_id, input_mode=outcome.input_mode, asr_backend=outcome.asr_result.backend, asr_latency_ms=outcome.asr_result.latency_ms, rule_id=match.rule_id, matched_keyword=match.matched_keyword, tts_status=tts_status_text, runtime_note=outcome.asr_result.runtime_note, ) return ( create_streaming_turn_state(), outcome.asr_result.transcript, outcome.asr_result.partial_transcript, match.matched_keyword or "未命中", match.rule_id or "未命中", reply_text_value, tts_status_text, safe_state.auto_reply_audio_path if reuse_auto_reply else (tts_preview.audio_path if tts_preview is not None else None), summary, ) @spaces.GPU def _run_stage1_smoke_for_space( asr_backend_key, tts_backend_key, tts_voice, rule_rows, script_ids_text, *, benchmark_csv_path, tracker, asr_adapters, tts_adapters, code_version, ): session_rules = rules_from_editor_rows(rule_rows) selected_script_ids = _parse_script_ids(script_ids_text) run_id = make_run_id(f"stage1-smoke-{asr_backend_key}") route_id = "R-001" if asr_backend_key == "funasr" else "R-002" metadata = RunMetadata( run_id=run_id, experiment_name="stage1-synthetic-audio-smoke", hypothesis_ids=["H-001", "H-002", "H-003", "H-005"], baseline_run_id=None, route_id=route_id, seed=42, code_version=code_version, data_version=f"{benchmark_csv_path}#synthetic-audio-smoke", started_at=datetime.now(timezone.utc).isoformat(), status="running", ) summary = evaluate_stage1_integration_smoke( benchmark_csv_path, session_rules, asr_adapters[asr_backend_key], tts_adapters[tts_backend_key], tts_adapters[tts_backend_key], script_ids=selected_script_ids, tracker=tracker, metadata=metadata, input_voice=tts_voice, reply_voice=tts_voice, ) return summary.to_markdown() def build_app( benchmark_csv_path: Path = DEFAULT_BENCHMARK_CSV_PATH, rules_path: Path = DEFAULT_RULES_PATH, evaluation_contract_path: Path = DEFAULT_EVALUATION_CONTRACT_PATH, ): gr = _require_gradio() seed_rules = load_rule_catalog(rules_path) evaluation_contract = load_evaluation_contract(evaluation_contract_path) tracker = LocalJsonlTracker(Path(__file__).resolve().parents[1] / "results" / "tracking") manual_asr = MockASRAdapter() asr_adapters = { "funasr": FunASRSenseVoiceAdapter(), "faster-whisper": FasterWhisperAdapter(), } # Keep preview ASR instances separate so a ZeroGPU live-preview CPU fallback # does not pin the finalize/upload paths to CPU as well. stream_preview_asr_adapters = _build_stream_preview_asr_adapters() tts_adapters = { "edge-tts": EdgeTTSAdapter(), "mock": MockTTSAdapter(), } code_version = resolve_code_version(Path(__file__).resolve().parents[1]) def apply_rules(rule_rows): session_rules = rules_from_editor_rows(rule_rows) return rules_to_table_rows(session_rules), _format_rule_status(session_rules) def start_live_microphone_stream(): return ( gr.Timer(value=0.25, active=True), create_streaming_turn_state(recording_active=True), "", "", "未命中", "未命中", "", f"实时监听中,命中规则后稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒将自动播放回复语音。", None, _initial_live_stream_placeholder(), ) process_turn = partial( _process_turn_for_space, tracker=tracker, manual_asr=manual_asr, asr_adapters=asr_adapters, tts_adapters=tts_adapters, code_version=code_version, ) preview_live_stream = partial( _preview_live_microphone_stream_for_space, asr_adapters=stream_preview_asr_adapters, ) auto_reply_live_stream = partial( _auto_reply_live_microphone_stream_for_space, tts_adapters=tts_adapters, ) finalize_live_stream = partial( _finalize_live_microphone_stream_for_space, tracker=tracker, manual_asr=manual_asr, asr_adapters=asr_adapters, tts_adapters=tts_adapters, code_version=code_version, ) def stop_live_microphone_stream(stream_state, asr_backend_key, tts_backend_key, tts_voice, rule_rows): result = finalize_live_stream(stream_state, asr_backend_key, tts_backend_key, tts_voice, rule_rows) return (gr.Timer(value=0.25, active=False), *result) def preview_benchmark(rule_rows): session_rules = rules_from_editor_rows(rule_rows) run_id = make_run_id("stage1-rule-preview") metadata = RunMetadata( run_id=run_id, experiment_name="stage1-rule-only-benchmark-preview", hypothesis_ids=["H-002"], baseline_run_id=None, route_id="R-001", seed=42, code_version=code_version, data_version=str(benchmark_csv_path), started_at=datetime.now(timezone.utc).isoformat(), status="running", ) summary = evaluate_internal_benchmark( benchmark_csv_path, session_rules, tracker=tracker, metadata=metadata, ) return summary.to_markdown() run_stage1_smoke = partial( _run_stage1_smoke_for_space, benchmark_csv_path=benchmark_csv_path, tracker=tracker, asr_adapters=asr_adapters, tts_adapters=tts_adapters, code_version=code_version, ) with gr.Blocks(title="VoiceDirector 语音场控接话演示") as app: live_stream_state = gr.State(value=create_streaming_turn_state()) auto_reply_timer = gr.Timer(value=0.25, active=False) gr.Markdown("# VoiceDirector 语音场控接话演示") gr.Markdown("**实时语音接话演示**:连续麦克风转写、自动规则匹配、自动回复语音。") gr.Markdown(f"提示:{MINIMAL_UI_NOTICE}") with gr.Row(): with gr.Column(scale=5): gr.Markdown( f"### 主路径:麦克风连续流式识别\n点击麦克风开始说话,识别文本会持续刷新;命中规则稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后会自动播放回复语音。" ) live_audio_input = gr.Audio( label="实时麦克风输入(连续流式)", sources=["microphone"], type="numpy", streaming=True, ) asr_backend = gr.Dropdown( choices=[ ("FunASR / SenseVoice(主路径)", "funasr"), ("faster-whisper(兜底)", "faster-whisper"), ], value="funasr", label="识别后端", ) tts_backend = gr.Dropdown( choices=[("edge-tts(真实语音)", "edge-tts"), ("mock(调试预览)", "mock")], value="edge-tts", label="回复语音后端", ) tts_voice = gr.Dropdown( choices=[(voice, voice) for voice in AVAILABLE_EDGE_TTS_VOICES], value=DEFAULT_EDGE_TTS_VOICE, label="回复音色", ) with gr.Accordion("上传音频 / 手工输入兜底", open=False): upload_audio_input = gr.Audio( label="上传音频兜底", sources=["upload"], type="filepath", format="wav", ) manual_text = gr.Textbox( label="手工文本兜底", placeholder="没有音频时,可直接输入中文文本做规则演示。", lines=3, ) run_turn_button = gr.Button("执行上传/手工兜底", variant="secondary") recognized_text = gr.Textbox(label="识别文本") partial_text = gr.Textbox(label="实时转写预览") matched_keyword = gr.Textbox(label="命中关键词") matched_rule = gr.Textbox(label="命中规则") reply_text = gr.Textbox(label="回复文本", lines=3) tts_status = gr.Textbox(label="当前状态") reply_audio = gr.Audio(label="回复语音", interactive=False, type="filepath", autoplay=True) run_summary = gr.Markdown(value=_initial_live_stream_placeholder()) with gr.Column(scale=7): rule_table = gr.Dataframe( headers=TABLE_HEADERS, datatype=["str", "str", "str", "str"], row_count=(len(seed_rules), "fixed"), column_count=(len(TABLE_HEADERS), "fixed"), value=rules_to_table_rows(seed_rules), label="规则编辑表(当前会话)", interactive=True, ) apply_button = gr.Button("应用规则编辑") rule_status = gr.Textbox(label="规则编辑状态", value=_format_rule_status(seed_rules)) gr.Markdown(_format_contract(evaluation_contract)) with gr.Accordion("基准与烟测面板", open=False): gr.Markdown( "下面的规则预览保留了内部基准可见性。端到端烟测会使用 edge-tts 生成输入音频来打通真实 ASR/TTS 代码路径。" ) benchmark_button = gr.Button("预览规则基准") benchmark_output = gr.Markdown(value=_initial_benchmark_placeholder(benchmark_csv_path)) script_ids = gr.Textbox( label="烟测脚本 ID", value=", ".join(DEFAULT_STAGE1_SMOKE_SCRIPT_IDS), lines=2, ) smoke_asr_backend = gr.Dropdown( choices=[ ("FunASR / SenseVoice(主路径)", "funasr"), ("faster-whisper(兜底)", "faster-whisper"), ], value="funasr", label="烟测识别后端", ) smoke_tts_backend = gr.Dropdown( choices=[("edge-tts(真实语音)", "edge-tts"), ("mock(调试预览)", "mock")], value="edge-tts", label="烟测语音后端", ) smoke_tts_voice = gr.Dropdown( choices=[(voice, voice) for voice in AVAILABLE_EDGE_TTS_VOICES], value=DEFAULT_EDGE_TTS_VOICE, label="烟测音色", ) stage1_smoke_button = gr.Button("执行端到端烟测") stage1_smoke_output = gr.Markdown(value=_initial_stage1_smoke_placeholder(benchmark_csv_path)) run_turn_button.click( process_turn, inputs=[upload_audio_input, manual_text, asr_backend, tts_backend, tts_voice, rule_table], outputs=[recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], ) live_audio_input.start_recording( start_live_microphone_stream, outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], queue=False, show_progress="hidden", ) stream_event = live_audio_input.stream( preview_live_stream, inputs=[live_audio_input, live_stream_state, asr_backend, rule_table], outputs=[live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], show_progress="hidden", trigger_mode="always_last", concurrency_limit=1, concurrency_id="live-stream-session", stream_every=0.75, ) auto_reply_event = auto_reply_timer.tick( auto_reply_live_stream, inputs=[live_stream_state, tts_backend, tts_voice], outputs=[live_stream_state, tts_status, reply_audio, run_summary], show_progress="hidden", trigger_mode="always_last", concurrency_limit=1, concurrency_id="live-stream-session", ) live_audio_input.stop_recording( stop_live_microphone_stream, inputs=[live_stream_state, asr_backend, tts_backend, tts_voice, rule_table], outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], show_progress="minimal", cancels=[stream_event, auto_reply_event], ) live_audio_input.pause_recording( stop_live_microphone_stream, inputs=[live_stream_state, asr_backend, tts_backend, tts_voice, rule_table], outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], show_progress="minimal", cancels=[stream_event, auto_reply_event], ) apply_button.click(apply_rules, inputs=[rule_table], outputs=[rule_table, rule_status]) benchmark_button.click(preview_benchmark, inputs=[rule_table], outputs=[benchmark_output]) stage1_smoke_button.click( run_stage1_smoke, inputs=[smoke_asr_backend, smoke_tts_backend, smoke_tts_voice, rule_table, script_ids], outputs=[stage1_smoke_output], ) return app def _format_contract(contract: dict) -> str: h003 = contract["h003_mos"] notes = contract["evaluation_constraints"] return "\n".join( [ "## 固定评测约束", f"- H-003 MOS 阈值:{h003['threshold']}/{h003['scale_max']}", f"- 最少评审人数:{h003['minimum_raters']}", f"- H-004 正式验证阶段:{notes['h004_formal_validation_phase']}", f"- H-002 适用范围:{notes['h002_precision_scope']}", ] ) def _format_rule_status(session_rules) -> str: return f"当前会话已加载 {len(session_rules)} 条规则,编辑结果会立即在本次会话生效。" def _initial_benchmark_placeholder(benchmark_csv_path: Path) -> str: return "\n".join( [ "### 规则基准已就绪", f"- 数据集:{benchmark_csv_path}", "- 范围:仅内部基准", "- 用途:查看规则与文本基准,不代表真实语音识别表现。", ] ) def _initial_stage1_smoke_placeholder(benchmark_csv_path: Path) -> str: return "\n".join( [ "### 端到端烟测已就绪", f"- 数据集:{benchmark_csv_path}", "- 输入音频来源:edge-tts 根据脚本文本合成", "- 用途:检查真实 ASR/TTS 链路是否跑通。", ] ) def _initial_live_stream_placeholder() -> str: return "\n".join( [ "### 实时监听已就绪", "- 点击麦克风开始说话,识别文本会持续刷新。", f"- 命中规则稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后,会自动播放回复语音。", "- 上传音频 / 手工输入兜底仍可用。", f"- 说明:{MINIMAL_UI_NOTICE}", ] ) def _parse_script_ids(raw_value: str | None) -> list[str]: if not raw_value: return list(DEFAULT_STAGE1_SMOKE_SCRIPT_IDS) return [item.strip() for item in raw_value.replace("\n", ",").split(",") if item.strip()] def _format_tts_status(tts_preview, matched: bool, *, finalized: bool) -> str: if tts_preview is None: return "录音结束,当前未生成回复语音。" if finalized and not matched else "当前未生成回复语音。" status = f"已生成回复语音({tts_preview.latency_ms} ms)" if tts_preview.runtime_note: status = f"{status};{tts_preview.runtime_note}" return status def _format_run_summary( *, run_id: str, input_mode: str, asr_backend: str | None, asr_latency_ms: int | None, rule_id: str | None, matched_keyword: str | None, tts_status: str, runtime_note: str | None, ) -> str: latency_text = f"{asr_latency_ms} ms" if asr_latency_ms is not None else "未知" input_mode_text = "音频输入" if input_mode == "audio" else "手工文本" lines = [ "### 本轮处理结果", f"- 运行 ID:{run_id}", f"- 输入方式:{input_mode_text}", f"- 识别后端:{asr_backend or '手工文本'}", f"- 最新识别延迟:{latency_text}", f"- 命中规则:{rule_id or '未命中'}", f"- 命中关键词:{matched_keyword or '未命中'}", f"- 回复语音状态:{tts_status}", f"- 说明:{MINIMAL_UI_NOTICE}", ] if runtime_note: lines.append(f"- 运行提示:{runtime_note}") return "\n".join(lines) def _format_live_stream_status(state, *, asr_result=None, finalized: bool, auto_reply_triggered: bool = False) -> str: if finalized: status = "录音已结束。" if state.current_match_key and (state.auto_reply_key == state.current_match_key or auto_reply_triggered): status = "录音已结束,已保留当前自动回复语音。" elif state.current_match_key: status = "录音已结束,已完成本轮规则匹配。" else: status = "录音已结束,当前未命中规则。" elif state.auto_reply_key == state.current_match_key and state.auto_reply_audio_path: status = f"已自动播放规则 {state.matched_rule_id or '未命中'} 的回复语音,继续说话可触发新的规则。" elif state.current_match_key: status = f"已命中规则 {state.matched_rule_id or '未命中'},稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后将自动播放回复语音。" elif state.transcript: status = "实时转写中,当前尚未命中规则。" else: status = "实时监听中,请开始说话。" effective_asr_result = asr_result if effective_asr_result is None and state.asr_latency_ms is not None: latency_text = f"最新识别延迟 {state.asr_latency_ms} ms。" status = f"{status} {latency_text}" elif effective_asr_result is not None: status = f"{status} 最新识别延迟 {effective_asr_result.latency_ms} ms。" runtime_note = effective_asr_result.runtime_note if effective_asr_result is not None else state.runtime_note if runtime_note: status = f"{status} {runtime_note}" return status def _format_stream_preview_summary(*, state, error: str | None = None, finalized: bool = False, auto_reply_triggered: bool = False) -> str: if error: return "\n".join( [ "### 实时状态", f"- 已接收音频片段:{state.chunk_count}", f"- 错误:{error}", f"- 说明:{MINIMAL_UI_NOTICE}", ] ) if state.auto_reply_key == state.current_match_key and state.auto_reply_audio_path: auto_reply_status = "已自动播放" elif state.current_match_key: auto_reply_status = f"等待稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒" else: auto_reply_status = "当前未触发" if auto_reply_triggered: auto_reply_status = "刚刚完成自动播放" if finalized and state.current_match_key: auto_reply_status = "录音结束,已完成本轮处理" latency_text = f"{state.asr_latency_ms} ms" if state.asr_latency_ms is not None else "未知" lines = [ "### 实时状态", f"- 已接收音频片段:{state.chunk_count}", f"- 当前采样率:{state.sample_rate or '未知'} Hz", f"- 识别后端:{state.asr_backend or '尚未开始'}", f"- 最新识别延迟:{latency_text}", f"- 当前命中规则:{state.matched_rule_id or '未命中'}", f"- 当前命中关键词:{state.matched_keyword or '未命中'}", f"- 自动播报状态:{auto_reply_status}", f"- 说明:{MINIMAL_UI_NOTICE}", ] if state.runtime_note: lines.append(f"- 运行提示:{state.runtime_note}") return "\n".join(lines) def _require_gradio(): try: import gradio as gr except ImportError as exc: raise RuntimeError("未安装 Gradio,请先执行 `pip install -r requirements.txt`。") from exc return gr