Spaces:
Paused
Paused
| import os | |
| from datetime import datetime, timezone | |
| from functools import partial | |
| from pathlib import Path | |
| try: | |
| import spaces # pyright: ignore[reportMissingImports] | |
| except ImportError: # pragma: no cover - local fallback when not running on HF Spaces. | |
| class _SpacesShim: | |
| def GPU(*decorator_args, **decorator_kwargs): | |
| if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs: | |
| return decorator_args[0] | |
| def _decorator(func): | |
| return func | |
| return _decorator | |
| spaces = _SpacesShim() | |
| from benchmarks.harness import ( | |
| DEFAULT_STAGE1_SMOKE_SCRIPT_IDS, | |
| evaluate_internal_benchmark, | |
| evaluate_stage1_integration_smoke, | |
| ) | |
| from core.adapters import ( | |
| AVAILABLE_EDGE_TTS_VOICES, | |
| DEFAULT_EDGE_TTS_VOICE, | |
| EdgeTTSAdapter, | |
| FasterWhisperAdapter, | |
| FunASRSenseVoiceAdapter, | |
| MockASRAdapter, | |
| MockTTSAdapter, | |
| ) | |
| from core.pipeline import ( | |
| AUTO_REPLY_DELAY_SECONDS, | |
| create_streaming_turn_state, | |
| maybe_auto_reply_streaming_turn, | |
| preview_streaming_turn, | |
| run_controlled_turn, | |
| run_streaming_turn, | |
| stop_streaming_turn_state, | |
| ) | |
| from utils.config_loader import ( | |
| DEFAULT_BENCHMARK_CSV_PATH, | |
| DEFAULT_EVALUATION_CONTRACT_PATH, | |
| DEFAULT_RULES_PATH, | |
| TABLE_HEADERS, | |
| load_evaluation_contract, | |
| load_rule_catalog, | |
| rules_from_editor_rows, | |
| rules_to_table_rows, | |
| ) | |
| from utils.tracking import LocalJsonlTracker, RunMetadata, make_run_id, resolve_code_version | |
| MINIMAL_UI_NOTICE = "当前为演示版,优先展示实时接话体验。首次加载模型可能稍慢;麦克风不可用时可改用下方兜底输入。" | |
| def _is_zero_gpu_runtime() -> bool: | |
| return os.getenv("SPACES_ZERO_GPU", "").lower() in {"1", "t", "true"} | |
| def _build_stream_preview_asr_adapters(): | |
| # ZeroGPU forbids CUDA init from the main process, so preview callbacks must | |
| # stay on CPU when they are not wrapped by @spaces.GPU. | |
| if _is_zero_gpu_runtime(): | |
| return { | |
| "funasr": FunASRSenseVoiceAdapter(device_order=("cpu",)), | |
| "faster-whisper": FasterWhisperAdapter(device_order=(("cpu", "int8"),)), | |
| } | |
| return { | |
| "funasr": FunASRSenseVoiceAdapter(), | |
| "faster-whisper": FasterWhisperAdapter(), | |
| } | |
| def _process_turn_for_space( | |
| audio_input, | |
| manual_text, | |
| asr_backend_key, | |
| tts_backend_key, | |
| tts_voice, | |
| rule_rows, | |
| *, | |
| tracker, | |
| manual_asr, | |
| asr_adapters, | |
| tts_adapters, | |
| code_version, | |
| ): | |
| session_rules = rules_from_editor_rows(rule_rows) | |
| audio_path = Path(audio_input) if audio_input else None | |
| hypothesis_ids = ["H-001", "H-002", "H-003"] if audio_path else ["H-002", "H-004"] | |
| route_id = "R-001" if asr_backend_key == "funasr" else "R-002" if asr_backend_key == "faster-whisper" else "R-001" | |
| run_id = make_run_id(f"stage1-turn-{asr_backend_key or 'manual'}") | |
| metadata = RunMetadata( | |
| run_id=run_id, | |
| experiment_name="stage1-controlled-turn", | |
| hypothesis_ids=hypothesis_ids, | |
| baseline_run_id=None, | |
| route_id=route_id, | |
| seed=42, | |
| code_version=code_version, | |
| data_version=str(audio_path) if audio_path else "ui-manual-text", | |
| started_at=datetime.now(timezone.utc).isoformat(), | |
| status="running", | |
| ) | |
| try: | |
| outcome = run_controlled_turn( | |
| run_id=run_id, | |
| audio_path=audio_path, | |
| manual_text=manual_text, | |
| rules=session_rules, | |
| audio_asr=asr_adapters.get(asr_backend_key), | |
| manual_asr=manual_asr, | |
| tts_adapter=tts_adapters.get(tts_backend_key), | |
| tts_voice=tts_voice, | |
| tracker=tracker, | |
| metadata=metadata, | |
| ) | |
| except Exception as exc: | |
| return ( | |
| "", | |
| "", | |
| "未命中", | |
| "未命中", | |
| "", | |
| f"处理失败:{exc}", | |
| None, | |
| _format_run_summary( | |
| run_id=run_id, | |
| input_mode="audio" if audio_path else "manual_text", | |
| asr_backend=asr_backend_key, | |
| asr_latency_ms=None, | |
| rule_id=None, | |
| matched_keyword=None, | |
| tts_status="失败", | |
| runtime_note=None, | |
| ), | |
| ) | |
| match = outcome.match_result | |
| tts_preview = outcome.tts_preview | |
| reply_text_value = match.reply or ("当前未命中规则。" if not match.matched else "") | |
| tts_status_text = _format_tts_status(tts_preview, match.matched, finalized=True) | |
| summary = _format_run_summary( | |
| run_id=outcome.run_id, | |
| input_mode=outcome.input_mode, | |
| asr_backend=outcome.asr_result.backend, | |
| asr_latency_ms=outcome.asr_result.latency_ms, | |
| rule_id=match.rule_id, | |
| matched_keyword=match.matched_keyword, | |
| tts_status=tts_status_text, | |
| runtime_note=outcome.asr_result.runtime_note, | |
| ) | |
| return ( | |
| outcome.asr_result.transcript, | |
| outcome.asr_result.partial_transcript, | |
| match.matched_keyword or "未命中", | |
| match.rule_id or "未命中", | |
| reply_text_value, | |
| tts_status_text, | |
| tts_preview.audio_path if tts_preview is not None else None, | |
| summary, | |
| ) | |
| def _preview_live_microphone_stream_for_space( | |
| audio_chunk, | |
| stream_state, | |
| asr_backend_key, | |
| rule_rows, | |
| *, | |
| asr_adapters, | |
| ): | |
| session_rules = rules_from_editor_rows(rule_rows) | |
| try: | |
| updated_state, asr_result, match = preview_streaming_turn( | |
| state=stream_state, | |
| audio_chunk=audio_chunk, | |
| rules=session_rules, | |
| audio_asr=asr_adapters[asr_backend_key], | |
| ) | |
| except Exception as exc: | |
| safe_state = stream_state or create_streaming_turn_state(recording_active=True) | |
| return ( | |
| safe_state, | |
| safe_state.transcript, | |
| safe_state.partial_transcript, | |
| safe_state.matched_keyword or "未命中", | |
| safe_state.matched_rule_id or "未命中", | |
| safe_state.matched_reply_text or "", | |
| f"实时识别失败:{exc}", | |
| safe_state.auto_reply_audio_path, | |
| _format_stream_preview_summary(state=safe_state, error=str(exc)), | |
| ) | |
| reply_preview = match.reply or ("当前尚未命中规则。" if updated_state.transcript else "") | |
| return ( | |
| updated_state, | |
| updated_state.transcript, | |
| updated_state.partial_transcript, | |
| match.matched_keyword or "未命中", | |
| match.rule_id or "未命中", | |
| reply_preview, | |
| _format_live_stream_status(updated_state, asr_result=asr_result, finalized=False), | |
| updated_state.auto_reply_audio_path, | |
| _format_stream_preview_summary(state=updated_state), | |
| ) | |
| def _auto_reply_live_microphone_stream_for_space( | |
| stream_state, | |
| tts_backend_key, | |
| tts_voice, | |
| *, | |
| tts_adapters, | |
| ): | |
| state = stream_state or create_streaming_turn_state() | |
| try: | |
| updated_state, tts_preview = maybe_auto_reply_streaming_turn( | |
| state=state, | |
| tts_adapter=tts_adapters.get(tts_backend_key), | |
| tts_voice=tts_voice, | |
| ) | |
| except Exception as exc: | |
| return ( | |
| state, | |
| f"自动播报失败:{exc}", | |
| state.auto_reply_audio_path, | |
| _format_stream_preview_summary(state=state, error=str(exc)), | |
| ) | |
| return ( | |
| updated_state, | |
| _format_live_stream_status(updated_state, finalized=False, auto_reply_triggered=tts_preview is not None), | |
| updated_state.auto_reply_audio_path, | |
| _format_stream_preview_summary(state=updated_state, auto_reply_triggered=tts_preview is not None), | |
| ) | |
| def _finalize_live_microphone_stream_for_space( | |
| stream_state, | |
| asr_backend_key, | |
| tts_backend_key, | |
| tts_voice, | |
| rule_rows, | |
| *, | |
| tracker, | |
| manual_asr, | |
| asr_adapters, | |
| tts_adapters, | |
| code_version, | |
| ): | |
| safe_state = stop_streaming_turn_state(stream_state or create_streaming_turn_state()) | |
| session_rules = rules_from_editor_rows(rule_rows) | |
| run_id = make_run_id(f"stage1-live-stream-{asr_backend_key or 'manual'}") | |
| metadata = RunMetadata( | |
| run_id=run_id, | |
| experiment_name="stage1-live-stream-turn", | |
| hypothesis_ids=["H-001", "H-002", "H-003"], | |
| baseline_run_id=None, | |
| route_id="R-001" if asr_backend_key == "funasr" else "R-002", | |
| seed=42, | |
| code_version=code_version, | |
| data_version="ui-live-microphone-stream", | |
| started_at=datetime.now(timezone.utc).isoformat(), | |
| status="running", | |
| ) | |
| reuse_auto_reply = bool( | |
| safe_state.current_match_key | |
| and safe_state.auto_reply_key == safe_state.current_match_key | |
| and safe_state.auto_reply_audio_path | |
| ) | |
| try: | |
| outcome = run_streaming_turn( | |
| run_id=run_id, | |
| state=safe_state, | |
| rules=session_rules, | |
| audio_asr=asr_adapters[asr_backend_key], | |
| manual_asr=manual_asr, | |
| tts_adapter=None if reuse_auto_reply else tts_adapters.get(tts_backend_key), | |
| tts_voice=tts_voice, | |
| tracker=tracker, | |
| metadata=metadata, | |
| ) | |
| except Exception as exc: | |
| return ( | |
| create_streaming_turn_state(), | |
| safe_state.transcript, | |
| safe_state.partial_transcript, | |
| safe_state.matched_keyword or "未命中", | |
| safe_state.matched_rule_id or "未命中", | |
| safe_state.matched_reply_text or "", | |
| f"结束录音时处理失败:{exc}", | |
| safe_state.auto_reply_audio_path, | |
| _format_stream_preview_summary(state=safe_state, error=str(exc), finalized=True), | |
| ) | |
| match = outcome.match_result | |
| tts_preview = outcome.tts_preview | |
| reply_text_value = safe_state.auto_reply_text or match.reply or ("当前未命中规则。" if not match.matched else "") | |
| tts_status_text = _format_live_stream_status( | |
| safe_state, | |
| asr_result=outcome.asr_result, | |
| finalized=True, | |
| auto_reply_triggered=reuse_auto_reply or tts_preview is not None, | |
| ) | |
| summary = _format_run_summary( | |
| run_id=outcome.run_id, | |
| input_mode=outcome.input_mode, | |
| asr_backend=outcome.asr_result.backend, | |
| asr_latency_ms=outcome.asr_result.latency_ms, | |
| rule_id=match.rule_id, | |
| matched_keyword=match.matched_keyword, | |
| tts_status=tts_status_text, | |
| runtime_note=outcome.asr_result.runtime_note, | |
| ) | |
| return ( | |
| create_streaming_turn_state(), | |
| outcome.asr_result.transcript, | |
| outcome.asr_result.partial_transcript, | |
| match.matched_keyword or "未命中", | |
| match.rule_id or "未命中", | |
| reply_text_value, | |
| tts_status_text, | |
| safe_state.auto_reply_audio_path if reuse_auto_reply else (tts_preview.audio_path if tts_preview is not None else None), | |
| summary, | |
| ) | |
| def _run_stage1_smoke_for_space( | |
| asr_backend_key, | |
| tts_backend_key, | |
| tts_voice, | |
| rule_rows, | |
| script_ids_text, | |
| *, | |
| benchmark_csv_path, | |
| tracker, | |
| asr_adapters, | |
| tts_adapters, | |
| code_version, | |
| ): | |
| session_rules = rules_from_editor_rows(rule_rows) | |
| selected_script_ids = _parse_script_ids(script_ids_text) | |
| run_id = make_run_id(f"stage1-smoke-{asr_backend_key}") | |
| route_id = "R-001" if asr_backend_key == "funasr" else "R-002" | |
| metadata = RunMetadata( | |
| run_id=run_id, | |
| experiment_name="stage1-synthetic-audio-smoke", | |
| hypothesis_ids=["H-001", "H-002", "H-003", "H-005"], | |
| baseline_run_id=None, | |
| route_id=route_id, | |
| seed=42, | |
| code_version=code_version, | |
| data_version=f"{benchmark_csv_path}#synthetic-audio-smoke", | |
| started_at=datetime.now(timezone.utc).isoformat(), | |
| status="running", | |
| ) | |
| summary = evaluate_stage1_integration_smoke( | |
| benchmark_csv_path, | |
| session_rules, | |
| asr_adapters[asr_backend_key], | |
| tts_adapters[tts_backend_key], | |
| tts_adapters[tts_backend_key], | |
| script_ids=selected_script_ids, | |
| tracker=tracker, | |
| metadata=metadata, | |
| input_voice=tts_voice, | |
| reply_voice=tts_voice, | |
| ) | |
| return summary.to_markdown() | |
| def build_app( | |
| benchmark_csv_path: Path = DEFAULT_BENCHMARK_CSV_PATH, | |
| rules_path: Path = DEFAULT_RULES_PATH, | |
| evaluation_contract_path: Path = DEFAULT_EVALUATION_CONTRACT_PATH, | |
| ): | |
| gr = _require_gradio() | |
| seed_rules = load_rule_catalog(rules_path) | |
| evaluation_contract = load_evaluation_contract(evaluation_contract_path) | |
| tracker = LocalJsonlTracker(Path(__file__).resolve().parents[1] / "results" / "tracking") | |
| manual_asr = MockASRAdapter() | |
| asr_adapters = { | |
| "funasr": FunASRSenseVoiceAdapter(), | |
| "faster-whisper": FasterWhisperAdapter(), | |
| } | |
| # Keep preview ASR instances separate so a ZeroGPU live-preview CPU fallback | |
| # does not pin the finalize/upload paths to CPU as well. | |
| stream_preview_asr_adapters = _build_stream_preview_asr_adapters() | |
| tts_adapters = { | |
| "edge-tts": EdgeTTSAdapter(), | |
| "mock": MockTTSAdapter(), | |
| } | |
| code_version = resolve_code_version(Path(__file__).resolve().parents[1]) | |
| def apply_rules(rule_rows): | |
| session_rules = rules_from_editor_rows(rule_rows) | |
| return rules_to_table_rows(session_rules), _format_rule_status(session_rules) | |
| def start_live_microphone_stream(): | |
| return ( | |
| gr.Timer(value=0.25, active=True), | |
| create_streaming_turn_state(recording_active=True), | |
| "", | |
| "", | |
| "未命中", | |
| "未命中", | |
| "", | |
| f"实时监听中,命中规则后稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒将自动播放回复语音。", | |
| None, | |
| _initial_live_stream_placeholder(), | |
| ) | |
| process_turn = partial( | |
| _process_turn_for_space, | |
| tracker=tracker, | |
| manual_asr=manual_asr, | |
| asr_adapters=asr_adapters, | |
| tts_adapters=tts_adapters, | |
| code_version=code_version, | |
| ) | |
| preview_live_stream = partial( | |
| _preview_live_microphone_stream_for_space, | |
| asr_adapters=stream_preview_asr_adapters, | |
| ) | |
| auto_reply_live_stream = partial( | |
| _auto_reply_live_microphone_stream_for_space, | |
| tts_adapters=tts_adapters, | |
| ) | |
| finalize_live_stream = partial( | |
| _finalize_live_microphone_stream_for_space, | |
| tracker=tracker, | |
| manual_asr=manual_asr, | |
| asr_adapters=asr_adapters, | |
| tts_adapters=tts_adapters, | |
| code_version=code_version, | |
| ) | |
| def stop_live_microphone_stream(stream_state, asr_backend_key, tts_backend_key, tts_voice, rule_rows): | |
| result = finalize_live_stream(stream_state, asr_backend_key, tts_backend_key, tts_voice, rule_rows) | |
| return (gr.Timer(value=0.25, active=False), *result) | |
| def preview_benchmark(rule_rows): | |
| session_rules = rules_from_editor_rows(rule_rows) | |
| run_id = make_run_id("stage1-rule-preview") | |
| metadata = RunMetadata( | |
| run_id=run_id, | |
| experiment_name="stage1-rule-only-benchmark-preview", | |
| hypothesis_ids=["H-002"], | |
| baseline_run_id=None, | |
| route_id="R-001", | |
| seed=42, | |
| code_version=code_version, | |
| data_version=str(benchmark_csv_path), | |
| started_at=datetime.now(timezone.utc).isoformat(), | |
| status="running", | |
| ) | |
| summary = evaluate_internal_benchmark( | |
| benchmark_csv_path, | |
| session_rules, | |
| tracker=tracker, | |
| metadata=metadata, | |
| ) | |
| return summary.to_markdown() | |
| run_stage1_smoke = partial( | |
| _run_stage1_smoke_for_space, | |
| benchmark_csv_path=benchmark_csv_path, | |
| tracker=tracker, | |
| asr_adapters=asr_adapters, | |
| tts_adapters=tts_adapters, | |
| code_version=code_version, | |
| ) | |
| with gr.Blocks(title="VoiceDirector 语音场控接话演示") as app: | |
| live_stream_state = gr.State(value=create_streaming_turn_state()) | |
| auto_reply_timer = gr.Timer(value=0.25, active=False) | |
| gr.Markdown("# VoiceDirector 语音场控接话演示") | |
| gr.Markdown("**实时语音接话演示**:连续麦克风转写、自动规则匹配、自动回复语音。") | |
| gr.Markdown(f"提示:{MINIMAL_UI_NOTICE}") | |
| with gr.Row(): | |
| with gr.Column(scale=5): | |
| gr.Markdown( | |
| f"### 主路径:麦克风连续流式识别\n点击麦克风开始说话,识别文本会持续刷新;命中规则稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后会自动播放回复语音。" | |
| ) | |
| live_audio_input = gr.Audio( | |
| label="实时麦克风输入(连续流式)", | |
| sources=["microphone"], | |
| type="numpy", | |
| streaming=True, | |
| ) | |
| asr_backend = gr.Dropdown( | |
| choices=[ | |
| ("FunASR / SenseVoice(主路径)", "funasr"), | |
| ("faster-whisper(兜底)", "faster-whisper"), | |
| ], | |
| value="funasr", | |
| label="识别后端", | |
| ) | |
| tts_backend = gr.Dropdown( | |
| choices=[("edge-tts(真实语音)", "edge-tts"), ("mock(调试预览)", "mock")], | |
| value="edge-tts", | |
| label="回复语音后端", | |
| ) | |
| tts_voice = gr.Dropdown( | |
| choices=[(voice, voice) for voice in AVAILABLE_EDGE_TTS_VOICES], | |
| value=DEFAULT_EDGE_TTS_VOICE, | |
| label="回复音色", | |
| ) | |
| with gr.Accordion("上传音频 / 手工输入兜底", open=False): | |
| upload_audio_input = gr.Audio( | |
| label="上传音频兜底", | |
| sources=["upload"], | |
| type="filepath", | |
| format="wav", | |
| ) | |
| manual_text = gr.Textbox( | |
| label="手工文本兜底", | |
| placeholder="没有音频时,可直接输入中文文本做规则演示。", | |
| lines=3, | |
| ) | |
| run_turn_button = gr.Button("执行上传/手工兜底", variant="secondary") | |
| recognized_text = gr.Textbox(label="识别文本") | |
| partial_text = gr.Textbox(label="实时转写预览") | |
| matched_keyword = gr.Textbox(label="命中关键词") | |
| matched_rule = gr.Textbox(label="命中规则") | |
| reply_text = gr.Textbox(label="回复文本", lines=3) | |
| tts_status = gr.Textbox(label="当前状态") | |
| reply_audio = gr.Audio(label="回复语音", interactive=False, type="filepath", autoplay=True) | |
| run_summary = gr.Markdown(value=_initial_live_stream_placeholder()) | |
| with gr.Column(scale=7): | |
| rule_table = gr.Dataframe( | |
| headers=TABLE_HEADERS, | |
| datatype=["str", "str", "str", "str"], | |
| row_count=(len(seed_rules), "fixed"), | |
| column_count=(len(TABLE_HEADERS), "fixed"), | |
| value=rules_to_table_rows(seed_rules), | |
| label="规则编辑表(当前会话)", | |
| interactive=True, | |
| ) | |
| apply_button = gr.Button("应用规则编辑") | |
| rule_status = gr.Textbox(label="规则编辑状态", value=_format_rule_status(seed_rules)) | |
| gr.Markdown(_format_contract(evaluation_contract)) | |
| with gr.Accordion("基准与烟测面板", open=False): | |
| gr.Markdown( | |
| "下面的规则预览保留了内部基准可见性。端到端烟测会使用 edge-tts 生成输入音频来打通真实 ASR/TTS 代码路径。" | |
| ) | |
| benchmark_button = gr.Button("预览规则基准") | |
| benchmark_output = gr.Markdown(value=_initial_benchmark_placeholder(benchmark_csv_path)) | |
| script_ids = gr.Textbox( | |
| label="烟测脚本 ID", | |
| value=", ".join(DEFAULT_STAGE1_SMOKE_SCRIPT_IDS), | |
| lines=2, | |
| ) | |
| smoke_asr_backend = gr.Dropdown( | |
| choices=[ | |
| ("FunASR / SenseVoice(主路径)", "funasr"), | |
| ("faster-whisper(兜底)", "faster-whisper"), | |
| ], | |
| value="funasr", | |
| label="烟测识别后端", | |
| ) | |
| smoke_tts_backend = gr.Dropdown( | |
| choices=[("edge-tts(真实语音)", "edge-tts"), ("mock(调试预览)", "mock")], | |
| value="edge-tts", | |
| label="烟测语音后端", | |
| ) | |
| smoke_tts_voice = gr.Dropdown( | |
| choices=[(voice, voice) for voice in AVAILABLE_EDGE_TTS_VOICES], | |
| value=DEFAULT_EDGE_TTS_VOICE, | |
| label="烟测音色", | |
| ) | |
| stage1_smoke_button = gr.Button("执行端到端烟测") | |
| stage1_smoke_output = gr.Markdown(value=_initial_stage1_smoke_placeholder(benchmark_csv_path)) | |
| run_turn_button.click( | |
| process_turn, | |
| inputs=[upload_audio_input, manual_text, asr_backend, tts_backend, tts_voice, rule_table], | |
| outputs=[recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], | |
| ) | |
| live_audio_input.start_recording( | |
| start_live_microphone_stream, | |
| outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], | |
| queue=False, | |
| show_progress="hidden", | |
| ) | |
| stream_event = live_audio_input.stream( | |
| preview_live_stream, | |
| inputs=[live_audio_input, live_stream_state, asr_backend, rule_table], | |
| outputs=[live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], | |
| show_progress="hidden", | |
| trigger_mode="always_last", | |
| concurrency_limit=1, | |
| concurrency_id="live-stream-session", | |
| stream_every=0.75, | |
| ) | |
| auto_reply_event = auto_reply_timer.tick( | |
| auto_reply_live_stream, | |
| inputs=[live_stream_state, tts_backend, tts_voice], | |
| outputs=[live_stream_state, tts_status, reply_audio, run_summary], | |
| show_progress="hidden", | |
| trigger_mode="always_last", | |
| concurrency_limit=1, | |
| concurrency_id="live-stream-session", | |
| ) | |
| live_audio_input.stop_recording( | |
| stop_live_microphone_stream, | |
| inputs=[live_stream_state, asr_backend, tts_backend, tts_voice, rule_table], | |
| outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], | |
| show_progress="minimal", | |
| cancels=[stream_event, auto_reply_event], | |
| ) | |
| live_audio_input.pause_recording( | |
| stop_live_microphone_stream, | |
| inputs=[live_stream_state, asr_backend, tts_backend, tts_voice, rule_table], | |
| outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary], | |
| show_progress="minimal", | |
| cancels=[stream_event, auto_reply_event], | |
| ) | |
| apply_button.click(apply_rules, inputs=[rule_table], outputs=[rule_table, rule_status]) | |
| benchmark_button.click(preview_benchmark, inputs=[rule_table], outputs=[benchmark_output]) | |
| stage1_smoke_button.click( | |
| run_stage1_smoke, | |
| inputs=[smoke_asr_backend, smoke_tts_backend, smoke_tts_voice, rule_table, script_ids], | |
| outputs=[stage1_smoke_output], | |
| ) | |
| return app | |
| def _format_contract(contract: dict) -> str: | |
| h003 = contract["h003_mos"] | |
| notes = contract["evaluation_constraints"] | |
| return "\n".join( | |
| [ | |
| "## 固定评测约束", | |
| f"- H-003 MOS 阈值:{h003['threshold']}/{h003['scale_max']}", | |
| f"- 最少评审人数:{h003['minimum_raters']}", | |
| f"- H-004 正式验证阶段:{notes['h004_formal_validation_phase']}", | |
| f"- H-002 适用范围:{notes['h002_precision_scope']}", | |
| ] | |
| ) | |
| def _format_rule_status(session_rules) -> str: | |
| return f"当前会话已加载 {len(session_rules)} 条规则,编辑结果会立即在本次会话生效。" | |
| def _initial_benchmark_placeholder(benchmark_csv_path: Path) -> str: | |
| return "\n".join( | |
| [ | |
| "### 规则基准已就绪", | |
| f"- 数据集:{benchmark_csv_path}", | |
| "- 范围:仅内部基准", | |
| "- 用途:查看规则与文本基准,不代表真实语音识别表现。", | |
| ] | |
| ) | |
| def _initial_stage1_smoke_placeholder(benchmark_csv_path: Path) -> str: | |
| return "\n".join( | |
| [ | |
| "### 端到端烟测已就绪", | |
| f"- 数据集:{benchmark_csv_path}", | |
| "- 输入音频来源:edge-tts 根据脚本文本合成", | |
| "- 用途:检查真实 ASR/TTS 链路是否跑通。", | |
| ] | |
| ) | |
| def _initial_live_stream_placeholder() -> str: | |
| return "\n".join( | |
| [ | |
| "### 实时监听已就绪", | |
| "- 点击麦克风开始说话,识别文本会持续刷新。", | |
| f"- 命中规则稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后,会自动播放回复语音。", | |
| "- 上传音频 / 手工输入兜底仍可用。", | |
| f"- 说明:{MINIMAL_UI_NOTICE}", | |
| ] | |
| ) | |
| def _parse_script_ids(raw_value: str | None) -> list[str]: | |
| if not raw_value: | |
| return list(DEFAULT_STAGE1_SMOKE_SCRIPT_IDS) | |
| return [item.strip() for item in raw_value.replace("\n", ",").split(",") if item.strip()] | |
| def _format_tts_status(tts_preview, matched: bool, *, finalized: bool) -> str: | |
| if tts_preview is None: | |
| return "录音结束,当前未生成回复语音。" if finalized and not matched else "当前未生成回复语音。" | |
| status = f"已生成回复语音({tts_preview.latency_ms} ms)" | |
| if tts_preview.runtime_note: | |
| status = f"{status};{tts_preview.runtime_note}" | |
| return status | |
| def _format_run_summary( | |
| *, | |
| run_id: str, | |
| input_mode: str, | |
| asr_backend: str | None, | |
| asr_latency_ms: int | None, | |
| rule_id: str | None, | |
| matched_keyword: str | None, | |
| tts_status: str, | |
| runtime_note: str | None, | |
| ) -> str: | |
| latency_text = f"{asr_latency_ms} ms" if asr_latency_ms is not None else "未知" | |
| input_mode_text = "音频输入" if input_mode == "audio" else "手工文本" | |
| lines = [ | |
| "### 本轮处理结果", | |
| f"- 运行 ID:{run_id}", | |
| f"- 输入方式:{input_mode_text}", | |
| f"- 识别后端:{asr_backend or '手工文本'}", | |
| f"- 最新识别延迟:{latency_text}", | |
| f"- 命中规则:{rule_id or '未命中'}", | |
| f"- 命中关键词:{matched_keyword or '未命中'}", | |
| f"- 回复语音状态:{tts_status}", | |
| f"- 说明:{MINIMAL_UI_NOTICE}", | |
| ] | |
| if runtime_note: | |
| lines.append(f"- 运行提示:{runtime_note}") | |
| return "\n".join(lines) | |
| def _format_live_stream_status(state, *, asr_result=None, finalized: bool, auto_reply_triggered: bool = False) -> str: | |
| if finalized: | |
| status = "录音已结束。" | |
| if state.current_match_key and (state.auto_reply_key == state.current_match_key or auto_reply_triggered): | |
| status = "录音已结束,已保留当前自动回复语音。" | |
| elif state.current_match_key: | |
| status = "录音已结束,已完成本轮规则匹配。" | |
| else: | |
| status = "录音已结束,当前未命中规则。" | |
| elif state.auto_reply_key == state.current_match_key and state.auto_reply_audio_path: | |
| status = f"已自动播放规则 {state.matched_rule_id or '未命中'} 的回复语音,继续说话可触发新的规则。" | |
| elif state.current_match_key: | |
| status = f"已命中规则 {state.matched_rule_id or '未命中'},稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后将自动播放回复语音。" | |
| elif state.transcript: | |
| status = "实时转写中,当前尚未命中规则。" | |
| else: | |
| status = "实时监听中,请开始说话。" | |
| effective_asr_result = asr_result | |
| if effective_asr_result is None and state.asr_latency_ms is not None: | |
| latency_text = f"最新识别延迟 {state.asr_latency_ms} ms。" | |
| status = f"{status} {latency_text}" | |
| elif effective_asr_result is not None: | |
| status = f"{status} 最新识别延迟 {effective_asr_result.latency_ms} ms。" | |
| runtime_note = effective_asr_result.runtime_note if effective_asr_result is not None else state.runtime_note | |
| if runtime_note: | |
| status = f"{status} {runtime_note}" | |
| return status | |
| def _format_stream_preview_summary(*, state, error: str | None = None, finalized: bool = False, auto_reply_triggered: bool = False) -> str: | |
| if error: | |
| return "\n".join( | |
| [ | |
| "### 实时状态", | |
| f"- 已接收音频片段:{state.chunk_count}", | |
| f"- 错误:{error}", | |
| f"- 说明:{MINIMAL_UI_NOTICE}", | |
| ] | |
| ) | |
| if state.auto_reply_key == state.current_match_key and state.auto_reply_audio_path: | |
| auto_reply_status = "已自动播放" | |
| elif state.current_match_key: | |
| auto_reply_status = f"等待稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒" | |
| else: | |
| auto_reply_status = "当前未触发" | |
| if auto_reply_triggered: | |
| auto_reply_status = "刚刚完成自动播放" | |
| if finalized and state.current_match_key: | |
| auto_reply_status = "录音结束,已完成本轮处理" | |
| latency_text = f"{state.asr_latency_ms} ms" if state.asr_latency_ms is not None else "未知" | |
| lines = [ | |
| "### 实时状态", | |
| f"- 已接收音频片段:{state.chunk_count}", | |
| f"- 当前采样率:{state.sample_rate or '未知'} Hz", | |
| f"- 识别后端:{state.asr_backend or '尚未开始'}", | |
| f"- 最新识别延迟:{latency_text}", | |
| f"- 当前命中规则:{state.matched_rule_id or '未命中'}", | |
| f"- 当前命中关键词:{state.matched_keyword or '未命中'}", | |
| f"- 自动播报状态:{auto_reply_status}", | |
| f"- 说明:{MINIMAL_UI_NOTICE}", | |
| ] | |
| if state.runtime_note: | |
| lines.append(f"- 运行提示:{state.runtime_note}") | |
| return "\n".join(lines) | |
| def _require_gradio(): | |
| try: | |
| import gradio as gr | |
| except ImportError as exc: | |
| raise RuntimeError("未安装 Gradio,请先执行 `pip install -r requirements.txt`。") from exc | |
| return gr | |