dsa2dsads's picture
Fix ZeroGPU preview callbacks to stay CPU-only and avoid main-process CUDA init
88733e7 verified
import os
from datetime import datetime, timezone
from functools import partial
from pathlib import Path
try:
import spaces # pyright: ignore[reportMissingImports]
except ImportError: # pragma: no cover - local fallback when not running on HF Spaces.
class _SpacesShim:
@staticmethod
def GPU(*decorator_args, **decorator_kwargs):
if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs:
return decorator_args[0]
def _decorator(func):
return func
return _decorator
spaces = _SpacesShim()
from benchmarks.harness import (
DEFAULT_STAGE1_SMOKE_SCRIPT_IDS,
evaluate_internal_benchmark,
evaluate_stage1_integration_smoke,
)
from core.adapters import (
AVAILABLE_EDGE_TTS_VOICES,
DEFAULT_EDGE_TTS_VOICE,
EdgeTTSAdapter,
FasterWhisperAdapter,
FunASRSenseVoiceAdapter,
MockASRAdapter,
MockTTSAdapter,
)
from core.pipeline import (
AUTO_REPLY_DELAY_SECONDS,
create_streaming_turn_state,
maybe_auto_reply_streaming_turn,
preview_streaming_turn,
run_controlled_turn,
run_streaming_turn,
stop_streaming_turn_state,
)
from utils.config_loader import (
DEFAULT_BENCHMARK_CSV_PATH,
DEFAULT_EVALUATION_CONTRACT_PATH,
DEFAULT_RULES_PATH,
TABLE_HEADERS,
load_evaluation_contract,
load_rule_catalog,
rules_from_editor_rows,
rules_to_table_rows,
)
from utils.tracking import LocalJsonlTracker, RunMetadata, make_run_id, resolve_code_version
MINIMAL_UI_NOTICE = "当前为演示版,优先展示实时接话体验。首次加载模型可能稍慢;麦克风不可用时可改用下方兜底输入。"
def _is_zero_gpu_runtime() -> bool:
return os.getenv("SPACES_ZERO_GPU", "").lower() in {"1", "t", "true"}
def _build_stream_preview_asr_adapters():
# ZeroGPU forbids CUDA init from the main process, so preview callbacks must
# stay on CPU when they are not wrapped by @spaces.GPU.
if _is_zero_gpu_runtime():
return {
"funasr": FunASRSenseVoiceAdapter(device_order=("cpu",)),
"faster-whisper": FasterWhisperAdapter(device_order=(("cpu", "int8"),)),
}
return {
"funasr": FunASRSenseVoiceAdapter(),
"faster-whisper": FasterWhisperAdapter(),
}
@spaces.GPU
def _process_turn_for_space(
audio_input,
manual_text,
asr_backend_key,
tts_backend_key,
tts_voice,
rule_rows,
*,
tracker,
manual_asr,
asr_adapters,
tts_adapters,
code_version,
):
session_rules = rules_from_editor_rows(rule_rows)
audio_path = Path(audio_input) if audio_input else None
hypothesis_ids = ["H-001", "H-002", "H-003"] if audio_path else ["H-002", "H-004"]
route_id = "R-001" if asr_backend_key == "funasr" else "R-002" if asr_backend_key == "faster-whisper" else "R-001"
run_id = make_run_id(f"stage1-turn-{asr_backend_key or 'manual'}")
metadata = RunMetadata(
run_id=run_id,
experiment_name="stage1-controlled-turn",
hypothesis_ids=hypothesis_ids,
baseline_run_id=None,
route_id=route_id,
seed=42,
code_version=code_version,
data_version=str(audio_path) if audio_path else "ui-manual-text",
started_at=datetime.now(timezone.utc).isoformat(),
status="running",
)
try:
outcome = run_controlled_turn(
run_id=run_id,
audio_path=audio_path,
manual_text=manual_text,
rules=session_rules,
audio_asr=asr_adapters.get(asr_backend_key),
manual_asr=manual_asr,
tts_adapter=tts_adapters.get(tts_backend_key),
tts_voice=tts_voice,
tracker=tracker,
metadata=metadata,
)
except Exception as exc:
return (
"",
"",
"未命中",
"未命中",
"",
f"处理失败:{exc}",
None,
_format_run_summary(
run_id=run_id,
input_mode="audio" if audio_path else "manual_text",
asr_backend=asr_backend_key,
asr_latency_ms=None,
rule_id=None,
matched_keyword=None,
tts_status="失败",
runtime_note=None,
),
)
match = outcome.match_result
tts_preview = outcome.tts_preview
reply_text_value = match.reply or ("当前未命中规则。" if not match.matched else "")
tts_status_text = _format_tts_status(tts_preview, match.matched, finalized=True)
summary = _format_run_summary(
run_id=outcome.run_id,
input_mode=outcome.input_mode,
asr_backend=outcome.asr_result.backend,
asr_latency_ms=outcome.asr_result.latency_ms,
rule_id=match.rule_id,
matched_keyword=match.matched_keyword,
tts_status=tts_status_text,
runtime_note=outcome.asr_result.runtime_note,
)
return (
outcome.asr_result.transcript,
outcome.asr_result.partial_transcript,
match.matched_keyword or "未命中",
match.rule_id or "未命中",
reply_text_value,
tts_status_text,
tts_preview.audio_path if tts_preview is not None else None,
summary,
)
def _preview_live_microphone_stream_for_space(
audio_chunk,
stream_state,
asr_backend_key,
rule_rows,
*,
asr_adapters,
):
session_rules = rules_from_editor_rows(rule_rows)
try:
updated_state, asr_result, match = preview_streaming_turn(
state=stream_state,
audio_chunk=audio_chunk,
rules=session_rules,
audio_asr=asr_adapters[asr_backend_key],
)
except Exception as exc:
safe_state = stream_state or create_streaming_turn_state(recording_active=True)
return (
safe_state,
safe_state.transcript,
safe_state.partial_transcript,
safe_state.matched_keyword or "未命中",
safe_state.matched_rule_id or "未命中",
safe_state.matched_reply_text or "",
f"实时识别失败:{exc}",
safe_state.auto_reply_audio_path,
_format_stream_preview_summary(state=safe_state, error=str(exc)),
)
reply_preview = match.reply or ("当前尚未命中规则。" if updated_state.transcript else "")
return (
updated_state,
updated_state.transcript,
updated_state.partial_transcript,
match.matched_keyword or "未命中",
match.rule_id or "未命中",
reply_preview,
_format_live_stream_status(updated_state, asr_result=asr_result, finalized=False),
updated_state.auto_reply_audio_path,
_format_stream_preview_summary(state=updated_state),
)
def _auto_reply_live_microphone_stream_for_space(
stream_state,
tts_backend_key,
tts_voice,
*,
tts_adapters,
):
state = stream_state or create_streaming_turn_state()
try:
updated_state, tts_preview = maybe_auto_reply_streaming_turn(
state=state,
tts_adapter=tts_adapters.get(tts_backend_key),
tts_voice=tts_voice,
)
except Exception as exc:
return (
state,
f"自动播报失败:{exc}",
state.auto_reply_audio_path,
_format_stream_preview_summary(state=state, error=str(exc)),
)
return (
updated_state,
_format_live_stream_status(updated_state, finalized=False, auto_reply_triggered=tts_preview is not None),
updated_state.auto_reply_audio_path,
_format_stream_preview_summary(state=updated_state, auto_reply_triggered=tts_preview is not None),
)
@spaces.GPU
def _finalize_live_microphone_stream_for_space(
stream_state,
asr_backend_key,
tts_backend_key,
tts_voice,
rule_rows,
*,
tracker,
manual_asr,
asr_adapters,
tts_adapters,
code_version,
):
safe_state = stop_streaming_turn_state(stream_state or create_streaming_turn_state())
session_rules = rules_from_editor_rows(rule_rows)
run_id = make_run_id(f"stage1-live-stream-{asr_backend_key or 'manual'}")
metadata = RunMetadata(
run_id=run_id,
experiment_name="stage1-live-stream-turn",
hypothesis_ids=["H-001", "H-002", "H-003"],
baseline_run_id=None,
route_id="R-001" if asr_backend_key == "funasr" else "R-002",
seed=42,
code_version=code_version,
data_version="ui-live-microphone-stream",
started_at=datetime.now(timezone.utc).isoformat(),
status="running",
)
reuse_auto_reply = bool(
safe_state.current_match_key
and safe_state.auto_reply_key == safe_state.current_match_key
and safe_state.auto_reply_audio_path
)
try:
outcome = run_streaming_turn(
run_id=run_id,
state=safe_state,
rules=session_rules,
audio_asr=asr_adapters[asr_backend_key],
manual_asr=manual_asr,
tts_adapter=None if reuse_auto_reply else tts_adapters.get(tts_backend_key),
tts_voice=tts_voice,
tracker=tracker,
metadata=metadata,
)
except Exception as exc:
return (
create_streaming_turn_state(),
safe_state.transcript,
safe_state.partial_transcript,
safe_state.matched_keyword or "未命中",
safe_state.matched_rule_id or "未命中",
safe_state.matched_reply_text or "",
f"结束录音时处理失败:{exc}",
safe_state.auto_reply_audio_path,
_format_stream_preview_summary(state=safe_state, error=str(exc), finalized=True),
)
match = outcome.match_result
tts_preview = outcome.tts_preview
reply_text_value = safe_state.auto_reply_text or match.reply or ("当前未命中规则。" if not match.matched else "")
tts_status_text = _format_live_stream_status(
safe_state,
asr_result=outcome.asr_result,
finalized=True,
auto_reply_triggered=reuse_auto_reply or tts_preview is not None,
)
summary = _format_run_summary(
run_id=outcome.run_id,
input_mode=outcome.input_mode,
asr_backend=outcome.asr_result.backend,
asr_latency_ms=outcome.asr_result.latency_ms,
rule_id=match.rule_id,
matched_keyword=match.matched_keyword,
tts_status=tts_status_text,
runtime_note=outcome.asr_result.runtime_note,
)
return (
create_streaming_turn_state(),
outcome.asr_result.transcript,
outcome.asr_result.partial_transcript,
match.matched_keyword or "未命中",
match.rule_id or "未命中",
reply_text_value,
tts_status_text,
safe_state.auto_reply_audio_path if reuse_auto_reply else (tts_preview.audio_path if tts_preview is not None else None),
summary,
)
@spaces.GPU
def _run_stage1_smoke_for_space(
asr_backend_key,
tts_backend_key,
tts_voice,
rule_rows,
script_ids_text,
*,
benchmark_csv_path,
tracker,
asr_adapters,
tts_adapters,
code_version,
):
session_rules = rules_from_editor_rows(rule_rows)
selected_script_ids = _parse_script_ids(script_ids_text)
run_id = make_run_id(f"stage1-smoke-{asr_backend_key}")
route_id = "R-001" if asr_backend_key == "funasr" else "R-002"
metadata = RunMetadata(
run_id=run_id,
experiment_name="stage1-synthetic-audio-smoke",
hypothesis_ids=["H-001", "H-002", "H-003", "H-005"],
baseline_run_id=None,
route_id=route_id,
seed=42,
code_version=code_version,
data_version=f"{benchmark_csv_path}#synthetic-audio-smoke",
started_at=datetime.now(timezone.utc).isoformat(),
status="running",
)
summary = evaluate_stage1_integration_smoke(
benchmark_csv_path,
session_rules,
asr_adapters[asr_backend_key],
tts_adapters[tts_backend_key],
tts_adapters[tts_backend_key],
script_ids=selected_script_ids,
tracker=tracker,
metadata=metadata,
input_voice=tts_voice,
reply_voice=tts_voice,
)
return summary.to_markdown()
def build_app(
benchmark_csv_path: Path = DEFAULT_BENCHMARK_CSV_PATH,
rules_path: Path = DEFAULT_RULES_PATH,
evaluation_contract_path: Path = DEFAULT_EVALUATION_CONTRACT_PATH,
):
gr = _require_gradio()
seed_rules = load_rule_catalog(rules_path)
evaluation_contract = load_evaluation_contract(evaluation_contract_path)
tracker = LocalJsonlTracker(Path(__file__).resolve().parents[1] / "results" / "tracking")
manual_asr = MockASRAdapter()
asr_adapters = {
"funasr": FunASRSenseVoiceAdapter(),
"faster-whisper": FasterWhisperAdapter(),
}
# Keep preview ASR instances separate so a ZeroGPU live-preview CPU fallback
# does not pin the finalize/upload paths to CPU as well.
stream_preview_asr_adapters = _build_stream_preview_asr_adapters()
tts_adapters = {
"edge-tts": EdgeTTSAdapter(),
"mock": MockTTSAdapter(),
}
code_version = resolve_code_version(Path(__file__).resolve().parents[1])
def apply_rules(rule_rows):
session_rules = rules_from_editor_rows(rule_rows)
return rules_to_table_rows(session_rules), _format_rule_status(session_rules)
def start_live_microphone_stream():
return (
gr.Timer(value=0.25, active=True),
create_streaming_turn_state(recording_active=True),
"",
"",
"未命中",
"未命中",
"",
f"实时监听中,命中规则后稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒将自动播放回复语音。",
None,
_initial_live_stream_placeholder(),
)
process_turn = partial(
_process_turn_for_space,
tracker=tracker,
manual_asr=manual_asr,
asr_adapters=asr_adapters,
tts_adapters=tts_adapters,
code_version=code_version,
)
preview_live_stream = partial(
_preview_live_microphone_stream_for_space,
asr_adapters=stream_preview_asr_adapters,
)
auto_reply_live_stream = partial(
_auto_reply_live_microphone_stream_for_space,
tts_adapters=tts_adapters,
)
finalize_live_stream = partial(
_finalize_live_microphone_stream_for_space,
tracker=tracker,
manual_asr=manual_asr,
asr_adapters=asr_adapters,
tts_adapters=tts_adapters,
code_version=code_version,
)
def stop_live_microphone_stream(stream_state, asr_backend_key, tts_backend_key, tts_voice, rule_rows):
result = finalize_live_stream(stream_state, asr_backend_key, tts_backend_key, tts_voice, rule_rows)
return (gr.Timer(value=0.25, active=False), *result)
def preview_benchmark(rule_rows):
session_rules = rules_from_editor_rows(rule_rows)
run_id = make_run_id("stage1-rule-preview")
metadata = RunMetadata(
run_id=run_id,
experiment_name="stage1-rule-only-benchmark-preview",
hypothesis_ids=["H-002"],
baseline_run_id=None,
route_id="R-001",
seed=42,
code_version=code_version,
data_version=str(benchmark_csv_path),
started_at=datetime.now(timezone.utc).isoformat(),
status="running",
)
summary = evaluate_internal_benchmark(
benchmark_csv_path,
session_rules,
tracker=tracker,
metadata=metadata,
)
return summary.to_markdown()
run_stage1_smoke = partial(
_run_stage1_smoke_for_space,
benchmark_csv_path=benchmark_csv_path,
tracker=tracker,
asr_adapters=asr_adapters,
tts_adapters=tts_adapters,
code_version=code_version,
)
with gr.Blocks(title="VoiceDirector 语音场控接话演示") as app:
live_stream_state = gr.State(value=create_streaming_turn_state())
auto_reply_timer = gr.Timer(value=0.25, active=False)
gr.Markdown("# VoiceDirector 语音场控接话演示")
gr.Markdown("**实时语音接话演示**:连续麦克风转写、自动规则匹配、自动回复语音。")
gr.Markdown(f"提示:{MINIMAL_UI_NOTICE}")
with gr.Row():
with gr.Column(scale=5):
gr.Markdown(
f"### 主路径:麦克风连续流式识别\n点击麦克风开始说话,识别文本会持续刷新;命中规则稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后会自动播放回复语音。"
)
live_audio_input = gr.Audio(
label="实时麦克风输入(连续流式)",
sources=["microphone"],
type="numpy",
streaming=True,
)
asr_backend = gr.Dropdown(
choices=[
("FunASR / SenseVoice(主路径)", "funasr"),
("faster-whisper(兜底)", "faster-whisper"),
],
value="funasr",
label="识别后端",
)
tts_backend = gr.Dropdown(
choices=[("edge-tts(真实语音)", "edge-tts"), ("mock(调试预览)", "mock")],
value="edge-tts",
label="回复语音后端",
)
tts_voice = gr.Dropdown(
choices=[(voice, voice) for voice in AVAILABLE_EDGE_TTS_VOICES],
value=DEFAULT_EDGE_TTS_VOICE,
label="回复音色",
)
with gr.Accordion("上传音频 / 手工输入兜底", open=False):
upload_audio_input = gr.Audio(
label="上传音频兜底",
sources=["upload"],
type="filepath",
format="wav",
)
manual_text = gr.Textbox(
label="手工文本兜底",
placeholder="没有音频时,可直接输入中文文本做规则演示。",
lines=3,
)
run_turn_button = gr.Button("执行上传/手工兜底", variant="secondary")
recognized_text = gr.Textbox(label="识别文本")
partial_text = gr.Textbox(label="实时转写预览")
matched_keyword = gr.Textbox(label="命中关键词")
matched_rule = gr.Textbox(label="命中规则")
reply_text = gr.Textbox(label="回复文本", lines=3)
tts_status = gr.Textbox(label="当前状态")
reply_audio = gr.Audio(label="回复语音", interactive=False, type="filepath", autoplay=True)
run_summary = gr.Markdown(value=_initial_live_stream_placeholder())
with gr.Column(scale=7):
rule_table = gr.Dataframe(
headers=TABLE_HEADERS,
datatype=["str", "str", "str", "str"],
row_count=(len(seed_rules), "fixed"),
column_count=(len(TABLE_HEADERS), "fixed"),
value=rules_to_table_rows(seed_rules),
label="规则编辑表(当前会话)",
interactive=True,
)
apply_button = gr.Button("应用规则编辑")
rule_status = gr.Textbox(label="规则编辑状态", value=_format_rule_status(seed_rules))
gr.Markdown(_format_contract(evaluation_contract))
with gr.Accordion("基准与烟测面板", open=False):
gr.Markdown(
"下面的规则预览保留了内部基准可见性。端到端烟测会使用 edge-tts 生成输入音频来打通真实 ASR/TTS 代码路径。"
)
benchmark_button = gr.Button("预览规则基准")
benchmark_output = gr.Markdown(value=_initial_benchmark_placeholder(benchmark_csv_path))
script_ids = gr.Textbox(
label="烟测脚本 ID",
value=", ".join(DEFAULT_STAGE1_SMOKE_SCRIPT_IDS),
lines=2,
)
smoke_asr_backend = gr.Dropdown(
choices=[
("FunASR / SenseVoice(主路径)", "funasr"),
("faster-whisper(兜底)", "faster-whisper"),
],
value="funasr",
label="烟测识别后端",
)
smoke_tts_backend = gr.Dropdown(
choices=[("edge-tts(真实语音)", "edge-tts"), ("mock(调试预览)", "mock")],
value="edge-tts",
label="烟测语音后端",
)
smoke_tts_voice = gr.Dropdown(
choices=[(voice, voice) for voice in AVAILABLE_EDGE_TTS_VOICES],
value=DEFAULT_EDGE_TTS_VOICE,
label="烟测音色",
)
stage1_smoke_button = gr.Button("执行端到端烟测")
stage1_smoke_output = gr.Markdown(value=_initial_stage1_smoke_placeholder(benchmark_csv_path))
run_turn_button.click(
process_turn,
inputs=[upload_audio_input, manual_text, asr_backend, tts_backend, tts_voice, rule_table],
outputs=[recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary],
)
live_audio_input.start_recording(
start_live_microphone_stream,
outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary],
queue=False,
show_progress="hidden",
)
stream_event = live_audio_input.stream(
preview_live_stream,
inputs=[live_audio_input, live_stream_state, asr_backend, rule_table],
outputs=[live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary],
show_progress="hidden",
trigger_mode="always_last",
concurrency_limit=1,
concurrency_id="live-stream-session",
stream_every=0.75,
)
auto_reply_event = auto_reply_timer.tick(
auto_reply_live_stream,
inputs=[live_stream_state, tts_backend, tts_voice],
outputs=[live_stream_state, tts_status, reply_audio, run_summary],
show_progress="hidden",
trigger_mode="always_last",
concurrency_limit=1,
concurrency_id="live-stream-session",
)
live_audio_input.stop_recording(
stop_live_microphone_stream,
inputs=[live_stream_state, asr_backend, tts_backend, tts_voice, rule_table],
outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary],
show_progress="minimal",
cancels=[stream_event, auto_reply_event],
)
live_audio_input.pause_recording(
stop_live_microphone_stream,
inputs=[live_stream_state, asr_backend, tts_backend, tts_voice, rule_table],
outputs=[auto_reply_timer, live_stream_state, recognized_text, partial_text, matched_keyword, matched_rule, reply_text, tts_status, reply_audio, run_summary],
show_progress="minimal",
cancels=[stream_event, auto_reply_event],
)
apply_button.click(apply_rules, inputs=[rule_table], outputs=[rule_table, rule_status])
benchmark_button.click(preview_benchmark, inputs=[rule_table], outputs=[benchmark_output])
stage1_smoke_button.click(
run_stage1_smoke,
inputs=[smoke_asr_backend, smoke_tts_backend, smoke_tts_voice, rule_table, script_ids],
outputs=[stage1_smoke_output],
)
return app
def _format_contract(contract: dict) -> str:
h003 = contract["h003_mos"]
notes = contract["evaluation_constraints"]
return "\n".join(
[
"## 固定评测约束",
f"- H-003 MOS 阈值:{h003['threshold']}/{h003['scale_max']}",
f"- 最少评审人数:{h003['minimum_raters']}",
f"- H-004 正式验证阶段:{notes['h004_formal_validation_phase']}",
f"- H-002 适用范围:{notes['h002_precision_scope']}",
]
)
def _format_rule_status(session_rules) -> str:
return f"当前会话已加载 {len(session_rules)} 条规则,编辑结果会立即在本次会话生效。"
def _initial_benchmark_placeholder(benchmark_csv_path: Path) -> str:
return "\n".join(
[
"### 规则基准已就绪",
f"- 数据集:{benchmark_csv_path}",
"- 范围:仅内部基准",
"- 用途:查看规则与文本基准,不代表真实语音识别表现。",
]
)
def _initial_stage1_smoke_placeholder(benchmark_csv_path: Path) -> str:
return "\n".join(
[
"### 端到端烟测已就绪",
f"- 数据集:{benchmark_csv_path}",
"- 输入音频来源:edge-tts 根据脚本文本合成",
"- 用途:检查真实 ASR/TTS 链路是否跑通。",
]
)
def _initial_live_stream_placeholder() -> str:
return "\n".join(
[
"### 实时监听已就绪",
"- 点击麦克风开始说话,识别文本会持续刷新。",
f"- 命中规则稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后,会自动播放回复语音。",
"- 上传音频 / 手工输入兜底仍可用。",
f"- 说明:{MINIMAL_UI_NOTICE}",
]
)
def _parse_script_ids(raw_value: str | None) -> list[str]:
if not raw_value:
return list(DEFAULT_STAGE1_SMOKE_SCRIPT_IDS)
return [item.strip() for item in raw_value.replace("\n", ",").split(",") if item.strip()]
def _format_tts_status(tts_preview, matched: bool, *, finalized: bool) -> str:
if tts_preview is None:
return "录音结束,当前未生成回复语音。" if finalized and not matched else "当前未生成回复语音。"
status = f"已生成回复语音({tts_preview.latency_ms} ms)"
if tts_preview.runtime_note:
status = f"{status}{tts_preview.runtime_note}"
return status
def _format_run_summary(
*,
run_id: str,
input_mode: str,
asr_backend: str | None,
asr_latency_ms: int | None,
rule_id: str | None,
matched_keyword: str | None,
tts_status: str,
runtime_note: str | None,
) -> str:
latency_text = f"{asr_latency_ms} ms" if asr_latency_ms is not None else "未知"
input_mode_text = "音频输入" if input_mode == "audio" else "手工文本"
lines = [
"### 本轮处理结果",
f"- 运行 ID:{run_id}",
f"- 输入方式:{input_mode_text}",
f"- 识别后端:{asr_backend or '手工文本'}",
f"- 最新识别延迟:{latency_text}",
f"- 命中规则:{rule_id or '未命中'}",
f"- 命中关键词:{matched_keyword or '未命中'}",
f"- 回复语音状态:{tts_status}",
f"- 说明:{MINIMAL_UI_NOTICE}",
]
if runtime_note:
lines.append(f"- 运行提示:{runtime_note}")
return "\n".join(lines)
def _format_live_stream_status(state, *, asr_result=None, finalized: bool, auto_reply_triggered: bool = False) -> str:
if finalized:
status = "录音已结束。"
if state.current_match_key and (state.auto_reply_key == state.current_match_key or auto_reply_triggered):
status = "录音已结束,已保留当前自动回复语音。"
elif state.current_match_key:
status = "录音已结束,已完成本轮规则匹配。"
else:
status = "录音已结束,当前未命中规则。"
elif state.auto_reply_key == state.current_match_key and state.auto_reply_audio_path:
status = f"已自动播放规则 {state.matched_rule_id or '未命中'} 的回复语音,继续说话可触发新的规则。"
elif state.current_match_key:
status = f"已命中规则 {state.matched_rule_id or '未命中'},稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒后将自动播放回复语音。"
elif state.transcript:
status = "实时转写中,当前尚未命中规则。"
else:
status = "实时监听中,请开始说话。"
effective_asr_result = asr_result
if effective_asr_result is None and state.asr_latency_ms is not None:
latency_text = f"最新识别延迟 {state.asr_latency_ms} ms。"
status = f"{status} {latency_text}"
elif effective_asr_result is not None:
status = f"{status} 最新识别延迟 {effective_asr_result.latency_ms} ms。"
runtime_note = effective_asr_result.runtime_note if effective_asr_result is not None else state.runtime_note
if runtime_note:
status = f"{status} {runtime_note}"
return status
def _format_stream_preview_summary(*, state, error: str | None = None, finalized: bool = False, auto_reply_triggered: bool = False) -> str:
if error:
return "\n".join(
[
"### 实时状态",
f"- 已接收音频片段:{state.chunk_count}",
f"- 错误:{error}",
f"- 说明:{MINIMAL_UI_NOTICE}",
]
)
if state.auto_reply_key == state.current_match_key and state.auto_reply_audio_path:
auto_reply_status = "已自动播放"
elif state.current_match_key:
auto_reply_status = f"等待稳定 {int(AUTO_REPLY_DELAY_SECONDS)} 秒"
else:
auto_reply_status = "当前未触发"
if auto_reply_triggered:
auto_reply_status = "刚刚完成自动播放"
if finalized and state.current_match_key:
auto_reply_status = "录音结束,已完成本轮处理"
latency_text = f"{state.asr_latency_ms} ms" if state.asr_latency_ms is not None else "未知"
lines = [
"### 实时状态",
f"- 已接收音频片段:{state.chunk_count}",
f"- 当前采样率:{state.sample_rate or '未知'} Hz",
f"- 识别后端:{state.asr_backend or '尚未开始'}",
f"- 最新识别延迟:{latency_text}",
f"- 当前命中规则:{state.matched_rule_id or '未命中'}",
f"- 当前命中关键词:{state.matched_keyword or '未命中'}",
f"- 自动播报状态:{auto_reply_status}",
f"- 说明:{MINIMAL_UI_NOTICE}",
]
if state.runtime_note:
lines.append(f"- 运行提示:{state.runtime_note}")
return "\n".join(lines)
def _require_gradio():
try:
import gradio as gr
except ImportError as exc:
raise RuntimeError("未安装 Gradio,请先执行 `pip install -r requirements.txt`。") from exc
return gr