jarvis / scripts /run_sim_acceptance.py
Jonathan Haas
feat: complete wave 113 no-hardware reliability expansion
b28a7b2
Raw
History Blame Contribute Delete
7.59 kB
#!/usr/bin/env python
from __future__ import annotations
import argparse
import json
import subprocess
import time
from pathlib import Path
def _phase_commands(profile: str) -> list[tuple[str, list[str]]]:
phases: list[tuple[str, list[str]]] = [
("sim_stack_baseline", ["./scripts/test_sim.sh"]),
(
"voice_loop_edges",
[
"uv",
"run",
"pytest",
"-q",
"tests/test_runtime_conversation.py",
"tests/test_main_audio.py",
"-k",
"barge_in or tts_barge_in_soak_harness_stability",
],
),
(
"voice_repair_confirmation",
[
"uv",
"run",
"pytest",
"-q",
"tests/test_main_lifecycle.py",
"-k",
(
"requires_stt_repair or "
"requires_confirmation_respects_voice_profile_confirmation_mode or "
"followup_carryover"
),
],
),
(
"autonomy_checkpoint_edges",
[
"uv",
"run",
"pytest",
"-q",
"tests/test_tools_services.py",
"-k",
(
"planner_engine_autonomy_cycle_requires_checkpoint_then_executes or "
"home_orchestrator_automation_pipeline_local_apply_and_rollback"
),
],
),
]
if profile == "full":
phases.extend(
[
(
"operator_contract_edges",
[
"uv",
"run",
"pytest",
"-q",
"tests/test_tools_services.py",
"-k",
(
"system_status_contract_reports_expected_fields or "
"system_status_reports_snapshot or "
"identity_guest_session_capability_enforced"
),
],
),
(
"recovery_replay_edges",
[
"uv",
"run",
"pytest",
"-q",
"tests/test_tools_services.py",
"-k",
(
"dead_letter_queue_captures_webhook_failure_and_replays or "
"bind_reconciles_interrupted_recovery_entries"
),
],
),
]
)
return phases
def _run_phase(name: str, command: list[str]) -> dict[str, object]:
started_at = time.time()
started_mono = time.monotonic()
proc = subprocess.run(command, capture_output=True, text=True)
finished_at = time.time()
return {
"phase": name,
"command": command,
"started_at": started_at,
"finished_at": finished_at,
"duration_sec": time.monotonic() - started_mono,
"exit_code": proc.returncode,
"status": "passed" if proc.returncode == 0 else "failed",
"stdout_tail": proc.stdout[-4000:],
"stderr_tail": proc.stderr[-2000:],
}
def _artifact_checks(
results: list[dict[str, object]],
*,
expected_phase_count_per_cycle: int,
repeat: int,
) -> dict[str, object]:
names = [str(row.get("phase", "")) for row in results if str(row.get("phase", "")).strip()]
valid_status = all(str(row.get("status", "")) in {"passed", "failed"} for row in results)
has_timestamps = all(
isinstance(row.get("started_at"), float) and isinstance(row.get("finished_at"), float)
for row in results
)
cycle_phase_counts: dict[int, int] = {}
for row in results:
try:
cycle = int(row.get("cycle", 1))
except (TypeError, ValueError):
cycle = 1
cycle_phase_counts[cycle] = cycle_phase_counts.get(cycle, 0) + 1
return {
"phase_names": names,
"all_status_valid": valid_status,
"all_timestamps_present": has_timestamps,
"expected_phase_count_per_cycle": expected_phase_count_per_cycle,
"expected_total_phase_count": expected_phase_count_per_cycle * max(1, repeat),
"cycle_phase_counts": {str(cycle): count for cycle, count in sorted(cycle_phase_counts.items())},
}
def main() -> int:
parser = argparse.ArgumentParser(description="Run simulation-first acceptance profile checks.")
parser.add_argument("--profile", choices=("fast", "full"), default="fast")
parser.add_argument("--repeat", type=int, default=1, help="Run full phase set this many cycles.")
parser.add_argument(
"--output",
default=".artifacts/quality/sim-acceptance-fast-repeat1.json",
help="JSON artifact path",
)
args = parser.parse_args()
if args.repeat <= 0:
raise SystemExit("--repeat must be >= 1.")
phase_plan = _phase_commands(args.profile)
results: list[dict[str, object]] = []
failed = False
for cycle in range(1, args.repeat + 1):
for phase_index, (name, command) in enumerate(phase_plan, start=1):
result = _run_phase(name, command)
result["cycle"] = cycle
result["cycle_phase_index"] = phase_index
results.append(result)
print(
f"[sim-acceptance] cycle {cycle}/{args.repeat} {name}: "
f"{result['status']} ({result['duration_sec']:.2f}s)"
)
if int(result["exit_code"]) != 0:
failed = True
break
if failed:
break
phase_count_per_cycle = len(phase_plan)
cycle_phase_counts: dict[int, int] = {}
cycle_failed: dict[int, bool] = {}
for row in results:
cycle = int(row.get("cycle", 1) or 1)
cycle_phase_counts[cycle] = cycle_phase_counts.get(cycle, 0) + 1
if int(row.get("exit_code", 1)) != 0:
cycle_failed[cycle] = True
cycles_completed = sum(
1
for cycle in range(1, args.repeat + 1)
if cycle_phase_counts.get(cycle, 0) == phase_count_per_cycle
and not cycle_failed.get(cycle, False)
)
accepted = cycles_completed == args.repeat and all(int(row.get("exit_code", 1)) == 0 for row in results)
summary = {
"profile": args.profile,
"repeat": args.repeat,
"cycles_completed": cycles_completed,
"phase_count": len(results),
"passed_count": sum(1 for row in results if row.get("status") == "passed"),
"failed_count": sum(1 for row in results if row.get("status") != "passed"),
"accepted": accepted,
"expected_phase_count": phase_count_per_cycle,
"artifact_checks": _artifact_checks(
results,
expected_phase_count_per_cycle=phase_count_per_cycle,
repeat=args.repeat,
),
"results": results,
"generated_at": time.time(),
}
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
print(json.dumps(summary, indent=2))
return 0 if bool(summary["accepted"]) else 1
if __name__ == "__main__":
raise SystemExit(main())