Spaces:

EvalOps
/

jarvis

Running

jarvis / scripts /run_sim_acceptance.py

Jonathan Haas

feat: complete wave 113 no-hardware reliability expansion

b28a7b2 4 months ago

7.59 kB

	#!/usr/bin/env python
	from __future__ import annotations

	import argparse
	import json
	import subprocess
	import time
	from pathlib import Path


	def _phase_commands(profile: str) -> list[tuple[str, list[str]]]:
	phases: list[tuple[str, list[str]]] = [
	("sim_stack_baseline", ["./scripts/test_sim.sh"]),
	(
	"voice_loop_edges",
	[
	"uv",
	"run",
	"pytest",
	"-q",
	"tests/test_runtime_conversation.py",
	"tests/test_main_audio.py",
	"-k",
	"barge_in or tts_barge_in_soak_harness_stability",
	],
	),
	(
	"voice_repair_confirmation",
	[
	"uv",
	"run",
	"pytest",
	"-q",
	"tests/test_main_lifecycle.py",
	"-k",
	(
	"requires_stt_repair or "
	"requires_confirmation_respects_voice_profile_confirmation_mode or "
	"followup_carryover"
	),
	],
	),
	(
	"autonomy_checkpoint_edges",
	[
	"uv",
	"run",
	"pytest",
	"-q",
	"tests/test_tools_services.py",
	"-k",
	(
	"planner_engine_autonomy_cycle_requires_checkpoint_then_executes or "
	"home_orchestrator_automation_pipeline_local_apply_and_rollback"
	),
	],
	),
	]
	if profile == "full":
	phases.extend(
	[
	(
	"operator_contract_edges",
	[
	"uv",
	"run",
	"pytest",
	"-q",
	"tests/test_tools_services.py",
	"-k",
	(
	"system_status_contract_reports_expected_fields or "
	"system_status_reports_snapshot or "
	"identity_guest_session_capability_enforced"
	),
	],
	),
	(
	"recovery_replay_edges",
	[
	"uv",
	"run",
	"pytest",
	"-q",
	"tests/test_tools_services.py",
	"-k",
	(
	"dead_letter_queue_captures_webhook_failure_and_replays or "
	"bind_reconciles_interrupted_recovery_entries"
	),
	],
	),
	]
	)
	return phases


	def _run_phase(name: str, command: list[str]) -> dict[str, object]:
	started_at = time.time()
	started_mono = time.monotonic()
	proc = subprocess.run(command, capture_output=True, text=True)
	finished_at = time.time()
	return {
	"phase": name,
	"command": command,
	"started_at": started_at,
	"finished_at": finished_at,
	"duration_sec": time.monotonic() - started_mono,
	"exit_code": proc.returncode,
	"status": "passed" if proc.returncode == 0 else "failed",
	"stdout_tail": proc.stdout[-4000:],
	"stderr_tail": proc.stderr[-2000:],
	}


	def _artifact_checks(
	results: list[dict[str, object]],
	*,
	expected_phase_count_per_cycle: int,
	repeat: int,
	) -> dict[str, object]:
	names = [str(row.get("phase", "")) for row in results if str(row.get("phase", "")).strip()]
	valid_status = all(str(row.get("status", "")) in {"passed", "failed"} for row in results)
	has_timestamps = all(
	isinstance(row.get("started_at"), float) and isinstance(row.get("finished_at"), float)
	for row in results
	)
	cycle_phase_counts: dict[int, int] = {}
	for row in results:
	try:
	cycle = int(row.get("cycle", 1))
	except (TypeError, ValueError):
	cycle = 1
	cycle_phase_counts[cycle] = cycle_phase_counts.get(cycle, 0) + 1
	return {
	"phase_names": names,
	"all_status_valid": valid_status,
	"all_timestamps_present": has_timestamps,
	"expected_phase_count_per_cycle": expected_phase_count_per_cycle,
	"expected_total_phase_count": expected_phase_count_per_cycle * max(1, repeat),
	"cycle_phase_counts": {str(cycle): count for cycle, count in sorted(cycle_phase_counts.items())},
	}


	def main() -> int:
	parser = argparse.ArgumentParser(description="Run simulation-first acceptance profile checks.")
	parser.add_argument("--profile", choices=("fast", "full"), default="fast")
	parser.add_argument("--repeat", type=int, default=1, help="Run full phase set this many cycles.")
	parser.add_argument(
	"--output",
	default=".artifacts/quality/sim-acceptance-fast-repeat1.json",
	help="JSON artifact path",
	)
	args = parser.parse_args()
	if args.repeat <= 0:
	raise SystemExit("--repeat must be >= 1.")

	phase_plan = _phase_commands(args.profile)
	results: list[dict[str, object]] = []
	failed = False
	for cycle in range(1, args.repeat + 1):
	for phase_index, (name, command) in enumerate(phase_plan, start=1):
	result = _run_phase(name, command)
	result["cycle"] = cycle
	result["cycle_phase_index"] = phase_index
	results.append(result)
	print(
	f"[sim-acceptance] cycle {cycle}/{args.repeat} {name}: "
	f"{result['status']} ({result['duration_sec']:.2f}s)"
	)
	if int(result["exit_code"]) != 0:
	failed = True
	break
	if failed:
	break

	phase_count_per_cycle = len(phase_plan)
	cycle_phase_counts: dict[int, int] = {}
	cycle_failed: dict[int, bool] = {}
	for row in results:
	cycle = int(row.get("cycle", 1) or 1)
	cycle_phase_counts[cycle] = cycle_phase_counts.get(cycle, 0) + 1
	if int(row.get("exit_code", 1)) != 0:
	cycle_failed[cycle] = True
	cycles_completed = sum(
	1
	for cycle in range(1, args.repeat + 1)
	if cycle_phase_counts.get(cycle, 0) == phase_count_per_cycle
	and not cycle_failed.get(cycle, False)
	)
	accepted = cycles_completed == args.repeat and all(int(row.get("exit_code", 1)) == 0 for row in results)

	summary = {
	"profile": args.profile,
	"repeat": args.repeat,
	"cycles_completed": cycles_completed,
	"phase_count": len(results),
	"passed_count": sum(1 for row in results if row.get("status") == "passed"),
	"failed_count": sum(1 for row in results if row.get("status") != "passed"),
	"accepted": accepted,
	"expected_phase_count": phase_count_per_cycle,
	"artifact_checks": _artifact_checks(
	results,
	expected_phase_count_per_cycle=phase_count_per_cycle,
	repeat=args.repeat,
	),
	"results": results,
	"generated_at": time.time(),
	}

	out_path = Path(args.output)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	out_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
	print(json.dumps(summary, indent=2))
	return 0 if bool(summary["accepted"]) else 1


	if __name__ == "__main__":
	raise SystemExit(main())