""" experiments/benchmark_phase2.py ================================ Phase 2 benchmark: 100 CIM simulation trials with full agent loop. Validates: ≥50% measurement reduction vs 64×64 dense baseline (4096 points) ≥90% success rate (reaching target (1,1) charge state) Usage: python experiments/benchmark_phase2.py --n-trials 100 --budget 2048 python experiments/benchmark_phase2.py --fast # 10 trials for quick testing python experiments/benchmark_phase2.py --skip-missing-checkpoints # run without trained models Outputs: - Summary report printed to stdout - Detailed per-trial logs saved to --out directory - CSV with metrics for each trial """ from __future__ import annotations import argparse import json import time from pathlib import Path from typing import List, Dict, Any import numpy as np # Phase 0 types and state from qdot.core.types import ChargeLabel, TuningStage from qdot.core.state import ExperimentState from qdot.core.governance import GovernanceLogger from qdot.core.hitl import HITLManager, HITLOutcome # Phase 0 hardware from qdot.simulator.cim import CIMSimulatorAdapter from qdot.hardware.safety import SafetyCritic # Phase 1 perception from qdot.perception.dqc import DQCGatekeeper from qdot.perception.inspector import InspectionAgent from qdot.perception.classifier import EnsembleCNN from qdot.perception.ood import MahalanobisOOD # Phase 2 agent from qdot.agent.executive import ExecutiveAgent def main(): parser = argparse.ArgumentParser(description="Phase 2 benchmark") parser.add_argument("--n-trials", type=int, default=100, help="Number of simulation trials (default: 100)") parser.add_argument("--budget", type=int, default=8192, help="Measurement budget per trial safety cap (default: 8192)") parser.add_argument("--max-steps", type=int, default=100, help="Max control steps per trial (default: 100)") parser.add_argument("--fast", action="store_true", help="Fast mode: 10 trials, reduced budgets for CI") parser.add_argument("--profile", action="store_true", help="Enable profiling to identify bottlenecks") parser.add_argument("--skip-missing-checkpoints", action="store_true", help="Run without trained InspectionAgent (for CI)") parser.add_argument("--out", type=str, default="results/benchmark_phase2", help="Output directory for detailed logs") parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() if args.fast: args.n_trials = 10 args.budget = 4096 # was 512 — needs headroom for 6-stage pipeline args.max_steps = 50 print("FAST MODE: 10 trials, 4096 budget, 50 max steps") out_dir = Path(args.out) out_dir.mkdir(parents=True, exist_ok=True) print(f"\n{'='*70}") print("PHASE 2 BENCHMARK — Agentic Tuning on CIM Simulator") print(f"{'='*70}\n") print(f"Trials: {args.n_trials}") print(f"Measurement budget: {args.budget} points") print(f"Max steps: {args.max_steps}") print(f"Target: (1,1) charge state") print(f"Dense baseline: 64×64 = 4096 points") print(f"Reduction target: ≥50% (≤2048 measurements)") print(f"Success target: ≥90% of trials\n") # Load Phase 1 components inspector = load_inspector(args.skip_missing_checkpoints) # Start profiling if requested profiler = None if args.profile: import cProfile profiler = cProfile.Profile() profiler.enable() print("⚙️ Profiling enabled\n") # Run trials np.random.seed(args.seed) results = [] for trial_idx in range(args.n_trials): print(f"[{trial_idx+1}/{args.n_trials}] ", end="", flush=True) result = run_trial( trial_idx=trial_idx, inspector=inspector, measurement_budget=args.budget, max_steps=args.max_steps, out_dir=out_dir, ) results.append(result) status = "✓" if result["success"] else "✗" print(f"{status} {result['final_stage']} | {result['total_measurements']} meas | {result['total_steps']} steps") # Aggregate metrics summary = compute_summary(results, args) # Print report print_report(summary) # Save detailed results save_results(summary, results, out_dir) # Stop profiling and print results if profiler is not None: profiler.disable() import pstats print("\n" + "="*70) print("PROFILING RESULTS — Top 20 Time Sinks") print("="*70) stats = pstats.Stats(profiler) stats.sort_stats('cumulative') stats.print_stats(20) # Exit with appropriate code if summary["success_rate"] < 0.90: print(f"\n❌ BENCHMARK FAILED: success rate {summary['success_rate']:.1%} < 90%") return 1 if summary["mean_reduction"] < 0.50: print(f"\n❌ BENCHMARK FAILED: mean reduction {summary['mean_reduction']:.1%} < 50%") return 1 print("\n✅ PHASE 2 BENCHMARK PASSED") return 0 def load_inspector(skip_missing: bool) -> InspectionAgent: """Load trained InspectionAgent or create untrained stub if skipping.""" checkpoint_dir = Path("experiments/checkpoints/phase1") if skip_missing or not checkpoint_dir.exists(): print("⚠️ Running without trained checkpoints (InspectionAgent in stub mode)\n") return InspectionAgent( ensemble=None, # Untrained — predictions will be random ood_detector=None, # OOD detection skipped ) # Load trained ensemble and OOD detector try: ensemble = EnsembleCNN.load(str(checkpoint_dir)) ood = MahalanobisOOD.load(str(checkpoint_dir / "ood_detector.pkl")) print(f"✓ Loaded trained InspectionAgent from {checkpoint_dir}\n") return InspectionAgent(ensemble=ensemble, ood_detector=ood) except Exception as e: print(f"⚠️ Failed to load checkpoints: {e}") print(" Continuing with untrained InspectionAgent\n") return InspectionAgent(ensemble=None, ood_detector=None) def run_trial( trial_idx: int, inspector: InspectionAgent, measurement_budget: int, max_steps: int, out_dir: Path, ) -> Dict[str, Any]: """Run a single tuning trial.""" # Create fresh state device_id = f"cim_trial_{trial_idx:03d}" state = ExperimentState.new( device_id=device_id, target_label=ChargeLabel.DOUBLE_DOT, ) state.config = { "measurement_budget": measurement_budget, "max_steps": max_steps, "trial_idx": trial_idx, } # CIM parameters constrained so the charge transition is within ±3 V. # Transition voltage: V_t = -E_c / lever_arm. # With lever_arm ~ 0.65 and E_c ~ 1.5 meV: # V_t ≈ -2.3 V (range: -1.8 to -2.8 V across the ± perturbations) # This matches GaAs-class device physics and is reachable within the # ±3 V voltage bounds set in ExperimentState (state.py). E_c_base = 2.5 + np.random.uniform(-0.3, 0.3) # was 1.5, lever_arm stays 0.65 t_c_base = 0.3 + np.random.uniform(-0.1, 0.1) adapter = CIMSimulatorAdapter( device_id=device_id, params={ "E_c1": E_c_base, "E_c2": E_c_base + 0.2, "t_c": t_c_base, "T": 0.08, "lever_arm": 0.65, # was 0.55; higher lever → transition closer to 0V "noise_level": 0.015, }, seed=trial_idx + 1000, ) # HITL in auto-approve test mode (no blocking) hitl = HITLManager(enabled=True) hitl.set_test_mode(auto_outcome=HITLOutcome.APPROVED) # Governance logger gov_log_dir = out_dir / "governance" / f"trial_{trial_idx:03d}" governance = GovernanceLogger(run_id=state.run_id, log_dir=str(gov_log_dir)) # Create agent agent = ExecutiveAgent( state=state, adapter=adapter, inspection_agent=inspector, hitl_manager=hitl, governance_logger=governance, max_steps=max_steps, measurement_budget=measurement_budget, ) # Run t_start = time.time() summary = agent.run() duration = time.time() - t_start # Add trial-specific info summary["trial_idx"] = trial_idx summary["duration_s"] = duration summary["device_params"] = { "E_c1": adapter.device.E_c1, "E_c2": adapter.device.E_c2, "t_c": adapter.device.t_c, } return summary def compute_summary(results: List[Dict], args) -> Dict[str, Any]: """Aggregate trial results into summary statistics.""" n = len(results) successes = sum(1 for r in results if r["success"]) measurements = [r["total_measurements"] for r in results] steps = [r["total_steps"] for r in results] reductions = [r["measurement_reduction"] for r in results] backtracks = [r["total_backtracks"] for r in results] hitl_counts = [r["hitl_events"] for r in results] dense_baseline = 64 * 64 # 4096 points return { "n_trials": n, "success_rate": successes / n if n > 0 else 0.0, "mean_measurements": float(np.mean(measurements)), "std_measurements": float(np.std(measurements)), "mean_steps": float(np.mean(steps)), "mean_reduction": float(np.mean(reductions)), "median_reduction": float(np.median(reductions)), "min_reduction": float(np.min(reductions)), "max_reduction": float(np.max(reductions)), "mean_backtracks": float(np.mean(backtracks)), "mean_hitl": float(np.mean(hitl_counts)), "dense_baseline": dense_baseline, "measurement_budget": args.budget, "max_steps": args.max_steps, "targets": { "success_rate_min": 0.90, "reduction_min": 0.50, }, } def print_report(summary: Dict[str, Any]): """Print formatted summary report.""" print(f"\n{'='*70}") print("BENCHMARK RESULTS") print(f"{'='*70}\n") success_pass = "✓" if summary["success_rate"] >= 0.90 else "✗" reduction_pass = "✓" if summary["mean_reduction"] >= 0.50 else "✗" print(f"Success rate: {summary['success_rate']:>6.1%} {success_pass} (target ≥90%)") print(f"Mean reduction: {summary['mean_reduction']:>6.1%} {reduction_pass} (target ≥50%)") print(f"Median reduction: {summary['median_reduction']:>6.1%}") print(f"Reduction range: [{summary['min_reduction']:.1%}, {summary['max_reduction']:.1%}]") print() print(f"Mean measurements: {summary['mean_measurements']:>6.0f} ± {summary['std_measurements']:.0f}") print(f"Mean steps: {summary['mean_steps']:>6.1f}") print(f"Mean backtracks: {summary['mean_backtracks']:>6.1f}") print(f"Mean HITL triggers: {summary['mean_hitl']:>6.1f}") print(f"\n{'='*70}") def save_results(summary: Dict, results: List[Dict], out_dir: Path): """Save detailed results to disk.""" # Summary JSON with open(out_dir / "summary.json", "w") as f: json.dump(summary, f, indent=2) # Per-trial CSV import csv with open(out_dir / "trials.csv", "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=[ "trial_idx", "success", "final_stage", "total_measurements", "total_steps", "measurement_reduction", "total_backtracks", "hitl_events", "duration_s", ]) writer.writeheader() for r in results: writer.writerow({ "trial_idx": r["trial_idx"], "success": r["success"], "final_stage": r["final_stage"], "total_measurements": r["total_measurements"], "total_steps": r["total_steps"], "measurement_reduction": r["measurement_reduction"], "total_backtracks": r["total_backtracks"], "hitl_events": r["hitl_events"], "duration_s": r.get("duration_s", 0.0), }) print(f"\nDetailed results saved to: {out_dir}/") print(f" - summary.json") print(f" - trials.csv") print(f" - governance/trial_XXX/*.jsonl") if __name__ == "__main__": import sys sys.exit(main())