Spaces:

lablab-ai-amd-developer-hackathon
/

simquantum-tuning-lab

Sleeping

App Files Files Community

simquantum-tuning-lab / experiments /benchmark_phase2.py

100enigma

SimQuantum — AMD Developer Hackathon

da98415 about 2 months ago

Raw

History Blame Contribute Delete

12.4 kB

	"""
	experiments/benchmark_phase2.py
	================================
	Phase 2 benchmark: 100 CIM simulation trials with full agent loop.

	Validates:
	≥50% measurement reduction vs 64×64 dense baseline (4096 points)
	≥90% success rate (reaching target (1,1) charge state)

	Usage:
	python experiments/benchmark_phase2.py --n-trials 100 --budget 2048
	python experiments/benchmark_phase2.py --fast # 10 trials for quick testing
	python experiments/benchmark_phase2.py --skip-missing-checkpoints # run without trained models

	Outputs:
	- Summary report printed to stdout
	- Detailed per-trial logs saved to --out directory
	- CSV with metrics for each trial
	"""

	from __future__ import annotations

	import argparse
	import json
	import time
	from pathlib import Path
	from typing import List, Dict, Any

	import numpy as np

	# Phase 0 types and state
	from qdot.core.types import ChargeLabel, TuningStage
	from qdot.core.state import ExperimentState
	from qdot.core.governance import GovernanceLogger
	from qdot.core.hitl import HITLManager, HITLOutcome

	# Phase 0 hardware
	from qdot.simulator.cim import CIMSimulatorAdapter
	from qdot.hardware.safety import SafetyCritic

	# Phase 1 perception
	from qdot.perception.dqc import DQCGatekeeper
	from qdot.perception.inspector import InspectionAgent
	from qdot.perception.classifier import EnsembleCNN
	from qdot.perception.ood import MahalanobisOOD

	# Phase 2 agent
	from qdot.agent.executive import ExecutiveAgent


	def main():
	parser = argparse.ArgumentParser(description="Phase 2 benchmark")
	parser.add_argument("--n-trials", type=int, default=100,
	help="Number of simulation trials (default: 100)")
	parser.add_argument("--budget", type=int, default=8192,
	help="Measurement budget per trial safety cap (default: 8192)")
	parser.add_argument("--max-steps", type=int, default=100,
	help="Max control steps per trial (default: 100)")
	parser.add_argument("--fast", action="store_true",
	help="Fast mode: 10 trials, reduced budgets for CI")
	parser.add_argument("--profile", action="store_true",
	help="Enable profiling to identify bottlenecks")
	parser.add_argument("--skip-missing-checkpoints", action="store_true",
	help="Run without trained InspectionAgent (for CI)")
	parser.add_argument("--out", type=str, default="results/benchmark_phase2",
	help="Output directory for detailed logs")
	parser.add_argument("--seed", type=int, default=42)
	args = parser.parse_args()

	if args.fast:
	args.n_trials = 10
	args.budget = 4096 # was 512 — needs headroom for 6-stage pipeline
	args.max_steps = 50
	print("FAST MODE: 10 trials, 4096 budget, 50 max steps")

	out_dir = Path(args.out)
	out_dir.mkdir(parents=True, exist_ok=True)

	print(f"\n{'='*70}")
	print("PHASE 2 BENCHMARK — Agentic Tuning on CIM Simulator")
	print(f"{'='*70}\n")
	print(f"Trials: {args.n_trials}")
	print(f"Measurement budget: {args.budget} points")
	print(f"Max steps: {args.max_steps}")
	print(f"Target: (1,1) charge state")
	print(f"Dense baseline: 64×64 = 4096 points")
	print(f"Reduction target: ≥50% (≤2048 measurements)")
	print(f"Success target: ≥90% of trials\n")

	# Load Phase 1 components
	inspector = load_inspector(args.skip_missing_checkpoints)

	# Start profiling if requested
	profiler = None
	if args.profile:
	import cProfile
	profiler = cProfile.Profile()
	profiler.enable()
	print("⚙️ Profiling enabled\n")

	# Run trials
	np.random.seed(args.seed)
	results = []
	for trial_idx in range(args.n_trials):
	print(f"[{trial_idx+1}/{args.n_trials}] ", end="", flush=True)
	result = run_trial(
	trial_idx=trial_idx,
	inspector=inspector,
	measurement_budget=args.budget,
	max_steps=args.max_steps,
	out_dir=out_dir,
	)
	results.append(result)
	status = "✓" if result["success"] else "✗"
	print(f"{status} {result['final_stage']} \| {result['total_measurements']} meas \| {result['total_steps']} steps")

	# Aggregate metrics
	summary = compute_summary(results, args)

	# Print report
	print_report(summary)

	# Save detailed results
	save_results(summary, results, out_dir)

	# Stop profiling and print results
	if profiler is not None:
	profiler.disable()
	import pstats
	print("\n" + "="*70)
	print("PROFILING RESULTS — Top 20 Time Sinks")
	print("="*70)
	stats = pstats.Stats(profiler)
	stats.sort_stats('cumulative')
	stats.print_stats(20)

	# Exit with appropriate code
	if summary["success_rate"] < 0.90:
	print(f"\n❌ BENCHMARK FAILED: success rate {summary['success_rate']:.1%} < 90%")
	return 1
	if summary["mean_reduction"] < 0.50:
	print(f"\n❌ BENCHMARK FAILED: mean reduction {summary['mean_reduction']:.1%} < 50%")
	return 1

	print("\n✅ PHASE 2 BENCHMARK PASSED")
	return 0


	def load_inspector(skip_missing: bool) -> InspectionAgent:
	"""Load trained InspectionAgent or create untrained stub if skipping."""
	checkpoint_dir = Path("experiments/checkpoints/phase1")

	if skip_missing or not checkpoint_dir.exists():
	print("⚠️ Running without trained checkpoints (InspectionAgent in stub mode)\n")
	return InspectionAgent(
	ensemble=None, # Untrained — predictions will be random
	ood_detector=None, # OOD detection skipped
	)

	# Load trained ensemble and OOD detector
	try:
	ensemble = EnsembleCNN.load(str(checkpoint_dir))
	ood = MahalanobisOOD.load(str(checkpoint_dir / "ood_detector.pkl"))
	print(f"✓ Loaded trained InspectionAgent from {checkpoint_dir}\n")
	return InspectionAgent(ensemble=ensemble, ood_detector=ood)
	except Exception as e:
	print(f"⚠️ Failed to load checkpoints: {e}")
	print(" Continuing with untrained InspectionAgent\n")
	return InspectionAgent(ensemble=None, ood_detector=None)


	def run_trial(
	trial_idx: int,
	inspector: InspectionAgent,
	measurement_budget: int,
	max_steps: int,
	out_dir: Path,
	) -> Dict[str, Any]:
	"""Run a single tuning trial."""
	# Create fresh state
	device_id = f"cim_trial_{trial_idx:03d}"
	state = ExperimentState.new(
	device_id=device_id,
	target_label=ChargeLabel.DOUBLE_DOT,
	)
	state.config = {
	"measurement_budget": measurement_budget,
	"max_steps": max_steps,
	"trial_idx": trial_idx,
	}

	# CIM parameters constrained so the charge transition is within ±3 V.
	# Transition voltage: V_t = -E_c / lever_arm.
	# With lever_arm ~ 0.65 and E_c ~ 1.5 meV:
	# V_t ≈ -2.3 V (range: -1.8 to -2.8 V across the ± perturbations)
	# This matches GaAs-class device physics and is reachable within the
	# ±3 V voltage bounds set in ExperimentState (state.py).
	E_c_base = 2.5 + np.random.uniform(-0.3, 0.3) # was 1.5, lever_arm stays 0.65
	t_c_base = 0.3 + np.random.uniform(-0.1, 0.1)
	adapter = CIMSimulatorAdapter(
	device_id=device_id,
	params={
	"E_c1": E_c_base,
	"E_c2": E_c_base + 0.2,
	"t_c": t_c_base,
	"T": 0.08,
	"lever_arm": 0.65, # was 0.55; higher lever → transition closer to 0V
	"noise_level": 0.015,
	},
	seed=trial_idx + 1000,
	)

	# HITL in auto-approve test mode (no blocking)
	hitl = HITLManager(enabled=True)
	hitl.set_test_mode(auto_outcome=HITLOutcome.APPROVED)

	# Governance logger
	gov_log_dir = out_dir / "governance" / f"trial_{trial_idx:03d}"
	governance = GovernanceLogger(run_id=state.run_id, log_dir=str(gov_log_dir))

	# Create agent
	agent = ExecutiveAgent(
	state=state,
	adapter=adapter,
	inspection_agent=inspector,
	hitl_manager=hitl,
	governance_logger=governance,
	max_steps=max_steps,
	measurement_budget=measurement_budget,
	)

	# Run
	t_start = time.time()
	summary = agent.run()
	duration = time.time() - t_start

	# Add trial-specific info
	summary["trial_idx"] = trial_idx
	summary["duration_s"] = duration
	summary["device_params"] = {
	"E_c1": adapter.device.E_c1,
	"E_c2": adapter.device.E_c2,
	"t_c": adapter.device.t_c,
	}

	return summary


	def compute_summary(results: List[Dict], args) -> Dict[str, Any]:
	"""Aggregate trial results into summary statistics."""
	n = len(results)
	successes = sum(1 for r in results if r["success"])
	measurements = [r["total_measurements"] for r in results]
	steps = [r["total_steps"] for r in results]
	reductions = [r["measurement_reduction"] for r in results]
	backtracks = [r["total_backtracks"] for r in results]
	hitl_counts = [r["hitl_events"] for r in results]

	dense_baseline = 64 * 64 # 4096 points

	return {
	"n_trials": n,
	"success_rate": successes / n if n > 0 else 0.0,
	"mean_measurements": float(np.mean(measurements)),
	"std_measurements": float(np.std(measurements)),
	"mean_steps": float(np.mean(steps)),
	"mean_reduction": float(np.mean(reductions)),
	"median_reduction": float(np.median(reductions)),
	"min_reduction": float(np.min(reductions)),
	"max_reduction": float(np.max(reductions)),
	"mean_backtracks": float(np.mean(backtracks)),
	"mean_hitl": float(np.mean(hitl_counts)),
	"dense_baseline": dense_baseline,
	"measurement_budget": args.budget,
	"max_steps": args.max_steps,
	"targets": {
	"success_rate_min": 0.90,
	"reduction_min": 0.50,
	},
	}


	def print_report(summary: Dict[str, Any]):
	"""Print formatted summary report."""
	print(f"\n{'='*70}")
	print("BENCHMARK RESULTS")
	print(f"{'='*70}\n")

	success_pass = "✓" if summary["success_rate"] >= 0.90 else "✗"
	reduction_pass = "✓" if summary["mean_reduction"] >= 0.50 else "✗"

	print(f"Success rate: {summary['success_rate']:>6.1%} {success_pass} (target ≥90%)")
	print(f"Mean reduction: {summary['mean_reduction']:>6.1%} {reduction_pass} (target ≥50%)")
	print(f"Median reduction: {summary['median_reduction']:>6.1%}")
	print(f"Reduction range: [{summary['min_reduction']:.1%}, {summary['max_reduction']:.1%}]")
	print()
	print(f"Mean measurements: {summary['mean_measurements']:>6.0f} ± {summary['std_measurements']:.0f}")
	print(f"Mean steps: {summary['mean_steps']:>6.1f}")
	print(f"Mean backtracks: {summary['mean_backtracks']:>6.1f}")
	print(f"Mean HITL triggers: {summary['mean_hitl']:>6.1f}")
	print(f"\n{'='*70}")


	def save_results(summary: Dict, results: List[Dict], out_dir: Path):
	"""Save detailed results to disk."""
	# Summary JSON
	with open(out_dir / "summary.json", "w") as f:
	json.dump(summary, f, indent=2)

	# Per-trial CSV
	import csv
	with open(out_dir / "trials.csv", "w", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=[
	"trial_idx", "success", "final_stage", "total_measurements",
	"total_steps", "measurement_reduction", "total_backtracks",
	"hitl_events", "duration_s",
	])
	writer.writeheader()
	for r in results:
	writer.writerow({
	"trial_idx": r["trial_idx"],
	"success": r["success"],
	"final_stage": r["final_stage"],
	"total_measurements": r["total_measurements"],
	"total_steps": r["total_steps"],
	"measurement_reduction": r["measurement_reduction"],
	"total_backtracks": r["total_backtracks"],
	"hitl_events": r["hitl_events"],
	"duration_s": r.get("duration_s", 0.0),
	})

	print(f"\nDetailed results saved to: {out_dir}/")
	print(f" - summary.json")
	print(f" - trials.csv")
	print(f" - governance/trial_XXX/*.jsonl")


	if __name__ == "__main__":
	import sys
	sys.exit(main())