"""
experiments/benchmark_phase2.py
================================
Phase 2 benchmark: 100 CIM simulation trials with full agent loop.

Validates:
    ≥50% measurement reduction vs 64×64 dense baseline (4096 points)
    ≥90% success rate (reaching target (1,1) charge state)

Usage:
    python experiments/benchmark_phase2.py --n-trials 100 --budget 2048
    python experiments/benchmark_phase2.py --fast  # 10 trials for quick testing
    python experiments/benchmark_phase2.py --skip-missing-checkpoints  # run without trained models

Outputs:
    - Summary report printed to stdout
    - Detailed per-trial logs saved to --out directory
    - CSV with metrics for each trial
"""

from __future__ import annotations

import argparse
import json
import time
from pathlib import Path
from typing import List, Dict, Any

import numpy as np

# Phase 0 types and state
from qdot.core.types import ChargeLabel, TuningStage
from qdot.core.state import ExperimentState
from qdot.core.governance import GovernanceLogger
from qdot.core.hitl import HITLManager, HITLOutcome

# Phase 0 hardware
from qdot.simulator.cim import CIMSimulatorAdapter
from qdot.hardware.safety import SafetyCritic

# Phase 1 perception
from qdot.perception.dqc import DQCGatekeeper
from qdot.perception.inspector import InspectionAgent
from qdot.perception.classifier import EnsembleCNN
from qdot.perception.ood import MahalanobisOOD

# Phase 2 agent
from qdot.agent.executive import ExecutiveAgent


def main():
    parser = argparse.ArgumentParser(description="Phase 2 benchmark")
    parser.add_argument("--n-trials", type=int, default=100,
                        help="Number of simulation trials (default: 100)")
    parser.add_argument("--budget", type=int, default=8192,
                        help="Measurement budget per trial safety cap (default: 8192)")
    parser.add_argument("--max-steps", type=int, default=100,
                        help="Max control steps per trial (default: 100)")
    parser.add_argument("--fast", action="store_true",
                        help="Fast mode: 10 trials, reduced budgets for CI")
    parser.add_argument("--profile", action="store_true",
                        help="Enable profiling to identify bottlenecks")
    parser.add_argument("--skip-missing-checkpoints", action="store_true",
                        help="Run without trained InspectionAgent (for CI)")
    parser.add_argument("--out", type=str, default="results/benchmark_phase2",
                        help="Output directory for detailed logs")
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    if args.fast:
        args.n_trials = 10
        args.budget = 4096   # was 512 — needs headroom for 6-stage pipeline
        args.max_steps = 50
        print("FAST MODE: 10 trials, 4096 budget, 50 max steps")

    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)

    print(f"\n{'='*70}")
    print("PHASE 2 BENCHMARK — Agentic Tuning on CIM Simulator")
    print(f"{'='*70}\n")
    print(f"Trials:            {args.n_trials}")
    print(f"Measurement budget: {args.budget} points")
    print(f"Max steps:         {args.max_steps}")
    print(f"Target:            (1,1) charge state")
    print(f"Dense baseline:    64×64 = 4096 points")
    print(f"Reduction target:  ≥50% (≤2048 measurements)")
    print(f"Success target:    ≥90% of trials\n")

    # Load Phase 1 components
    inspector = load_inspector(args.skip_missing_checkpoints)

    # Start profiling if requested
    profiler = None
    if args.profile:
        import cProfile
        profiler = cProfile.Profile()
        profiler.enable()
        print("⚙️  Profiling enabled\n")

    # Run trials
    np.random.seed(args.seed)
    results = []
    for trial_idx in range(args.n_trials):
        print(f"[{trial_idx+1}/{args.n_trials}] ", end="", flush=True)
        result = run_trial(
            trial_idx=trial_idx,
            inspector=inspector,
            measurement_budget=args.budget,
            max_steps=args.max_steps,
            out_dir=out_dir,
        )
        results.append(result)
        status = "✓" if result["success"] else "✗"
        print(f"{status} {result['final_stage']} | {result['total_measurements']} meas | {result['total_steps']} steps")

    # Aggregate metrics
    summary = compute_summary(results, args)

    # Print report
    print_report(summary)

    # Save detailed results
    save_results(summary, results, out_dir)

    # Stop profiling and print results
    if profiler is not None:
        profiler.disable()
        import pstats
        print("\n" + "="*70)
        print("PROFILING RESULTS — Top 20 Time Sinks")
        print("="*70)
        stats = pstats.Stats(profiler)
        stats.sort_stats('cumulative')
        stats.print_stats(20)

    # Exit with appropriate code
    if summary["success_rate"] < 0.90:
        print(f"\n❌ BENCHMARK FAILED: success rate {summary['success_rate']:.1%} < 90%")
        return 1
    if summary["mean_reduction"] < 0.50:
        print(f"\n❌ BENCHMARK FAILED: mean reduction {summary['mean_reduction']:.1%} < 50%")
        return 1

    print("\n✅ PHASE 2 BENCHMARK PASSED")
    return 0


def load_inspector(skip_missing: bool) -> InspectionAgent:
    """Load trained InspectionAgent or create untrained stub if skipping."""
    checkpoint_dir = Path("experiments/checkpoints/phase1")

    if skip_missing or not checkpoint_dir.exists():
        print("⚠️  Running without trained checkpoints (InspectionAgent in stub mode)\n")
        return InspectionAgent(
            ensemble=None,      # Untrained — predictions will be random
            ood_detector=None,  # OOD detection skipped
        )

    # Load trained ensemble and OOD detector
    try:
        ensemble = EnsembleCNN.load(str(checkpoint_dir))
        ood = MahalanobisOOD.load(str(checkpoint_dir / "ood_detector.pkl"))
        print(f"✓ Loaded trained InspectionAgent from {checkpoint_dir}\n")
        return InspectionAgent(ensemble=ensemble, ood_detector=ood)
    except Exception as e:
        print(f"⚠️  Failed to load checkpoints: {e}")
        print("   Continuing with untrained InspectionAgent\n")
        return InspectionAgent(ensemble=None, ood_detector=None)


def run_trial(
    trial_idx: int,
    inspector: InspectionAgent,
    measurement_budget: int,
    max_steps: int,
    out_dir: Path,
) -> Dict[str, Any]:
    """Run a single tuning trial."""
    # Create fresh state
    device_id = f"cim_trial_{trial_idx:03d}"
    state = ExperimentState.new(
        device_id=device_id,
        target_label=ChargeLabel.DOUBLE_DOT,
    )
    state.config = {
        "measurement_budget": measurement_budget,
        "max_steps": max_steps,
        "trial_idx": trial_idx,
    }

    # CIM parameters constrained so the charge transition is within ±3 V.
    # Transition voltage: V_t = -E_c / lever_arm.
    # With lever_arm ~ 0.65 and E_c ~ 1.5 meV:
    #   V_t ≈ -2.3 V  (range: -1.8 to -2.8 V across the ± perturbations)
    # This matches GaAs-class device physics and is reachable within the
    # ±3 V voltage bounds set in ExperimentState (state.py).
    E_c_base = 2.5 + np.random.uniform(-0.3, 0.3)   # was 1.5, lever_arm stays 0.65
    t_c_base = 0.3 + np.random.uniform(-0.1, 0.1)
    adapter = CIMSimulatorAdapter(
        device_id=device_id,
        params={
            "E_c1": E_c_base,
            "E_c2": E_c_base + 0.2,
            "t_c": t_c_base,
            "T": 0.08,
            "lever_arm": 0.65,   # was 0.55; higher lever → transition closer to 0V
            "noise_level": 0.015,
        },
        seed=trial_idx + 1000,
    )

    # HITL in auto-approve test mode (no blocking)
    hitl = HITLManager(enabled=True)
    hitl.set_test_mode(auto_outcome=HITLOutcome.APPROVED)

    # Governance logger
    gov_log_dir = out_dir / "governance" / f"trial_{trial_idx:03d}"
    governance = GovernanceLogger(run_id=state.run_id, log_dir=str(gov_log_dir))

    # Create agent
    agent = ExecutiveAgent(
        state=state,
        adapter=adapter,
        inspection_agent=inspector,
        hitl_manager=hitl,
        governance_logger=governance,
        max_steps=max_steps,
        measurement_budget=measurement_budget,
    )

    # Run
    t_start = time.time()
    summary = agent.run()
    duration = time.time() - t_start

    # Add trial-specific info
    summary["trial_idx"] = trial_idx
    summary["duration_s"] = duration
    summary["device_params"] = {
        "E_c1": adapter.device.E_c1,
        "E_c2": adapter.device.E_c2,
        "t_c": adapter.device.t_c,
    }

    return summary


def compute_summary(results: List[Dict], args) -> Dict[str, Any]:
    """Aggregate trial results into summary statistics."""
    n = len(results)
    successes = sum(1 for r in results if r["success"])
    measurements = [r["total_measurements"] for r in results]
    steps = [r["total_steps"] for r in results]
    reductions = [r["measurement_reduction"] for r in results]
    backtracks = [r["total_backtracks"] for r in results]
    hitl_counts = [r["hitl_events"] for r in results]

    dense_baseline = 64 * 64  # 4096 points

    return {
        "n_trials": n,
        "success_rate": successes / n if n > 0 else 0.0,
        "mean_measurements": float(np.mean(measurements)),
        "std_measurements": float(np.std(measurements)),
        "mean_steps": float(np.mean(steps)),
        "mean_reduction": float(np.mean(reductions)),
        "median_reduction": float(np.median(reductions)),
        "min_reduction": float(np.min(reductions)),
        "max_reduction": float(np.max(reductions)),
        "mean_backtracks": float(np.mean(backtracks)),
        "mean_hitl": float(np.mean(hitl_counts)),
        "dense_baseline": dense_baseline,
        "measurement_budget": args.budget,
        "max_steps": args.max_steps,
        "targets": {
            "success_rate_min": 0.90,
            "reduction_min": 0.50,
        },
    }


def print_report(summary: Dict[str, Any]):
    """Print formatted summary report."""
    print(f"\n{'='*70}")
    print("BENCHMARK RESULTS")
    print(f"{'='*70}\n")

    success_pass = "✓" if summary["success_rate"] >= 0.90 else "✗"
    reduction_pass = "✓" if summary["mean_reduction"] >= 0.50 else "✗"

    print(f"Success rate:        {summary['success_rate']:>6.1%}  {success_pass}  (target ≥90%)")
    print(f"Mean reduction:      {summary['mean_reduction']:>6.1%}  {reduction_pass}  (target ≥50%)")
    print(f"Median reduction:    {summary['median_reduction']:>6.1%}")
    print(f"Reduction range:     [{summary['min_reduction']:.1%}, {summary['max_reduction']:.1%}]")
    print()
    print(f"Mean measurements:   {summary['mean_measurements']:>6.0f} ± {summary['std_measurements']:.0f}")
    print(f"Mean steps:          {summary['mean_steps']:>6.1f}")
    print(f"Mean backtracks:     {summary['mean_backtracks']:>6.1f}")
    print(f"Mean HITL triggers:  {summary['mean_hitl']:>6.1f}")
    print(f"\n{'='*70}")


def save_results(summary: Dict, results: List[Dict], out_dir: Path):
    """Save detailed results to disk."""
    # Summary JSON
    with open(out_dir / "summary.json", "w") as f:
        json.dump(summary, f, indent=2)

    # Per-trial CSV
    import csv
    with open(out_dir / "trials.csv", "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "trial_idx", "success", "final_stage", "total_measurements",
            "total_steps", "measurement_reduction", "total_backtracks",
            "hitl_events", "duration_s",
        ])
        writer.writeheader()
        for r in results:
            writer.writerow({
                "trial_idx": r["trial_idx"],
                "success": r["success"],
                "final_stage": r["final_stage"],
                "total_measurements": r["total_measurements"],
                "total_steps": r["total_steps"],
                "measurement_reduction": r["measurement_reduction"],
                "total_backtracks": r["total_backtracks"],
                "hitl_events": r["hitl_events"],
                "duration_s": r.get("duration_s", 0.0),
            })

    print(f"\nDetailed results saved to: {out_dir}/")
    print(f"  - summary.json")
    print(f"  - trials.csv")
    print(f"  - governance/trial_XXX/*.jsonl")


if __name__ == "__main__":
    import sys
    sys.exit(main())