"""Simple evaluation harness for Red + Blue agents. Run N episodes, aggregate metrics, and return a summary dict. """ from __future__ import annotations import logging from typing import TYPE_CHECKING from open_range.agents.episode import run_episode from open_range.agents.protocol import EpisodeResult if TYPE_CHECKING: from open_range.agents.protocol import RangeAgent logger = logging.getLogger(__name__) def _mean(values: list[float]) -> float: """Compute mean, returning 0.0 for empty lists.""" return sum(values) / len(values) if values else 0.0 def evaluate( env: object, red: RangeAgent, blue: RangeAgent, n_episodes: int = 50, max_steps: int = 100, red_model: str = "", blue_model: str = "", ) -> dict: """Run *n_episodes* and compute aggregate metrics. Args: env: A ``RangeEnvironment`` instance (or compatible object). red: Red team agent. blue: Blue team agent. n_episodes: Number of episodes to run. max_steps: Maximum steps per episode. red_model: Model identifier for logging. blue_model: Model identifier for logging. Returns: Dict with aggregate metrics:: { "n_episodes": int, "red_solve_rate": float, "blue_detect_rate": float, "avg_steps": float, "avg_stealth": float, "avg_availability": float, "false_positive_rate": float, "avg_flag_capture_rate": float, "outcomes": {"red_win": int, "blue_win": int, "timeout": int}, "results": [EpisodeResult, ...], } """ results: list[EpisodeResult] = [] for i in range(n_episodes): logger.info("Running episode %d/%d", i + 1, n_episodes) result = run_episode( env=env, red=red, blue=blue, max_steps=max_steps, red_model=red_model, blue_model=blue_model, ) results.append(result) # Aggregate outcomes = {"red_win": 0, "blue_win": 0, "timeout": 0} for r in results: if r.outcome in outcomes: outcomes[r.outcome] += 1 return { "n_episodes": n_episodes, "red_solve_rate": _mean([1.0 if r.outcome == "red_win" else 0.0 for r in results]), "blue_detect_rate": _mean([r.metrics.detection_tp for r in results]), "avg_steps": _mean([float(r.steps) for r in results]), "avg_stealth": _mean([r.metrics.stealth for r in results]), "avg_availability": _mean([r.metrics.availability for r in results]), "false_positive_rate": _mean([r.metrics.false_positives for r in results]), "avg_flag_capture_rate": _mean([r.metrics.flag_capture_rate for r in results]), "outcomes": outcomes, "results": results, }