Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| benchmark.py | |
| ------------- | |
| Local benchmark for cascade-mind v2. | |
| Runs the environment through a range of seeds and produces aggregated | |
| performance statistics without requiring an LLM. Uses a simple heuristic | |
| agent (BFS from changed service) to establish a performance baseline. | |
| Usage: | |
| python benchmark.py # run 30 seeds with heuristic agent | |
| python benchmark.py --seeds 100 # run 100 seeds | |
| python benchmark.py --audit # also run trajectory auditor | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from typing import Any, Dict, List | |
| # Disable LLM simulator for benchmarking | |
| os.environ.setdefault("LLM_SIMULATOR_ENABLED", "false") | |
| # Imports | |
| try: | |
| from server.service_impact_environment import ServiceImpactEnvironment | |
| from models import ServiceImpactAction | |
| from server.graph_builder import SERVICES | |
| except ImportError: | |
| sys.exit("Run from the cascade-mind root directory.") | |
| try: | |
| from server.trajectory_auditor import TrajectoryAuditor | |
| except ImportError: | |
| TrajectoryAuditor = None | |
| def heuristic_agent(env: ServiceImpactEnvironment, seed: int, verbose: bool = False) -> Dict[str, Any]: | |
| """Simple heuristic agent: free intel → BFS → submit. | |
| This establishes a baseline that any LLM agent should beat. | |
| """ | |
| obs = env.reset(seed=seed) | |
| changed = obs.changed_service | |
| budget = obs.queries_remaining | |
| difficulty = env._task_difficulty | |
| candidates = set() | |
| steps = 0 | |
| start = time.time() | |
| # Phase 1: Free intel (always do all 3) | |
| for action_type in ["query_changelog", "query_runbook", "query_monitoring"]: | |
| obs = env.step(ServiceImpactAction( | |
| action_type=action_type, | |
| service_name=changed, | |
| affected_services=[], | |
| )) | |
| steps += 1 | |
| # Extract service names from message | |
| for svc in SERVICES: | |
| if svc in obs.message and svc != changed: | |
| candidates.add(svc) | |
| # Phase 2: BFS with query_dependents | |
| queried = set() | |
| to_query = [changed] # start from changed service | |
| while to_query and env._queries_used < env._max_queries - 2: # leave buffer for submit | |
| svc = to_query.pop(0) | |
| if svc in queried: | |
| continue | |
| queried.add(svc) | |
| obs = env.step(ServiceImpactAction( | |
| action_type="query_dependents", | |
| service_name=svc, | |
| affected_services=[], | |
| )) | |
| steps += 1 | |
| # Extract new service names | |
| for s in SERVICES: | |
| if s in obs.message and s != changed and s not in queried: | |
| candidates.add(s) | |
| if s not in to_query: | |
| to_query.append(s) | |
| if obs.done: | |
| break | |
| # Phase 3: Submit | |
| if not env._episode_ended: | |
| obs = env.step(ServiceImpactAction( | |
| action_type="submit", | |
| affected_services=sorted(candidates), | |
| )) | |
| steps += 1 | |
| elapsed = time.time() - start | |
| reward = obs.reward if obs.reward is not None else 0.0 | |
| reward = max(0.001, min(0.999, reward)) | |
| if verbose: | |
| print(f" seed={seed:4d} difficulty={difficulty:<6s} " | |
| f"reward={reward:.3f} candidates={len(candidates):2d} " | |
| f"steps={steps:2d} time={elapsed:.2f}s") | |
| return { | |
| "seed": seed, | |
| "difficulty": difficulty, | |
| "reward": reward, | |
| "candidates": len(candidates), | |
| "steps": steps, | |
| "elapsed_s": round(elapsed, 3), | |
| } | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="cascade-mind v2 benchmark") | |
| parser.add_argument("--seeds", type=int, default=30, help="Number of seeds to run") | |
| parser.add_argument("--start-seed", type=int, default=0, help="Starting seed") | |
| parser.add_argument("--audit", action="store_true", help="Run trajectory auditor") | |
| parser.add_argument("--verbose", "-v", action="store_true", help="Show per-seed results") | |
| args = parser.parse_args() | |
| print(f"{'='*65}") | |
| print(f" cascade-mind v2 — Heuristic Agent Benchmark") | |
| print(f" Seeds: {args.start_seed} to {args.start_seed + args.seeds - 1}") | |
| print(f"{'='*65}") | |
| results: List[Dict[str, Any]] = [] | |
| by_difficulty: Dict[str, List[float]] = {"easy": [], "medium": [], "hard": []} | |
| start_total = time.time() | |
| for seed in range(args.start_seed, args.start_seed + args.seeds): | |
| env = ServiceImpactEnvironment() | |
| result = heuristic_agent(env, seed, verbose=args.verbose) | |
| results.append(result) | |
| by_difficulty[result["difficulty"]].append(result["reward"]) | |
| total_elapsed = time.time() - start_total | |
| # Summary | |
| print(f"\n{'─'*65}") | |
| all_rewards = [r["reward"] for r in results] | |
| mean_reward = sum(all_rewards) / len(all_rewards) if all_rewards else 0.0 | |
| for diff in ["easy", "medium", "hard"]: | |
| scores = by_difficulty[diff] | |
| if scores: | |
| avg = sum(scores) / len(scores) | |
| print(f" {diff:<8s} n={len(scores):3d} mean={avg:.3f} " | |
| f"min={min(scores):.3f} max={max(scores):.3f}") | |
| else: | |
| print(f" {diff:<8s} n= 0") | |
| print(f"{'─'*65}") | |
| print(f" OVERALL n={len(results):3d} mean={mean_reward:.3f} time={total_elapsed:.1f}s") | |
| print(f"{'='*65}") | |
| # Audit | |
| if args.audit and TrajectoryAuditor is not None: | |
| print(f"\n{'─'*65}") | |
| print(" Trajectory Audit") | |
| print(f"{'─'*65}") | |
| auditor = TrajectoryAuditor() | |
| summary = auditor.summary() | |
| print(f" Episodes audited: {summary.get('episodes', 0)}") | |
| print(f" Mean reward: {summary.get('mean_reward', 0):.3f}") | |
| print(f" Budget util: {summary.get('mean_budget_utilization', 0):.1%}") | |
| print(f" Strategies: {json.dumps(summary.get('strategy_distribution', {}))}") | |
| print(f" Hypothesis used: {summary.get('hypothesis_usage', 0)}") | |
| # JSON output | |
| print(f"\nJSON_BENCHMARK: {json.dumps({'mean_reward': round(mean_reward, 4), 'seeds': args.seeds, 'by_difficulty': {d: round(sum(s)/len(s), 4) if s else 0 for d, s in by_difficulty.items()}})}") | |
| if __name__ == "__main__": | |
| main() | |