cascade-mind / scripts /benchmark.py
Rajkamal Rajarshi
refactor: restructure into cascade_mind package with server subpackages
17cb006
#!/usr/bin/env python3
"""
benchmark.py
-------------
Local benchmark for cascade-mind v2.
Runs the environment through a range of seeds and produces aggregated
performance statistics without requiring an LLM. Uses a simple heuristic
agent (BFS from changed service) to establish a performance baseline.
Usage:
python benchmark.py # run 30 seeds with heuristic agent
python benchmark.py --seeds 100 # run 100 seeds
python benchmark.py --audit # also run trajectory auditor
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from typing import Any, Dict, List
# Disable LLM simulator for benchmarking
os.environ.setdefault("LLM_SIMULATOR_ENABLED", "false")
# Imports
try:
from server.service_impact_environment import ServiceImpactEnvironment
from models import ServiceImpactAction
from server.graph_builder import SERVICES
except ImportError:
sys.exit("Run from the cascade-mind root directory.")
try:
from server.trajectory_auditor import TrajectoryAuditor
except ImportError:
TrajectoryAuditor = None
def heuristic_agent(env: ServiceImpactEnvironment, seed: int, verbose: bool = False) -> Dict[str, Any]:
"""Simple heuristic agent: free intel → BFS → submit.
This establishes a baseline that any LLM agent should beat.
"""
obs = env.reset(seed=seed)
changed = obs.changed_service
budget = obs.queries_remaining
difficulty = env._task_difficulty
candidates = set()
steps = 0
start = time.time()
# Phase 1: Free intel (always do all 3)
for action_type in ["query_changelog", "query_runbook", "query_monitoring"]:
obs = env.step(ServiceImpactAction(
action_type=action_type,
service_name=changed,
affected_services=[],
))
steps += 1
# Extract service names from message
for svc in SERVICES:
if svc in obs.message and svc != changed:
candidates.add(svc)
# Phase 2: BFS with query_dependents
queried = set()
to_query = [changed] # start from changed service
while to_query and env._queries_used < env._max_queries - 2: # leave buffer for submit
svc = to_query.pop(0)
if svc in queried:
continue
queried.add(svc)
obs = env.step(ServiceImpactAction(
action_type="query_dependents",
service_name=svc,
affected_services=[],
))
steps += 1
# Extract new service names
for s in SERVICES:
if s in obs.message and s != changed and s not in queried:
candidates.add(s)
if s not in to_query:
to_query.append(s)
if obs.done:
break
# Phase 3: Submit
if not env._episode_ended:
obs = env.step(ServiceImpactAction(
action_type="submit",
affected_services=sorted(candidates),
))
steps += 1
elapsed = time.time() - start
reward = obs.reward if obs.reward is not None else 0.0
reward = max(0.001, min(0.999, reward))
if verbose:
print(f" seed={seed:4d} difficulty={difficulty:<6s} "
f"reward={reward:.3f} candidates={len(candidates):2d} "
f"steps={steps:2d} time={elapsed:.2f}s")
return {
"seed": seed,
"difficulty": difficulty,
"reward": reward,
"candidates": len(candidates),
"steps": steps,
"elapsed_s": round(elapsed, 3),
}
def main() -> None:
parser = argparse.ArgumentParser(description="cascade-mind v2 benchmark")
parser.add_argument("--seeds", type=int, default=30, help="Number of seeds to run")
parser.add_argument("--start-seed", type=int, default=0, help="Starting seed")
parser.add_argument("--audit", action="store_true", help="Run trajectory auditor")
parser.add_argument("--verbose", "-v", action="store_true", help="Show per-seed results")
args = parser.parse_args()
print(f"{'='*65}")
print(f" cascade-mind v2 — Heuristic Agent Benchmark")
print(f" Seeds: {args.start_seed} to {args.start_seed + args.seeds - 1}")
print(f"{'='*65}")
results: List[Dict[str, Any]] = []
by_difficulty: Dict[str, List[float]] = {"easy": [], "medium": [], "hard": []}
start_total = time.time()
for seed in range(args.start_seed, args.start_seed + args.seeds):
env = ServiceImpactEnvironment()
result = heuristic_agent(env, seed, verbose=args.verbose)
results.append(result)
by_difficulty[result["difficulty"]].append(result["reward"])
total_elapsed = time.time() - start_total
# Summary
print(f"\n{'─'*65}")
all_rewards = [r["reward"] for r in results]
mean_reward = sum(all_rewards) / len(all_rewards) if all_rewards else 0.0
for diff in ["easy", "medium", "hard"]:
scores = by_difficulty[diff]
if scores:
avg = sum(scores) / len(scores)
print(f" {diff:<8s} n={len(scores):3d} mean={avg:.3f} "
f"min={min(scores):.3f} max={max(scores):.3f}")
else:
print(f" {diff:<8s} n= 0")
print(f"{'─'*65}")
print(f" OVERALL n={len(results):3d} mean={mean_reward:.3f} time={total_elapsed:.1f}s")
print(f"{'='*65}")
# Audit
if args.audit and TrajectoryAuditor is not None:
print(f"\n{'─'*65}")
print(" Trajectory Audit")
print(f"{'─'*65}")
auditor = TrajectoryAuditor()
summary = auditor.summary()
print(f" Episodes audited: {summary.get('episodes', 0)}")
print(f" Mean reward: {summary.get('mean_reward', 0):.3f}")
print(f" Budget util: {summary.get('mean_budget_utilization', 0):.1%}")
print(f" Strategies: {json.dumps(summary.get('strategy_distribution', {}))}")
print(f" Hypothesis used: {summary.get('hypothesis_usage', 0)}")
# JSON output
print(f"\nJSON_BENCHMARK: {json.dumps({'mean_reward': round(mean_reward, 4), 'seeds': args.seeds, 'by_difficulty': {d: round(sum(s)/len(s), 4) if s else 0 for d, s in by_difficulty.items()}})}")
if __name__ == "__main__":
main()