Spaces:
Sleeping
Sleeping
File size: 6,271 Bytes
17cb006 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | #!/usr/bin/env python3
"""
benchmark.py
-------------
Local benchmark for cascade-mind v2.
Runs the environment through a range of seeds and produces aggregated
performance statistics without requiring an LLM. Uses a simple heuristic
agent (BFS from changed service) to establish a performance baseline.
Usage:
python benchmark.py # run 30 seeds with heuristic agent
python benchmark.py --seeds 100 # run 100 seeds
python benchmark.py --audit # also run trajectory auditor
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from typing import Any, Dict, List
# Disable LLM simulator for benchmarking
os.environ.setdefault("LLM_SIMULATOR_ENABLED", "false")
# Imports
try:
from server.service_impact_environment import ServiceImpactEnvironment
from models import ServiceImpactAction
from server.graph_builder import SERVICES
except ImportError:
sys.exit("Run from the cascade-mind root directory.")
try:
from server.trajectory_auditor import TrajectoryAuditor
except ImportError:
TrajectoryAuditor = None
def heuristic_agent(env: ServiceImpactEnvironment, seed: int, verbose: bool = False) -> Dict[str, Any]:
"""Simple heuristic agent: free intel β BFS β submit.
This establishes a baseline that any LLM agent should beat.
"""
obs = env.reset(seed=seed)
changed = obs.changed_service
budget = obs.queries_remaining
difficulty = env._task_difficulty
candidates = set()
steps = 0
start = time.time()
# Phase 1: Free intel (always do all 3)
for action_type in ["query_changelog", "query_runbook", "query_monitoring"]:
obs = env.step(ServiceImpactAction(
action_type=action_type,
service_name=changed,
affected_services=[],
))
steps += 1
# Extract service names from message
for svc in SERVICES:
if svc in obs.message and svc != changed:
candidates.add(svc)
# Phase 2: BFS with query_dependents
queried = set()
to_query = [changed] # start from changed service
while to_query and env._queries_used < env._max_queries - 2: # leave buffer for submit
svc = to_query.pop(0)
if svc in queried:
continue
queried.add(svc)
obs = env.step(ServiceImpactAction(
action_type="query_dependents",
service_name=svc,
affected_services=[],
))
steps += 1
# Extract new service names
for s in SERVICES:
if s in obs.message and s != changed and s not in queried:
candidates.add(s)
if s not in to_query:
to_query.append(s)
if obs.done:
break
# Phase 3: Submit
if not env._episode_ended:
obs = env.step(ServiceImpactAction(
action_type="submit",
affected_services=sorted(candidates),
))
steps += 1
elapsed = time.time() - start
reward = obs.reward if obs.reward is not None else 0.0
reward = max(0.001, min(0.999, reward))
if verbose:
print(f" seed={seed:4d} difficulty={difficulty:<6s} "
f"reward={reward:.3f} candidates={len(candidates):2d} "
f"steps={steps:2d} time={elapsed:.2f}s")
return {
"seed": seed,
"difficulty": difficulty,
"reward": reward,
"candidates": len(candidates),
"steps": steps,
"elapsed_s": round(elapsed, 3),
}
def main() -> None:
parser = argparse.ArgumentParser(description="cascade-mind v2 benchmark")
parser.add_argument("--seeds", type=int, default=30, help="Number of seeds to run")
parser.add_argument("--start-seed", type=int, default=0, help="Starting seed")
parser.add_argument("--audit", action="store_true", help="Run trajectory auditor")
parser.add_argument("--verbose", "-v", action="store_true", help="Show per-seed results")
args = parser.parse_args()
print(f"{'='*65}")
print(f" cascade-mind v2 β Heuristic Agent Benchmark")
print(f" Seeds: {args.start_seed} to {args.start_seed + args.seeds - 1}")
print(f"{'='*65}")
results: List[Dict[str, Any]] = []
by_difficulty: Dict[str, List[float]] = {"easy": [], "medium": [], "hard": []}
start_total = time.time()
for seed in range(args.start_seed, args.start_seed + args.seeds):
env = ServiceImpactEnvironment()
result = heuristic_agent(env, seed, verbose=args.verbose)
results.append(result)
by_difficulty[result["difficulty"]].append(result["reward"])
total_elapsed = time.time() - start_total
# Summary
print(f"\n{'β'*65}")
all_rewards = [r["reward"] for r in results]
mean_reward = sum(all_rewards) / len(all_rewards) if all_rewards else 0.0
for diff in ["easy", "medium", "hard"]:
scores = by_difficulty[diff]
if scores:
avg = sum(scores) / len(scores)
print(f" {diff:<8s} n={len(scores):3d} mean={avg:.3f} "
f"min={min(scores):.3f} max={max(scores):.3f}")
else:
print(f" {diff:<8s} n= 0")
print(f"{'β'*65}")
print(f" OVERALL n={len(results):3d} mean={mean_reward:.3f} time={total_elapsed:.1f}s")
print(f"{'='*65}")
# Audit
if args.audit and TrajectoryAuditor is not None:
print(f"\n{'β'*65}")
print(" Trajectory Audit")
print(f"{'β'*65}")
auditor = TrajectoryAuditor()
summary = auditor.summary()
print(f" Episodes audited: {summary.get('episodes', 0)}")
print(f" Mean reward: {summary.get('mean_reward', 0):.3f}")
print(f" Budget util: {summary.get('mean_budget_utilization', 0):.1%}")
print(f" Strategies: {json.dumps(summary.get('strategy_distribution', {}))}")
print(f" Hypothesis used: {summary.get('hypothesis_usage', 0)}")
# JSON output
print(f"\nJSON_BENCHMARK: {json.dumps({'mean_reward': round(mean_reward, 4), 'seeds': args.seeds, 'by_difficulty': {d: round(sum(s)/len(s), 4) if s else 0 for d, s in by_difficulty.items()}})}")
if __name__ == "__main__":
main()
|