Spaces:

Rajkamal2819
/

cascade-mind

Sleeping

Rajkamal Rajarshi

refactor: restructure into cascade_mind package with server subpackages

17cb006 about 2 months ago

6.27 kB

	#!/usr/bin/env python3
	"""
	benchmark.py
	-------------
	Local benchmark for cascade-mind v2.

	Runs the environment through a range of seeds and produces aggregated
	performance statistics without requiring an LLM. Uses a simple heuristic
	agent (BFS from changed service) to establish a performance baseline.

	Usage:
	python benchmark.py # run 30 seeds with heuristic agent
	python benchmark.py --seeds 100 # run 100 seeds
	python benchmark.py --audit # also run trajectory auditor
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import sys
	import time
	from typing import Any, Dict, List

	# Disable LLM simulator for benchmarking
	os.environ.setdefault("LLM_SIMULATOR_ENABLED", "false")

	# Imports
	try:
	from server.service_impact_environment import ServiceImpactEnvironment
	from models import ServiceImpactAction
	from server.graph_builder import SERVICES
	except ImportError:
	sys.exit("Run from the cascade-mind root directory.")

	try:
	from server.trajectory_auditor import TrajectoryAuditor
	except ImportError:
	TrajectoryAuditor = None


	def heuristic_agent(env: ServiceImpactEnvironment, seed: int, verbose: bool = False) -> Dict[str, Any]:
	"""Simple heuristic agent: free intel → BFS → submit.

	This establishes a baseline that any LLM agent should beat.
	"""
	obs = env.reset(seed=seed)
	changed = obs.changed_service
	budget = obs.queries_remaining
	difficulty = env._task_difficulty

	candidates = set()
	steps = 0
	start = time.time()

	# Phase 1: Free intel (always do all 3)
	for action_type in ["query_changelog", "query_runbook", "query_monitoring"]:
	obs = env.step(ServiceImpactAction(
	action_type=action_type,
	service_name=changed,
	affected_services=[],
	))
	steps += 1
	# Extract service names from message
	for svc in SERVICES:
	if svc in obs.message and svc != changed:
	candidates.add(svc)

	# Phase 2: BFS with query_dependents
	queried = set()
	to_query = [changed] # start from changed service
	while to_query and env._queries_used < env._max_queries - 2: # leave buffer for submit
	svc = to_query.pop(0)
	if svc in queried:
	continue
	queried.add(svc)

	obs = env.step(ServiceImpactAction(
	action_type="query_dependents",
	service_name=svc,
	affected_services=[],
	))
	steps += 1

	# Extract new service names
	for s in SERVICES:
	if s in obs.message and s != changed and s not in queried:
	candidates.add(s)
	if s not in to_query:
	to_query.append(s)

	if obs.done:
	break

	# Phase 3: Submit
	if not env._episode_ended:
	obs = env.step(ServiceImpactAction(
	action_type="submit",
	affected_services=sorted(candidates),
	))
	steps += 1

	elapsed = time.time() - start
	reward = obs.reward if obs.reward is not None else 0.0
	reward = max(0.001, min(0.999, reward))

	if verbose:
	print(f" seed={seed:4d} difficulty={difficulty:<6s} "
	f"reward={reward:.3f} candidates={len(candidates):2d} "
	f"steps={steps:2d} time={elapsed:.2f}s")

	return {
	"seed": seed,
	"difficulty": difficulty,
	"reward": reward,
	"candidates": len(candidates),
	"steps": steps,
	"elapsed_s": round(elapsed, 3),
	}


	def main() -> None:
	parser = argparse.ArgumentParser(description="cascade-mind v2 benchmark")
	parser.add_argument("--seeds", type=int, default=30, help="Number of seeds to run")
	parser.add_argument("--start-seed", type=int, default=0, help="Starting seed")
	parser.add_argument("--audit", action="store_true", help="Run trajectory auditor")
	parser.add_argument("--verbose", "-v", action="store_true", help="Show per-seed results")
	args = parser.parse_args()

	print(f"{'='*65}")
	print(f" cascade-mind v2 — Heuristic Agent Benchmark")
	print(f" Seeds: {args.start_seed} to {args.start_seed + args.seeds - 1}")
	print(f"{'='*65}")

	results: List[Dict[str, Any]] = []
	by_difficulty: Dict[str, List[float]] = {"easy": [], "medium": [], "hard": []}

	start_total = time.time()
	for seed in range(args.start_seed, args.start_seed + args.seeds):
	env = ServiceImpactEnvironment()
	result = heuristic_agent(env, seed, verbose=args.verbose)
	results.append(result)
	by_difficulty[result["difficulty"]].append(result["reward"])

	total_elapsed = time.time() - start_total

	# Summary
	print(f"\n{'─'*65}")
	all_rewards = [r["reward"] for r in results]
	mean_reward = sum(all_rewards) / len(all_rewards) if all_rewards else 0.0

	for diff in ["easy", "medium", "hard"]:
	scores = by_difficulty[diff]
	if scores:
	avg = sum(scores) / len(scores)
	print(f" {diff:<8s} n={len(scores):3d} mean={avg:.3f} "
	f"min={min(scores):.3f} max={max(scores):.3f}")
	else:
	print(f" {diff:<8s} n= 0")

	print(f"{'─'*65}")
	print(f" OVERALL n={len(results):3d} mean={mean_reward:.3f} time={total_elapsed:.1f}s")
	print(f"{'='*65}")

	# Audit
	if args.audit and TrajectoryAuditor is not None:
	print(f"\n{'─'*65}")
	print(" Trajectory Audit")
	print(f"{'─'*65}")
	auditor = TrajectoryAuditor()
	summary = auditor.summary()
	print(f" Episodes audited: {summary.get('episodes', 0)}")
	print(f" Mean reward: {summary.get('mean_reward', 0):.3f}")
	print(f" Budget util: {summary.get('mean_budget_utilization', 0):.1%}")
	print(f" Strategies: {json.dumps(summary.get('strategy_distribution', {}))}")
	print(f" Hypothesis used: {summary.get('hypothesis_usage', 0)}")

	# JSON output
	print(f"\nJSON_BENCHMARK: {json.dumps({'mean_reward': round(mean_reward, 4), 'seeds': args.seeds, 'by_difficulty': {d: round(sum(s)/len(s), 4) if s else 0 for d, s in by_difficulty.items()}})}")


	if __name__ == "__main__":
	main()