Spaces:

Rajkamal2819
/

cascade-mind

Sleeping

File size: 6,271 Bytes

17cb006

#!/usr/bin/env python3
"""
benchmark.py
-------------
Local benchmark for cascade-mind v2.

Runs the environment through a range of seeds and produces aggregated
performance statistics without requiring an LLM. Uses a simple heuristic
agent (BFS from changed service) to establish a performance baseline.

Usage:
  python benchmark.py                   # run 30 seeds with heuristic agent
  python benchmark.py --seeds 100       # run 100 seeds
  python benchmark.py --audit           # also run trajectory auditor
"""
from __future__ import annotations

import argparse
import json
import os
import sys
import time
from typing import Any, Dict, List

# Disable LLM simulator for benchmarking
os.environ.setdefault("LLM_SIMULATOR_ENABLED", "false")

# Imports
try:
    from server.service_impact_environment import ServiceImpactEnvironment
    from models import ServiceImpactAction
    from server.graph_builder import SERVICES
except ImportError:
    sys.exit("Run from the cascade-mind root directory.")

try:
    from server.trajectory_auditor import TrajectoryAuditor
except ImportError:
    TrajectoryAuditor = None


def heuristic_agent(env: ServiceImpactEnvironment, seed: int, verbose: bool = False) -> Dict[str, Any]:
    """Simple heuristic agent: free intel → BFS → submit.

    This establishes a baseline that any LLM agent should beat.
    """
    obs = env.reset(seed=seed)
    changed = obs.changed_service
    budget = obs.queries_remaining
    difficulty = env._task_difficulty

    candidates = set()
    steps = 0
    start = time.time()

    # Phase 1: Free intel (always do all 3)
    for action_type in ["query_changelog", "query_runbook", "query_monitoring"]:
        obs = env.step(ServiceImpactAction(
            action_type=action_type,
            service_name=changed,
            affected_services=[],
        ))
        steps += 1
        # Extract service names from message
        for svc in SERVICES:
            if svc in obs.message and svc != changed:
                candidates.add(svc)

    # Phase 2: BFS with query_dependents
    queried = set()
    to_query = [changed]  # start from changed service
    while to_query and env._queries_used < env._max_queries - 2:  # leave buffer for submit
        svc = to_query.pop(0)
        if svc in queried:
            continue
        queried.add(svc)

        obs = env.step(ServiceImpactAction(
            action_type="query_dependents",
            service_name=svc,
            affected_services=[],
        ))
        steps += 1

        # Extract new service names
        for s in SERVICES:
            if s in obs.message and s != changed and s not in queried:
                candidates.add(s)
                if s not in to_query:
                    to_query.append(s)

        if obs.done:
            break

    # Phase 3: Submit
    if not env._episode_ended:
        obs = env.step(ServiceImpactAction(
            action_type="submit",
            affected_services=sorted(candidates),
        ))
        steps += 1

    elapsed = time.time() - start
    reward = obs.reward if obs.reward is not None else 0.0
    reward = max(0.001, min(0.999, reward))

    if verbose:
        print(f"  seed={seed:4d}  difficulty={difficulty:<6s}  "
              f"reward={reward:.3f}  candidates={len(candidates):2d}  "
              f"steps={steps:2d}  time={elapsed:.2f}s")

    return {
        "seed": seed,
        "difficulty": difficulty,
        "reward": reward,
        "candidates": len(candidates),
        "steps": steps,
        "elapsed_s": round(elapsed, 3),
    }


def main() -> None:
    parser = argparse.ArgumentParser(description="cascade-mind v2 benchmark")
    parser.add_argument("--seeds", type=int, default=30, help="Number of seeds to run")
    parser.add_argument("--start-seed", type=int, default=0, help="Starting seed")
    parser.add_argument("--audit", action="store_true", help="Run trajectory auditor")
    parser.add_argument("--verbose", "-v", action="store_true", help="Show per-seed results")
    args = parser.parse_args()

    print(f"{'='*65}")
    print(f"  cascade-mind v2 — Heuristic Agent Benchmark")
    print(f"  Seeds: {args.start_seed} to {args.start_seed + args.seeds - 1}")
    print(f"{'='*65}")

    results: List[Dict[str, Any]] = []
    by_difficulty: Dict[str, List[float]] = {"easy": [], "medium": [], "hard": []}

    start_total = time.time()
    for seed in range(args.start_seed, args.start_seed + args.seeds):
        env = ServiceImpactEnvironment()
        result = heuristic_agent(env, seed, verbose=args.verbose)
        results.append(result)
        by_difficulty[result["difficulty"]].append(result["reward"])

    total_elapsed = time.time() - start_total

    # Summary
    print(f"\n{'─'*65}")
    all_rewards = [r["reward"] for r in results]
    mean_reward = sum(all_rewards) / len(all_rewards) if all_rewards else 0.0

    for diff in ["easy", "medium", "hard"]:
        scores = by_difficulty[diff]
        if scores:
            avg = sum(scores) / len(scores)
            print(f"  {diff:<8s}  n={len(scores):3d}  mean={avg:.3f}  "
                  f"min={min(scores):.3f}  max={max(scores):.3f}")
        else:
            print(f"  {diff:<8s}  n=  0")

    print(f"{'─'*65}")
    print(f"  OVERALL   n={len(results):3d}  mean={mean_reward:.3f}  time={total_elapsed:.1f}s")
    print(f"{'='*65}")

    # Audit
    if args.audit and TrajectoryAuditor is not None:
        print(f"\n{'─'*65}")
        print("  Trajectory Audit")
        print(f"{'─'*65}")
        auditor = TrajectoryAuditor()
        summary = auditor.summary()
        print(f"  Episodes audited: {summary.get('episodes', 0)}")
        print(f"  Mean reward:      {summary.get('mean_reward', 0):.3f}")
        print(f"  Budget util:      {summary.get('mean_budget_utilization', 0):.1%}")
        print(f"  Strategies:       {json.dumps(summary.get('strategy_distribution', {}))}")
        print(f"  Hypothesis used:  {summary.get('hypothesis_usage', 0)}")

    # JSON output
    print(f"\nJSON_BENCHMARK: {json.dumps({'mean_reward': round(mean_reward, 4), 'seeds': args.seeds, 'by_difficulty': {d: round(sum(s)/len(s), 4) if s else 0 for d, s in by_difficulty.items()}})}")


if __name__ == "__main__":
    main()