codebase-nav-env / server /multi_agent.py
Chirag0123's picture
v3.0 — Intelligence layer: failure classification, strategy detection, advanced metrics, self-improvement, multi-agent comparison, 3D visualizer
dfbd16e
# server/multi_agent.py
"""
Multi-Agent Comparison Engine.
Runs multiple agent configurations against the SAME task variant
and produces a side-by-side comparison report.
Agent configurations:
- Deterministic (rule-based, no LLM) — baseline
- Test-first (forces reading tests before anything)
- Search-first (forces search_code before reads)
- LLM-based (if HF_TOKEN provided)
This is the key feature that answers: "Which agent strategy wins?"
"""
import time
import copy
from typing import List, Dict, Any, Optional, Callable
from dataclasses import dataclass, field
@dataclass
class AgentRunResult:
"""Result of one agent configuration running one episode."""
agent_name: str
task: str
variant_id: str
final_score: float
total_steps: int
cumulative_reward: float
duration_seconds: float
action_sequence: List[str]
files_read: List[str]
files_written: List[str]
strategy: str # Detected strategy label
strategy_score: float
failure_type: str
reliability_index: float
step_timeline: List[dict]
def to_dict(self) -> dict:
return {
"agent_name": self.agent_name,
"task": self.task,
"variant_id": self.variant_id,
"final_score": round(self.final_score, 3),
"total_steps": self.total_steps,
"cumulative_reward": round(self.cumulative_reward, 3),
"duration_seconds": round(self.duration_seconds, 2),
"action_sequence": self.action_sequence,
"files_read": self.files_read,
"files_written": self.files_written,
"strategy": self.strategy,
"strategy_score": round(self.strategy_score, 3),
"failure_type": self.failure_type,
"reliability_index": round(self.reliability_index, 3),
"step_timeline": self.step_timeline,
}
@dataclass
class ComparisonReport:
"""Side-by-side comparison of multiple agent configurations."""
task: str
variant_id: str
runs: List[AgentRunResult] = field(default_factory=list)
def to_dict(self) -> dict:
if not self.runs:
return {"error": "No runs to compare"}
# Rank by score then steps
ranked = sorted(self.runs, key=lambda r: (-r.final_score, r.total_steps))
winner = ranked[0]
return {
"task": self.task,
"variant_id": self.variant_id,
"winner": winner.agent_name,
"winner_score": winner.final_score,
"summary_table": [
{
"rank": i + 1,
"agent": r.agent_name,
"score": round(r.final_score, 3),
"steps": r.total_steps,
"reward": round(r.cumulative_reward, 3),
"strategy": r.strategy,
"failure": r.failure_type,
"reliability": round(r.reliability_index, 3),
}
for i, r in enumerate(ranked)
],
"detailed_runs": [r.to_dict() for r in self.runs],
"insights": self._generate_insights(ranked),
}
def _generate_insights(self, ranked: List[AgentRunResult]) -> List[str]:
insights = []
if len(ranked) < 2:
return insights
best = ranked[0]
worst = ranked[-1]
if best.final_score > worst.final_score + 0.2:
insights.append(
f"'{best.agent_name}' significantly outperformed '{worst.agent_name}' "
f"({best.final_score:.2f} vs {worst.final_score:.2f})"
)
step_diffs = [(r.agent_name, r.total_steps) for r in ranked]
most_efficient = min(ranked, key=lambda r: r.total_steps if r.final_score >= 0.5 else float('inf'))
if most_efficient.final_score >= 0.5:
insights.append(
f"Most step-efficient successful agent: '{most_efficient.agent_name}' "
f"({most_efficient.total_steps} steps)"
)
strategies = [r.strategy for r in ranked]
if len(set(strategies)) > 1:
insights.append(
f"Strategy variance observed: {set(strategies)} — "
f"'{best.agent_name}' used {best.strategy} which proved most effective."
)
return insights
class MultiAgentComparison:
"""
Runs multiple deterministic agent strategies against the same environment.
Usage (in-process, no LLM required):
from server.environment import CodebaseNavEnvironment
from server.models import RepoAction
env = CodebaseNavEnvironment()
engine = MultiAgentComparison()
report = engine.compare(env, task="task1")
"""
# ── Built-in agent strategies ─────────────────────────────────────────────
@staticmethod
def _agent_test_first(obs: dict, step: int, context: dict) -> dict:
"""Strategy: Read tests before any source file."""
tree = obs.get("repo_tree", [])
files_read = set(obs.get("files_read", []))
test_files = sorted([f for f in tree if f.startswith("tests/")])
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
spec_files = sorted([f for f in tree if f.endswith(".md")])
# Phase 1: Tests first
for tf in test_files:
if tf not in files_read:
return {"action_type": "read_file", "path": tf}
# Phase 2: Source files
for sf in src_files:
if sf not in files_read:
return {"action_type": "read_file", "path": sf}
# Phase 3: Run tests
if test_files and context.get("tests_run", 0) == 0:
context["tests_run"] = 1
return {"action_type": "run_tests", "path": test_files[0]}
return {"action_type": "submit"}
@staticmethod
def _agent_search_first(obs: dict, step: int, context: dict) -> dict:
"""Strategy: Use search_code to locate the bug before reading."""
tree = obs.get("repo_tree", [])
files_read = set(obs.get("files_read", []))
failing = obs.get("failing_tests", [])
# Step 1: search for the failing test function name
if step == 1 and failing:
fn_name = failing[0].split(".")[-1] if failing else "bug"
context["searched"] = True
return {"action_type": "search_code", "query": fn_name}
# Step 2: Read files based on search
test_files = sorted([f for f in tree if f.startswith("tests/")])
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
for tf in test_files:
if tf not in files_read:
return {"action_type": "read_file", "path": tf}
for sf in src_files:
if sf not in files_read:
return {"action_type": "read_file", "path": sf}
if test_files and context.get("tests_run", 0) == 0:
context["tests_run"] = 1
return {"action_type": "run_tests", "path": test_files[0]}
return {"action_type": "submit"}
@staticmethod
def _agent_minimal(obs: dict, step: int, context: dict) -> dict:
"""Strategy: Minimal effort — read one file, submit immediately."""
tree = obs.get("repo_tree", [])
files_read = set(obs.get("files_read", []))
src_files = [f for f in tree if f.startswith("src/") and f.endswith(".py")]
if src_files and not files_read:
return {"action_type": "read_file", "path": src_files[0]}
return {"action_type": "submit"}
@staticmethod
def _agent_exhaustive(obs: dict, step: int, context: dict) -> dict:
"""Strategy: Read everything, run tests, then submit."""
tree = obs.get("repo_tree", [])
files_read = set(obs.get("files_read", []))
all_readable = [f for f in tree if f.endswith(".py") or f.endswith(".md")]
for f in all_readable:
if f not in files_read:
return {"action_type": "read_file", "path": f}
test_files = [f for f in tree if f.startswith("tests/")]
if test_files and context.get("tests_run", 0) == 0:
context["tests_run"] = 1
return {"action_type": "run_tests", "path": test_files[0]}
if test_files and context.get("tests_run2", 0) == 0:
context["tests_run2"] = 1
return {"action_type": "run_tests"}
return {"action_type": "submit"}
AGENT_CONFIGS = {
"test-first": _agent_test_first.__func__,
"search-first": _agent_search_first.__func__,
"minimal": _agent_minimal.__func__,
"exhaustive": _agent_exhaustive.__func__,
}
def compare(
self,
env, # CodebaseNavEnvironment instance
task: str = "task1",
agents: Optional[List[str]] = None,
shared_variant: Optional[str] = None,
) -> ComparisonReport:
"""
Run all (or selected) agents against the same task and compare.
The environment is reset to the same variant for each agent.
"""
from server.models import RepoAction
from server.strategy_detector import StrategyDetector
from server.failure_classifier import FailureClassifier
from server.advanced_metrics import AdvancedMetricsEngine
agent_names = agents or list(self.AGENT_CONFIGS.keys())
strategy_detector = StrategyDetector()
failure_classifier = FailureClassifier()
metrics_engine = AdvancedMetricsEngine()
runs: List[AgentRunResult] = []
variant_id = None
for agent_name in agent_names:
agent_fn = self.AGENT_CONFIGS.get(agent_name)
if not agent_fn:
continue
# Reset environment
reset_result = env.reset(task=task)
obs = reset_result.observation
variant_id = reset_result.info.get("variant_id", "unknown")
context = {}
start = time.time()
max_steps = 15
files_read = []
files_written = []
cumulative_reward = 0.0
action_sequence = []
step_timeline = []
obs_dict = obs.model_dump()
for step_num in range(1, max_steps + 1):
if env.done:
break
action_dict = agent_fn(obs_dict, step_num, context)
action = RepoAction(
action_type=action_dict.get("action_type", "submit"),
path=action_dict.get("path"),
query=action_dict.get("query"),
content=action_dict.get("content"),
)
result = env.step(action)
obs = result.observation
obs_dict = obs.model_dump()
cumulative_reward += result.reward
action_sequence.append(action.action_type)
if action.path and action.action_type == "read_file":
files_read.append(action.path)
if action.path and action.action_type == "write_file":
files_written.append(action.path)
step_timeline.append({
"step": step_num,
"action": action.action_type,
"path": action.path,
"reward": round(result.reward, 3),
})
if result.done:
break
# Force submit if not done
if not env.done:
result = env.step(RepoAction(action_type="submit"))
cumulative_reward += result.reward
action_sequence.append("submit")
duration = time.time() - start
final_score = env.final_score
# Get trajectory for analysis
trajectory = env.get_trajectory()
traj_steps = trajectory.get("steps", []) if trajectory else []
variant_meta = {}
if env.variant:
variant_meta = env.variant.meta
# Detect strategy
strategy_report = strategy_detector.detect(
traj_steps, task, variant_meta, files_read, final_score
)
# Classify failure
failure_report = failure_classifier.classify(
episode_id=trajectory.get("episode_id", "") if trajectory else "",
task=task,
trajectory_steps=traj_steps,
variant_meta=variant_meta,
files_read=files_read,
files_written=files_written,
final_score=final_score,
)
# Advanced metrics
adv_metrics = metrics_engine.compute(
traj_steps, variant_meta, final_score, files_read, files_written
)
runs.append(AgentRunResult(
agent_name=agent_name,
task=task,
variant_id=variant_id or "unknown",
final_score=final_score,
total_steps=len(action_sequence),
cumulative_reward=cumulative_reward,
duration_seconds=duration,
action_sequence=action_sequence,
files_read=files_read,
files_written=files_written,
strategy=strategy_report.strategy,
strategy_score=strategy_report.score,
failure_type=failure_report.primary_failure,
reliability_index=adv_metrics.reliability_index,
step_timeline=step_timeline,
))
return ComparisonReport(
task=task,
variant_id=variant_id or "unknown",
runs=runs,
)