Spaces:

Chirag0123
/

codebase-nav-env

Sleeping

File size: 13,935 Bytes

dfbd16e

# server/multi_agent.py
"""
Multi-Agent Comparison Engine.

Runs multiple agent configurations against the SAME task variant
and produces a side-by-side comparison report.

Agent configurations:
  - Deterministic (rule-based, no LLM) — baseline
  - Test-first (forces reading tests before anything)
  - Search-first (forces search_code before reads)
  - LLM-based (if HF_TOKEN provided)

This is the key feature that answers: "Which agent strategy wins?"
"""
import time
import copy
from typing import List, Dict, Any, Optional, Callable
from dataclasses import dataclass, field


@dataclass
class AgentRunResult:
    """Result of one agent configuration running one episode."""
    agent_name: str
    task: str
    variant_id: str
    final_score: float
    total_steps: int
    cumulative_reward: float
    duration_seconds: float
    action_sequence: List[str]
    files_read: List[str]
    files_written: List[str]
    strategy: str              # Detected strategy label
    strategy_score: float
    failure_type: str
    reliability_index: float
    step_timeline: List[dict]

    def to_dict(self) -> dict:
        return {
            "agent_name": self.agent_name,
            "task": self.task,
            "variant_id": self.variant_id,
            "final_score": round(self.final_score, 3),
            "total_steps": self.total_steps,
            "cumulative_reward": round(self.cumulative_reward, 3),
            "duration_seconds": round(self.duration_seconds, 2),
            "action_sequence": self.action_sequence,
            "files_read": self.files_read,
            "files_written": self.files_written,
            "strategy": self.strategy,
            "strategy_score": round(self.strategy_score, 3),
            "failure_type": self.failure_type,
            "reliability_index": round(self.reliability_index, 3),
            "step_timeline": self.step_timeline,
        }


@dataclass
class ComparisonReport:
    """Side-by-side comparison of multiple agent configurations."""
    task: str
    variant_id: str
    runs: List[AgentRunResult] = field(default_factory=list)

    def to_dict(self) -> dict:
        if not self.runs:
            return {"error": "No runs to compare"}

        # Rank by score then steps
        ranked = sorted(self.runs, key=lambda r: (-r.final_score, r.total_steps))
        winner = ranked[0]

        return {
            "task": self.task,
            "variant_id": self.variant_id,
            "winner": winner.agent_name,
            "winner_score": winner.final_score,
            "summary_table": [
                {
                    "rank": i + 1,
                    "agent": r.agent_name,
                    "score": round(r.final_score, 3),
                    "steps": r.total_steps,
                    "reward": round(r.cumulative_reward, 3),
                    "strategy": r.strategy,
                    "failure": r.failure_type,
                    "reliability": round(r.reliability_index, 3),
                }
                for i, r in enumerate(ranked)
            ],
            "detailed_runs": [r.to_dict() for r in self.runs],
            "insights": self._generate_insights(ranked),
        }

    def _generate_insights(self, ranked: List[AgentRunResult]) -> List[str]:
        insights = []
        if len(ranked) < 2:
            return insights

        best = ranked[0]
        worst = ranked[-1]

        if best.final_score > worst.final_score + 0.2:
            insights.append(
                f"'{best.agent_name}' significantly outperformed '{worst.agent_name}' "
                f"({best.final_score:.2f} vs {worst.final_score:.2f})"
            )

        step_diffs = [(r.agent_name, r.total_steps) for r in ranked]
        most_efficient = min(ranked, key=lambda r: r.total_steps if r.final_score >= 0.5 else float('inf'))
        if most_efficient.final_score >= 0.5:
            insights.append(
                f"Most step-efficient successful agent: '{most_efficient.agent_name}' "
                f"({most_efficient.total_steps} steps)"
            )

        strategies = [r.strategy for r in ranked]
        if len(set(strategies)) > 1:
            insights.append(
                f"Strategy variance observed: {set(strategies)} — "
                f"'{best.agent_name}' used {best.strategy} which proved most effective."
            )

        return insights


class MultiAgentComparison:
    """
    Runs multiple deterministic agent strategies against the same environment.

    Usage (in-process, no LLM required):
        from server.environment import CodebaseNavEnvironment
        from server.models import RepoAction

        env = CodebaseNavEnvironment()
        engine = MultiAgentComparison()
        report = engine.compare(env, task="task1")
    """

    # ── Built-in agent strategies ─────────────────────────────────────────────

    @staticmethod
    def _agent_test_first(obs: dict, step: int, context: dict) -> dict:
        """Strategy: Read tests before any source file."""
        tree = obs.get("repo_tree", [])
        files_read = set(obs.get("files_read", []))

        test_files = sorted([f for f in tree if f.startswith("tests/")])
        src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
        spec_files = sorted([f for f in tree if f.endswith(".md")])

        # Phase 1: Tests first
        for tf in test_files:
            if tf not in files_read:
                return {"action_type": "read_file", "path": tf}
        # Phase 2: Source files
        for sf in src_files:
            if sf not in files_read:
                return {"action_type": "read_file", "path": sf}
        # Phase 3: Run tests
        if test_files and context.get("tests_run", 0) == 0:
            context["tests_run"] = 1
            return {"action_type": "run_tests", "path": test_files[0]}
        return {"action_type": "submit"}

    @staticmethod
    def _agent_search_first(obs: dict, step: int, context: dict) -> dict:
        """Strategy: Use search_code to locate the bug before reading."""
        tree = obs.get("repo_tree", [])
        files_read = set(obs.get("files_read", []))
        failing = obs.get("failing_tests", [])

        # Step 1: search for the failing test function name
        if step == 1 and failing:
            fn_name = failing[0].split(".")[-1] if failing else "bug"
            context["searched"] = True
            return {"action_type": "search_code", "query": fn_name}

        # Step 2: Read files based on search
        test_files = sorted([f for f in tree if f.startswith("tests/")])
        src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])

        for tf in test_files:
            if tf not in files_read:
                return {"action_type": "read_file", "path": tf}
        for sf in src_files:
            if sf not in files_read:
                return {"action_type": "read_file", "path": sf}
        if test_files and context.get("tests_run", 0) == 0:
            context["tests_run"] = 1
            return {"action_type": "run_tests", "path": test_files[0]}
        return {"action_type": "submit"}

    @staticmethod
    def _agent_minimal(obs: dict, step: int, context: dict) -> dict:
        """Strategy: Minimal effort — read one file, submit immediately."""
        tree = obs.get("repo_tree", [])
        files_read = set(obs.get("files_read", []))
        src_files = [f for f in tree if f.startswith("src/") and f.endswith(".py")]
        if src_files and not files_read:
            return {"action_type": "read_file", "path": src_files[0]}
        return {"action_type": "submit"}

    @staticmethod
    def _agent_exhaustive(obs: dict, step: int, context: dict) -> dict:
        """Strategy: Read everything, run tests, then submit."""
        tree = obs.get("repo_tree", [])
        files_read = set(obs.get("files_read", []))

        all_readable = [f for f in tree if f.endswith(".py") or f.endswith(".md")]
        for f in all_readable:
            if f not in files_read:
                return {"action_type": "read_file", "path": f}

        test_files = [f for f in tree if f.startswith("tests/")]
        if test_files and context.get("tests_run", 0) == 0:
            context["tests_run"] = 1
            return {"action_type": "run_tests", "path": test_files[0]}
        if test_files and context.get("tests_run2", 0) == 0:
            context["tests_run2"] = 1
            return {"action_type": "run_tests"}
        return {"action_type": "submit"}

    AGENT_CONFIGS = {
        "test-first": _agent_test_first.__func__,
        "search-first": _agent_search_first.__func__,
        "minimal": _agent_minimal.__func__,
        "exhaustive": _agent_exhaustive.__func__,
    }

    def compare(
        self,
        env,  # CodebaseNavEnvironment instance
        task: str = "task1",
        agents: Optional[List[str]] = None,
        shared_variant: Optional[str] = None,
    ) -> ComparisonReport:
        """
        Run all (or selected) agents against the same task and compare.
        The environment is reset to the same variant for each agent.
        """
        from server.models import RepoAction
        from server.strategy_detector import StrategyDetector
        from server.failure_classifier import FailureClassifier
        from server.advanced_metrics import AdvancedMetricsEngine

        agent_names = agents or list(self.AGENT_CONFIGS.keys())
        strategy_detector = StrategyDetector()
        failure_classifier = FailureClassifier()
        metrics_engine = AdvancedMetricsEngine()

        runs: List[AgentRunResult] = []
        variant_id = None

        for agent_name in agent_names:
            agent_fn = self.AGENT_CONFIGS.get(agent_name)
            if not agent_fn:
                continue

            # Reset environment
            reset_result = env.reset(task=task)
            obs = reset_result.observation
            variant_id = reset_result.info.get("variant_id", "unknown")

            context = {}
            start = time.time()
            max_steps = 15
            files_read = []
            files_written = []
            cumulative_reward = 0.0
            action_sequence = []
            step_timeline = []

            obs_dict = obs.model_dump()

            for step_num in range(1, max_steps + 1):
                if env.done:
                    break

                action_dict = agent_fn(obs_dict, step_num, context)
                action = RepoAction(
                    action_type=action_dict.get("action_type", "submit"),
                    path=action_dict.get("path"),
                    query=action_dict.get("query"),
                    content=action_dict.get("content"),
                )

                result = env.step(action)
                obs = result.observation
                obs_dict = obs.model_dump()
                cumulative_reward += result.reward
                action_sequence.append(action.action_type)

                if action.path and action.action_type == "read_file":
                    files_read.append(action.path)
                if action.path and action.action_type == "write_file":
                    files_written.append(action.path)

                step_timeline.append({
                    "step": step_num,
                    "action": action.action_type,
                    "path": action.path,
                    "reward": round(result.reward, 3),
                })

                if result.done:
                    break

            # Force submit if not done
            if not env.done:
                result = env.step(RepoAction(action_type="submit"))
                cumulative_reward += result.reward
                action_sequence.append("submit")

            duration = time.time() - start
            final_score = env.final_score

            # Get trajectory for analysis
            trajectory = env.get_trajectory()
            traj_steps = trajectory.get("steps", []) if trajectory else []
            variant_meta = {}
            if env.variant:
                variant_meta = env.variant.meta

            # Detect strategy
            strategy_report = strategy_detector.detect(
                traj_steps, task, variant_meta, files_read, final_score
            )

            # Classify failure
            failure_report = failure_classifier.classify(
                episode_id=trajectory.get("episode_id", "") if trajectory else "",
                task=task,
                trajectory_steps=traj_steps,
                variant_meta=variant_meta,
                files_read=files_read,
                files_written=files_written,
                final_score=final_score,
            )

            # Advanced metrics
            adv_metrics = metrics_engine.compute(
                traj_steps, variant_meta, final_score, files_read, files_written
            )

            runs.append(AgentRunResult(
                agent_name=agent_name,
                task=task,
                variant_id=variant_id or "unknown",
                final_score=final_score,
                total_steps=len(action_sequence),
                cumulative_reward=cumulative_reward,
                duration_seconds=duration,
                action_sequence=action_sequence,
                files_read=files_read,
                files_written=files_written,
                strategy=strategy_report.strategy,
                strategy_score=strategy_report.score,
                failure_type=failure_report.primary_failure,
                reliability_index=adv_metrics.reliability_index,
                step_timeline=step_timeline,
            ))

        return ComparisonReport(
            task=task,
            variant_id=variant_id or "unknown",
            runs=runs,
        )