Spaces:

Chirag0123
/

codebase-nav-env

Sleeping

File size: 16,531 Bytes

0b0338d

# server/counterfactual_engine.py
"""
Counterfactual Robustness Engine — v4.0

The key scientific question: Is the agent's strategy robust, or is it brittle?

We test this by:
1. Running an episode → recording strategy
2. Applying small, semantically-neutral mutations to the repo
   (rename variable, change a constant, add a dummy function)
3. Measuring whether the agent's recorded strategy would fail on the mutated repo

IMPORTANT: This does NOT re-run the agent. It analyzes whether the
already-recorded navigation pattern was based on deep structure (robust)
or surface signals like filenames/constants (brittle).

This is completely novel — no benchmark or tool does this.
"""
from __future__ import annotations
import random
import hashlib
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass, field
from enum import Enum


class BrittlenessLevel(str, Enum):
    ROBUST = "ROBUST"           # Strategy survives all mutations
    MILDLY_BRITTLE = "MILDLY_BRITTLE"  # Survives 60-80% of mutations
    BRITTLE = "BRITTLE"         # Survives < 60%
    FRAGILE = "FRAGILE"         # Survives < 30%


@dataclass
class Mutation:
    """A single counterfactual mutation applied to the repo."""
    mutation_type: str
    target_file: str
    description: str
    would_break_agent: bool  # Would this mutation cause agent's strategy to fail?
    why: str                 # Explanation


@dataclass
class CounterfactualReport:
    """Results of counterfactual robustness testing."""
    episode_id: str
    task: str
    brittleness_level: BrittlenessLevel
    robustness_score: float      # 0.0 – 1.0

    mutations_tested: List[Mutation]
    mutations_survived: int
    mutations_failed: int

    surface_dependencies: List[str]  # What surface signals the agent relied on
    deep_dependencies: List[str]     # What structural signals it used correctly

    explanation: str
    recommendations: List[str]

    def to_dict(self) -> dict:
        return {
            "episode_id": self.episode_id,
            "task": self.task,
            "brittleness_level": self.brittleness_level.value,
            "robustness_score": round(self.robustness_score, 3),
            "mutations_tested": len(self.mutations_tested),
            "mutations_survived": self.mutations_survived,
            "mutations_failed": self.mutations_failed,
            "mutations": [
                {
                    "type": m.mutation_type,
                    "file": m.target_file,
                    "description": m.description,
                    "would_break_agent": m.would_break_agent,
                    "why": m.why,
                }
                for m in self.mutations_tested
            ],
            "surface_dependencies": self.surface_dependencies,
            "deep_dependencies": self.deep_dependencies,
            "explanation": self.explanation,
            "recommendations": self.recommendations,
        }


class CounterfactualEngine:
    """
    Analyzes brittleness by reasoning about what mutations would break the agent.

    We don't need to actually re-run the agent — we analyze the recorded
    trajectory and ask: "If file X was named differently / had a different
    constant, would this agent's navigation pattern still work?"

    Brittle signals:
    - Agent found bug file by pattern-matching on filename (not content search)
    - Agent submitted after reading the same file every run
    - Agent ignored test content and relied on positional heuristics

    Robust signals:
    - Agent used search_code to find function by name
    - Agent read test → traced import → found source
    - Agent ran tests and verified result before submitting
    """

    MUTATION_TEMPLATES = [
        {
            "type": "FILENAME_RENAME",
            "description": "Rename src/X.py to src/X_v2.py (same content)",
            "breaks_if": "agent found file by name pattern, not by search or import tracing",
            "surface_signal": "filename",
            "robust_signal": "import tracing or search_code",
        },
        {
            "type": "CONSTANT_CHANGE",
            "description": "Change a numeric constant by ±1 (semantically neutral for navigation)",
            "breaks_if": "agent hardcoded expected value rather than reading actual code",
            "surface_signal": "constant value pattern matching",
            "robust_signal": "dynamic code reading",
        },
        {
            "type": "DUMMY_FUNCTION",
            "description": "Add a dummy function with a similar name near the bug",
            "breaks_if": "agent used first-match navigation without reading full context",
            "surface_signal": "first result of search or first match in file",
            "robust_signal": "reading complete function signatures before deciding",
        },
        {
            "type": "DIRECTORY_SHUFFLE",
            "description": "Move test file from tests/ to test/ (same content)",
            "breaks_if": "agent hardcoded path prefix tests/ instead of searching",
            "surface_signal": "hardcoded directory prefix",
            "robust_signal": "search or dynamic discovery",
        },
        {
            "type": "DOCSTRING_NOISE",
            "description": "Add misleading docstring claiming a different function causes the bug",
            "breaks_if": "agent read docs instead of tests to understand expected behavior",
            "surface_signal": "docstring content",
            "robust_signal": "test assertions as ground truth",
        },
        {
            "type": "IMPORT_REORDER",
            "description": "Reorder imports in the source file",
            "breaks_if": "agent relied on line numbers instead of function names",
            "surface_signal": "absolute line numbers",
            "robust_signal": "function name search",
        },
    ]

    def analyze(
        self,
        episode_id: str,
        task: str,
        trajectory_steps: List[dict],
        variant_meta: dict,
        files_read: List[str],
        files_written: List[str],
        final_score: float,
    ) -> CounterfactualReport:
        """
        Analyze robustness by simulating mutations and reasoning about
        whether the agent's recorded pattern would survive them.
        """
        action_types = [s.get("action_type", "") for s in trajectory_steps]
        action_paths = [s.get("action_path") for s in trajectory_steps]

        bug_files = set(variant_meta.get("bug_files", []) or
                        variant_meta.get("files_to_implement", []) or [])
        test_files_meta = set(variant_meta.get("test_files", []) or [])

        # Infer what signals agent used
        used_search = "search_code" in action_types
        used_tests_first = self._tests_read_before_src(trajectory_steps, test_files_meta, bug_files)
        used_run_tests = "run_tests" in action_types
        blind_navigation = not used_search and not used_tests_first
        read_count = action_types.count("read_file")
        write_count = action_types.count("write_file")
        immediate_write = write_count > 0 and action_types.index("write_file") <= 2
        verified_before_submit = self._verified_before_submit(trajectory_steps)

        # ── Evaluate each mutation ────────────────────────────────────────────
        mutations: List[Mutation] = []

        for tmpl in self.MUTATION_TEMPLATES:
            target_file = self._pick_target_file(tmpl["type"], files_read, bug_files)
            would_break, why = self._would_break_agent(
                mutation_type=tmpl["type"],
                used_search=used_search,
                used_tests_first=used_tests_first,
                verified_before_submit=verified_before_submit,
                blind_navigation=blind_navigation,
                immediate_write=immediate_write,
                read_count=read_count,
                tmpl=tmpl,
            )
            mutations.append(Mutation(
                mutation_type=tmpl["type"],
                target_file=target_file or "unknown",
                description=tmpl["description"],
                would_break_agent=would_break,
                why=why,
            ))

        survived = sum(1 for m in mutations if not m.would_break_agent)
        failed = len(mutations) - survived

        robustness_score = survived / len(mutations) if mutations else 0.0

        # ── Surface vs deep dependency analysis ──────────────────────────────
        surface_deps = []
        deep_deps = []

        if not used_search:
            surface_deps.append("Filename-based navigation (no search_code used)")
        if not used_tests_first:
            surface_deps.append("Skipped test-informed navigation")
        if immediate_write:
            surface_deps.append("Immediate write after minimal reading (blind fix)")
        if not verified_before_submit:
            surface_deps.append("Submitted without running tests (no verification)")

        if used_search:
            deep_deps.append("Used search_code to find functions by name (content-based)")
        if used_tests_first:
            deep_deps.append("Read tests first — used expected behavior as compass")
        if read_count >= 3:
            deep_deps.append(f"Read {read_count} files — explored structure before committing")
        if verified_before_submit:
            deep_deps.append("Verified fix with run_tests before submitting")

        # ── Brittleness classification ────────────────────────────────────────
        if robustness_score >= 0.80:
            level = BrittlenessLevel.ROBUST
        elif robustness_score >= 0.60:
            level = BrittlenessLevel.MILDLY_BRITTLE
        elif robustness_score >= 0.30:
            level = BrittlenessLevel.BRITTLE
        else:
            level = BrittlenessLevel.FRAGILE

        explanations = {
            BrittlenessLevel.ROBUST: (
                "Agent strategy is robust. It relies on deep structural signals (function names, "
                "test assertions, causal chain traversal) rather than surface patterns. "
                "Minor repo mutations would not break its navigation."
            ),
            BrittlenessLevel.MILDLY_BRITTLE: (
                "Agent strategy is mildly brittle. Some mutations would break its navigation, "
                "particularly those that change surface signals it relied on. "
                "Using search_code and test-first navigation consistently would improve robustness."
            ),
            BrittlenessLevel.BRITTLE: (
                "Agent strategy is brittle. Most mutations would break its navigation. "
                "The agent appears to rely on stable surface patterns (filenames, positions) "
                "rather than understanding the semantic structure of the codebase."
            ),
            BrittlenessLevel.FRAGILE: (
                "Agent strategy is fragile. Almost any perturbation to the repo structure "
                "would cause this agent to fail. This indicates pure pattern-matching on "
                "the specific repo layout rather than generalizable code understanding."
            ),
        }

        recs = []
        if not used_search:
            recs.append("Use search_code to find functions by name — survives filename renames.")
        if not used_tests_first:
            recs.append("Read tests first to anchor your navigation in expected behavior, not filenames.")
        if immediate_write:
            recs.append("Read source files before writing to them — avoid blind writes.")
        if not verified_before_submit:
            recs.append("Run tests after writing — verify your fix holds on the actual behavior.")

        return CounterfactualReport(
            episode_id=episode_id,
            task=task,
            brittleness_level=level,
            robustness_score=robustness_score,
            mutations_tested=mutations,
            mutations_survived=survived,
            mutations_failed=failed,
            surface_dependencies=surface_deps,
            deep_dependencies=deep_deps,
            explanation=explanations[level],
            recommendations=recs,
        )

    # ── Helpers ───────────────────────────────────────────────────────────────

    def _tests_read_before_src(
        self, steps: List[dict], test_files: set, bug_files: set
    ) -> bool:
        test_steps = [
            s.get("step_number", 99) for s in steps
            if s.get("action_type") == "read_file"
            and any(tf in (s.get("action_path") or "") for tf in test_files)
        ]
        src_steps = [
            s.get("step_number", 99) for s in steps
            if s.get("action_type") == "read_file"
            and any(bf in (s.get("action_path") or "") for bf in bug_files)
        ]
        if test_steps and src_steps:
            return min(test_steps) < min(src_steps)
        return False

    def _verified_before_submit(self, steps: List[dict]) -> bool:
        submit_step = next(
            (s.get("step_number", 9999) for s in steps if s.get("action_type") == "submit"),
            None,
        )
        if submit_step is None:
            return False
        return any(
            s.get("action_type") == "run_tests"
            and s.get("step_number", 0) < submit_step
            for s in steps
        )

    def _pick_target_file(
        self, mutation_type: str, files_read: List[str], bug_files: set
    ) -> str:
        if mutation_type in ("FILENAME_RENAME", "DUMMY_FUNCTION", "IMPORT_REORDER"):
            for f in bug_files:
                return f
            return files_read[0] if files_read else "src/main.py"
        if mutation_type == "DIRECTORY_SHUFFLE":
            for f in files_read:
                if "test" in f.lower():
                    return f
        return files_read[0] if files_read else "unknown"

    def _would_break_agent(
        self,
        mutation_type: str,
        used_search: bool,
        used_tests_first: bool,
        verified_before_submit: bool,
        blind_navigation: bool,
        immediate_write: bool,
        read_count: int,
        tmpl: dict,
    ) -> Tuple[bool, str]:
        """
        Return (would_break, explanation) by reasoning about the agent's signals.
        """
        if mutation_type == "FILENAME_RENAME":
            if used_search:
                return False, "Agent used search_code — finds function by name, not filename"
            if blind_navigation:
                return True, "Agent navigated by filename without search — rename breaks it"
            return True, "Agent likely relied on filename pattern without search fallback"

        if mutation_type == "CONSTANT_CHANGE":
            # Almost never breaks well-behaved agents
            if read_count >= 2:
                return False, "Agent read files dynamically — adapts to any constant value"
            return True, "Agent may have hardcoded expected value in navigation heuristic"

        if mutation_type == "DUMMY_FUNCTION":
            if used_search and read_count >= 3:
                return False, "Agent searched and read thoroughly — would disambiguate"
            return True, "Agent took first match without thorough reading"

        if mutation_type == "DIRECTORY_SHUFFLE":
            if used_search:
                return False, "search_code finds tests regardless of directory"
            return True, "Agent used hardcoded path prefix — directory change breaks it"

        if mutation_type == "DOCSTRING_NOISE":
            if used_tests_first:
                return False, "Agent used test assertions as ground truth, not docstrings"
            return True, "Agent may have read misleading docstring instead of test"

        if mutation_type == "IMPORT_REORDER":
            # Only brittle if agent relied on line numbers
            if read_count <= 1:
                return True, "Agent skimmed — likely used line numbers for navigation"
            return False, "Agent read full files — import reorder doesn't change function content"

        return False, "Neutral mutation"