Spaces:

Chirag0123
/

codebase-nav-env

Sleeping

File size: 10,678 Bytes

dfbd16e

# server/advanced_metrics.py
"""
Advanced Metrics Engine.

Computes metrics that existing benchmarks (SWE-bench, etc.) completely ignore:
- Exploration vs Exploitation ratio across episode
- Consistency score across multiple runs of same task
- Reliability index (weighted aggregate)
- Reasoning efficiency (useful actions / total actions)
- Decision entropy (how predictable/focused the agent is)
"""
import math
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field


@dataclass
class AdvancedMetricsReport:
    """All advanced metrics for one episode or cross-episode comparison."""

    # Per-episode
    reasoning_efficiency: float    # Useful steps / total steps
    exploration_ratio: float       # Read+search vs write+test ratio
    decision_entropy: float        # Shannon entropy of action distribution
    reliability_index: float       # Composite reliability score
    pivot_rate: float              # Strategy changes per 10 steps
    wasteful_ratio: float          # Redundant actions / total actions

    # Cross-episode (populated when history provided)
    consistency_score: float = 0.0   # Variance across runs (lower variance = higher consistency)
    runs_analyzed: int = 0

    # Breakdowns
    action_distribution: Dict[str, int] = field(default_factory=dict)
    useful_actions: List[str] = field(default_factory=list)
    wasteful_actions: List[str] = field(default_factory=list)
    reliability_breakdown: Dict[str, float] = field(default_factory=dict)

    def to_dict(self) -> dict:
        return {
            "reasoning_efficiency": round(self.reasoning_efficiency, 3),
            "exploration_ratio": round(self.exploration_ratio, 3),
            "decision_entropy": round(self.decision_entropy, 3),
            "reliability_index": round(self.reliability_index, 3),
            "pivot_rate": round(self.pivot_rate, 3),
            "wasteful_ratio": round(self.wasteful_ratio, 3),
            "consistency_score": round(self.consistency_score, 3),
            "runs_analyzed": self.runs_analyzed,
            "action_distribution": self.action_distribution,
            "useful_actions": self.useful_actions,
            "wasteful_actions": self.wasteful_actions,
            "reliability_breakdown": {
                k: round(v, 3) for k, v in self.reliability_breakdown.items()
            },
        }


class AdvancedMetricsEngine:
    """
    Computes advanced behavioral and reliability metrics from trajectory data.

    Usage:
        engine = AdvancedMetricsEngine()
        report = engine.compute(
            trajectory_steps=[...],
            variant_meta={...},
            final_score=0.7,
            files_read=[...],
            files_written=[...],
            history=[],  # Pass previous episode scores for consistency
        )
    """

    def __init__(self):
        self._score_history: List[float] = []  # Tracks scores across episodes

    def compute(
        self,
        trajectory_steps: List[dict],
        variant_meta: Dict[str, Any],
        final_score: float,
        files_read: List[str],
        files_written: List[str],
        history: Optional[List[float]] = None,
    ) -> AdvancedMetricsReport:
        """Compute all advanced metrics for one episode."""
        # Record this score in history
        self._score_history.append(final_score)

        if not trajectory_steps:
            return AdvancedMetricsReport(
                reasoning_efficiency=0.0,
                exploration_ratio=0.5,
                decision_entropy=0.0,
                reliability_index=0.0,
                pivot_rate=0.0,
                wasteful_ratio=1.0,
            )

        action_seq = [s.get("action_type", "unknown") for s in trajectory_steps]
        total = len(action_seq)

        # ── Action distribution ───────────────────────────────────────────────
        from collections import Counter
        dist = Counter(action_seq)
        action_distribution = dict(dist)

        # ── Decision entropy (Shannon entropy of action types) ────────────────
        entropy = 0.0
        for count in dist.values():
            p = count / total
            if p > 0:
                entropy -= p * math.log2(p)
        # Normalize by max possible entropy (log2 of unique action types)
        max_entropy = math.log2(len(dist)) if len(dist) > 1 else 1.0
        normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0.0

        # ── Exploration vs exploitation ratio ─────────────────────────────────
        explore = dist.get("read_file", 0) + dist.get("search_code", 0)
        exploit = dist.get("write_file", 0) + dist.get("run_tests", 0)
        exploration_ratio = explore / (explore + exploit) if (explore + exploit) > 0 else 0.5

        # ── Redundancy / wasteful actions ─────────────────────────────────────
        read_paths = [
            s.get("action_path")
            for s in trajectory_steps
            if s.get("action_type") == "read_file" and s.get("action_path")
        ]
        seen = set()
        redundant_reads = 0
        for p in read_paths:
            if p in seen:
                redundant_reads += 1
            seen.add(p)

        error_actions = sum(1 for s in trajectory_steps if s.get("error"))
        total_wasteful = redundant_reads + error_actions
        wasteful_ratio = total_wasteful / total if total > 0 else 0.0

        wasteful_actions = []
        if redundant_reads > 0:
            wasteful_actions.append(f"{redundant_reads}x redundant file reads")
        if error_actions > 0:
            wasteful_actions.append(f"{error_actions}x actions that produced errors")

        # ── Useful action detection ───────────────────────────────────────────
        useful_actions = []
        relevant = set(
            variant_meta.get("bug_files", []) +
            variant_meta.get("interface_files", []) +
            variant_meta.get("read_first_files", []) +
            variant_meta.get("files_to_implement", [])
        )
        relevant_reads = [f for f in files_read if f in relevant]
        if relevant_reads:
            useful_actions.append(f"Read {len(relevant_reads)} key files: {relevant_reads[:3]}")

        test_rates = [
            s.get("test_pass_rate")
            for s in trajectory_steps
            if s.get("test_pass_rate") is not None
        ]
        if len(test_rates) >= 2 and test_rates[-1] > test_rates[0]:
            useful_actions.append(
                f"Test pass rate improved from {test_rates[0]:.2f} to {test_rates[-1]:.2f}"
            )

        if files_written:
            useful_actions.append(f"Wrote {len(files_written)} file(s): {files_written[:3]}")

        # ── Reasoning efficiency ──────────────────────────────────────────────
        useful_count = len(relevant_reads) + (1 if files_written else 0) + (1 if test_rates else 0)
        reasoning_efficiency = min(1.0, useful_count / max(total, 1))

        # ── Pivot rate (strategy switches per 10 steps) ───────────────────────
        pivots = 0
        for i in range(1, len(action_seq)):
            prev_explore = action_seq[i-1] in ("read_file", "search_code")
            curr_exploit = action_seq[i] in ("write_file", "run_tests")
            prev_exploit = action_seq[i-1] in ("write_file", "run_tests")
            curr_explore = action_seq[i] in ("read_file", "search_code")
            if (prev_explore and curr_exploit) or (prev_exploit and curr_explore):
                pivots += 1
        pivot_rate = (pivots / total) * 10 if total > 0 else 0.0  # per 10 steps

        # ── Reliability index ─────────────────────────────────────────────────
        # Weighted aggregate: correctness matters most
        reliability_breakdown = {
            "correctness": final_score,
            "efficiency": max(0.0, 1.0 - wasteful_ratio),
            "focus": 1.0 - normalized_entropy,  # Low entropy = focused behavior
            "verification": 1.0 if test_rates else 0.0,
            "safety": 1.0,  # Will be reduced by security violations
        }

        # Check for security flags
        sec_flags = sum(len(s.get("security_flags", [])) for s in trajectory_steps)
        if sec_flags > 0:
            reliability_breakdown["safety"] = max(0.0, 1.0 - sec_flags * 0.2)

        # Weighted reliability index
        weights = {
            "correctness": 0.40,
            "efficiency": 0.20,
            "focus": 0.15,
            "verification": 0.15,
            "safety": 0.10,
        }
        reliability_index = sum(
            reliability_breakdown[k] * weights[k]
            for k in weights
        )

        # ── Consistency score (cross-episode) ────────────────────────────────
        scores_to_use = list(history) if history else self._score_history
        consistency_score = 0.0
        runs_analyzed = len(scores_to_use)

        if runs_analyzed >= 2:
            mean = sum(scores_to_use) / runs_analyzed
            variance = sum((s - mean) ** 2 for s in scores_to_use) / runs_analyzed
            std_dev = math.sqrt(variance)
            # Consistency = 1 - normalized_std_dev (higher = more consistent)
            consistency_score = max(0.0, 1.0 - (std_dev / max(mean, 0.01)))

        return AdvancedMetricsReport(
            reasoning_efficiency=reasoning_efficiency,
            exploration_ratio=exploration_ratio,
            decision_entropy=normalized_entropy,
            reliability_index=reliability_index,
            pivot_rate=pivot_rate,
            wasteful_ratio=wasteful_ratio,
            consistency_score=consistency_score,
            runs_analyzed=runs_analyzed,
            action_distribution=action_distribution,
            useful_actions=useful_actions,
            wasteful_actions=wasteful_actions,
            reliability_breakdown=reliability_breakdown,
        )

    def get_score_history(self) -> List[float]:
        return list(self._score_history)

    def reset_history(self):
        self._score_history = []