Spaces:

Meta-HF-hackathon
/

updated-policy

Sleeping

File size: 9,041 Bytes

"""
Task registry and unified grader.

`TASK_REGISTRY` maps task_name → scenario class.  Pools (A/B/C/D) live
in `pools.py` and reuse the same registry — there's no duplication.

The unified grader is oracle-INDEPENDENT: it consumes only step records
plus terminal artefacts (declared patch, declared no-change), so it can
score a saved trajectory file long after the episode ended.
"""

from __future__ import annotations

from typing import Dict, List, Optional, Type

from .models import StepRecord
from .scenarios.base import BaseScenario
from .scenarios.easy_memory_leak import MemoryLeakScenario
from .scenarios.medium_cascading_failure import CascadingFailureScenario
from .scenarios.hard_distributed_deadlock import DistributedDeadlockScenario
from .scenarios.grader_p2 import (
    grade_patch_quality,
    grade_no_change,
    grade_p2_efficiency,
)


# ──────────────────────────────────────────────────────────────────────
# Registry
# ──────────────────────────────────────────────────────────────────────

TASK_REGISTRY: Dict[str, Type[BaseScenario]] = {
    "memory_leak":          MemoryLeakScenario,
    "cascading_failure":    CascadingFailureScenario,
    "distributed_deadlock": DistributedDeadlockScenario,
}

# Phase B scenarios are registered lazily to avoid import cycles
# during the initial Phase A bring-up.  See `_lazy_register_phase_b()`.
_PHASE_B_REGISTERED = False


def _lazy_register_phase_b() -> None:
    """Register Phase B scenarios if importable."""
    global _PHASE_B_REGISTERED
    if _PHASE_B_REGISTERED:
        return
    _PHASE_B_REGISTERED = True

    try:
        from .scenarios.aliased_fault import AliasedFaultScenario
        TASK_REGISTRY["aliased_fault"] = AliasedFaultScenario
    except ImportError:
        pass
    try:
        from .scenarios.severity_inversion import SeverityInversionScenario
        TASK_REGISTRY["severity_inversion"] = SeverityInversionScenario
    except ImportError:
        pass
    try:
        from .scenarios.confidence_inversion import ConfidenceInversionScenario
        TASK_REGISTRY["confidence_inversion"] = ConfidenceInversionScenario
    except ImportError:
        pass
    try:
        from .scenarios.info_ordering import InfoOrderingScenario
        TASK_REGISTRY["info_ordering"] = InfoOrderingScenario
    except ImportError:
        pass
    try:
        from .scenarios.circuit_breaker_noop import CircuitBreakerNoopScenario
        TASK_REGISTRY["circuit_breaker_noop"] = CircuitBreakerNoopScenario
    except ImportError:
        pass
    # Pool D held-out compounds
    try:
        from .scenarios.heldout import (
            HeldoutAliasedSeverityScenario,
            HeldoutConfidenceOrderingScenario,
        )
        TASK_REGISTRY["heldout_aliased_severity"]    = HeldoutAliasedSeverityScenario
        TASK_REGISTRY["heldout_confidence_ordering"] = HeldoutConfidenceOrderingScenario
    except ImportError:
        pass


_lazy_register_phase_b()
TASK_NAMES = list(TASK_REGISTRY.keys())


def get_scenario(task_name: str) -> BaseScenario:
    cls = TASK_REGISTRY.get(task_name)
    if cls is None:
        raise ValueError(
            f"Unknown task: {task_name}. Available: {list(TASK_REGISTRY)}")
    return cls()


# ──────────────────────────────────────────────────────────────────────
# P1-only grader (legacy)
# ──────────────────────────────────────────────────────────────────────

def grade_trajectory(task_name: str, trajectory: List[StepRecord]) -> float:
    """Score a P1-only trajectory in [0.01, 0.99]."""
    scenario = get_scenario(task_name)
    return float(scenario.grade(trajectory))


# ──────────────────────────────────────────────────────────────────────
# Unified grader
# ──────────────────────────────────────────────────────────────────────

#  Component weights (must sum to 1.0)
W_P1_RCA            = 0.25
W_P1_EFFICIENCY     = 0.15
W_PATCH_QUALITY     = 0.35
W_NO_CHANGE         = 0.25
# Note: weights sum to 1.0; for is_valid_issue scenarios the "no_change"
# slot is reallocated to P2 efficiency.
W_P2_EFFICIENCY     = 0.25


def grade_trajectory_unified(
    task_name:          str,
    p1_trajectory:      List[StepRecord],
    p2_trajectory:      List[StepRecord],
    declared_patch:     Optional[str],
    declared_no_change: bool,
    p1_belief_history:  Optional[List[dict]] = None,
) -> Dict[str, float]:
    """
    Score a unified P1 + P2 trajectory.

    Returns a breakdown dict with the four weighted component scores
    and the final aggregate.  Each component is in [0, 1] *before*
    weighting; the final is also in [0, 1].
    """
    scenario = get_scenario(task_name)
    ctx      = scenario.code_context

    # ---- P1 components (always evaluated) ----
    p1_rca_raw = scenario.grade_p1_rca(p1_trajectory)
    p1_eff_raw = scenario.grade_p1_efficiency(p1_trajectory)

    # ---- P2 components (only if scenario has a code_context) ----
    if ctx is None:
        # P1-only scenario — entire P2 budget goes to P1 RCA & efficiency
        return {
            "final":               round(p1_rca_raw * 0.5 + p1_eff_raw * 0.5, 4),
            "p1_rca":              round(p1_rca_raw,  4),
            "p1_efficiency":       round(p1_eff_raw,  4),
            "patch_quality":       0.0,
            "no_change_detection": 0.0,
            "p2_efficiency":       0.0,
        }

    if ctx.is_valid_issue:
        patch_raw     = grade_patch_quality(declared_patch or "", ctx)
        no_change_raw = 0.0
        p2_eff_raw    = grade_p2_efficiency(
            p2_steps      = sum(1 for r in p2_trajectory if r.phase == 2),
            expected_steps= ctx.expected_p2_steps,
        )
    else:
        # No-change scenario: declared_no_change is the right answer,
        # any patch is wrong.  We grade `no_change` and keep efficiency.
        patch_raw     = 0.0
        no_change_raw = grade_no_change(declared_no_change, ctx)
        p2_eff_raw    = grade_p2_efficiency(
            p2_steps      = sum(1 for r in p2_trajectory if r.phase == 2),
            expected_steps= ctx.expected_p2_steps,
        )

    # Weighted sum
    final = (
        W_P1_RCA        * p1_rca_raw +
        W_P1_EFFICIENCY * p1_eff_raw +
        W_PATCH_QUALITY * patch_raw +
        W_NO_CHANGE     * no_change_raw
    )
    # If no_change wasn't applicable (valid-issue scenario), reallocate
    # its weight to P2 efficiency so weights still sum to 1.0
    if ctx.is_valid_issue:
        final += W_P2_EFFICIENCY * p2_eff_raw - W_NO_CHANGE * no_change_raw

    return {
        "final":               round(final, 4),
        "p1_rca":              round(p1_rca_raw,  4),
        "p1_efficiency":       round(p1_eff_raw,  4),
        "patch_quality":       round(patch_raw,   4),
        "no_change_detection": round(no_change_raw, 4),
        "p2_efficiency":       round(p2_eff_raw,  4),
    }


# ──────────────────────────────────────────────────────────────────────
# Counterfactual r_cross
# ──────────────────────────────────────────────────────────────────────

def compute_r_cross(
    task_name:      str,
    declared_patch: Optional[str],
    declared_no_change: bool,
    p2_trajectory:  List[StepRecord],
) -> float:
    """
    Counterfactual cross-phase reward:
        r_cross = max(0, r_code(τ_2 | context(τ_1)) - r_code(τ_2 | context(∅)))

    The null-context baseline lives on `CodeContext.null_context_p2_score`
    (filled in by `training/run_pool_b_baseline.py`).  We clamp to ≥0 so
    Phase 1 is never punished for inherently hard bugs that no context
    could have helped.
    """
    scenario = get_scenario(task_name)
    ctx      = scenario.code_context
    if ctx is None:
        return 0.0

    if ctx.is_valid_issue:
        with_ctx = grade_patch_quality(declared_patch or "", ctx)
    else:
        with_ctx = grade_no_change(declared_no_change, ctx)

    return max(0.0, with_ctx - ctx.null_context_p2_score)