Spaces:

Meta-HF-hackathon
/

updated-policy

Sleeping

App Files Files Community

updated-policy / tasks.py

srinjoyd

init

19f7f7b 17 days ago

raw

history blame contribute delete

9.04 kB

	"""
	Task registry and unified grader.

	`TASK_REGISTRY` maps task_name → scenario class. Pools (A/B/C/D) live
	in `pools.py` and reuse the same registry — there's no duplication.

	The unified grader is oracle-INDEPENDENT: it consumes only step records
	plus terminal artefacts (declared patch, declared no-change), so it can
	score a saved trajectory file long after the episode ended.
	"""

	from __future__ import annotations

	from typing import Dict, List, Optional, Type

	from .models import StepRecord
	from .scenarios.base import BaseScenario
	from .scenarios.easy_memory_leak import MemoryLeakScenario
	from .scenarios.medium_cascading_failure import CascadingFailureScenario
	from .scenarios.hard_distributed_deadlock import DistributedDeadlockScenario
	from .scenarios.grader_p2 import (
	grade_patch_quality,
	grade_no_change,
	grade_p2_efficiency,
	)


	# ──────────────────────────────────────────────────────────────────────
	# Registry
	# ──────────────────────────────────────────────────────────────────────

	TASK_REGISTRY: Dict[str, Type[BaseScenario]] = {
	"memory_leak": MemoryLeakScenario,
	"cascading_failure": CascadingFailureScenario,
	"distributed_deadlock": DistributedDeadlockScenario,
	}

	# Phase B scenarios are registered lazily to avoid import cycles
	# during the initial Phase A bring-up. See `_lazy_register_phase_b()`.
	_PHASE_B_REGISTERED = False


	def _lazy_register_phase_b() -> None:
	"""Register Phase B scenarios if importable."""
	global _PHASE_B_REGISTERED
	if _PHASE_B_REGISTERED:
	return
	_PHASE_B_REGISTERED = True

	try:
	from .scenarios.aliased_fault import AliasedFaultScenario
	TASK_REGISTRY["aliased_fault"] = AliasedFaultScenario
	except ImportError:
	pass
	try:
	from .scenarios.severity_inversion import SeverityInversionScenario
	TASK_REGISTRY["severity_inversion"] = SeverityInversionScenario
	except ImportError:
	pass
	try:
	from .scenarios.confidence_inversion import ConfidenceInversionScenario
	TASK_REGISTRY["confidence_inversion"] = ConfidenceInversionScenario
	except ImportError:
	pass
	try:
	from .scenarios.info_ordering import InfoOrderingScenario
	TASK_REGISTRY["info_ordering"] = InfoOrderingScenario
	except ImportError:
	pass
	try:
	from .scenarios.circuit_breaker_noop import CircuitBreakerNoopScenario
	TASK_REGISTRY["circuit_breaker_noop"] = CircuitBreakerNoopScenario
	except ImportError:
	pass
	# Pool D held-out compounds
	try:
	from .scenarios.heldout import (
	HeldoutAliasedSeverityScenario,
	HeldoutConfidenceOrderingScenario,
	)
	TASK_REGISTRY["heldout_aliased_severity"] = HeldoutAliasedSeverityScenario
	TASK_REGISTRY["heldout_confidence_ordering"] = HeldoutConfidenceOrderingScenario
	except ImportError:
	pass


	_lazy_register_phase_b()
	TASK_NAMES = list(TASK_REGISTRY.keys())


	def get_scenario(task_name: str) -> BaseScenario:
	cls = TASK_REGISTRY.get(task_name)
	if cls is None:
	raise ValueError(
	f"Unknown task: {task_name}. Available: {list(TASK_REGISTRY)}")
	return cls()


	# ──────────────────────────────────────────────────────────────────────
	# P1-only grader (legacy)
	# ──────────────────────────────────────────────────────────────────────

	def grade_trajectory(task_name: str, trajectory: List[StepRecord]) -> float:
	"""Score a P1-only trajectory in [0.01, 0.99]."""
	scenario = get_scenario(task_name)
	return float(scenario.grade(trajectory))


	# ──────────────────────────────────────────────────────────────────────
	# Unified grader
	# ──────────────────────────────────────────────────────────────────────

	# Component weights (must sum to 1.0)
	W_P1_RCA = 0.25
	W_P1_EFFICIENCY = 0.15
	W_PATCH_QUALITY = 0.35
	W_NO_CHANGE = 0.25
	# Note: weights sum to 1.0; for is_valid_issue scenarios the "no_change"
	# slot is reallocated to P2 efficiency.
	W_P2_EFFICIENCY = 0.25


	def grade_trajectory_unified(
	task_name: str,
	p1_trajectory: List[StepRecord],
	p2_trajectory: List[StepRecord],
	declared_patch: Optional[str],
	declared_no_change: bool,
	p1_belief_history: Optional[List[dict]] = None,
	) -> Dict[str, float]:
	"""
	Score a unified P1 + P2 trajectory.

	Returns a breakdown dict with the four weighted component scores
	and the final aggregate. Each component is in [0, 1] before
	weighting; the final is also in [0, 1].
	"""
	scenario = get_scenario(task_name)
	ctx = scenario.code_context

	# ---- P1 components (always evaluated) ----
	p1_rca_raw = scenario.grade_p1_rca(p1_trajectory)
	p1_eff_raw = scenario.grade_p1_efficiency(p1_trajectory)

	# ---- P2 components (only if scenario has a code_context) ----
	if ctx is None:
	# P1-only scenario — entire P2 budget goes to P1 RCA & efficiency
	return {
	"final": round(p1_rca_raw * 0.5 + p1_eff_raw * 0.5, 4),
	"p1_rca": round(p1_rca_raw, 4),
	"p1_efficiency": round(p1_eff_raw, 4),
	"patch_quality": 0.0,
	"no_change_detection": 0.0,
	"p2_efficiency": 0.0,
	}

	if ctx.is_valid_issue:
	patch_raw = grade_patch_quality(declared_patch or "", ctx)
	no_change_raw = 0.0
	p2_eff_raw = grade_p2_efficiency(
	p2_steps = sum(1 for r in p2_trajectory if r.phase == 2),
	expected_steps= ctx.expected_p2_steps,
	)
	else:
	# No-change scenario: declared_no_change is the right answer,
	# any patch is wrong. We grade `no_change` and keep efficiency.
	patch_raw = 0.0
	no_change_raw = grade_no_change(declared_no_change, ctx)
	p2_eff_raw = grade_p2_efficiency(
	p2_steps = sum(1 for r in p2_trajectory if r.phase == 2),
	expected_steps= ctx.expected_p2_steps,
	)

	# Weighted sum
	final = (
	W_P1_RCA * p1_rca_raw +
	W_P1_EFFICIENCY * p1_eff_raw +
	W_PATCH_QUALITY * patch_raw +
	W_NO_CHANGE * no_change_raw
	)
	# If no_change wasn't applicable (valid-issue scenario), reallocate
	# its weight to P2 efficiency so weights still sum to 1.0
	if ctx.is_valid_issue:
	final += W_P2_EFFICIENCY * p2_eff_raw - W_NO_CHANGE * no_change_raw

	return {
	"final": round(final, 4),
	"p1_rca": round(p1_rca_raw, 4),
	"p1_efficiency": round(p1_eff_raw, 4),
	"patch_quality": round(patch_raw, 4),
	"no_change_detection": round(no_change_raw, 4),
	"p2_efficiency": round(p2_eff_raw, 4),
	}


	# ──────────────────────────────────────────────────────────────────────
	# Counterfactual r_cross
	# ──────────────────────────────────────────────────────────────────────

	def compute_r_cross(
	task_name: str,
	declared_patch: Optional[str],
	declared_no_change: bool,
	p2_trajectory: List[StepRecord],
	) -> float:
	"""
	Counterfactual cross-phase reward:
	r_cross = max(0, r_code(τ_2 \| context(τ_1)) - r_code(τ_2 \| context(∅)))

	The null-context baseline lives on `CodeContext.null_context_p2_score`
	(filled in by `training/run_pool_b_baseline.py`). We clamp to ≥0 so
	Phase 1 is never punished for inherently hard bugs that no context
	could have helped.
	"""
	scenario = get_scenario(task_name)
	ctx = scenario.code_context
	if ctx is None:
	return 0.0

	if ctx.is_valid_issue:
	with_ctx = grade_patch_quality(declared_patch or "", ctx)
	else:
	with_ctx = grade_no_change(declared_no_change, ctx)

	return max(0.0, with_ctx - ctx.null_context_p2_score)