from __future__ import annotations from typing import Any from env.graders.deterministic import DeterministicGrader class HiddenTestRunner: """Evaluates whether a fix generalizes across deterministic CI variants.""" def __init__(self, grader: DeterministicGrader | None = None, pass_threshold: float = 0.65): self.grader = grader or DeterministicGrader() self.pass_threshold = pass_threshold def generate_variants(self, config_text: str) -> list[str]: base = config_text or "" variants: list[str] = [] for replacements in self._variant_replacement_sets(): variant = self._apply_replacements(base, replacements) if variant not in variants: variants.append(variant) return variants def evaluate_fix( self, fixed_config: str, task: dict[str, Any] | None = None, expected_config: str | None = None, metadata: dict[str, Any] | None = None, ) -> float: fixed_config = fixed_config or "" task = task or {} metadata = metadata or {} expected = expected_config or str(task.get("expected_config", "")) if not fixed_config.strip() or not expected.strip(): return 0.0 total = 0 passed = 0 for replacements in self._variant_replacement_sets(): fixed_variant = self._apply_replacements(fixed_config, replacements) expected_variant = self._apply_replacements(expected, replacements) score = self.grader.grade(fixed_variant, expected_variant, metadata) total += 1 if score >= self.pass_threshold: passed += 1 if total == 0: return 0.0 return round(passed / total, 4) def _variant_replacement_sets(self) -> list[tuple[tuple[str, str], ...]]: return [ tuple(), (("ubuntu-latest", "windows-latest"),), (("windows-latest", "ubuntu-latest"),), (("node-version: 16", "node-version: 18"),), (("node-version: \"16\"", "node-version: \"18\""),), (("python-version: \"3.10\"", "python-version: \"3.12\""),), (("NODE_ENV=production", "NODE_ENV=development"),), ] def _apply_replacements(self, text: str, replacements: tuple[tuple[str, str], ...]) -> str: output = text for old, new in replacements: output = output.replace(old, new) return output