Spaces:

eastbrick
/

releaseops-env

Sleeping

App Files Files Community

eastbrick Copilot commited on Apr 10

Commit

140d024

1 Parent(s): abb0dea

Unify score normalization and add validator parity checks

Browse files

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Files changed (10) hide show

.github/workflows/validator-parity.yml +32 -0
README.md +12 -2
inference.py +43 -14
releaseops_env/__init__.py +4 -0
releaseops_env/scoring.py +40 -0
scripts/validator_parity_check.py +67 -0
server/releaseops_environment.py +2 -3
server/rubrics.py +2 -7
tests/test_inference_output.py +38 -0
tests/test_scoring.py +28 -0

.github/workflows/validator-parity.yml ADDED Viewed

	@@ -0,0 +1,32 @@

+name: Validator Parity
+on:
+  push:
+    branches: [main]
+  pull_request:
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev,baseline]"
+      - name: OpenEnv validate
+        run: openenv validate
+      - name: Run tests
+        run: pytest -q
+      - name: Validator parity checks
+        run: python3 scripts/validator_parity_check.py

README.md CHANGED Viewed

@@ -110,7 +110,7 @@ curl -X POST http://localhost:7860/baseline
 | `rollout_phase` | str | precheck → canary → promoted \| rolled_back |
 | `time_remaining` | int | Steps remaining before timeout |
 | `cumulative_reward` | float | Running reward total |
-| `final_score` | float\|null | Grader score 0.0–1.0 (set on terminal step) |
 ## Grading Formula
@@ -122,7 +122,7 @@ score = 0.35 * evidence_coverage
       - 0.30 * forbidden_penalty
 ```
-Scores clamped to [0.0, 1.0]. Fully deterministic — no LLM judge.
 - **evidence_coverage**: fraction of required evidence sources the agent inspected
 - **risk_signal_discovery**: fraction of required risk signals the environment emitted during the episode (objective — measures what the agent actually observed, not what strings it typed)
@@ -152,6 +152,16 @@ static diff/test inspection cannot reveal.
 Heuristic baseline runs via `curl -X POST http://localhost:7860/baseline` — no LLM required.
 ## Rollout State Machine
 ```

 | `rollout_phase` | str | precheck → canary → promoted \| rolled_back |
 | `time_remaining` | int | Steps remaining before timeout |
 | `cumulative_reward` | float | Running reward total |
+| `final_score` | float\|null | Grader score strictly between 0 and 1 (set on terminal step) |
 ## Grading Formula
       - 0.30 * forbidden_penalty
 ```
+Scores normalized to strict bounds (0, 1), i.e. [0.001, 0.999]. Fully deterministic — no LLM judge.
 - **evidence_coverage**: fraction of required evidence sources the agent inspected
 - **risk_signal_discovery**: fraction of required risk signals the environment emitted during the episode (objective — measures what the agent actually observed, not what strings it typed)
 Heuristic baseline runs via `curl -X POST http://localhost:7860/baseline` — no LLM required.
+## Validator Parity Checks
+```bash
+openenv validate
+python3 scripts/validator_parity_check.py
+pytest -q
+```
+CI runs the same checks in `.github/workflows/validator-parity.yml` on every push/PR.
 ## Rollout State Machine
 ```

inference.py CHANGED Viewed

@@ -17,10 +17,11 @@ import json
 import os
 import textwrap
 import time
-from typing import List, Optional
 import requests
 from openai import OpenAI
 # ── Config ──────────────────────────────────────────────────────────────────────
 # Validator injects these - no fallbacks allowed
@@ -49,9 +50,29 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]):
     print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
-def normalize_score(value: float) -> float:
-    """Force score to validator-safe strict bounds: 0 < score < 1."""
-    return max(0.001, min(0.999, float(value)))
 TASKS       = ["easy_001", "easy_002", "medium_001", "medium_002", "hard_001", "hard_002"]
 MAX_STEPS   = 14
@@ -217,9 +238,11 @@ def run_task(llm: OpenAI, task_id: str) -> dict:
     log_start(task_id, MODEL_NAME)
     rewards: List[float] = []
     step = 0
     success = False
-    score = 0.0
     env = SimpleEnvClient(base_url=ENV_URL)
     try:
@@ -308,14 +331,18 @@ def run_task(llm: OpenAI, task_id: str) -> dict:
         print(f"[DEBUG] Task {task_id} failed with error: {e}", flush=True)
         success = False
         score = 0.001
-    finally:
-        log_end(success, step, score, rewards)
-    return {
-        "task_id": task_id,
-        "final_score": score,
-        "steps_taken": step,
-    }
 # ── Entry point ──────────────────────────────────────────────────────────────────
@@ -328,8 +355,10 @@ def main():
         total = 0.0
         for r in results:
             total += r["final_score"]
-            print(f"  {r['task_id']:15s}  score={r['final_score']:.3f}  steps={r['steps_taken']}")
-        print(f"  {'AVERAGE':15s}  score={total / len(results):.3f}")
         return results
     except Exception as e:
         print(f"[ERROR] Fatal error in main: {e}", flush=True)

 import os
 import textwrap
 import time
+from typing import List, Optional, TypedDict
 import requests
 from openai import OpenAI
+from releaseops_env.scoring import format_score, normalize_score
 # ── Config ──────────────────────────────────────────────────────────────────────
 # Validator injects these - no fallbacks allowed
     print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
+class TaskResult(TypedDict):
+    task_id: str
+    final_score: float
+    steps_taken: int
+    done: bool
+    errors: List[str]
+def make_task_result(
+    task_id: str, final_score: float, steps_taken: int, done: bool, errors: List[str]
+) -> TaskResult:
+    return {
+        "task_id": task_id,
+        "final_score": normalize_score(final_score),
+        "steps_taken": int(steps_taken),
+        "done": bool(done),
+        "errors": errors,
+    }
+def emit_task_result(result: TaskResult) -> None:
+    """Emit machine-parseable per-task result JSON."""
+    print(json.dumps({"type": "task_result", **result}, sort_keys=True), flush=True)
 TASKS       = ["easy_001", "easy_002", "medium_001", "medium_002", "hard_001", "hard_002"]
 MAX_STEPS   = 14
     log_start(task_id, MODEL_NAME)
     rewards: List[float] = []
+    errors: List[str] = []
     step = 0
+    done = False
     success = False
+    score = 0.001
     env = SimpleEnvClient(base_url=ENV_URL)
     try:
         print(f"[DEBUG] Task {task_id} failed with error: {e}", flush=True)
         success = False
         score = 0.001
+        errors.append(str(e))
+    log_end(success, step, score, rewards)
+    task_result = make_task_result(
+        task_id=task_id,
+        final_score=score,
+        steps_taken=step,
+        done=done,
+        errors=errors,
+    )
+    emit_task_result(task_result)
+    return task_result
 # ── Entry point ──────────────────────────────────────────────────────────────────
         total = 0.0
         for r in results:
             total += r["final_score"]
+            print(
+                f"  {r['task_id']:15s}  score={format_score(r['final_score'])}  steps={r['steps_taken']}"
+            )
+        print(f"  {'AVERAGE':15s}  score={format_score(total / len(results))}")
         return results
     except Exception as e:
         print(f"[ERROR] Fatal error in main: {e}", flush=True)

releaseops_env/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from releaseops_env.models import (
     RiskSignal,
     ToolResult,
 )
 # Client import is deferred to avoid circular imports and to allow
@@ -26,4 +27,7 @@ __all__ = [
     "ReleaseState",
     "RiskSignal",
     "ToolResult",
 ]

     RiskSignal,
     ToolResult,
 )
+from releaseops_env.scoring import format_score, is_strict_score, normalize_score
 # Client import is deferred to avoid circular imports and to allow
     "ReleaseState",
     "RiskSignal",
     "ToolResult",
+    "normalize_score",
+    "format_score",
+    "is_strict_score",
 ]

releaseops_env/scoring.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Shared score utilities for strict validator-compatible score handling."""
+from __future__ import annotations
+import math
+from typing import Any
+STRICT_SCORE_MIN = 0.001
+STRICT_SCORE_MAX = 0.999
+def normalize_score(value: Any) -> float:
+    """Return a finite score guaranteed to satisfy 0 < score < 1."""
+    try:
+        score = float(value)
+    except (TypeError, ValueError):
+        return STRICT_SCORE_MIN
+    if math.isnan(score):
+        return STRICT_SCORE_MIN
+    if score == math.inf:
+        return STRICT_SCORE_MAX
+    if score == -math.inf:
+        return STRICT_SCORE_MIN
+    return max(STRICT_SCORE_MIN, min(STRICT_SCORE_MAX, score))
+def format_score(value: Any, decimals: int = 3) -> str:
+    """Format a score after strict normalization."""
+    return f"{normalize_score(value):.{decimals}f}"
+def is_strict_score(value: Any) -> bool:
+    """Return True only when value is finite and strictly between 0 and 1."""
+    try:
+        score = float(value)
+    except (TypeError, ValueError):
+        return False
+    return math.isfinite(score) and 0.0 < score < 1.0

scripts/validator_parity_check.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+"""Validator-parity checks for score bounds and output contract."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from releaseops_env.models import ReleaseAction
+from releaseops_env.scoring import is_strict_score
+from server.releaseops_environment import ReleaseOpsEnvironment
+TASKS_DIR = Path(__file__).resolve().parents[1] / "tasks"
+def run_reference_episode(task_id: str) -> float:
+    env = ReleaseOpsEnvironment()
+    obs = env.reset(task_id=task_id)
+    with open(TASKS_DIR / task_id / "ground_truth.json") as f:
+        gt = json.load(f)
+    evidence_actions = [
+        ReleaseAction(action_type="inspect_change", section="diff"),
+        ReleaseAction(action_type="inspect_change", section="tests"),
+        ReleaseAction(action_type="inspect_change", section="approvals"),
+        ReleaseAction(action_type="inspect_dependencies"),
+        ReleaseAction(action_type="search_incidents", keywords=["retry", "timeout", "latency"]),
+        ReleaseAction(action_type="check_policy"),
+    ]
+    for action in evidence_actions:
+        obs = env.step(action)
+        if obs.done:
+            break
+    if not obs.done:
+        obs = env.step(
+            ReleaseAction(
+                action_type="submit_decision",
+                final_decision=gt.get("optimal_decision", "block"),
+                reason_codes=gt.get("required_reason_codes", []),
+            )
+        )
+    score = obs.final_score
+    if score is None or not is_strict_score(score):
+        raise SystemExit(f"[FAIL] {task_id}: out-of-range final_score={score}")
+    print(f"[OK] {task_id}: final_score={score:.3f}")
+    return score
+def main() -> None:
+    task_ids = sorted(p.name for p in TASKS_DIR.iterdir() if p.is_dir())
+    if not task_ids:
+        raise SystemExit("[FAIL] No tasks found")
+    scores = [run_reference_episode(task_id) for task_id in task_ids]
+    avg = sum(scores) / len(scores)
+    if not is_strict_score(avg):
+        raise SystemExit(f"[FAIL] Average score out-of-range: {avg}")
+    print(f"[OK] average_score={avg:.3f}")
+    print("[OK] validator parity checks passed")
+if __name__ == "__main__":
+    main()

server/releaseops_environment.py CHANGED Viewed

@@ -16,6 +16,7 @@ from releaseops_env.models import (
     RiskSignal,
     ToolResult,
 )
 TASKS_DIR = Path(__file__).parent.parent / "tasks"
 INCIDENTS_DB = Path(__file__).parent.parent / "data" / "incidents.db"
@@ -880,9 +881,7 @@ class ReleaseOpsEnvironment(Environment):
             + 0.30 * decision_score
             + 0.10 * efficiency
         )
-        score = max(0.0, min(1.0, raw_score - forbidden_penalty))
-        # Keep output strictly inside (0, 1), even after downstream formatting/rounding.
-        score = max(0.001, min(0.999, score))
         return {
             "score": round(score, 3),

     RiskSignal,
     ToolResult,
 )
+from releaseops_env.scoring import normalize_score
 TASKS_DIR = Path(__file__).parent.parent / "tasks"
 INCIDENTS_DB = Path(__file__).parent.parent / "data" / "incidents.db"
             + 0.30 * decision_score
             + 0.10 * efficiency
         )
+        score = normalize_score(raw_score - forbidden_penalty)
         return {
             "score": round(score, 3),

server/rubrics.py CHANGED Viewed

@@ -16,6 +16,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Protocol
 # ── Data types ────────────────────────────────────────────────────────────────
@@ -230,13 +231,7 @@ class ReleaseOpsRubric:
         forbidden_penalty = 0.3 if took_forbidden else 0.0
         raw = sum(r.score * r.weight for r in results)
-        final_score = max(0.0, min(1.0, raw - forbidden_penalty))
-        # Clamp to strictly within (0, 1) — validator requires 0 < score < 1
-        if final_score <= 0.0:
-            final_score = 0.001
-        elif final_score >= 1.0:
-            final_score = 0.999
         return {
             "score": round(final_score, 3),

 from dataclasses import dataclass
 from typing import Protocol
+from releaseops_env.scoring import normalize_score
 # ── Data types ────────────────────────────────────────────────────────────────
         forbidden_penalty = 0.3 if took_forbidden else 0.0
         raw = sum(r.score * r.weight for r in results)
+        final_score = normalize_score(raw - forbidden_penalty)
         return {
             "score": round(final_score, 3),

tests/test_inference_output.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Tests for inference output contract."""
+import json
+from inference import emit_task_result, make_task_result
+def test_make_task_result_schema_and_bounds():
+    result = make_task_result(
+        task_id="easy_001",
+        final_score=1.5,
+        steps_taken=7,
+        done=True,
+        errors=[],
+    )
+    assert set(result.keys()) == {"task_id", "final_score", "steps_taken", "done", "errors"}
+    assert result["task_id"] == "easy_001"
+    assert result["steps_taken"] == 7
+    assert result["done"] is True
+    assert result["errors"] == []
+    assert 0.0 < result["final_score"] < 1.0
+def test_emit_task_result_is_json_line(capsys):
+    result = make_task_result(
+        task_id="easy_002",
+        final_score=0.0,
+        steps_taken=3,
+        done=False,
+        errors=["sample error"],
+    )
+    emit_task_result(result)
+    line = capsys.readouterr().out.strip()
+    payload = json.loads(line)
+    assert payload["type"] == "task_result"
+    assert payload["task_id"] == "easy_002"
+    assert payload["errors"] == ["sample error"]
+    assert 0.0 < payload["final_score"] < 1.0

tests/test_scoring.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Tests for shared strict score utilities."""
+from releaseops_env.scoring import format_score, is_strict_score, normalize_score
+def test_normalize_score_clamps_boundaries():
+    assert normalize_score(0.0) == 0.001
+    assert normalize_score(1.0) == 0.999
+    assert normalize_score(-5.0) == 0.001
+    assert normalize_score(5.0) == 0.999
+def test_normalize_score_handles_non_finite_values():
+    assert normalize_score(float("nan")) == 0.001
+    assert normalize_score(float("-inf")) == 0.001
+    assert normalize_score(float("inf")) == 0.999
+def test_is_strict_score():
+    assert is_strict_score(0.001) is True
+    assert is_strict_score(0.999) is True
+    assert is_strict_score(0.0) is False
+    assert is_strict_score(1.0) is False
+def test_format_score_uses_normalized_value():
+    assert format_score(1.0) == "0.999"
+    assert format_score(0.0) == "0.001"