eastbrick Copilot commited on
Commit
140d024
Β·
1 Parent(s): abb0dea

Unify score normalization and add validator parity checks

Browse files

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

.github/workflows/validator-parity.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Validator Parity
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ validate:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Checkout
13
+ uses: actions/checkout@v4
14
+
15
+ - name: Setup Python
16
+ uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.11"
19
+
20
+ - name: Install dependencies
21
+ run: |
22
+ python -m pip install --upgrade pip
23
+ pip install -e ".[dev,baseline]"
24
+
25
+ - name: OpenEnv validate
26
+ run: openenv validate
27
+
28
+ - name: Run tests
29
+ run: pytest -q
30
+
31
+ - name: Validator parity checks
32
+ run: python3 scripts/validator_parity_check.py
README.md CHANGED
@@ -110,7 +110,7 @@ curl -X POST http://localhost:7860/baseline
110
  | `rollout_phase` | str | precheck β†’ canary β†’ promoted \| rolled_back |
111
  | `time_remaining` | int | Steps remaining before timeout |
112
  | `cumulative_reward` | float | Running reward total |
113
- | `final_score` | float\|null | Grader score 0.0–1.0 (set on terminal step) |
114
 
115
  ## Grading Formula
116
 
@@ -122,7 +122,7 @@ score = 0.35 * evidence_coverage
122
  - 0.30 * forbidden_penalty
123
  ```
124
 
125
- Scores clamped to [0.0, 1.0]. Fully deterministic β€” no LLM judge.
126
 
127
  - **evidence_coverage**: fraction of required evidence sources the agent inspected
128
  - **risk_signal_discovery**: fraction of required risk signals the environment emitted during the episode (objective β€” measures what the agent actually observed, not what strings it typed)
@@ -152,6 +152,16 @@ static diff/test inspection cannot reveal.
152
 
153
  Heuristic baseline runs via `curl -X POST http://localhost:7860/baseline` β€” no LLM required.
154
 
 
 
 
 
 
 
 
 
 
 
155
  ## Rollout State Machine
156
 
157
  ```
 
110
  | `rollout_phase` | str | precheck β†’ canary β†’ promoted \| rolled_back |
111
  | `time_remaining` | int | Steps remaining before timeout |
112
  | `cumulative_reward` | float | Running reward total |
113
+ | `final_score` | float\|null | Grader score strictly between 0 and 1 (set on terminal step) |
114
 
115
  ## Grading Formula
116
 
 
122
  - 0.30 * forbidden_penalty
123
  ```
124
 
125
+ Scores normalized to strict bounds (0, 1), i.e. [0.001, 0.999]. Fully deterministic β€” no LLM judge.
126
 
127
  - **evidence_coverage**: fraction of required evidence sources the agent inspected
128
  - **risk_signal_discovery**: fraction of required risk signals the environment emitted during the episode (objective β€” measures what the agent actually observed, not what strings it typed)
 
152
 
153
  Heuristic baseline runs via `curl -X POST http://localhost:7860/baseline` β€” no LLM required.
154
 
155
+ ## Validator Parity Checks
156
+
157
+ ```bash
158
+ openenv validate
159
+ python3 scripts/validator_parity_check.py
160
+ pytest -q
161
+ ```
162
+
163
+ CI runs the same checks in `.github/workflows/validator-parity.yml` on every push/PR.
164
+
165
  ## Rollout State Machine
166
 
167
  ```
inference.py CHANGED
@@ -17,10 +17,11 @@ import json
17
  import os
18
  import textwrap
19
  import time
20
- from typing import List, Optional
21
 
22
  import requests
23
  from openai import OpenAI
 
24
 
25
  # ── Config ──────────────────────────────────────────────────────────────────────
26
  # Validator injects these - no fallbacks allowed
@@ -49,9 +50,29 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]):
49
  print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
50
 
51
 
52
- def normalize_score(value: float) -> float:
53
- """Force score to validator-safe strict bounds: 0 < score < 1."""
54
- return max(0.001, min(0.999, float(value)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  TASKS = ["easy_001", "easy_002", "medium_001", "medium_002", "hard_001", "hard_002"]
57
  MAX_STEPS = 14
@@ -217,9 +238,11 @@ def run_task(llm: OpenAI, task_id: str) -> dict:
217
  log_start(task_id, MODEL_NAME)
218
 
219
  rewards: List[float] = []
 
220
  step = 0
 
221
  success = False
222
- score = 0.0
223
  env = SimpleEnvClient(base_url=ENV_URL)
224
 
225
  try:
@@ -308,14 +331,18 @@ def run_task(llm: OpenAI, task_id: str) -> dict:
308
  print(f"[DEBUG] Task {task_id} failed with error: {e}", flush=True)
309
  success = False
310
  score = 0.001
311
- finally:
312
- log_end(success, step, score, rewards)
313
 
314
- return {
315
- "task_id": task_id,
316
- "final_score": score,
317
- "steps_taken": step,
318
- }
 
 
 
 
 
319
 
320
 
321
  # ── Entry point ──────────────────────────────────────────────────────────────────
@@ -328,8 +355,10 @@ def main():
328
  total = 0.0
329
  for r in results:
330
  total += r["final_score"]
331
- print(f" {r['task_id']:15s} score={r['final_score']:.3f} steps={r['steps_taken']}")
332
- print(f" {'AVERAGE':15s} score={total / len(results):.3f}")
 
 
333
  return results
334
  except Exception as e:
335
  print(f"[ERROR] Fatal error in main: {e}", flush=True)
 
17
  import os
18
  import textwrap
19
  import time
20
+ from typing import List, Optional, TypedDict
21
 
22
  import requests
23
  from openai import OpenAI
24
+ from releaseops_env.scoring import format_score, normalize_score
25
 
26
  # ── Config ──────────────────────────────────────────────────────────────────────
27
  # Validator injects these - no fallbacks allowed
 
50
  print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
51
 
52
 
53
+ class TaskResult(TypedDict):
54
+ task_id: str
55
+ final_score: float
56
+ steps_taken: int
57
+ done: bool
58
+ errors: List[str]
59
+
60
+
61
+ def make_task_result(
62
+ task_id: str, final_score: float, steps_taken: int, done: bool, errors: List[str]
63
+ ) -> TaskResult:
64
+ return {
65
+ "task_id": task_id,
66
+ "final_score": normalize_score(final_score),
67
+ "steps_taken": int(steps_taken),
68
+ "done": bool(done),
69
+ "errors": errors,
70
+ }
71
+
72
+
73
+ def emit_task_result(result: TaskResult) -> None:
74
+ """Emit machine-parseable per-task result JSON."""
75
+ print(json.dumps({"type": "task_result", **result}, sort_keys=True), flush=True)
76
 
77
  TASKS = ["easy_001", "easy_002", "medium_001", "medium_002", "hard_001", "hard_002"]
78
  MAX_STEPS = 14
 
238
  log_start(task_id, MODEL_NAME)
239
 
240
  rewards: List[float] = []
241
+ errors: List[str] = []
242
  step = 0
243
+ done = False
244
  success = False
245
+ score = 0.001
246
  env = SimpleEnvClient(base_url=ENV_URL)
247
 
248
  try:
 
331
  print(f"[DEBUG] Task {task_id} failed with error: {e}", flush=True)
332
  success = False
333
  score = 0.001
334
+ errors.append(str(e))
 
335
 
336
+ log_end(success, step, score, rewards)
337
+ task_result = make_task_result(
338
+ task_id=task_id,
339
+ final_score=score,
340
+ steps_taken=step,
341
+ done=done,
342
+ errors=errors,
343
+ )
344
+ emit_task_result(task_result)
345
+ return task_result
346
 
347
 
348
  # ── Entry point ──────────────────────────────────────────────────────────────────
 
355
  total = 0.0
356
  for r in results:
357
  total += r["final_score"]
358
+ print(
359
+ f" {r['task_id']:15s} score={format_score(r['final_score'])} steps={r['steps_taken']}"
360
+ )
361
+ print(f" {'AVERAGE':15s} score={format_score(total / len(results))}")
362
  return results
363
  except Exception as e:
364
  print(f"[ERROR] Fatal error in main: {e}", flush=True)
releaseops_env/__init__.py CHANGED
@@ -7,6 +7,7 @@ from releaseops_env.models import (
7
  RiskSignal,
8
  ToolResult,
9
  )
 
10
 
11
 
12
  # Client import is deferred to avoid circular imports and to allow
@@ -26,4 +27,7 @@ __all__ = [
26
  "ReleaseState",
27
  "RiskSignal",
28
  "ToolResult",
 
 
 
29
  ]
 
7
  RiskSignal,
8
  ToolResult,
9
  )
10
+ from releaseops_env.scoring import format_score, is_strict_score, normalize_score
11
 
12
 
13
  # Client import is deferred to avoid circular imports and to allow
 
27
  "ReleaseState",
28
  "RiskSignal",
29
  "ToolResult",
30
+ "normalize_score",
31
+ "format_score",
32
+ "is_strict_score",
33
  ]
releaseops_env/scoring.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared score utilities for strict validator-compatible score handling."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from typing import Any
7
+
8
+ STRICT_SCORE_MIN = 0.001
9
+ STRICT_SCORE_MAX = 0.999
10
+
11
+
12
+ def normalize_score(value: Any) -> float:
13
+ """Return a finite score guaranteed to satisfy 0 < score < 1."""
14
+ try:
15
+ score = float(value)
16
+ except (TypeError, ValueError):
17
+ return STRICT_SCORE_MIN
18
+
19
+ if math.isnan(score):
20
+ return STRICT_SCORE_MIN
21
+ if score == math.inf:
22
+ return STRICT_SCORE_MAX
23
+ if score == -math.inf:
24
+ return STRICT_SCORE_MIN
25
+
26
+ return max(STRICT_SCORE_MIN, min(STRICT_SCORE_MAX, score))
27
+
28
+
29
+ def format_score(value: Any, decimals: int = 3) -> str:
30
+ """Format a score after strict normalization."""
31
+ return f"{normalize_score(value):.{decimals}f}"
32
+
33
+
34
+ def is_strict_score(value: Any) -> bool:
35
+ """Return True only when value is finite and strictly between 0 and 1."""
36
+ try:
37
+ score = float(value)
38
+ except (TypeError, ValueError):
39
+ return False
40
+ return math.isfinite(score) and 0.0 < score < 1.0
scripts/validator_parity_check.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Validator-parity checks for score bounds and output contract."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ from pathlib import Path
8
+
9
+ from releaseops_env.models import ReleaseAction
10
+ from releaseops_env.scoring import is_strict_score
11
+ from server.releaseops_environment import ReleaseOpsEnvironment
12
+
13
+ TASKS_DIR = Path(__file__).resolve().parents[1] / "tasks"
14
+
15
+
16
+ def run_reference_episode(task_id: str) -> float:
17
+ env = ReleaseOpsEnvironment()
18
+ obs = env.reset(task_id=task_id)
19
+
20
+ with open(TASKS_DIR / task_id / "ground_truth.json") as f:
21
+ gt = json.load(f)
22
+
23
+ evidence_actions = [
24
+ ReleaseAction(action_type="inspect_change", section="diff"),
25
+ ReleaseAction(action_type="inspect_change", section="tests"),
26
+ ReleaseAction(action_type="inspect_change", section="approvals"),
27
+ ReleaseAction(action_type="inspect_dependencies"),
28
+ ReleaseAction(action_type="search_incidents", keywords=["retry", "timeout", "latency"]),
29
+ ReleaseAction(action_type="check_policy"),
30
+ ]
31
+
32
+ for action in evidence_actions:
33
+ obs = env.step(action)
34
+ if obs.done:
35
+ break
36
+
37
+ if not obs.done:
38
+ obs = env.step(
39
+ ReleaseAction(
40
+ action_type="submit_decision",
41
+ final_decision=gt.get("optimal_decision", "block"),
42
+ reason_codes=gt.get("required_reason_codes", []),
43
+ )
44
+ )
45
+
46
+ score = obs.final_score
47
+ if score is None or not is_strict_score(score):
48
+ raise SystemExit(f"[FAIL] {task_id}: out-of-range final_score={score}")
49
+ print(f"[OK] {task_id}: final_score={score:.3f}")
50
+ return score
51
+
52
+
53
+ def main() -> None:
54
+ task_ids = sorted(p.name for p in TASKS_DIR.iterdir() if p.is_dir())
55
+ if not task_ids:
56
+ raise SystemExit("[FAIL] No tasks found")
57
+
58
+ scores = [run_reference_episode(task_id) for task_id in task_ids]
59
+ avg = sum(scores) / len(scores)
60
+ if not is_strict_score(avg):
61
+ raise SystemExit(f"[FAIL] Average score out-of-range: {avg}")
62
+ print(f"[OK] average_score={avg:.3f}")
63
+ print("[OK] validator parity checks passed")
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
server/releaseops_environment.py CHANGED
@@ -16,6 +16,7 @@ from releaseops_env.models import (
16
  RiskSignal,
17
  ToolResult,
18
  )
 
19
 
20
  TASKS_DIR = Path(__file__).parent.parent / "tasks"
21
  INCIDENTS_DB = Path(__file__).parent.parent / "data" / "incidents.db"
@@ -880,9 +881,7 @@ class ReleaseOpsEnvironment(Environment):
880
  + 0.30 * decision_score
881
  + 0.10 * efficiency
882
  )
883
- score = max(0.0, min(1.0, raw_score - forbidden_penalty))
884
- # Keep output strictly inside (0, 1), even after downstream formatting/rounding.
885
- score = max(0.001, min(0.999, score))
886
 
887
  return {
888
  "score": round(score, 3),
 
16
  RiskSignal,
17
  ToolResult,
18
  )
19
+ from releaseops_env.scoring import normalize_score
20
 
21
  TASKS_DIR = Path(__file__).parent.parent / "tasks"
22
  INCIDENTS_DB = Path(__file__).parent.parent / "data" / "incidents.db"
 
881
  + 0.30 * decision_score
882
  + 0.10 * efficiency
883
  )
884
+ score = normalize_score(raw_score - forbidden_penalty)
 
 
885
 
886
  return {
887
  "score": round(score, 3),
server/rubrics.py CHANGED
@@ -16,6 +16,7 @@ from __future__ import annotations
16
 
17
  from dataclasses import dataclass
18
  from typing import Protocol
 
19
 
20
 
21
  # ── Data types ────────────────────────────────────────────────────────────────
@@ -230,13 +231,7 @@ class ReleaseOpsRubric:
230
  forbidden_penalty = 0.3 if took_forbidden else 0.0
231
 
232
  raw = sum(r.score * r.weight for r in results)
233
- final_score = max(0.0, min(1.0, raw - forbidden_penalty))
234
-
235
- # Clamp to strictly within (0, 1) β€” validator requires 0 < score < 1
236
- if final_score <= 0.0:
237
- final_score = 0.001
238
- elif final_score >= 1.0:
239
- final_score = 0.999
240
 
241
  return {
242
  "score": round(final_score, 3),
 
16
 
17
  from dataclasses import dataclass
18
  from typing import Protocol
19
+ from releaseops_env.scoring import normalize_score
20
 
21
 
22
  # ── Data types ────────────────────────────────────────────────────────────────
 
231
  forbidden_penalty = 0.3 if took_forbidden else 0.0
232
 
233
  raw = sum(r.score * r.weight for r in results)
234
+ final_score = normalize_score(raw - forbidden_penalty)
 
 
 
 
 
 
235
 
236
  return {
237
  "score": round(final_score, 3),
tests/test_inference_output.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for inference output contract."""
2
+
3
+ import json
4
+
5
+ from inference import emit_task_result, make_task_result
6
+
7
+
8
+ def test_make_task_result_schema_and_bounds():
9
+ result = make_task_result(
10
+ task_id="easy_001",
11
+ final_score=1.5,
12
+ steps_taken=7,
13
+ done=True,
14
+ errors=[],
15
+ )
16
+ assert set(result.keys()) == {"task_id", "final_score", "steps_taken", "done", "errors"}
17
+ assert result["task_id"] == "easy_001"
18
+ assert result["steps_taken"] == 7
19
+ assert result["done"] is True
20
+ assert result["errors"] == []
21
+ assert 0.0 < result["final_score"] < 1.0
22
+
23
+
24
+ def test_emit_task_result_is_json_line(capsys):
25
+ result = make_task_result(
26
+ task_id="easy_002",
27
+ final_score=0.0,
28
+ steps_taken=3,
29
+ done=False,
30
+ errors=["sample error"],
31
+ )
32
+ emit_task_result(result)
33
+ line = capsys.readouterr().out.strip()
34
+ payload = json.loads(line)
35
+ assert payload["type"] == "task_result"
36
+ assert payload["task_id"] == "easy_002"
37
+ assert payload["errors"] == ["sample error"]
38
+ assert 0.0 < payload["final_score"] < 1.0
tests/test_scoring.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for shared strict score utilities."""
2
+
3
+ from releaseops_env.scoring import format_score, is_strict_score, normalize_score
4
+
5
+
6
+ def test_normalize_score_clamps_boundaries():
7
+ assert normalize_score(0.0) == 0.001
8
+ assert normalize_score(1.0) == 0.999
9
+ assert normalize_score(-5.0) == 0.001
10
+ assert normalize_score(5.0) == 0.999
11
+
12
+
13
+ def test_normalize_score_handles_non_finite_values():
14
+ assert normalize_score(float("nan")) == 0.001
15
+ assert normalize_score(float("-inf")) == 0.001
16
+ assert normalize_score(float("inf")) == 0.999
17
+
18
+
19
+ def test_is_strict_score():
20
+ assert is_strict_score(0.001) is True
21
+ assert is_strict_score(0.999) is True
22
+ assert is_strict_score(0.0) is False
23
+ assert is_strict_score(1.0) is False
24
+
25
+
26
+ def test_format_score_uses_normalized_value():
27
+ assert format_score(1.0) == "0.999"
28
+ assert format_score(0.0) == "0.001"