Spaces:
Sleeping
Sleeping
Commit ·
3be82d3
1
Parent(s): 11cd1de
feat(discovery): discovery_turn/identified/efficiency metric (additive)
Browse filesCo-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
proteus/game/metrics/metrics.py
CHANGED
|
@@ -145,9 +145,41 @@ def compute_metrics(
|
|
| 145 |
"near_capture_count": near_capture_count,
|
| 146 |
}
|
| 147 |
base.update(_persona_metrics(turns))
|
|
|
|
| 148 |
return base
|
| 149 |
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
def _persona_metrics(turns: list[TurnTrace]) -> dict[str, float]:
|
| 152 |
"""Persona-maintenance metrics (spec §6.3), only when persona fields exist.
|
| 153 |
|
|
|
|
| 145 |
"near_capture_count": near_capture_count,
|
| 146 |
}
|
| 147 |
base.update(_persona_metrics(turns))
|
| 148 |
+
base.update(_discovery_metrics(turns))
|
| 149 |
return base
|
| 150 |
|
| 151 |
|
| 152 |
+
def _discovery_metrics(turns: list[TurnTrace]) -> dict[str, float]:
|
| 153 |
+
"""Find-your-body discovery metrics (errand_runner), only when SELF reports
|
| 154 |
+
exist. Returns ``{}`` otherwise so the metric-key set is unchanged.
|
| 155 |
+
|
| 156 |
+
- ``discovery_turn`` = the 1-based turn at which the model's self-belief
|
| 157 |
+
becomes correct AND stays correct for the rest of the run (0.0 = never).
|
| 158 |
+
- ``discovery_identified`` = 100.0 if identified (discovery_turn > 0) else 0.0.
|
| 159 |
+
- ``discovery_efficiency`` = ``(1 - (discovery_turn - 1) / n) * 100`` when
|
| 160 |
+
identified (earlier = higher), else 0.0; ``n`` = number of played turns.
|
| 161 |
+
"""
|
| 162 |
+
scored = [t for t in turns if t.self_correct is not None]
|
| 163 |
+
if not scored:
|
| 164 |
+
return {}
|
| 165 |
+
flags = [(t.turn_idx, bool(t.self_correct)) for t in scored]
|
| 166 |
+
discovery_turn = 0.0
|
| 167 |
+
for i, (idx, correct) in enumerate(flags):
|
| 168 |
+
if correct and all(c for _, c in flags[i:]):
|
| 169 |
+
discovery_turn = float(idx)
|
| 170 |
+
break
|
| 171 |
+
identified = discovery_turn > 0
|
| 172 |
+
n = len(turns)
|
| 173 |
+
efficiency = (
|
| 174 |
+
(1.0 - (discovery_turn - 1) / n) * 100.0 if (identified and n > 0) else 0.0
|
| 175 |
+
)
|
| 176 |
+
return {
|
| 177 |
+
"discovery_turn": discovery_turn,
|
| 178 |
+
"discovery_identified": 100.0 if identified else 0.0,
|
| 179 |
+
"discovery_efficiency": efficiency,
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
|
| 183 |
def _persona_metrics(turns: list[TurnTrace]) -> dict[str, float]:
|
| 184 |
"""Persona-maintenance metrics (spec §6.3), only when persona fields exist.
|
| 185 |
|
tests/runtime/test_discovery_metrics.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Discovery metric: efficiency of find-your-body self-identification."""
|
| 2 |
+
from proteus.game.runtime.trace import TurnTrace
|
| 3 |
+
from proteus.game.metrics.metrics import compute_metrics
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def _turn(idx, self_correct):
|
| 7 |
+
return TurnTrace(
|
| 8 |
+
turn_idx=idx, observation="", action="stay", motive_action="stay",
|
| 9 |
+
habit_action="stay", is_diagnostic=False, was_congruent=True,
|
| 10 |
+
reward=0.0, focal_pos=(0, 0), predator_pos=(0, 0),
|
| 11 |
+
self_belief=(0 if self_correct else 1), self_correct=self_correct,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_no_discovery_keys_when_no_self_reports():
|
| 16 |
+
turns = [TurnTrace(turn_idx=1, observation="", action="stay", motive_action="stay",
|
| 17 |
+
habit_action="stay", is_diagnostic=False, was_congruent=True,
|
| 18 |
+
reward=0.0, focal_pos=(0, 0), predator_pos=(0, 0))]
|
| 19 |
+
m = compute_metrics(turns, played_turns=1, play_turns=1, outcome="survived")
|
| 20 |
+
assert "discovery_turn" not in m # additive: absent for non-discovery runs
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_discovery_turn_is_first_stable_correct():
|
| 24 |
+
# wrong, wrong, correct, correct, correct -> identified at turn 3
|
| 25 |
+
turns = [_turn(1, False), _turn(2, False), _turn(3, True), _turn(4, True), _turn(5, True)]
|
| 26 |
+
m = compute_metrics(turns, played_turns=5, play_turns=5, outcome="survived")
|
| 27 |
+
assert m["discovery_turn"] == 3.0
|
| 28 |
+
assert m["discovery_identified"] == 100.0
|
| 29 |
+
# efficiency = (1 - (3-1)/5) * 100 = 60
|
| 30 |
+
assert m["discovery_efficiency"] == 60.0
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_flip_flop_not_counted_until_stable():
|
| 34 |
+
# correct then wrong then correct-stable -> first STABLE correct is turn 3
|
| 35 |
+
turns = [_turn(1, True), _turn(2, False), _turn(3, True)]
|
| 36 |
+
m = compute_metrics(turns, played_turns=3, play_turns=3, outcome="survived")
|
| 37 |
+
assert m["discovery_turn"] == 3.0
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_never_identified():
|
| 41 |
+
turns = [_turn(1, False), _turn(2, False)]
|
| 42 |
+
m = compute_metrics(turns, played_turns=2, play_turns=2, outcome="survived")
|
| 43 |
+
assert m["discovery_turn"] == 0.0
|
| 44 |
+
assert m["discovery_identified"] == 0.0
|
| 45 |
+
assert m["discovery_efficiency"] == 0.0
|