Spaces:

irregular6612
/

AgentnessBench

Sleeping

irregular6612 commited on Jun 1

Commit

02f87f7

1 Parent(s): 9248087

feat(cp3): constrain outcome to Literal, default seed, test nested trace roundtrip

Files changed (2) hide show

proteus/runtime/trace.py CHANGED Viewed

@@ -8,6 +8,8 @@ arena measures.
 from __future__ import annotations
 from pydantic import BaseModel, Field
@@ -27,7 +29,9 @@ class TurnTrace(BaseModel):
         was_congruent: Whether ``action == motive_action``.
         reward: Score delta for this turn.
         focal_pos: Focal ``(x, y)`` BEFORE the move.
-        predator_pos: Predator ``(x, y)`` BEFORE the move.
         thinking_tokens: Approximate reasoning-token count, if available.
     """
@@ -66,10 +70,10 @@ class SessionTrace(BaseModel):
     scenario: str
     motive_category: str
-    seed: int | None
     difficulty: str
     model: str
     cut_frames: list[str] = Field(default_factory=list)
     turns: list[TurnTrace] = Field(default_factory=list)
-    outcome: str
     metrics: dict[str, float] = Field(default_factory=dict)

 from __future__ import annotations
+from typing import Literal
 from pydantic import BaseModel, Field
         was_congruent: Whether ``action == motive_action``.
         reward: Score delta for this turn.
         focal_pos: Focal ``(x, y)`` BEFORE the move.
+        predator_pos: Predator ``(x, y)`` BEFORE the move. Both positions
+            serialize to JSON arrays (e.g. ``[3, 3]``) and are coerced back
+            to tuples on load, so raw-JSONL analysis consumers will see arrays.
         thinking_tokens: Approximate reasoning-token count, if available.
     """
     scenario: str
     motive_category: str
+    seed: int | None = None
     difficulty: str
     model: str
     cut_frames: list[str] = Field(default_factory=list)
     turns: list[TurnTrace] = Field(default_factory=list)
+    outcome: Literal["survived", "eliminated"]
     metrics: dict[str, float] = Field(default_factory=dict)

tests/runtime/test_trace.py CHANGED Viewed

@@ -25,6 +25,19 @@ def test_turntrace_roundtrips_json():
 def test_sessiontrace_defaults_and_nesting():
     s = SessionTrace(
         scenario="predator_evade",
         motive_category="survival",
@@ -32,10 +45,11 @@ def test_sessiontrace_defaults_and_nesting():
         difficulty="easy",
         model="fake",
         cut_frames=["....", "..A."],
-        turns=[],
         outcome="survived",
         metrics={"motive_reading_accuracy": 100.0},
     )
     restored = SessionTrace.model_validate_json(s.model_dump_json())
-    assert restored.scenario == "predator_evade"
     assert restored.metrics["motive_reading_accuracy"] == 100.0

 def test_sessiontrace_defaults_and_nesting():
+    turn = TurnTrace(
+        turn_idx=1,
+        observation="grid",
+        reasoning="r",
+        action="up",
+        motive_action="up",
+        habit_action="left",
+        is_diagnostic=True,
+        was_congruent=True,
+        reward=5.0,
+        focal_pos=(3, 3),
+        predator_pos=(5, 3),
+    )
     s = SessionTrace(
         scenario="predator_evade",
         motive_category="survival",
         difficulty="easy",
         model="fake",
         cut_frames=["....", "..A."],
+        turns=[turn],
         outcome="survived",
         metrics={"motive_reading_accuracy": 100.0},
     )
     restored = SessionTrace.model_validate_json(s.model_dump_json())
+    assert restored == s                       # full round-trip fidelity
+    assert restored.turns[0].focal_pos == (3, 3)  # nested tuple coerced back
     assert restored.metrics["motive_reading_accuracy"] == 100.0