AgentnessBench / tests /web /test_memory_modes.py
irregular6612's picture
refactor(scenario): delete predator_evade; template is the canonical scenario
93cd78f
Raw
History Blame Contribute Delete
8.7 kB
"""Web memory-mode selection wired into /session (human) and /spectate (LLM):
default / none / persona[:id] / generate / latest, plus the raised action quota."""
from __future__ import annotations
import proteus.game.scenarios # noqa: F401
from proteus.web.local import server
def _reg():
return {}
def test_default_play_quota_is_100():
_, payload, _ = server.handle_request(
"POST", "/session", {"scenario": "template"}, _reg())
assert payload["state"]["play_turns"] == 100
def test_persona_memory_attached_to_human_session():
_, payload, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "seed": 42, "play_turns": 3,
"memory": "persona:risk_averse"}, _reg())
assert payload["memory"]["attached"] is True
assert payload["memory"]["persona"] == "risk_averse"
assert payload["memory"]["turns"] >= 1
def test_none_forces_no_memory_even_when_scenario_has_a_default():
# template attaches a persona memory by default; 'none' must override it.
_, dflt, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "seed": 42, "play_turns": 3, "memory": "default"}, _reg())
assert dflt["memory"]["attached"] is True # scenario default = persona
_, none, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "seed": 42, "play_turns": 3, "memory": "none"}, _reg())
assert none["memory"]["attached"] is False # forced off
def test_unknown_persona_is_400():
status, payload, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "memory": "persona:bogus"}, _reg())
assert status == 400 and "error" in payload
def test_generate_memory_via_fake_model_on_spectate(tmp_path):
status, payload, _ = server.handle_request(
"POST", "/spectate",
{"scenario": "template", "seed": 42, "play_turns": 2,
"model": "fake:demo", "memory": "generate",
"memory_root": str(tmp_path)}, _reg())
assert status == 200
assert payload["memory"]["attached"] is True
assert payload["memory"]["turns"] >= 1
# generate saved a checkpoint under the (tmp) root -> latest can find it.
status, latest, _ = server.handle_request(
"POST", "/spectate",
{"scenario": "template", "seed": 42, "play_turns": 2,
"model": "fake:demo", "memory": "latest",
"memory_root": str(tmp_path)}, _reg())
assert status == 200 and latest["memory"]["attached"] is True
def test_generate_without_model_on_human_is_400():
status, payload, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "memory": "generate"}, _reg())
assert status == 400 and "needs a model" in payload["error"]
def test_response_carries_rendered_memory_block_for_display():
# The web can show the actual memory at the start, so the response includes
# the rendered block the model is given.
_, payload, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "seed": 42, "play_turns": 3,
"memory": "persona:risk_averse"}, _reg())
assert payload["memory"]["block"] and "MEMORY" in payload["memory"]["block"]
# 'none' carries no block.
_, none, _ = server.handle_request(
"POST", "/session", {"scenario": "template", "memory": "none"}, _reg())
assert none["memory"]["block"] is None
def test_memory_payload_includes_replay_frames_for_template():
_, payload, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "seed": 42, "play_turns": 3, "memory": "default"},
_reg())
frames = payload["memory"]["frames"]
assert isinstance(frames, list) and len(frames) >= 1
f0 = frames[0]
assert {"turn_idx", "action", "grid"} <= set(f0) # events key also present since Task 5
assert len(f0["grid"]) == 64 and len(f0["grid"][0]) == 64 # 64x64 field
assert any(3 in row for row in f0["grid"]) # walls present
def test_template_legacy_replay_block_sizes_are_3x3_predator_2x2_focal():
"""Regression: the legacy single-agent prose branch must paint a 3x3 predator
and a 2x2 focal — not the stale 5x5/3x3 that pre-dated the template resize.
"""
from proteus.game.runtime.memory import MemoryCheckpoint, MemoryTurn, memory_frames
# Use a simple prose frame_ascii (non-grid) so the legacy paint branch fires.
# Pick positions well clear of edges so no clipping masks the block size.
predator_anchor = (10, 10)
focal_anchor = (30, 30)
PREDATOR_IDX = 2 # "B" in template legend
FOCAL_IDX = 1 # "A" in template legend
ck = MemoryCheckpoint(
model="test", scenario="template", difficulty="easy", seed=1,
created_at="t", outcome="survived", transparent_prompt="p",
memory_turns=[
MemoryTurn(
turn_idx=1,
frame_ascii="Open field 64x64. You are A (2x2) centered at (31,31). "
"Predator B (3x3) centered at (11,11). Manhattan distance 20.",
action="right",
focal_pos=focal_anchor,
predator_pos=predator_anchor,
)
],
)
legend = {5: ".", 1: "A", 2: "B", 3: "#", 14: "F"}
frames = memory_frames(ck, legend=legend, grid_size=(64, 64))
grid = frames[0]["grid"]
# Assert the predator is painted as exactly 3x3 at its anchor.
px, py = predator_anchor
for dy in range(3):
for dx in range(3):
assert grid[py + dy][px + dx] == PREDATOR_IDX, (
f"predator cell ({px+dx},{py+dy}) expected {PREDATOR_IDX}, "
f"got {grid[py+dy][px+dx]}"
)
# The cell one step beyond the 3x3 predator block must NOT be predator colour
# (catches the old 5x5 bug where cells up to anchor+4 were painted).
assert grid[py][px + 3] != PREDATOR_IDX, "predator block wider than 3 (old 5x5 bug)"
assert grid[py + 3][px] != PREDATOR_IDX, "predator block taller than 3 (old 5x5 bug)"
# Assert the focal is painted as exactly 2x2 at its anchor.
fx, fy = focal_anchor
for dy in range(2):
for dx in range(2):
assert grid[fy + dy][fx + dx] == FOCAL_IDX, (
f"focal cell ({fx+dx},{fy+dy}) expected {FOCAL_IDX}, "
f"got {grid[fy+dy][fx+dx]}"
)
# The cell one step beyond the 2x2 focal block must NOT be focal colour
# (catches the old 3x3 bug).
assert grid[fy][fx + 2] != FOCAL_IDX, "focal block wider than 2 (old 3x3 bug)"
assert grid[fy + 2][fx] != FOCAL_IDX, "focal block taller than 2 (old 3x3 bug)"
def test_memory_frames_empty_when_no_memory():
_, payload, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "seed": 42, "play_turns": 3, "memory": "none"},
_reg())
assert payload["memory"]["attached"] is False
assert payload["memory"]["frames"] == []
def test_index_html_has_replay_controls():
status, body, ctype = server.handle_request("GET", "/", None, _reg())
html = body.decode() if isinstance(body, (bytes, bytearray)) else body
# Memory replay now shares the single play board (#grid); the separate
# #memReplayGrid / #memoryPanel region was removed and replaced by a compact
# control bar with a "start playing" (memReplayDone) handover button.
assert "memReplayGrid" not in html
assert "memoryPanel" not in html
assert 'id="memReplay"' in html and "memReplayDone" in html
assert "memReplayPrev" in html and "memReplayNext" in html and "memReplayPlay" in html
def test_policy_memory_attached_to_session():
_, payload, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "seed": 7, "play_turns": 3,
"memory": "policy:survival_refuge"}, _reg())
assert payload["memory"]["attached"] is True
assert payload["memory"]["source"] == "policy:survival_refuge"
assert payload["memory"]["turns"] >= 1
assert payload["memory"]["frames"]
def test_unknown_policy_is_400():
status, payload, _ = server.handle_request(
"POST", "/session",
{"scenario": "template", "memory": "policy:bogus"}, _reg())
assert status == 400 and "error" in payload
def test_spectate_bad_provider_is_400_not_500():
# ollama with no key -> ValueError now; must surface as a clean 400, not a 500.
status, payload, _ = server.handle_request(
"POST", "/spectate",
{"scenario": "template", "seed": 7, "play_turns": 2, "model": "ollama:x",
"memory": "none"}, _reg())
assert status == 400 and "error" in payload