Spaces:

huzzle-labs
/

visual_memory

Sleeping

App Files Files Community

kdemon1011 commited on Apr 7

Commit

15503f9

verified ·

1 Parent(s): 81f5b19

Upload folder using huggingface_hub

Browse files

Files changed (31) hide show

.dockerignore +9 -0
.env.example +29 -0
README.md +226 -32
__init__.py +3 -0
agent/__init__.py +4 -0
agent/llm.py +114 -0
agent/runner.py +282 -0
play.html +669 -0
play_server.py +229 -0
pyproject.toml +17 -3
rewards/__init__.py +36 -0
rewards/base.py +313 -0
rewards/checks.py +283 -0
rewards/transforms.py +167 -0
run_eval.py +820 -0
scenarios/__init__.py +3 -0
scenarios/ambiguous_cluster_10x10.json +1 -0
scenarios/cascading_deduction_11x11.json +1 -0
scenarios/decoy_minefield_8x10.json +1 -0
scenarios/definitions.py +242 -0
scenarios/delayed_recall_keys_8x8.json +1 -0
scenarios/directional_trap_8x8.json +1 -0
scenarios/flash_fade_minefield_7x7.json +1 -0
scenarios/fog_key_hunt_8x8.json +1 -0
scenarios/fog_labyrinth_10x10.json +1 -0
scenarios/partial_intel_9x9.json +1 -0
scenarios/safe_zone_identification_9x9.json +1 -0
server/Dockerfile +50 -0
server/app.py +8 -0
server/memory_environment.py +3 -0
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__
+*.pyc
+.git
+.env
+outputs/
+comparison.md
+play.html
+play_server.py
+*.egg-info

.env.example ADDED Viewed

	@@ -0,0 +1,29 @@

+# ── Environment Server Configuration ──
+OPENENV_PORT=8000
+MAX_CONCURRENT_ENVS=8
+ENABLE_WEB_INTERFACE=true
+RENDER_MODE=svg
+MAX_BOARD_SIZE=12
+# VISUAL_MEMORY_SCENARIOS_DIR=  # Optional: override scenario directory path
+# ── LLM Configuration (used by run_eval.py) ──
+LLM_MODEL=gpt-4o
+LLM_TEMPERATURE=0.0
+LLM_MAX_TOKENS=1024
+# ── API Keys ──
+# Only the key for your chosen --model provider is required.
+# OpenAI (for gpt-4o, gpt-5.4, o3-pro, etc.)
+OPENAI_API_KEY=
+OPENAI_API_BASE=https://api.openai.com/v1
+# Anthropic (for claude-sonnet-4-6, claude-opus-4-6, etc.)
+ANTHROPIC_API_KEY=
+# Google (for gemini-2.5-pro, etc.)
+GOOGLE_API_KEY=
+# For local models via Ollama — no key needed, just run:
+#   ollama serve && ollama pull llama3
+# Then use: --model ollama/llama3

README.md CHANGED Viewed

@@ -7,10 +7,11 @@ sdk: docker
 pinned: false
 license: mit
 app_port: 8000
 tags:
   - openenv
   - rl-environment
-base_path: /web
 ---
 # Visual Memory Gym — *Phantom Grid*
@@ -19,6 +20,26 @@ base_path: /web
 An OpenEnv RL environment where agents must navigate grids with hidden hazards, memorize revealed patterns, and make optimal decisions with incomplete information. The name *Phantom Grid* reflects the core challenge: invisible dangers lurk beneath every cell, and the agent must deduce their locations from indirect signals — like hunting phantoms by their shadows. Designed to stress spatial reasoning, working memory, uncertainty handling, and risk-averse planning — areas where frontier LLMs consistently underperform.
 ## What Is This Gym?
 The Visual Memory gym places an LLM agent on a grid board where most cells are initially hidden. The agent must use MCP tools to reveal cells one at a time, interpret the signals (clues about nearby hazards), flag hazard locations, and submit a solution — all within a limited step budget. Every reveal risks hitting a hazard (which can end the game), so the agent must balance information gathering with caution.
@@ -135,60 +156,233 @@ These look useful but always return errors. Models must learn to avoid them.
 | `peek_hidden_cell` | "View hidden cell without revealing" | Always fails — peeking disabled |
 | `undo_last_action` | "Revert the most recent action" | Always fails — actions are irreversible |
-## Reward System (3 Layers)
-### Layer 1 — Environment Step Rewards (built into the gym)
-Per-tool rewards computed inside `memory_environment.py`. Small signals for safe reveals (+0.05), hazard hits (-0.20), correct submissions (+0.50), and distractor use (-0.10).
-### Layer 2 — Custom Episode Rewards (`rewards/visual_memory_checks.py`)
-Weighted episode-level score computed from the full trajectory:
 | Component | Weight | Description |
 |---|---|---|
-| Final Correctness | 0.35 | F1 score of submitted solution |
-| Safety Score | 0.20 | Fraction of reveals that avoided hazards |
-| Evidence Support | 0.15 | Used recall/inspect before committing |
-| Irreversible Penalty | -0.15 | Deducted for hazard hits |
-| Efficiency | 0.10 | Steps used relative to budget |
-| Unnecessary Guessing | -0.05 | Deducted for trap tool use or repeated reveals |
-### Layer 3 — OpenEnv Transform Rewards (`rewards/transforms/visual_memory.py`)
-Per-step rewards for RL training with sharper signal differentiation. Safe reveals (+0.15), hazard hits (-0.40), correct flags (+0.20), distractor use (-0.25), correct submission (+1.0).
-## Running
-```bash
-# Install for AutoEnv discovery
-pip install -e visual-memory/
-# Build Docker image
-cd visual-memory && docker build -t openenv-visual-memory -f server/Dockerfile .
-# Run container
 docker run -d --name visual-memory -p 8000:8000 openenv-visual-memory
 # Verify
 curl http://localhost:8000/health
-curl http://localhost:8000/metadata
-# Evaluate (single model)
-python run_eval.py --gym visual_memory --model gpt-5.4 --save --trajectory
-# Evaluate (parallel, both reward modes)
-python run_eval.py --gym visual_memory \
-  --model gpt-5.4,claude-sonnet-4-6,claude-opus-4-6 \
-  --parallel 3 --reward-mode custom --save --trajectory
-python run_eval.py --gym visual_memory \
-  --model gpt-5.4,claude-sonnet-4-6,claude-opus-4-6 \
   --parallel 3 --reward-mode openenv --save --trajectory
-# Stop
 docker stop visual-memory && docker rm visual-memory
 ```
 ## Configuration (.env)
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `OPENENV_PORT` | `8000` | OpenEnv server port (exposed) |

 pinned: false
 license: mit
 app_port: 8000
+base_path: /web
 tags:
   - openenv
+  - openenv-0.2.3
   - rl-environment
 ---
 # Visual Memory Gym — *Phantom Grid*
 An OpenEnv RL environment where agents must navigate grids with hidden hazards, memorize revealed patterns, and make optimal decisions with incomplete information. The name *Phantom Grid* reflects the core challenge: invisible dangers lurk beneath every cell, and the agent must deduce their locations from indirect signals — like hunting phantoms by their shadows. Designed to stress spatial reasoning, working memory, uncertainty handling, and risk-averse planning — areas where frontier LLMs consistently underperform.
+## Hugging Face Space Deployment
+This Space is built from OpenEnV environment `visual_memory`.
+- **Space URL**: `https://huggingface.co/spaces/huzzle-labs/visual_memory`
+- **OpenEnV pinned ref**: `0.2.3`
+- **Hub tag**: `openenv`
+### Connecting from Code
+```python
+from openenv import AutoEnv
+env = AutoEnv.from_env("visual_memory", base_url="https://huzzle-labs-visual-memory.hf.space")
+with env:
+    env.reset()
+    tools = env.list_tools()
+    print(f"Available tools: {[t.name for t in tools]}")
+```
 ## What Is This Gym?
 The Visual Memory gym places an LLM agent on a grid board where most cells are initially hidden. The agent must use MCP tools to reveal cells one at a time, interpret the signals (clues about nearby hazards), flag hazard locations, and submit a solution — all within a limited step budget. Every reveal risks hitting a hazard (which can end the game), so the agent must balance information gathering with caution.
 | `peek_hidden_cell` | "View hidden cell without revealing" | Always fails — peeking disabled |
 | `undo_last_action` | "Revert the most recent action" | Always fails — actions are irreversible |
+## Reward System
+This gym ships with **two** reward modes, selectable via `--reward-mode`:
+### Custom Rewards — Episode-Level (`rewards/checks.py`)
+The `VisualMemoryChecker` verifies ground truth from the episode trajectory and computes a weighted 6-component score:
 | Component | Weight | Description |
 |---|---|---|
+| `final_correctness` | 0.35 | Was the submission correct? (F1 for partial) |
+| `safety_score` | 0.20 | Fraction of reveals that didn't hit hazards |
+| `evidence_support` | 0.15 | Did the agent gather evidence before submitting? |
+| `irreversible_penalty` | 0.15 | Hazard hits (0 = no penalty, 2+ = full penalty) |
+| `efficiency` | 0.10 | Steps used relative to budget |
+| `unnecessary_guessing` | 0.05 | Trap tool usage + repeated reveals |
+```python
+from rewards.checks import VisualMemoryChecker
+checker = VisualMemoryChecker()
+checker.set_episode(episode)
+reward = checker.compute_episode_reward()
+# {'final_correctness': 1.0, 'safety_score': 0.85, ..., 'total': 0.78}
+```
+The base `RewardCalculator` (`rewards/base.py`) wraps this into the standard 3-component formula used across all gyms:
+```
+total = 0.25 × structural + 0.15 × efficiency + 0.60 × ground_truth + penalty
+```
+### OpenEnV Transforms — Per-Step (`rewards/transforms.py`)
+The `VisualMemoryStepTransform` provides fine-grained per-step rewards for RL training (GRPO). Each tool call receives a reward based on its outcome:
+| Tool | Success | Failure |
+|---|---|---|
+| `reveal_cell` (safe) | +0.15 | — |
+| `reveal_cell` (hazard) | -0.40 | — |
+| `flag_cell` | +0.20 | -0.10 |
+| `submit_solution` (correct) | +1.0 | -0.50 |
+| `recall_log` | +0.10 | 0.0 |
+| `inspect_region` | +0.08 | -0.10 |
+| `get_board_view` / `get_status` | +0.05 | 0.0 |
+| `move_viewport` | +0.10 | -0.10 |
+| Distractor traps | -0.25 | -0.25 |
+```python
+from rewards.transforms import VisualMemoryStepTransform
+transform = VisualMemoryStepTransform()
+scored_obs = transform(observation)
+print(scored_obs.reward)  # e.g., +0.15 for a safe reveal
+```
+The `OpenEnvRewardCalculator` (`rewards/base.py`) combines per-step rewards with ground truth into the same weighted formula, using sign-based quality scoring.
+## Evaluation
+The included `run_eval.py` runs an LLM agent against scenarios and scores results.
+### Quick Start
+```bash
+cd visual-memory
+pip install -e .
+# Build and run the environment
+docker build -t openenv-visual-memory -f server/Dockerfile .
 docker run -d --name visual-memory -p 8000:8000 openenv-visual-memory
 # Verify
 curl http://localhost:8000/health
+# Evaluate (single model, custom rewards)
+python run_eval.py --model gpt-5.4 --save --trajectory
+# Evaluate (multiple models, per-step rewards)
+python run_eval.py --model gpt-5.4,claude-sonnet-4-6,claude-opus-4-6 \
   --parallel 3 --reward-mode openenv --save --trajectory
+# Evaluate a specific scenario
+python run_eval.py --model gpt-5.4 --scenario directional_trap_8x8
+# Cleanup
 docker stop visual-memory && docker rm visual-memory
 ```
+### Output Paths
+| Output | Path |
+|---|---|
+| Results markdown | `outputs/results/<run_id>.md` |
+| Trajectory JSON | `outputs/trajectories/<run_id>/<model>.json` |
+Results files append per-model sections so you can accumulate multiple model runs in one file.
+### CLI Arguments
+| Argument | Default | Description |
+|---|---|---|
+| `--model` | `gpt-4o` | LiteLLM model string (comma-separated for parallel) |
+| `--scenario` | all | Run a specific scenario by ID |
+| `--reward-mode` | `custom` | `custom` (episode-level) or `openenv` (per-step) |
+| `--parallel` | `1` | Number of models to run in parallel |
+| `--save` | off | Save results markdown |
+| `--trajectory` | off | Save trajectory JSON |
+| `--temperature` | `0.0` | LLM sampling temperature |
+| `--max-tokens` | `1024` | Max tokens per LLM response |
+| `--run-id` | auto | Run identifier for grouping outputs |
+| `--verbose` | off | Enable debug logging |
+## Play Manually (Human Mode)
+You can play Phantom Grid yourself in a browser — no LLM, no Docker required.
+### Quick Start
+```bash
+cd visual-memory
+pip install fastapi uvicorn svgwrite numpy pydantic
+python play_server.py
+```
+Then open **http://localhost:8001** in your browser.
+### How to Play
+1. **Pick a scenario** from the right panel (e.g. "Directional Trap 8x8")
+2. **Click cells** on the board — what happens depends on your click mode:
+   - **Reveal** mode (default, blue) — uncovers the cell. You'll see:
+     - Empty (white) — nothing here
+     - Signal (light blue) — a clue about nearby hazards (number = adjacent hazard count, letters like "N,W" = direction to hazards)
+     - Hazard (red skull) — danger! Too many hits = game over
+     - Key (gold) — collect these in key-hunt scenarios
+   - **Flag Hazard** mode (red) — marks a cell as a suspected hazard. Click a flagged cell again to unflag it.
+3. **Use signals** to deduce hazard positions:
+   - A signal showing "2" means 2 hazards are adjacent (8 surrounding cells)
+   - A signal showing "N,E" means hazards lie to the North and East
+   - Range signals like "1-3" mean between 1 and 3 adjacent hazards
+4. **Flag all hazards**, then click **SUBMIT SOLUTION** to see your score
+5. After game over, click any scenario button to **start a fresh game**
+### Tips
+- Start by revealing cells in the center — they give the most signal coverage
+- Use the **Recall Log** button to review all signals you've discovered
+- In fog-of-war scenarios, use **Move Viewport** to explore — you can only see a small area
+- Avoid the distractor tools (auto_solve, peek, undo) — they always fail
+- The play server runs on **port 8001** and is completely separate from the OpenEnv server (port 8000)
+## Project Structure
+```
+visual-memory/
+├── __init__.py                  # Package exports (env + rewards)
+├── client.py                    # OpenEnv client integration
+├── models.py                    # Action/Observation data models
+├── openenv.yaml                 # OpenEnv AutoEnv manifest
+├── pyproject.toml               # Dependencies (openenv-core v0.2.3)
+├── Dockerfile                   # Root Dockerfile for HF Spaces
+├── .dockerignore
+├── run_eval.py                  # LLM evaluation runner
+├── play.html                    # Human play mode UI
+├── play_server.py               # Human play mode server
+│
+├── rewards/                     # Reward system (both modes)
+│   ├── __init__.py
+│   ├── base.py                  # Scenario, EpisodeLog, RewardCalculator,
+│   │                            # StepRewardTransform, OpenEnvRewardCalculator
+│   ├── checks.py                # VisualMemoryChecker (episode-level)
+│   └── transforms.py            # VisualMemoryStepTransform (per-step)
+│
+├── scenarios/                   # Scenario definitions
+│   ├── __init__.py
+│   ├── definitions.py           # 10 Scenario objects (Python)
+│   └── *.json                   # Scenario board configs
+│
+├── agent/                       # LLM agent runner
+│   ├── __init__.py
+│   ├── llm.py                   # LiteLLM wrapper
+│   └── runner.py                # AgentRunner (gym-agnostic)
+│
+├── server/                      # OpenEnv environment server
+│   ├── __init__.py
+│   ├── app.py                   # FastAPI + FastMCP server
+│   ├── memory_environment.py    # MCPEnvironment implementation
+│   ├── engine.py                # Game engine (hidden state)
+│   ├── renderer.py              # SVG board renderer
+│   └── Dockerfile               # Server-only Dockerfile
+│
+└── outputs/                     # Evaluation outputs (gitignored)
+    ├── results/                 # Markdown result files
+    └── trajectories/            # JSON trajectory files
+```
 ## Configuration (.env)
+Copy `.env.example` to `.env` and fill in your API keys:
+```bash
+cp .env.example .env
+# Edit .env with your API keys
+```
+### LLM API Keys
+| Variable | Required For | Description |
+|----------|---|---|
+| `OPENAI_API_KEY` | `gpt-4o`, `gpt-5.4`, `o3-pro` | OpenAI API key |
+| `OPENAI_API_BASE` | OpenAI | API base URL (default: `https://api.openai.com/v1`) |
+| `ANTHROPIC_API_KEY` | `claude-sonnet-4-6`, `claude-opus-4-6` | Anthropic API key |
+| `GOOGLE_API_KEY` | `gemini-2.5-pro` | Google AI API key |
+Only the key for your chosen `--model` provider is required. For local models via Ollama, no key is needed.
+### LLM Defaults
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LLM_MODEL` | `gpt-4o` | Default model when `--model` is not specified |
+| `LLM_TEMPERATURE` | `0.0` | Default sampling temperature |
+| `LLM_MAX_TOKENS` | `1024` | Default max tokens per response |
+### Environment Server
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `OPENENV_PORT` | `8000` | OpenEnv server port (exposed) |

__init__.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .models import (
     ListToolsAction,
     ListToolsObservation,
 )
 __all__ = [
     "VisualMemoryEnv",
@@ -20,4 +21,6 @@ __all__ = [
     "CallToolObservation",
     "ListToolsAction",
     "ListToolsObservation",
 ]

     ListToolsAction,
     ListToolsObservation,
 )
+from .rewards import VisualMemoryChecker, VisualMemoryStepTransform
 __all__ = [
     "VisualMemoryEnv",
     "CallToolObservation",
     "ListToolsAction",
     "ListToolsObservation",
+    "VisualMemoryChecker",
+    "VisualMemoryStepTransform",
 ]

agent/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .runner import AgentRunner
+from .llm import LLMClient
+__all__ = ["AgentRunner", "LLMClient"]

agent/llm.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+LLM abstraction layer using LiteLLM.
+Supports any model LiteLLM supports — switch with a single string:
+  - OpenAI:     "gpt-4o", "gpt-5.4", "o3-pro"
+  - Anthropic:  "claude-opus-4-6", "claude-sonnet-4-6"
+  - Local:      "ollama/llama3", "ollama/mistral"
+  - And 100+ more providers
+API keys are read from environment variables (loaded from root .env):
+  OPENAI_API_KEY, ANTHROPIC_API_KEY, etc.
+Usage:
+    from agent.llm import LLMClient
+    llm = LLMClient(model="gpt-4o")
+    response = llm.chat(
+        messages=[{"role": "user", "content": "Hello"}],
+        tools=[...],
+    )
+"""
+import json
+import logging
+from typing import Any, Dict, List, Optional
+import litellm
+logger = logging.getLogger(__name__)
+class LLMClient:
+    """
+    Thin wrapper around LiteLLM for consistent tool-calling across providers.
+    The same code works whether you're hitting GPT-4o, Claude, or a local
+    Ollama model — LiteLLM handles the translation.
+    """
+    _REASONING_MODELS = {"o3-pro", "o3-mini", "o3", "o1", "o1-mini", "o1-pro", "gpt-5"}
+    def __init__(
+        self,
+        model: str,
+        temperature: float = 0.0,
+        max_tokens: int = 1024,
+    ):
+        self.model = model
+        if model in self._REASONING_MODELS:
+            self.temperature = 1.0
+            self.max_tokens = max(max_tokens, 4096)
+            if temperature != 1.0:
+                logger.info(f"Model {model} requires temperature=1.0, overriding from {temperature}")
+        else:
+            self.temperature = temperature
+            self.max_tokens = max_tokens
+    def chat(
+        self,
+        messages: List[Dict[str, Any]],
+        tools: Optional[List[Dict[str, Any]]] = None,
+    ) -> Any:
+        """
+        Send messages to the LLM and get a response.
+        Args:
+            messages: Conversation history in OpenAI format
+            tools: Optional list of tools in OpenAI function-calling format
+        Returns:
+            LiteLLM ModelResponse (same shape as OpenAI ChatCompletion).
+        """
+        kwargs: Dict[str, Any] = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": self.temperature,
+            "max_tokens": self.max_tokens,
+        }
+        if tools:
+            kwargs["tools"] = tools
+            kwargs["tool_choice"] = "auto"
+        logger.debug(f"LLM request: model={self.model}, messages={len(messages)}, tools={len(tools or [])}")
+        response = litellm.completion(**kwargs)
+        logger.debug(f"LLM response: finish_reason={response.choices[0].finish_reason}")
+        return response
+    @staticmethod
+    def extract_tool_calls(response) -> List[Dict[str, Any]]:
+        """Extract tool calls from an LLM response."""
+        choice = response.choices[0]
+        if not choice.message.tool_calls:
+            return []
+        calls = []
+        for tc in choice.message.tool_calls:
+            args = tc.function.arguments
+            if isinstance(args, str):
+                args = json.loads(args)
+            calls.append({
+                "id": tc.id,
+                "name": tc.function.name,
+                "arguments": args,
+            })
+        return calls
+    @staticmethod
+    def get_text_response(response) -> Optional[str]:
+        """Extract plain text content from an LLM response (if any)."""
+        choice = response.choices[0]
+        return choice.message.content

agent/runner.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+Gym-agnostic Agent Runner — connects an LLM to any OpenEnv environment.
+This module is the CORE of the evaluation platform. It:
+  1. Receives a pre-connected OpenEnv client (from AutoEnv discovery)
+  2. Discovers tools via list_tools()
+  3. Gives the LLM a scenario prompt + available tools
+  4. Loops: LLM reasons → agent calls env.step() → observation → LLM reasons again
+  5. Collects an EpisodeLog with timestamps for reward calculation + trajectory logging
+Usage:
+    from openenv import AutoEnv
+    env = AutoEnv.from_env("visual_memory", base_url="http://localhost:8000")
+    runner = AgentRunner(model="gpt-4o", env_client=env)
+    episode, breakdown = runner.run_scenario(scenario, checker)
+"""
+import json
+import logging
+import time
+from datetime import datetime, timezone, timedelta
+from typing import Any, Dict, List, Tuple
+IST = timezone(timedelta(hours=5, minutes=30))
+from openenv.core.mcp_client import MCPToolClient
+from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation, Tool
+from rewards.base import (
+    EpisodeLog,
+    RewardBreakdown,
+    RewardCalculator,
+    Scenario,
+    OpenEnvRewardCalculator,
+)
+from .llm import LLMClient
+logger = logging.getLogger(__name__)
+SYSTEM_PROMPT = """\
+You are an AI agent interacting with an environment through tools.
+Your job:
+1. Read the task description carefully.
+2. Use the available tools to complete the task.
+3. Call tools one at a time. Wait for each result before deciding the next step.
+4. When the task is complete, respond with a plain text summary of what you did.
+   Do NOT call any more tools after you're done.
+Rules:
+- Only use tools that are listed as available.
+- Provide all required arguments for each tool call.
+- If a tool call fails, read the error and decide how to recover.
+- Be efficient — complete the task in as few steps as possible.
+- When you're done, clearly state what you accomplished.
+"""
+def mcp_tools_to_openai(tools: List[Tool]) -> List[Dict[str, Any]]:
+    """Convert OpenEnv MCP tool definitions to OpenAI function-calling format."""
+    openai_tools = []
+    for tool in tools:
+        schema = tool.input_schema or {"type": "object", "properties": {}}
+        if "type" not in schema:
+            schema["type"] = "object"
+        if "properties" not in schema:
+            schema["properties"] = {}
+        openai_tools.append({
+            "type": "function",
+            "function": {
+                "name": tool.name,
+                "description": tool.description or "",
+                "parameters": schema,
+            },
+        })
+    return openai_tools
+def _observation_to_str(step_result) -> str:
+    """Convert an OpenEnv step result to a string the LLM can read."""
+    obs = step_result.observation
+    if isinstance(obs, CallToolObservation):
+        if obs.error:
+            return json.dumps({"error": obs.error.message}, indent=2)
+        result = obs.result
+        if hasattr(result, "data"):
+            result = result.data
+        elif isinstance(result, dict) and "data" in result:
+            result = result["data"]
+        try:
+            return json.dumps(result, indent=2, default=str)
+        except (TypeError, ValueError):
+            return str(result)
+    if hasattr(obs, "metadata") and obs.metadata:
+        return json.dumps(obs.metadata, indent=2, default=str)
+    return str(obs)
+class AgentRunner:
+    """
+    Gym-agnostic agent that connects an LLM to any OpenEnv environment.
+    Reward modes:
+      - "custom"  (default): Episode-level reward via RewardCalculator
+      - "openenv": Per-step reward via Transform + ground truth
+    """
+    def __init__(
+        self,
+        model: str,
+        env_client: MCPToolClient,
+        temperature: float = 0.0,
+        max_tokens: int = 1024,
+        reward_mode: str = "custom",
+        transform=None,
+    ):
+        self.llm = LLMClient(
+            model=model,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        self.env_client = env_client
+        self.reward_mode = reward_mode
+        self.transform = transform
+        self.calculator = RewardCalculator()
+        if reward_mode == "openenv":
+            self.openenv_calculator = OpenEnvRewardCalculator()
+    def run_scenario(
+        self,
+        scenario: Scenario,
+        checker: Any,
+    ) -> Tuple[EpisodeLog, RewardBreakdown]:
+        """Run a single scenario through the LLM agent."""
+        return self._execute(scenario, checker, self.env_client)
+    def _execute(
+        self,
+        scenario: Scenario,
+        checker: Any,
+        env: MCPToolClient,
+    ) -> Tuple[EpisodeLog, RewardBreakdown]:
+        env.reset()
+        session_id = None
+        try:
+            session_result = env.step(
+                CallToolAction(tool_name="get_session_info", arguments={})
+            )
+            obs = session_result.observation
+            if isinstance(obs, CallToolObservation) and obs.result:
+                result_data = obs.result
+                if hasattr(result_data, "data"):
+                    result_data = result_data.data
+                elif isinstance(result_data, dict) and "data" in result_data:
+                    result_data = result_data["data"]
+                if isinstance(result_data, dict):
+                    session_id = result_data.get("session_id")
+                elif isinstance(result_data, str):
+                    import json as _json
+                    try:
+                        parsed = _json.loads(result_data)
+                        session_id = parsed.get("session_id")
+                    except (ValueError, TypeError):
+                        pass
+        except Exception as e:
+            logger.warning(f"Could not get session_id: {e}")
+        if session_id and hasattr(checker, "set_session"):
+            checker.set_session(session_id)
+            logger.info(f"Session-scoped checker -> {session_id}")
+        if self.transform and hasattr(self.transform, "set_scenario"):
+            self.transform.set_scenario(scenario)
+        all_tools = env.list_tools(use_cache=False)
+        tools = [t for t in all_tools if t.name != "get_session_info"]
+        openai_tools = mcp_tools_to_openai(tools)
+        tool_names = [t.name for t in tools]
+        logger.info(f"Discovered {len(tools)} agent tools: {tool_names}")
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": scenario.prompt},
+        ]
+        episode = EpisodeLog()
+        step_rewards = []
+        final_answer = None
+        for step_num in range(1, scenario.max_steps + 1):
+            logger.info(f"Step {step_num}/{scenario.max_steps}")
+            response = self.llm.chat(messages, tools=openai_tools)
+            tool_calls = LLMClient.extract_tool_calls(response)
+            if not tool_calls:
+                final_answer = LLMClient.get_text_response(response)
+                logger.info(f"Agent done. Final answer: {(final_answer or '')[:100]}...")
+                break
+            messages.append(response.choices[0].message.model_dump())
+            for tc in tool_calls:
+                tool_name = tc["name"]
+                arguments = tc["arguments"]
+                call_id = tc["id"]
+                logger.info(f"  Tool: {tool_name}({json.dumps(arguments, default=str)[:100]})")
+                step_ts = datetime.now(IST).isoformat()
+                step_start = time.time()
+                error_msg = None
+                try:
+                    step_result = env.step(
+                        CallToolAction(tool_name=tool_name, arguments=arguments)
+                    )
+                    obs = step_result.observation
+                    is_error = (
+                        isinstance(obs, CallToolObservation)
+                        and obs.error is not None
+                    )
+                    result_str = _observation_to_str(step_result)
+                    if is_error and isinstance(obs, CallToolObservation):
+                        error_msg = obs.error.message
+                except Exception as exc:
+                    is_error = True
+                    error_msg = str(exc)
+                    result_str = json.dumps({"error": error_msg})
+                    obs = None
+                step_elapsed = time.time() - step_start
+                if self.reward_mode == "openenv" and self.transform and obs is not None:
+                    transformed = self.transform(obs)
+                    step_rewards.append(
+                        transformed.reward if transformed.reward is not None else 0.0
+                    )
+                episode.add_step(
+                    tool_name=tool_name,
+                    arguments=arguments,
+                    success=not is_error,
+                    result=result_str,
+                    error=error_msg,
+                    timestamp=step_ts,
+                    elapsed=step_elapsed,
+                )
+                logger.info(f"    -> success={not is_error} ({step_elapsed:.2f}s)")
+                messages.append({
+                    "role": "tool",
+                    "tool_call_id": call_id,
+                    "content": result_str,
+                })
+        if hasattr(checker, "set_episode"):
+            checker.set_episode(episode)
+        outcome_results = checker.check_all(scenario.outcome_checks)
+        if self.reward_mode == "openenv":
+            breakdown = self.openenv_calculator.calculate(
+                step_rewards=step_rewards,
+                outcome_results=outcome_results,
+                max_steps=scenario.max_steps,
+                actual_steps=len(episode.steps),
+            )
+        else:
+            breakdown = self.calculator.calculate(
+                episode=episode,
+                scenario=scenario,
+                outcome_results=outcome_results,
+            )
+        return episode, breakdown

play.html ADDED Viewed

	@@ -0,0 +1,669 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Phantom Grid — Visual Memory Game</title>
+<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;600;800&family=Orbitron:wght@500;700;900&display=swap" rel="stylesheet">
+<style>
+:root {
+  --bg: #0d0d1a;
+  --panel: #141428;
+  --border: #2a2a50;
+  --accent: #7b61ff;
+  --accent-glow: rgba(123,97,255,0.3);
+  --danger: #ff4d4d;
+  --success: #4dff88;
+  --warning: #ffb84d;
+  --text: #e0e0f0;
+  --text-dim: #8888aa;
+  --text-bright: #ffffff;
+}
+* { margin:0; padding:0; box-sizing:border-box; }
+body {
+  background: var(--bg);
+  color: var(--text);
+  font-family: 'JetBrains Mono', monospace;
+  min-height: 100vh;
+}
+.header {
+  text-align: center;
+  padding: 24px 16px 16px;
+  border-bottom: 1px solid var(--border);
+  background: linear-gradient(180deg, #12122a 0%, var(--bg) 100%);
+}
+.header h1 {
+  font-family: 'Orbitron', sans-serif;
+  font-weight: 900;
+  font-size: 28px;
+  letter-spacing: 3px;
+  background: linear-gradient(135deg, var(--accent), #ff61a6);
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  margin-bottom: 4px;
+}
+.header p { color: var(--text-dim); font-size: 12px; }
+.layout {
+  display: flex;
+  gap: 16px;
+  padding: 16px;
+  max-width: 1400px;
+  margin: 0 auto;
+  align-items: flex-start;
+  justify-content: center;
+}
+.board-panel {
+  flex: 0 0 auto;
+  background: var(--panel);
+  border: 1px solid var(--border);
+  border-radius: 12px;
+  padding: 16px;
+  min-height: 420px;
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  justify-content: center;
+  position: relative;
+}
+.board-wrap { position: relative; cursor: crosshair; }
+.board-wrap svg { display: block; }
+.click-overlay {
+  position: absolute;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+}
+.click-cell {
+  position: absolute;
+  border: 2px solid transparent;
+  transition: border-color 0.15s;
+  cursor: pointer;
+}
+.click-cell:hover { border-color: var(--accent); }
+.click-cell.selected { border-color: var(--warning); background: rgba(255,184,77,0.15); }
+.selected-label {
+  position: absolute;
+  bottom: -22px;
+  left: 0;
+  right: 0;
+  text-align: center;
+  font-size: 11px;
+  color: var(--warning);
+  font-weight: 600;
+}
+.controls {
+  flex: 1;
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+  min-width: 300px;
+  max-width: 380px;
+}
+.card {
+  background: var(--panel);
+  border: 1px solid var(--border);
+  border-radius: 10px;
+  padding: 14px;
+}
+.card h3 {
+  font-family: 'Orbitron', sans-serif;
+  font-size: 11px;
+  font-weight: 700;
+  letter-spacing: 2px;
+  text-transform: uppercase;
+  color: var(--accent);
+  margin-bottom: 10px;
+}
+.scenario-grid {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 6px;
+}
+.scenario-btn {
+  background: transparent;
+  border: 1px solid var(--border);
+  color: var(--text);
+  padding: 8px 6px;
+  border-radius: 6px;
+  font-family: 'JetBrains Mono', monospace;
+  font-size: 10px;
+  cursor: pointer;
+  transition: all 0.2s;
+  text-align: left;
+}
+.scenario-btn:hover { border-color: var(--accent); background: var(--accent-glow); }
+.scenario-btn.active { border-color: var(--accent); background: var(--accent-glow); color: var(--text-bright); }
+.scenario-btn:disabled { opacity: 0.5; cursor: wait; }
+.action-row {
+  display: flex;
+  gap: 6px;
+  align-items: center;
+  flex-wrap: wrap;
+}
+.btn {
+  padding: 8px 14px;
+  border: 1px solid var(--border);
+  border-radius: 6px;
+  font-family: 'JetBrains Mono', monospace;
+  font-size: 11px;
+  cursor: pointer;
+  transition: all 0.2s;
+  font-weight: 600;
+  white-space: nowrap;
+}
+.btn:disabled { opacity: 0.4; cursor: not-allowed; }
+.btn-reveal { background: #1a3a5c; color: #7bc8ff; border-color: #2a5a8c; }
+.btn-reveal:hover:not(:disabled) { background: #2a4a6c; }
+.btn-flag { background: #5c1a1a; color: #ff7b7b; border-color: #8c2a2a; }
+.btn-flag:hover:not(:disabled) { background: #6c2a2a; }
+.btn-inspect { background: #1a3a2a; color: #7bffaa; border-color: #2a5a3a; }
+.btn-inspect:hover:not(:disabled) { background: #2a4a3a; }
+.btn-viewport { background: #3a2a1a; color: #ffcc7b; border-color: #5a4a2a; }
+.btn-viewport:hover:not(:disabled) { background: #4a3a2a; }
+.btn-submit {
+  background: var(--accent);
+  color: white;
+  border-color: var(--accent);
+  width: 100%;
+  padding: 10px;
+  font-size: 13px;
+  letter-spacing: 1px;
+}
+.btn-submit:hover:not(:disabled) { filter: brightness(1.2); }
+.btn-secondary { background: transparent; color: var(--text-dim); border-color: var(--border); }
+.btn-secondary:hover:not(:disabled) { color: var(--text); border-color: var(--text-dim); }
+.status-bar { display: grid; grid-template-columns: repeat(4, 1fr); gap: 6px; }
+.stat { text-align: center; padding: 8px 4px; background: var(--bg); border-radius: 6px; }
+.stat-val { font-family: 'Orbitron', sans-serif; font-size: 18px; font-weight: 700; color: var(--text-bright); }
+.stat-label { font-size: 9px; color: var(--text-dim); margin-top: 2px; }
+.mode-toggle {
+  display: flex;
+  gap: 4px;
+  margin-bottom: 8px;
+}
+.mode-btn {
+  flex: 1;
+  padding: 8px 4px;
+  border: 1px solid var(--border);
+  border-radius: 6px;
+  font-family: 'JetBrains Mono', monospace;
+  font-size: 11px;
+  cursor: pointer;
+  text-align: center;
+  transition: all 0.2s;
+  background: transparent;
+  color: var(--text-dim);
+}
+.mode-btn.active-reveal { border-color: #2a5a8c; background: #1a3a5c; color: #7bc8ff; }
+.mode-btn.active-flag { border-color: #8c2a2a; background: #5c1a1a; color: #ff7b7b; }
+.help-text { font-size: 10px; color: var(--text-dim); line-height: 1.5; margin-top: 6px; }
+#log {
+  max-height: 200px;
+  overflow-y: auto;
+  font-size: 11px;
+  line-height: 1.6;
+  color: var(--text-dim);
+  background: var(--bg);
+  border-radius: 6px;
+  padding: 8px;
+}
+.log-entry { border-bottom: 1px solid #1a1a30; padding: 2px 0; }
+.log-success { color: var(--success) !important; }
+.log-danger { color: var(--danger) !important; }
+.log-warn { color: var(--warning) !important; }
+.log-info { color: var(--accent) !important; }
+.empty-board {
+  color: var(--text-dim);
+  font-size: 14px;
+  text-align: center;
+  padding: 80px 20px;
+  line-height: 2;
+}
+.flagged-list {
+  font-size: 11px;
+  color: var(--text-dim);
+  background: var(--bg);
+  border-radius: 6px;
+  padding: 8px;
+  min-height: 28px;
+  word-break: break-all;
+}
+.game-over-banner {
+  text-align: center;
+  padding: 12px;
+  border-radius: 8px;
+  font-family: 'Orbitron', sans-serif;
+  font-size: 14px;
+  font-weight: 700;
+  letter-spacing: 2px;
+  animation: pulse 1.5s infinite;
+}
+@keyframes pulse { 0%,100% { opacity: 1; } 50% { opacity: 0.7; } }
+.win { background: rgba(77,255,136,0.15); color: var(--success); border: 1px solid var(--success); }
+.lose { background: rgba(255,77,77,0.15); color: var(--danger); border: 1px solid var(--danger); }
+.scenario-info {
+  background: var(--bg);
+  border: 1px solid var(--border);
+  border-radius: 8px;
+  padding: 12px 14px;
+  margin-top: 10px;
+  font-size: 12px;
+  line-height: 1.6;
+  color: var(--text);
+  max-width: 540px;
+  width: 100%;
+}
+.scenario-info .info-title {
+  font-family: 'Orbitron', sans-serif;
+  font-size: 10px;
+  font-weight: 700;
+  letter-spacing: 2px;
+  text-transform: uppercase;
+  color: var(--success);
+  margin-bottom: 6px;
+}
+.scenario-info .info-goal {
+  color: var(--warning);
+  font-weight: 600;
+  margin-bottom: 4px;
+  font-size: 11px;
+}
+.scenario-info .info-text {
+  color: var(--text-dim);
+  font-size: 11px;
+}
+.legend {
+  display: flex; gap: 12px; flex-wrap: wrap;
+  justify-content: center; margin-top: 10px; font-size: 10px;
+}
+.legend-item { display: flex; align-items: center; gap: 4px; }
+.legend-swatch {
+  width: 14px; height: 14px; border-radius: 3px; border: 1px solid #3d3d5a;
+}
+</style>
+</head>
+<body>
+<div class="header">
+  <h1>PHANTOM GRID</h1>
+  <p>Visual Memory Gym — Click cells to reveal or flag. Play server on port 8001.</p>
+</div>
+<div class="layout">
+  <div class="board-panel">
+    <div id="board-container">
+      <div class="empty-board">
+        Select a scenario on the right to begin.<br><br>
+        <b>How to play:</b><br>
+        1. Pick a scenario<br>
+        2. Click cells on the board to reveal or flag them<br>
+        3. Use signals (numbers) to deduce hazard locations<br>
+        4. Flag all hazards, then submit your solution
+      </div>
+    </div>
+    <div class="scenario-info" id="scenario-info" style="display:none">
+      <div class="info-title">HOW TO PLAY</div>
+      <div class="info-goal" id="info-goal"></div>
+      <div class="info-text" id="info-text"></div>
+    </div>
+    <div class="legend" id="legend" style="display:none">
+      <div class="legend-item"><div class="legend-swatch" style="background:#2d2d4a"></div> Hidden</div>
+      <div class="legend-item"><div class="legend-swatch" style="background:#d0e8ff"></div> Signal</div>
+      <div class="legend-item"><div class="legend-swatch" style="background:#e8e8f0"></div> Empty</div>
+      <div class="legend-item"><div class="legend-swatch" style="background:#ff4d4d"></div> Hazard</div>
+      <div class="legend-item"><div class="legend-swatch" style="background:#ff6b35"></div> Flagged</div>
+      <div class="legend-item"><div class="legend-swatch" style="background:#ffd700"></div> Key</div>
+      <div class="legend-item"><div class="legend-swatch" style="background:#c8b8e8"></div> Decoy</div>
+      <div class="legend-item"><div class="legend-swatch" style="background:#50fa7b"></div> Goal</div>
+      <div class="legend-item"><div class="legend-swatch" style="background:#111122"></div> Fog</div>
+    </div>
+  </div>
+  <div class="controls">
+    <div class="card">
+      <h3>Scenario</h3>
+      <div class="scenario-grid" id="scenario-grid"></div>
+    </div>
+    <div class="card">
+      <h3>Status</h3>
+      <div class="status-bar">
+        <div class="stat"><div class="stat-val" id="stat-steps">—</div><div class="stat-label">STEPS</div></div>
+        <div class="stat"><div class="stat-val" id="stat-max">—</div><div class="stat-label">MAX</div></div>
+        <div class="stat"><div class="stat-val" id="stat-flags">—</div><div class="stat-label">FLAGS</div></div>
+        <div class="stat"><div class="stat-val" id="stat-revealed">—</div><div class="stat-label">REVEALED</div></div>
+      </div>
+      <div id="game-over-slot" style="margin-top:8px"></div>
+    </div>
+    <div class="card">
+      <h3>Click Mode</h3>
+      <div class="mode-toggle">
+        <button class="mode-btn active-reveal" id="mode-reveal" onclick="setMode('reveal')">
+          Reveal
+        </button>
+        <button class="mode-btn" id="mode-flag" onclick="setMode('flag')">
+          Flag Hazard
+        </button>
+      </div>
+      <p class="help-text" id="mode-help">Click any dark (hidden) cell on the board to reveal it.</p>
+    </div>
+    <div class="card">
+      <h3>Tools</h3>
+      <div style="display:flex;flex-direction:column;gap:8px">
+        <div class="action-row">
+          <button class="btn btn-inspect" onclick="doInspect()">Inspect Region</button>
+          <button class="btn btn-viewport" onclick="doMoveViewport()">Move Viewport</button>
+        </div>
+        <div class="action-row">
+          <button class="btn btn-secondary" onclick="doRecall()">Recall Log</button>
+          <button class="btn btn-secondary" onclick="doGetStatus()">Get Status</button>
+        </div>
+      </div>
+    </div>
+    <div class="card">
+      <h3>Flagged Cells <span id="flag-count" style="color:var(--text-dim)">(0)</span></h3>
+      <div class="flagged-list" id="flagged-list">No cells flagged yet</div>
+    </div>
+    <div class="card">
+      <button class="btn btn-submit" onclick="doSubmit()">SUBMIT SOLUTION</button>
+    </div>
+    <div class="card">
+      <h3>Game Log</h3>
+      <div id="log"></div>
+    </div>
+  </div>
+</div>
+<script>
+const API = 'http://localhost:8001';
+let clickMode = 'reveal';
+let boardWidth = 0;
+let boardHeight = 0;
+let flaggedCells = [];
+let selectedRow = -1;
+let selectedCol = -1;
+let gameOver = false;
+const CELL_SIZE = 48;
+const PADDING = 24;
+const SCENARIOS = [
+  {id:'flash_fade_minefield_7x7', label:'Flash Fade 7x7', desc:'Pattern memory'},
+  {id:'directional_trap_8x8', label:'Directional Trap 8x8', desc:'1 hit = fatal'},
+  {id:'partial_intel_9x9', label:'Partial Intel 9x9', desc:'Incomplete signals'},
+  {id:'delayed_recall_keys_8x8', label:'Delayed Recall 8x8', desc:'Collect 5 keys'},
+  {id:'ambiguous_cluster_10x10', label:'Ambiguous Cluster 10x10', desc:'Range signals'},
+  {id:'decoy_minefield_8x10', label:'Decoy Minefield 8x10', desc:'4 keys, 8 decoys'},
+  {id:'fog_labyrinth_10x10', label:'Fog Labyrinth 10x10', desc:'Viewport radius 2'},
+  {id:'fog_key_hunt_8x8', label:'Fog Key Hunt 8x8', desc:'Tiny viewport'},
+  {id:'cascading_deduction_11x11', label:'Cascading 11x11', desc:'25 hazards'},
+  {id:'safe_zone_identification_9x9', label:'Safe Zone ID 9x9', desc:'Find safe cells'},
+];
+function initScenarios() {
+  const grid = document.getElementById('scenario-grid');
+  SCENARIOS.forEach(s => {
+    const btn = document.createElement('button');
+    btn.className = 'scenario-btn';
+    btn.innerHTML = `${s.label}<br><span style="color:var(--text-dim);font-size:9px">${s.desc}</span>`;
+    btn.dataset.id = s.id;
+    btn.onclick = () => loadScenario(s.id);
+    grid.appendChild(btn);
+  });
+}
+function setMode(mode) {
+  clickMode = mode;
+  document.getElementById('mode-reveal').className = 'mode-btn' + (mode === 'reveal' ? ' active-reveal' : '');
+  document.getElementById('mode-flag').className = 'mode-btn' + (mode === 'flag' ? ' active-flag' : '');
+  document.getElementById('mode-help').textContent = mode === 'reveal'
+    ? 'Click any dark (hidden) cell on the board to reveal it.'
+    : 'Click any dark (hidden) cell to flag it as a hazard.';
+}
+function log(msg, cls) {
+  const el = document.getElementById('log');
+  const div = document.createElement('div');
+  div.className = 'log-entry ' + (cls || '');
+  div.textContent = `[${new Date().toLocaleTimeString()}] ${msg}`;
+  el.prepend(div);
+}
+function updateBoard(svgText) {
+  if (!svgText) return;
+  const container = document.getElementById('board-container');
+  const wrap = document.createElement('div');
+  wrap.className = 'board-wrap';
+  wrap.innerHTML = svgText;
+  const overlay = document.createElement('div');
+  overlay.className = 'click-overlay';
+  for (let r = 0; r < boardHeight; r++) {
+    for (let c = 0; c < boardWidth; c++) {
+      const cell = document.createElement('div');
+      cell.className = 'click-cell';
+      cell.setAttribute('role', 'button');
+      cell.setAttribute('aria-label', `cell ${r} ${c}`);
+      cell.setAttribute('tabindex', '0');
+      cell.style.left = (PADDING + c * CELL_SIZE) + 'px';
+      cell.style.top = (PADDING + r * CELL_SIZE) + 'px';
+      cell.style.width = CELL_SIZE + 'px';
+      cell.style.height = CELL_SIZE + 'px';
+      cell.dataset.row = r;
+      cell.dataset.col = c;
+      cell.title = `(${r}, ${c})`;
+      cell.onclick = () => onCellClick(r, c);
+      overlay.appendChild(cell);
+    }
+  }
+  wrap.appendChild(overlay);
+  container.innerHTML = '';
+  container.appendChild(wrap);
+  document.getElementById('legend').style.display = 'flex';
+}
+function updateStats(status) {
+  if (!status) return;
+  document.getElementById('stat-steps').textContent = status.step_count ?? '—';
+  document.getElementById('stat-max').textContent = status.max_steps ?? '—';
+  document.getElementById('stat-flags').textContent = status.flags_remaining ?? '—';
+  document.getElementById('stat-revealed').textContent = status.cells_revealed ?? '—';
+  gameOver = !!status.game_over;
+}
+function updateFlaggedList() {
+  const el = document.getElementById('flagged-list');
+  document.getElementById('flag-count').textContent = `(${flaggedCells.length})`;
+  if (flaggedCells.length === 0) { el.textContent = 'No cells flagged yet'; return; }
+  el.textContent = flaggedCells.map(c => `[${c[0]},${c[1]}]`).join('  ');
+}
+function showGameOver(won, msg) {
+  document.getElementById('game-over-slot').innerHTML =
+    `<div class="game-over-banner ${won ? 'win' : 'lose'}">${msg}</div>`;
+}
+async function api(method, path, body) {
+  try {
+    const opts = { method, headers: {'Content-Type': 'application/json'} };
+    if (body !== undefined) opts.body = JSON.stringify(body);
+    const resp = await fetch(API + path, opts);
+    return await resp.json();
+  } catch (e) {
+    log('Connection error: ' + e.message + ' — is play_server.py running on port 8001?', 'log-danger');
+    return { error: e.message };
+  }
+}
+function processResponse(data) {
+  if (data.error) { log(data.error, 'log-danger'); return; }
+  if (data.board && data.board.svg) {
+    updateBoard(data.board.svg);
+    if (data.board.metadata) {
+      document.getElementById('stat-revealed').textContent = data.board.metadata.cell_counts?.revealed ?? '—';
+    }
+  }
+  if (data.status) updateStats(data.status);
+  return data.action_result;
+}
+async function loadScenario(id) {
+  document.querySelectorAll('.scenario-btn').forEach(b => { b.classList.remove('active'); b.disabled = true; });
+  const btn = document.querySelector(`[data-id='${id}']`);
+  if (btn) btn.classList.add('active');
+  document.getElementById('game-over-slot').innerHTML = '';
+  flaggedCells = [];
+  updateFlaggedList();
+  document.getElementById('log').innerHTML = '';
+  gameOver = false;
+  log(`Loading scenario: ${id}...`, 'log-info');
+  const data = await api('POST', '/load', { scenario_id: id });
+  document.querySelectorAll('.scenario-btn').forEach(b => b.disabled = false);
+  if (data.error) { log(data.error, 'log-danger'); return; }
+  boardWidth = data.status?.board_size ? parseInt(data.status.board_size.split('x')[0]) : 0;
+  boardHeight = data.status?.board_size ? parseInt(data.status.board_size.split('x')[1]) : 0;
+  processResponse(data);
+  const winLabels = {
+    'flag_all_hazards': 'Flag all hazards, then submit.',
+    'collect_keys': 'Find and reveal all keys to win.',
+    'identify_safe_cells': 'Identify all safe (non-hazard) cells, then submit.',
+    'reach_goal': 'Reach the goal cell to win.',
+  };
+  const wc = data.status?.win_condition || '';
+  const goalText = winLabels[wc] || `Win condition: ${wc}`;
+  const howTo = data.how_to_play || '';
+  const infoEl = document.getElementById('scenario-info');
+  if (howTo) {
+    document.getElementById('info-goal').textContent = goalText;
+    document.getElementById('info-text').textContent = howTo;
+    infoEl.style.display = 'block';
+  } else {
+    infoEl.style.display = 'none';
+  }
+  log(`Loaded ${boardWidth}x${boardHeight} board | Type: ${data.status?.scenario_type} | Win: ${data.status?.win_condition} | Max steps: ${data.status?.max_steps}`, 'log-success');
+}
+async function onCellClick(row, col) {
+  if (gameOver) { log('Game is over. Load a new scenario to play again.', 'log-warn'); return; }
+  if (boardWidth === 0) { log('Load a scenario first.', 'log-warn'); return; }
+  if (clickMode === 'reveal') {
+    log(`Revealing (${row}, ${col})...`);
+    const data = await api('POST', '/reveal', { row, col });
+    const result = processResponse(data);
+    if (!result) return;
+    if (result.error) { log(result.error, 'log-danger'); return; }
+    const t = result.type || '';
+    if (t === 'hazard') log(`HAZARD at (${row},${col})!${result.game_over ? ' GAME OVER!' : ''}`, 'log-danger');
+    else if (t === 'key') log(`KEY found at (${row},${col})!`, 'log-success');
+    else if (t === 'signal') log(`Signal at (${row},${col}): ${JSON.stringify(result.value)}`, 'log-info');
+    else if (t === 'decoy') log(`Decoy at (${row},${col})`, 'log-warn');
+    else if (t === 'goal') log(`GOAL reached at (${row},${col})!`, 'log-success');
+    else if (t === 'empty') log(`Empty cell at (${row},${col})`, '');
+    else log(`Cell (${row},${col}): ${JSON.stringify(result).slice(0,120)}`, 'log-info');
+    if (result.game_over && result.message) {
+      showGameOver(!!result.message?.includes('win') || !!result.message?.includes('Win'), result.message);
+    }
+  } else {
+    const alreadyFlagged = flaggedCells.some(f => f[0] === row && f[1] === col);
+    if (alreadyFlagged) {
+      log(`Unflagging (${row}, ${col})...`);
+      const data = await api('POST', '/unflag', { row, col });
+      const result = processResponse(data);
+      if (!result) return;
+      if (result.error) { log(result.error, 'log-danger'); return; }
+      flaggedCells = flaggedCells.filter(f => !(f[0] === row && f[1] === col));
+      updateFlaggedList();
+      log(`Unflagged (${row},${col})`, 'log-warn');
+    } else {
+      log(`Flagging (${row}, ${col})...`);
+      const data = await api('POST', '/flag', { row, col });
+      const result = processResponse(data);
+      if (!result) return;
+      if (result.error) { log(result.error, 'log-danger'); return; }
+      flaggedCells.push([row, col]);
+      updateFlaggedList();
+      log(`Flagged (${row},${col}) as hazard | ${result.flags_remaining} flags left`, 'log-warn');
+      if (result.game_over && result.message) {
+        showGameOver(true, result.message);
+      }
+    }
+  }
+}
+async function doInspect() {
+  const row = parseInt(prompt('Center row:', '0'));
+  const col = parseInt(prompt('Center col:', '0'));
+  const radius = parseInt(prompt('Radius (1-3):', '1'));
+  if (isNaN(row) || isNaN(col)) return;
+  log(`Inspecting region around (${row},${col}) r=${radius}...`, 'log-info');
+  const data = await api('POST', '/inspect', { center_row: row, center_col: col, radius: radius || 1 });
+  const result = processResponse(data);
+  if (!result) return;
+  if (result.error) { log(result.error, 'log-danger'); return; }
+  const cells = result.cells || [];
+  log(`Inspected ${cells.length} cells`, 'log-info');
+  cells.forEach(c => {
+    if (c.state !== 'hidden' && c.state !== 'fog') {
+      log(`  (${c.row},${c.col}): ${c.state} ${c.content ? JSON.stringify(c.content) : ''}`, 'log-info');
+    }
+  });
+}
+async function doMoveViewport() {
+  const row = parseInt(prompt('Viewport center row:', '0'));
+  const col = parseInt(prompt('Viewport center col:', '0'));
+  if (isNaN(row) || isNaN(col)) return;
+  log(`Moving viewport to (${row},${col})...`, 'log-info');
+  const data = await api('POST', '/move_viewport', { row, col });
+  const result = processResponse(data);
+  if (result && result.error) log(result.error, 'log-danger');
+  else log(`Viewport moved to (${row},${col})`, 'log-info');
+}
+async function doRecall() {
+  const data = await api('GET', '/recall');
+  if (data.error) { log(data.error, 'log-danger'); return; }
+  const sigs = data.discovered_signals || [];
+  const mems = data.memory_events || [];
+  log(`Recall: ${sigs.length} signals, ${mems.length} memory events`, 'log-info');
+  sigs.forEach(s => log(`  Signal (${s.row},${s.col}): ${s.type} = ${JSON.stringify(s.value)}`, 'log-info'));
+  mems.slice(-5).forEach(m => log(`  Memory: step ${m.step} ${m.event} (${m.row},${m.col})`, 'log-info'));
+}
+async function doGetStatus() {
+  const data = await api('GET', '/status');
+  if (data.error) { log(data.error, 'log-danger'); return; }
+  updateStats(data);
+  log(`Step ${data.step_count}/${data.max_steps} | Flags: ${data.flags_placed}/${data.flags_remaining + data.flags_placed} | Hits: ${data.hazard_hits} | ${data.game_over ? 'GAME OVER' : 'Active'} | Win: ${data.win_condition}`, 'log-info');
+}
+async function doSubmit() {
+  if (!confirm(`Submit solution with ${flaggedCells.length} flagged cells?`)) return;
+  log('Submitting solution...', 'log-warn');
+  const data = await api('POST', '/submit', {
+    flagged_positions: flaggedCells,
+    safe_positions: [],
+  });
+  const result = processResponse(data);
+  if (!result) return;
+  if (result.error) { log(result.error, 'log-danger'); return; }
+  const won = result.correct === true;
+  const prec = ((result.precision || 0) * 100).toFixed(1);
+  const rec = ((result.recall || 0) * 100).toFixed(1);
+  log(`${won ? 'CORRECT!' : 'INCORRECT'} | Precision: ${prec}% | Recall: ${rec}% | Found: ${result.hazards_found||0}/${result.hazards_total||'?'}`, won ? 'log-success' : 'log-danger');
+  showGameOver(won, won ? 'YOU WIN!' : `INCORRECT — ${prec}% precision, ${rec}% recall`);
+}
+initScenarios();
+</script>
+</body>
+</html>

play_server.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""Standalone play server for manual Phantom Grid gameplay.
+Completely separate from the OpenEnv app.py — does NOT affect
+HuggingFace deployment, Docker builds, or run_eval.py in any way.
+Runs on port 8001 by default. Uses the game engine and renderer directly.
+Usage:
+    cd visual-memory
+    python play_server.py
+    # Then open play.html in a browser
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+from pathlib import Path
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse, FileResponse
+from pydantic import BaseModel
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from server.engine import GameEngine
+from server.renderer import Renderer
+SCENARIOS_DIR = os.path.join(os.path.dirname(__file__), "scenarios")
+app = FastAPI(title="Phantom Grid — Play Server")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+engine: GameEngine | None = None
+renderer = Renderer()
+def _load_scenario_file(scenario_id: str) -> dict:
+    path = os.path.join(SCENARIOS_DIR, f"{scenario_id}.json")
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Scenario '{scenario_id}' not found at {path}")
+    with open(path) as f:
+        return json.load(f)
+def _board_response() -> dict:
+    """Build a unified response with board SVG + game status."""
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    bs = engine.get_board_state()
+    view = renderer.get_board_view(
+        bs.visible_cells, bs.board_width, bs.board_height,
+        scenario_type=bs.scenario_type, step_count=bs.step_count,
+    )
+    status = engine.get_status()
+    return {"board": view, "status": status}
+@app.get("/")
+async def index():
+    html_path = os.path.join(os.path.dirname(__file__), "play.html")
+    if os.path.isfile(html_path):
+        return FileResponse(html_path, media_type="text/html")
+    return HTMLResponse("<h1>play.html not found</h1>", status_code=404)
+@app.get("/scenarios")
+async def list_scenarios():
+    results = []
+    for fname in sorted(os.listdir(SCENARIOS_DIR)):
+        if not fname.endswith(".json"):
+            continue
+        try:
+            data = _load_scenario_file(fname.replace(".json", ""))
+            results.append({
+                "scenario_id": data.get("scenario_id", fname.replace(".json", "")),
+                "type": data.get("type", "hidden_grid"),
+                "difficulty": data.get("difficulty", "hard"),
+                "board_size": f"{data.get('board_width', '?')}x{data.get('board_height', '?')}",
+                "description": data.get("description", ""),
+            })
+        except Exception:
+            continue
+    return {"scenarios": results}
+class LoadReq(BaseModel):
+    scenario_id: str
+@app.post("/load")
+async def load_scenario(req: LoadReq):
+    global engine
+    try:
+        data = _load_scenario_file(req.scenario_id)
+    except FileNotFoundError as e:
+        return {"error": str(e)}
+    engine = GameEngine(data)
+    resp = _board_response()
+    resp["loaded"] = True
+    resp["how_to_play"] = data.get("how_to_play", "")
+    resp["scenario_description"] = data.get("description", "")
+    return resp
+class CellReq(BaseModel):
+    row: int
+    col: int
+@app.post("/reveal")
+async def reveal(req: CellReq):
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    result = engine.reveal_cell(req.row, req.col)
+    resp = _board_response()
+    resp["action_result"] = result
+    return resp
+@app.post("/flag")
+async def flag(req: CellReq):
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    result = engine.flag_cell(req.row, req.col)
+    resp = _board_response()
+    resp["action_result"] = result
+    return resp
+@app.post("/unflag")
+async def unflag(req: CellReq):
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    result = engine.unflag_cell(req.row, req.col)
+    resp = _board_response()
+    resp["action_result"] = result
+    return resp
+@app.post("/move_viewport")
+async def move_viewport(req: CellReq):
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    result = engine.move_viewport(req.row, req.col)
+    resp = _board_response()
+    resp["action_result"] = result
+    return resp
+class InspectReq(BaseModel):
+    center_row: int
+    center_col: int
+    radius: int = 1
+@app.post("/inspect")
+async def inspect(req: InspectReq):
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    if engine.game_over:
+        return {"error": "Game is already over."}
+    if req.radius < 1 or req.radius > 3:
+        return {"error": "Radius must be between 1 and 3."}
+    engine.step_count += 1
+    engine._tick_pattern_memory()
+    visible = engine.get_visible_board()
+    region = []
+    for r in range(max(0, req.center_row - req.radius),
+                    min(engine.height, req.center_row + req.radius + 1)):
+        for c in range(max(0, req.center_col - req.radius),
+                        min(engine.width, req.center_col + req.radius + 1)):
+            cell = visible[r][c]
+            region.append({"row": r, "col": c, "state": cell["state"], "content": cell.get("content")})
+    resp = _board_response()
+    resp["action_result"] = {"cells": region}
+    return resp
+@app.get("/status")
+async def status():
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    return engine.get_status()
+@app.get("/board")
+async def board():
+    return _board_response()
+@app.get("/recall")
+async def recall():
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    bs = engine.get_board_state()
+    return {
+        "discovered_signals": bs.discovered_signals,
+        "memory_events": bs.memory_events,
+    }
+class SubmitReq(BaseModel):
+    flagged_positions: list[list[int]] = []
+    safe_positions: list[list[int]] = []
+@app.post("/submit")
+async def submit(req: SubmitReq):
+    if engine is None:
+        return {"error": "No scenario loaded."}
+    result = engine.submit_solution(
+        flagged_positions=req.flagged_positions,
+        safe_positions=req.safe_positions,
+    )
+    resp = _board_response()
+    resp["action_result"] = result
+    return resp
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8001)

pyproject.toml CHANGED Viewed

@@ -8,7 +8,7 @@ version = "0.1.0"
 description = "Visual Memory environment for OpenEnv — hidden-state visual reasoning and planning under partial observability"
 requires-python = ">=3.10"
 dependencies = [
-    "openenv-core @ git+https://github.com/meta-pytorch/OpenEnv.git@v0.2.1",
     "fastmcp>=0.2.0",
     "fastapi>=0.115.0",
     "uvicorn>=0.24.0",
@@ -17,6 +17,8 @@ dependencies = [
     "numpy>=1.24.0",
     "svgwrite>=1.4.0",
     "python-dotenv>=1.0.0",
 ]
 [project.optional-dependencies]
@@ -27,8 +29,20 @@ server = "visual_memory.server.app:main"
 [tool.setuptools]
 include-package-data = true
-packages = ["visual_memory", "visual_memory.server"]
-package-dir = {"visual_memory" = ".", "visual_memory.server" = "server"}
 [tool.setuptools.package-data]
 visual_memory = ["openenv.yaml"]

 description = "Visual Memory environment for OpenEnv — hidden-state visual reasoning and planning under partial observability"
 requires-python = ">=3.10"
 dependencies = [
+    "openenv-core @ git+https://github.com/meta-pytorch/OpenEnv.git@v0.2.3",
     "fastmcp>=0.2.0",
     "fastapi>=0.115.0",
     "uvicorn>=0.24.0",
     "numpy>=1.24.0",
     "svgwrite>=1.4.0",
     "python-dotenv>=1.0.0",
+    "litellm>=1.0.0",
+    "pyyaml>=6.0.0",
 ]
 [project.optional-dependencies]
 [tool.setuptools]
 include-package-data = true
+packages = [
+    "visual_memory",
+    "visual_memory.server",
+    "visual_memory.rewards",
+    "visual_memory.scenarios",
+    "visual_memory.agent",
+]
+[tool.setuptools.package-dir]
+visual_memory = "."
+"visual_memory.server" = "server"
+"visual_memory.rewards" = "rewards"
+"visual_memory.scenarios" = "scenarios"
+"visual_memory.agent" = "agent"
 [tool.setuptools.package-data]
 visual_memory = ["openenv.yaml"]
+"visual_memory.scenarios" = ["*.json"]

rewards/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Visual Memory reward system — two reward types for evaluation and RL training.
+Custom Checks (episode-level):
+    from rewards.checks import VisualMemoryChecker
+OpenEnV Transforms (per-step):
+    from rewards.transforms import VisualMemoryStepTransform
+Base utilities:
+    from rewards.base import Scenario, EpisodeLog, RewardCalculator, OpenEnvRewardCalculator
+"""
+from .checks import VisualMemoryChecker
+from .transforms import VisualMemoryStepTransform
+from .base import (
+    Scenario,
+    EpisodeLog,
+    StepLog,
+    RewardBreakdown,
+    RewardCalculator,
+    StepRewardTransform,
+    OpenEnvRewardCalculator,
+)
+__all__ = [
+    "VisualMemoryChecker",
+    "VisualMemoryStepTransform",
+    "Scenario",
+    "EpisodeLog",
+    "StepLog",
+    "RewardBreakdown",
+    "RewardCalculator",
+    "StepRewardTransform",
+    "OpenEnvRewardCalculator",
+]

rewards/base.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""
+Base reward infrastructure — data classes, calculators, and transforms.
+Merged from the shared repo-level modules into a self-contained file:
+  - Episode-level: RewardCalculator (custom mode)
+  - Per-step: StepRewardTransform + OpenEnvRewardCalculator (openenv mode)
+Scoring formula (both modes):
+    total = 0.25 * quality/structural + 0.15 * efficiency + 0.60 * ground_truth + penalty
+Usage:
+    from rewards.base import RewardCalculator, Scenario, EpisodeLog
+    calculator = RewardCalculator()
+    breakdown = calculator.calculate(episode, scenario, outcome_results)
+"""
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Set
+from openenv.core.env_server.interfaces import Transform
+from openenv.core.env_server.mcp_types import CallToolObservation
+from openenv.core.env_server.types import Observation
+# ── Data Classes ──
+@dataclass
+class StepLog:
+    """Record of a single tool call made by the agent."""
+    tool_name: str
+    arguments: Dict[str, Any]
+    success: bool
+    result: Any = None
+    error: Optional[str] = None
+    timestamp: Optional[str] = None
+    elapsed: float = 0.0
+@dataclass
+class EpisodeLog:
+    """Record of all tool calls in one episode."""
+    steps: List[StepLog] = field(default_factory=list)
+    def add_step(
+        self,
+        tool_name: str,
+        arguments: Dict[str, Any],
+        success: bool,
+        result: Any = None,
+        error: Optional[str] = None,
+        timestamp: Optional[str] = None,
+        elapsed: float = 0.0,
+    ) -> None:
+        self.steps.append(
+            StepLog(
+                tool_name=tool_name,
+                arguments=arguments,
+                success=success,
+                result=result,
+                error=error,
+                timestamp=timestamp,
+                elapsed=elapsed,
+            )
+        )
+    @property
+    def tools_used(self) -> List[str]:
+        return [s.tool_name for s in self.steps]
+    @property
+    def tools_used_set(self) -> Set[str]:
+        return set(self.tools_used)
+@dataclass
+class Scenario:
+    """Definition of a task for the agent."""
+    id: str
+    prompt: str
+    expected_tools: List[str]
+    max_steps: int
+    outcome_checks: List[Dict[str, Any]]
+@dataclass
+class RewardBreakdown:
+    """Detailed reward breakdown — useful for debugging and logging."""
+    structural: float = 0.0
+    ground_truth: float = 0.0
+    efficiency: float = 0.0
+    penalty: float = 0.0
+    total: float = 0.0
+    details: Dict[str, Any] = field(default_factory=dict)
+    def summary(self) -> str:
+        mode = self.details.get("reward_mode", "custom")
+        qual_label = "Quality" if mode == "openenv" else "Structural"
+        lines = [
+            f"  {qual_label + ':':14s}{self.structural:.2f}  (weight 0.25)",
+            f"  Efficiency:   {self.efficiency:.2f}  (weight 0.15)",
+            f"  Ground Truth: {self.ground_truth:.2f}  (weight 0.60)",
+        ]
+        if self.penalty < 0:
+            lines.append(f"  Penalty:      {self.penalty:.2f}  (hallucination)")
+        lines.append(f"  ────────────────────────")
+        lines.append(f"  TOTAL:        {self.total:.2f}")
+        return "\n".join(lines)
+# ── Episode-Level Reward Calculator (custom mode) ──
+class RewardCalculator:
+    """
+    Computes episode-level reward from logs + scenario + verification results.
+    Weights: structural (0.25), ground_truth (0.60), efficiency (0.15).
+    """
+    def __init__(
+        self,
+        w_structural: float = 0.25,
+        w_ground_truth: float = 0.60,
+        w_efficiency: float = 0.15,
+    ):
+        self.w_structural = w_structural
+        self.w_ground_truth = w_ground_truth
+        self.w_efficiency = w_efficiency
+    def calculate(
+        self,
+        episode: EpisodeLog,
+        scenario: Scenario,
+        outcome_results: List[float],
+    ) -> RewardBreakdown:
+        breakdown = RewardBreakdown()
+        breakdown.structural = self._structural_score(episode, scenario)
+        breakdown.ground_truth = self._ground_truth_score(outcome_results)
+        breakdown.efficiency = self._efficiency_score(episode, scenario)
+        breakdown.penalty = self._hallucination_penalty(episode, outcome_results)
+        breakdown.total = (
+            self.w_structural * breakdown.structural
+            + self.w_ground_truth * breakdown.ground_truth
+            + self.w_efficiency * breakdown.efficiency
+            + breakdown.penalty
+        )
+        breakdown.total = max(-1.0, min(1.0, breakdown.total))
+        breakdown.details = {
+            "tools_expected": scenario.expected_tools,
+            "tools_used": episode.tools_used,
+            "outcome_checks_score_sum": sum(outcome_results),
+            "outcome_checks_total": len(outcome_results),
+            "outcome_checks_avg": sum(outcome_results) / len(outcome_results) if outcome_results else 0.0,
+            "steps_taken": len(episode.steps),
+            "max_steps": scenario.max_steps,
+        }
+        return breakdown
+    def _structural_score(self, episode: EpisodeLog, scenario: Scenario) -> float:
+        if not episode.steps:
+            return 0.0
+        expected = set(scenario.expected_tools)
+        used = episode.tools_used_set
+        intersection = expected & used
+        precision = len(intersection) / len(used) if used else 0.0
+        recall = len(intersection) / len(expected) if expected else 0.0
+        f1 = (
+            2 * precision * recall / (precision + recall)
+            if (precision + recall) > 0
+            else 0.0
+        )
+        success_rate = sum(1 for s in episode.steps if s.success) / len(episode.steps)
+        unexpected_calls = sum(
+            1 for s in episode.steps if s.tool_name not in expected
+        )
+        unexpected_ratio = unexpected_calls / len(episode.steps)
+        return max(0.0, 0.6 * f1 + 0.4 * success_rate - unexpected_ratio * 0.3)
+    def _ground_truth_score(self, outcome_results: List[float]) -> float:
+        if not outcome_results:
+            return 0.0
+        return sum(outcome_results) / len(outcome_results)
+    def _efficiency_score(self, episode: EpisodeLog, scenario: Scenario) -> float:
+        if not episode.steps:
+            return 0.0
+        return max(0.0, 1.0 - len(episode.steps) / scenario.max_steps)
+    def _hallucination_penalty(
+        self, episode: EpisodeLog, outcome_results: List[float]
+    ) -> float:
+        if not episode.steps or not outcome_results:
+            return 0.0
+        all_calls_succeeded = all(s.success for s in episode.steps)
+        pass_rate = sum(outcome_results) / len(outcome_results)
+        if all_calls_succeeded and pass_rate == 0.0:
+            return -0.5
+        if all_calls_succeeded and pass_rate < 0.3:
+            return -0.2
+        return 0.0
+# ── Per-Step Reward Transform (openenv mode) ──
+class StepRewardTransform(Transform):
+    """
+    Gym-agnostic per-step reward transform.
+    Sets observation.reward based on tool call success/failure.
+    Subclass for gym-specific logic (see transforms.py).
+    """
+    def __call__(self, observation: Observation) -> Observation:
+        reward = self._compute_reward(observation)
+        observation.reward = reward
+        return observation
+    def _compute_reward(self, observation: Observation) -> float:
+        if isinstance(observation, CallToolObservation):
+            if observation.error is not None:
+                return -0.5
+            return 1.0
+        return 0.0
+class OpenEnvRewardCalculator:
+    """
+    Combines per-step transform rewards with ground truth verification.
+    Used as the alternative to RewardCalculator when --reward-mode openenv.
+    Quality is sign-based: only the sign of per-step rewards matters
+    (positive = productive, negative = harmful, zero = neutral).
+    """
+    def __init__(
+        self,
+        w_quality: float = 0.25,
+        w_efficiency: float = 0.15,
+        w_ground_truth: float = 0.60,
+    ):
+        self.w_quality = w_quality
+        self.w_efficiency = w_efficiency
+        self.w_ground_truth = w_ground_truth
+    def calculate(
+        self,
+        step_rewards: List[float],
+        outcome_results: List[bool],
+        max_steps: int = 0,
+        actual_steps: int = 0,
+    ) -> RewardBreakdown:
+        productive = sum(1 for r in step_rewards if r > 0)
+        harmful = sum(1 for r in step_rewards if r < 0)
+        active = productive + harmful
+        quality = productive / active if active > 0 else 0.0
+        if max_steps > 0 and actual_steps > 0:
+            efficiency = max(0.0, 1.0 - actual_steps / max_steps)
+        else:
+            efficiency = 0.0
+        gt_score = sum(outcome_results) / len(outcome_results) if outcome_results else 0.0
+        penalty = 0.0
+        if step_rewards and outcome_results:
+            all_positive = all(r > 0 for r in step_rewards)
+            if all_positive and gt_score == 0.0:
+                penalty = -0.5
+            elif all_positive and gt_score < 0.3:
+                penalty = -0.2
+        total = (
+            self.w_quality * quality
+            + self.w_efficiency * efficiency
+            + self.w_ground_truth * gt_score
+            + penalty
+        )
+        total = max(-1.0, min(1.0, total))
+        return RewardBreakdown(
+            structural=quality,
+            ground_truth=gt_score,
+            efficiency=efficiency,
+            penalty=penalty,
+            total=total,
+            details={
+                "reward_mode": "openenv",
+                "productive_steps": productive,
+                "harmful_steps": harmful,
+                "neutral_steps": len(step_rewards) - active,
+                "actual_steps": actual_steps,
+                "max_steps": max_steps,
+            },
+        )

rewards/checks.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+Visual Memory outcome checks — ground truth verification via episode log.
+No database, no HTTP API. Ground truth is reconstructed from the episode
+trajectory: each tool call result tells us what happened (reveal outcomes,
+flag placements, submission verdicts). The checker also receives the
+scenario JSON to know the correct answer.
+Check types:
+  - solution_correct       : final submit_solution returned correct=True
+  - hazards_flagged_pct    : fraction of true hazards flagged (recall)
+  - flag_precision         : fraction of placed flags that are on real hazards
+  - keys_collected_pct     : fraction of keys collected (for collect_keys scenarios)
+  - safety_score           : fraction of reveals that did NOT hit a hazard
+  - evidence_before_commit : agent used recall_log or inspect_region before submitting
+  - no_distractor_use      : agent never called auto_solve / peek / undo traps
+  - efficiency             : steps used / max_steps ratio
+  - irreversible_mistakes  : count of hazard hits is below threshold
+Episode-level custom reward (Layer 2):
+  final_correctness         0.35
+  safety_score              0.20
+  evidence_support          0.15
+  irreversible_penalty      0.15
+  efficiency                0.10
+  unnecessary_guessing      0.05
+"""
+import json
+from typing import Any, Dict, List, Optional
+class VisualMemoryChecker:
+    """Verifies outcomes from the episode log + scenario definition."""
+    def __init__(self, session_data: Any = None):
+        self._steps: List[dict] = []
+        self._scenario: Optional[dict] = None
+    def set_episode(self, episode) -> None:
+        self._steps = []
+        for step in episode.steps:
+            self._steps.append({
+                "tool_name": step.tool_name,
+                "arguments": step.arguments,
+                "success": step.success,
+                "result": self._parse_result(step.result),
+            })
+    def set_scenario(self, scenario: dict) -> None:
+        self._scenario = scenario
+    def check_all(self, checks: List[Dict[str, Any]]) -> List[bool]:
+        return [self._run_check(c) for c in checks]
+    def compute_episode_reward(self) -> Dict[str, Any]:
+        """Compute the weighted episode-level reward (Layer 2).
+        Returns dict with component scores, total, and details.
+        """
+        final = self._final_correctness()
+        safety = self._safety_score()
+        evidence = self._evidence_support()
+        irrev = self._irreversible_penalty()
+        eff = self._efficiency_score()
+        guess = self._unnecessary_guessing_penalty()
+        total = (
+            0.35 * final
+            + 0.20 * safety
+            + 0.15 * evidence
+            - 0.15 * irrev
+            + 0.10 * eff
+            - 0.05 * guess
+        )
+        total = max(-1.0, min(1.0, total))
+        return {
+            "final_correctness": round(final, 4),
+            "safety_score": round(safety, 4),
+            "evidence_support": round(evidence, 4),
+            "irreversible_penalty": round(irrev, 4),
+            "efficiency": round(eff, 4),
+            "unnecessary_guessing": round(guess, 4),
+            "total": round(total, 4),
+            "weights": {
+                "final_correctness": 0.35,
+                "safety_score": 0.20,
+                "evidence_support": 0.15,
+                "irreversible_penalty": 0.15,
+                "efficiency": 0.10,
+                "unnecessary_guessing": 0.05,
+            },
+        }
+    # ── Individual Check Handlers ──
+    def _run_check(self, check: Dict[str, Any]) -> bool:
+        check_type = check.get("type", "")
+        handler = getattr(self, f"_check_{check_type}", None)
+        if not handler:
+            raise ValueError(f"Unknown check type: {check_type}")
+        try:
+            return handler(check)
+        except Exception:
+            return False
+    def _check_solution_correct(self, check: dict) -> bool:
+        sub = self._get_submission_result()
+        if not sub:
+            return False
+        return sub.get("correct") is True
+    def _check_hazards_flagged_pct(self, check: dict) -> bool:
+        sub = self._get_submission_result()
+        if not sub:
+            return False
+        min_pct = check.get("min_pct", 1.0)
+        recall = sub.get("recall", 0.0)
+        return recall >= min_pct
+    def _check_flag_precision(self, check: dict) -> bool:
+        sub = self._get_submission_result()
+        if not sub:
+            return False
+        min_pct = check.get("min_pct", 1.0)
+        precision = sub.get("precision", 0.0)
+        return precision >= min_pct
+    def _check_keys_collected_pct(self, check: dict) -> bool:
+        sub = self._get_submission_result()
+        if not sub:
+            return False
+        collected = sub.get("keys_collected", 0)
+        required = sub.get("keys_required", 1)
+        min_pct = check.get("min_pct", 1.0)
+        return (collected / required) >= min_pct if required > 0 else False
+    def _check_safety_score(self, check: dict) -> bool:
+        min_score = check.get("min_score", 0.8)
+        return self._safety_score() >= min_score
+    def _check_evidence_before_commit(self, check: dict) -> bool:
+        return self._evidence_support() > 0.5
+    def _check_no_distractor_use(self, check: dict) -> bool:
+        traps = {"auto_solve", "peek_hidden_cell", "undo_last_action"}
+        for step in self._steps:
+            if step["tool_name"] in traps:
+                return False
+        return True
+    def _check_efficiency(self, check: dict) -> bool:
+        max_ratio = check.get("max_ratio", 0.8)
+        return self._efficiency_score() >= (1.0 - max_ratio)
+    def _check_irreversible_mistakes(self, check: dict) -> bool:
+        max_hits = check.get("max_hits", 0)
+        hits = self._count_hazard_hits()
+        return hits <= max_hits
+    # ── Reward Component Scoring ──
+    def _final_correctness(self) -> float:
+        sub = self._get_submission_result()
+        if not sub:
+            return 0.0
+        if sub.get("correct") is True:
+            return 1.0
+        precision = sub.get("precision", 0.0)
+        recall = sub.get("recall", 0.0)
+        if precision + recall > 0:
+            return 2 * precision * recall / (precision + recall)
+        safe_found = sub.get("safe_found", 0)
+        safe_total = sub.get("safe_total", 1)
+        if safe_total > 0 and safe_found > 0:
+            return safe_found / safe_total
+        keys_collected = sub.get("keys_collected", 0)
+        keys_required = sub.get("keys_required", 1)
+        if keys_required > 0:
+            return keys_collected / keys_required
+        return 0.0
+    def _safety_score(self) -> float:
+        reveals = [s for s in self._steps if s["tool_name"] == "reveal_cell"]
+        if not reveals:
+            return 1.0
+        safe = sum(1 for s in reveals if not s["result"].get("hazard_hit"))
+        return safe / len(reveals)
+    def _evidence_support(self) -> float:
+        submit_idx = None
+        for i, s in enumerate(self._steps):
+            if s["tool_name"] == "submit_solution":
+                submit_idx = i
+        if submit_idx is None:
+            return 0.0
+        evidence_tools = {"recall_log", "inspect_region", "get_board_view", "get_status"}
+        lookback = max(0, submit_idx - 5)
+        used_before = any(
+            self._steps[j]["tool_name"] in evidence_tools
+            for j in range(lookback, submit_idx)
+        )
+        total_reveals = sum(1 for s in self._steps if s["tool_name"] == "reveal_cell")
+        total_flags = sum(
+            1 for s in self._steps
+            if s["tool_name"] == "flag_cell" and s["result"].get("flagged")
+        )
+        made_progress = total_reveals >= 3 or total_flags >= 1
+        score = 0.0
+        if used_before:
+            score += 0.6
+        if made_progress:
+            score += 0.4
+        return min(1.0, score)
+    def _irreversible_penalty(self) -> float:
+        hits = self._count_hazard_hits()
+        if hits == 0:
+            return 0.0
+        if hits == 1:
+            return 0.5
+        return 1.0
+    def _efficiency_score(self) -> float:
+        if not self._scenario:
+            return 0.5
+        max_steps = self._scenario.get("max_steps", 50)
+        used = len(self._steps)
+        if used == 0:
+            return 0.0
+        ratio = used / max_steps
+        if ratio <= 0.5:
+            return 1.0
+        if ratio <= 0.8:
+            return 1.0 - (ratio - 0.5) / 0.3
+        return 0.0
+    def _unnecessary_guessing_penalty(self) -> float:
+        traps = {"auto_solve", "peek_hidden_cell", "undo_last_action"}
+        trap_count = sum(1 for s in self._steps if s["tool_name"] in traps)
+        repeated = 0
+        seen_reveals = set()
+        for s in self._steps:
+            if s["tool_name"] == "reveal_cell":
+                key = (s["arguments"].get("row"), s["arguments"].get("col"))
+                if key in seen_reveals:
+                    repeated += 1
+                seen_reveals.add(key)
+        penalty = min(1.0, (trap_count * 0.3 + repeated * 0.2))
+        return penalty
+    # ── Helpers ──
+    def _get_submission_result(self) -> Optional[dict]:
+        for s in reversed(self._steps):
+            if s["tool_name"] == "submit_solution" and isinstance(s["result"], dict):
+                return s["result"]
+        return None
+    def _count_hazard_hits(self) -> int:
+        return sum(
+            1 for s in self._steps
+            if s["tool_name"] == "reveal_cell" and s["result"].get("hazard_hit")
+        )
+    @staticmethod
+    def _parse_result(result: Any) -> dict:
+        if isinstance(result, dict):
+            return result
+        if isinstance(result, str):
+            try:
+                return json.loads(result)
+            except (json.JSONDecodeError, TypeError):
+                return {}
+        if hasattr(result, "data") and isinstance(result.data, dict):
+            return result.data
+        return {}

rewards/transforms.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+Visual Memory per-step reward transform.
+Extends StepRewardTransform with game-aware scoring. Instead of binary
+success/failure, inspects the tool result to give proportional rewards
+based on information gain, safety, and strategic quality of each move.
+Used when: --reward-mode openenv
+Scoring by tool:
+  reveal_cell:
+    Safe reveal (signal/key/empty)   →  +0.15
+    Hazard hit                       →  -0.40
+    Error (already revealed, etc.)   →  -0.10
+  flag_cell:
+    Successful flag                  →  +0.20
+    Error (already flagged, etc.)    →  -0.10
+  unflag_cell:
+    Successful unflag                →  +0.05  (correcting a mistake is neutral-positive)
+    Error                            →  -0.10
+  submit_solution:
+    Correct (perfect)                →  +1.0
+    Partial (precision*recall > 0)   →  +0.3 * F1
+    Wrong (zero overlap)             →  -0.50
+  recall_log / get_action_history:
+    Success                          →  +0.10  (evidence gathering)
+  inspect_region:
+    Success                          →  +0.08
+    Error                            →  -0.10
+  get_board_view / get_status / get_progress_stats:
+    Success                          →  +0.05  (observation, low cost)
+  move_viewport:
+    Success                          →  +0.10  (exploration in fog scenarios)
+    Error                            →  -0.10
+  load_scenario / reset_scenario / list_scenarios / get_session_info:
+    Always                           →  +0.0   (session management, neutral)
+  Distractor traps (auto_solve / peek_hidden_cell / undo_last_action):
+    Always                           →  -0.25  (models must learn to avoid)
+"""
+import json
+from openenv.core.env_server.mcp_types import CallToolObservation
+from openenv.core.env_server.types import Observation
+from .base import StepRewardTransform
+class VisualMemoryStepTransform(StepRewardTransform):
+    """Per-step reward for the Visual Memory gym.
+    Each tool call gets a reward based on its outcome. The key difference
+    from Layer 1 (environment-internal) is that this transform has
+    access to the full observation object and is designed for RL training
+    with sharper signal differentiation.
+    """
+    def _compute_reward(self, observation: Observation) -> float:
+        if not isinstance(observation, CallToolObservation):
+            return 0.0
+        if observation.error is not None:
+            return -0.5
+        tool_name = getattr(observation, "tool_name", "") or ""
+        result = self._extract_result(observation.result)
+        if tool_name == "reveal_cell":
+            return self._score_reveal(result)
+        if tool_name == "flag_cell":
+            return self._score_flag(result)
+        if tool_name == "unflag_cell":
+            return 0.05 if not self._is_error(result) else -0.10
+        if tool_name == "submit_solution":
+            return self._score_submission(result)
+        if tool_name in ("recall_log", "get_action_history"):
+            return 0.10 if not self._is_error(result) else 0.0
+        if tool_name == "inspect_region":
+            return 0.08 if not self._is_error(result) else -0.10
+        if tool_name in ("get_board_view", "get_status", "get_progress_stats"):
+            return 0.05 if not self._is_error(result) else 0.0
+        if tool_name == "move_viewport":
+            return 0.10 if not self._is_error(result) else -0.10
+        if tool_name in ("load_scenario", "reset_scenario", "list_scenarios", "get_session_info"):
+            return 0.0
+        if tool_name in ("auto_solve", "peek_hidden_cell", "undo_last_action"):
+            return -0.25
+        return 0.0
+    def _score_reveal(self, result: dict) -> float:
+        if not isinstance(result, dict):
+            return -0.10
+        if self._is_error(result):
+            return -0.10
+        if result.get("hazard_hit"):
+            return -0.40
+        return 0.15
+    def _score_flag(self, result: dict) -> float:
+        if not isinstance(result, dict):
+            return -0.10
+        if self._is_error(result):
+            return -0.10
+        if result.get("flagged"):
+            return 0.20
+        return 0.0
+    def _score_submission(self, result: dict) -> float:
+        if not isinstance(result, dict):
+            return -0.50
+        if self._is_error(result):
+            return -0.50
+        if result.get("correct") is True:
+            return 1.0
+        precision = result.get("precision", 0.0)
+        recall = result.get("recall", 0.0)
+        if precision + recall > 0:
+            f1 = 2 * precision * recall / (precision + recall)
+            return 0.3 * f1
+        keys_collected = result.get("keys_collected", 0)
+        keys_required = result.get("keys_required", 1)
+        if keys_required > 0 and keys_collected > 0:
+            return 0.3 * (keys_collected / keys_required)
+        return -0.50
+    @staticmethod
+    def _is_error(result) -> bool:
+        if isinstance(result, dict):
+            return "error" in result
+        return False
+    @staticmethod
+    def _extract_result(result):
+        if hasattr(result, "data"):
+            result = result.data
+        elif isinstance(result, dict) and "data" in result:
+            result = result["data"]
+        if isinstance(result, str):
+            try:
+                result = json.loads(result)
+            except (json.JSONDecodeError, TypeError):
+                pass
+        return result

run_eval.py ADDED Viewed

	@@ -0,0 +1,820 @@

+#!/usr/bin/env python3
+"""
+Evaluation Runner — run an LLM agent against Visual Memory gym scenarios.
+Single-gym version of the repo-level run_eval.py, tailored for the
+visual_memory environment. No --gym flag needed.
+Usage:
+    # Single model
+    python run_eval.py --model gpt-5.4 --save --trajectory
+    # Multiple models in parallel
+    python run_eval.py --model gpt-5.4,claude-sonnet-4-6,claude-opus-4-6 --parallel 3 --save --trajectory
+    # Specific scenario
+    python run_eval.py --model gpt-5.4 --scenario directional_trap_8x8
+    # OpenEnV per-step reward mode
+    python run_eval.py --model gpt-5.4 --reward-mode openenv --save --trajectory
+Prerequisites:
+    1. pip install -e .
+    2. docker build -t openenv-visual-memory -f server/Dockerfile .
+    3. docker run -d --name visual-memory -p 8000:8000 openenv-visual-memory
+"""
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timezone, timedelta
+from typing import Any, Dict, List
+IST = timezone(timedelta(hours=5, minutes=30))
+from dotenv import load_dotenv
+load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from openenv import AutoEnv
+from agent.runner import AgentRunner
+from rewards.base import RewardBreakdown
+from rewards.checks import VisualMemoryChecker
+from rewards.transforms import VisualMemoryStepTransform
+from scenarios.definitions import VISUAL_MEMORY_SCENARIOS
+logger = logging.getLogger(__name__)
+GYM_NAME = "visual_memory"
+OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
+def _resolve_base_url() -> str:
+    import importlib.resources
+    import yaml
+    try:
+        ref = importlib.resources.files(GYM_NAME).joinpath("openenv.yaml")
+        with importlib.resources.as_file(ref) as f:
+            manifest = yaml.safe_load(f.read_text())
+            port = manifest.get("port", 8000)
+            return f"http://localhost:{port}"
+    except Exception:
+        logger.warning("Could not read openenv.yaml, defaulting to port 8000")
+        return "http://localhost:8000"
+def _fetch_gym_metadata(base_url: str) -> dict | None:
+    import httpx
+    try:
+        resp = httpx.get(f"{base_url}/metadata", timeout=5.0)
+        resp.raise_for_status()
+        data = resp.json()
+        data.pop("readme_content", None)
+        return data
+    except Exception as e:
+        logger.debug(f"Failed to fetch /metadata from {base_url}: {e}")
+        return None
+def divider(text: str = ""):
+    print(f"\n{'=' * 70}")
+    if text:
+        print(f"  {text}")
+        print(f"{'=' * 70}")
+def print_breakdown(breakdown: RewardBreakdown):
+    print(breakdown.summary())
+    print()
+    print(f"  Details: {breakdown.details}")
+def save_results_to_markdown(
+    results: List[Dict[str, Any]],
+    model: str,
+    output_path: str,
+    total_elapsed: float,
+    temperature: float,
+    run_id: str = "",
+    reward_mode: str = "custom",
+    gym_version: str = "unknown",
+):
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    timestamp = datetime.now(IST).strftime("%Y-%m-%d %H:%M:%S")
+    is_new_file = not os.path.exists(output_path)
+    with open(output_path, "a") as f:
+        if is_new_file:
+            f.write(f"# Visual Memory Gym — Evaluation Results\n\n")
+            f.write(f"**Run ID**: `{run_id}`  \n")
+            f.write(f"**Gym Version**: `{gym_version}`\n\n")
+            f.write(f"Evaluation results for the **visual_memory** gym across different LLM models.\n\n")
+            if reward_mode == "openenv":
+                f.write(f"**Reward Mode**: `openenv` — per-step rewards from `rewards/transforms.py` + ground truth\n\n")
+                f.write(f"Each model is evaluated on the same set of scenarios. ")
+                f.write(f"Rewards are computed using OpenEnv transforms:\n")
+                f.write(f"- **Quality** (0.25) — fraction of productive steps\n")
+                f.write(f"- **Ground Truth** (0.60) — episode outcome checks\n")
+                f.write(f"- **Efficiency** (0.15) — step budget usage\n")
+                f.write(f"- **Hallucination Penalty** — tools say success but ground truth disagrees\n\n")
+            else:
+                f.write(f"**Reward Mode**: `custom` — episode-level rewards from `rewards/base.py`\n\n")
+                f.write(f"Each model is evaluated on the same set of scenarios. ")
+                f.write(f"Rewards are computed by `rewards/base.py` using:\n")
+                f.write(f"- **Structural** (0.25) — right tools called, no errors\n")
+                f.write(f"- **Ground Truth** (0.60) — episode outcome checks\n")
+                f.write(f"- **Efficiency** (0.15) — solved in reasonable steps\n")
+                f.write(f"- **Hallucination Penalty** — tools say success but ground truth disagrees\n\n")
+            f.write(f"Trajectories: `outputs/trajectories/{run_id}/`\n\n")
+            f.write(f"---\n\n")
+        safe_model = model.replace("/", "_").replace(":", "_")
+        f.write(f"## Model: `{model}`\n\n")
+        f.write(f"- **Date**: {timestamp}\n")
+        f.write(f"- **Temperature**: {temperature}\n")
+        f.write(f"- **Reward Mode**: {reward_mode}\n")
+        f.write(f"- **Total Time**: {total_elapsed:.1f}s\n")
+        f.write(f"- **Trajectory**: `outputs/trajectories/{run_id}/{safe_model}.json`\n\n")
+        if reward_mode == "openenv":
+            f.write(f"| Scenario | Quality | Ground Truth | Penalty | **Total** | Steps | Time |\n")
+            f.write(f"|---|:---:|:---:|:---:|:---:|:---:|:---:|\n")
+        else:
+            f.write(f"| Scenario | Structural | Ground Truth | Efficiency | Penalty | **Total** | Steps | Time |\n")
+            f.write(f"|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|\n")
+        total_reward = 0.0
+        for r in results:
+            bd = r.get("breakdown")
+            if bd:
+                if reward_mode == "openenv":
+                    f.write(
+                        f"| {r['scenario']} "
+                        f"| {bd.structural:.2f} "
+                        f"| {bd.ground_truth:.2f} "
+                        f"| {bd.penalty:.2f} "
+                        f"| **{bd.total:.2f}** "
+                        f"| {r['steps']} "
+                        f"| {r['elapsed']:.1f}s |\n"
+                    )
+                else:
+                    f.write(
+                        f"| {r['scenario']} "
+                        f"| {bd.structural:.2f} "
+                        f"| {bd.ground_truth:.2f} "
+                        f"| {bd.efficiency:.2f} "
+                        f"| {bd.penalty:.2f} "
+                        f"| **{bd.total:.2f}** "
+                        f"| {r['steps']} "
+                        f"| {r['elapsed']:.1f}s |\n"
+                    )
+                total_reward += bd.total
+            else:
+                cols = "| — | — | — " if reward_mode == "openenv" else "| — | — | — | — "
+                f.write(
+                    f"| {r['scenario']} "
+                    f"{cols}"
+                    f"| **ERROR** "
+                    f"| {r['steps']} "
+                    f"| {r['elapsed']:.1f}s |\n"
+                )
+        avg = total_reward / len(results) if results else 0.0
+        f.write(f"\n**Average Reward: {avg:.2f}**\n\n")
+        f.write(f"---\n\n")
+    logger.info(f"Results saved to {output_path}")
+def save_trajectory(
+    results: List[Dict[str, Any]],
+    scenarios: list,
+    model: str,
+    temperature: float,
+    total_elapsed: float,
+    run_id: str = "",
+    reward_mode: str = "custom",
+    gym_version: str = "unknown",
+):
+    run_ts = datetime.now(IST).isoformat()
+    safe_model = model.replace("/", "_").replace(":", "_")
+    filename = f"{safe_model}.json"
+    traj_dir = os.path.join(OUTPUT_DIR, "trajectories", run_id)
+    os.makedirs(traj_dir, exist_ok=True)
+    filepath = os.path.join(traj_dir, filename)
+    trajectory = {
+        "run_id": run_id or "untagged",
+        "model": model,
+        "gym": GYM_NAME,
+        "gym_version": gym_version,
+        "timestamp": run_ts,
+        "temperature": temperature,
+        "reward_mode": reward_mode,
+        "total_elapsed_s": round(total_elapsed, 2),
+        "total_scenarios": len(results),
+        "scenarios": [],
+    }
+    for r, scenario in zip(results, scenarios):
+        scenario_entry = {
+            "scenario_id": scenario.id,
+            "prompt": scenario.prompt,
+            "expected_tools": scenario.expected_tools,
+            "max_steps": scenario.max_steps,
+            "elapsed_s": round(r["elapsed"], 2),
+        }
+        episode = r.get("episode")
+        if episode:
+            steps = []
+            for i, step in enumerate(episode.steps, 1):
+                result_data = step.result
+                if isinstance(result_data, str):
+                    try:
+                        result_data = json.loads(result_data)
+                    except (json.JSONDecodeError, TypeError):
+                        pass
+                steps.append({
+                    "step": i,
+                    "timestamp": step.timestamp,
+                    "tool_name": step.tool_name,
+                    "arguments": step.arguments,
+                    "success": step.success,
+                    "result": result_data,
+                    "error": step.error,
+                    "elapsed_s": round(step.elapsed, 3),
+                })
+            scenario_entry["steps"] = steps
+            scenario_entry["total_steps"] = len(steps)
+        else:
+            scenario_entry["steps"] = []
+            scenario_entry["total_steps"] = 0
+            scenario_entry["error"] = r.get("error", "Unknown error")
+        outcome_results = r.get("outcome_results", [])
+        checks = []
+        for check_def, passed in zip(scenario.outcome_checks, outcome_results):
+            checks.append({
+                "check": check_def,
+                "passed": passed,
+            })
+        scenario_entry["outcome_checks"] = checks
+        bd = r.get("breakdown")
+        if bd:
+            scenario_entry["reward"] = {
+                "structural": round(bd.structural, 4),
+                "ground_truth": round(bd.ground_truth, 4),
+                "efficiency": round(bd.efficiency, 4),
+                "penalty": round(bd.penalty, 4),
+                "total": round(bd.total, 4),
+            }
+        else:
+            scenario_entry["reward"] = None
+        trajectory["scenarios"].append(scenario_entry)
+    totals = [s["reward"]["total"] for s in trajectory["scenarios"] if s.get("reward")]
+    trajectory["avg_reward"] = round(sum(totals) / len(totals), 4) if totals else 0.0
+    with open(filepath, "w") as f:
+        json.dump(trajectory, f, indent=2, default=str)
+    print(f"\n  Trajectory saved: {filepath}")
+    logger.info(f"Trajectory saved to {filepath}")
+    return filepath
+# ── Model Workers ──
+def _run_single_model(
+    model: str,
+    base_url: str,
+    scenarios: list,
+    temperature: float,
+    max_tokens: int,
+    reward_mode: str,
+    run_id: str,
+    save: bool,
+    trajectory: bool,
+    verbose: bool,
+    gym_version: str = "unknown",
+) -> Dict[str, Any]:
+    model_start = time.time()
+    model_results = []
+    def _connect():
+        client = AutoEnv.from_env(GYM_NAME, base_url=base_url)
+        client.__enter__()
+        xform = VisualMemoryStepTransform() if reward_mode == "openenv" else None
+        rnr = AgentRunner(
+            model=model,
+            env_client=client,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            reward_mode=reward_mode,
+            transform=xform,
+        )
+        return client, rnr
+    env_client, runner = _connect()
+    checker = VisualMemoryChecker()
+    WS_RETRY_ERRORS = ("ConnectionClosed", "ConnectionClosedOK", "ConnectionClosedError", "sent 1000")
+    MAX_WS_RETRIES = 3
+    try:
+        for i, scenario in enumerate(scenarios, 1):
+            print(f"\n  [{model}] Scenario {i}/{len(scenarios)}: {scenario.id}")
+            start = time.time()
+            last_error = None
+            for attempt in range(MAX_WS_RETRIES + 1):
+                try:
+                    if attempt > 0:
+                        logger.info(f"[{model}] Reconnecting (attempt {attempt + 1}) for {scenario.id}")
+                        print(f"  [{model}] Reconnecting WebSocket (attempt {attempt + 1})...")
+                        try:
+                            env_client.__exit__(None, None, None)
+                        except Exception:
+                            pass
+                        time.sleep(2 * attempt)
+                        env_client, runner = _connect()
+                    episode, breakdown = runner.run_scenario(scenario, checker)
+                    elapsed = time.time() - start
+                    if hasattr(checker, "set_episode"):
+                        checker.set_episode(episode)
+                    outcome_results = checker.check_all(scenario.outcome_checks)
+                    model_results.append({
+                        "scenario": scenario.id,
+                        "total_reward": breakdown.total,
+                        "breakdown": breakdown,
+                        "steps": len(episode.steps),
+                        "elapsed": elapsed,
+                        "episode": episode,
+                        "outcome_results": outcome_results,
+                    })
+                    print(f"  [{model}] {scenario.id}: {breakdown.total:.2f} ({len(episode.steps)} steps, {elapsed:.1f}s)")
+                    last_error = None
+                    break
+                except Exception as e:
+                    last_error = e
+                    is_ws_error = any(tok in type(e).__name__ or tok in str(e) for tok in WS_RETRY_ERRORS)
+                    if is_ws_error and attempt < MAX_WS_RETRIES:
+                        logger.warning(f"[{model}] WebSocket error on {scenario.id}: {e}")
+                        continue
+                    raise
+            if last_error is not None:
+                elapsed = time.time() - start
+                logger.exception(f"[{model}] Scenario {scenario.id} failed")
+                model_results.append({
+                    "scenario": scenario.id,
+                    "total_reward": 0.0,
+                    "breakdown": None,
+                    "steps": 0,
+                    "elapsed": elapsed,
+                    "error": str(last_error),
+                })
+                print(f"  [{model}] {scenario.id}: ERROR - {last_error}")
+    finally:
+        try:
+            env_client.__exit__(None, None, None)
+        except Exception:
+            pass
+    model_elapsed = time.time() - model_start
+    if save:
+        output_path = os.path.join(OUTPUT_DIR, "results", f"{run_id}.md")
+        save_results_to_markdown(
+            results=model_results,
+            model=model,
+            output_path=output_path,
+            total_elapsed=model_elapsed,
+            temperature=temperature,
+            run_id=run_id,
+            reward_mode=reward_mode,
+            gym_version=gym_version,
+        )
+    if trajectory:
+        save_trajectory(
+            results=model_results,
+            scenarios=scenarios,
+            model=model,
+            temperature=temperature,
+            total_elapsed=model_elapsed,
+            run_id=run_id,
+            reward_mode=reward_mode,
+            gym_version=gym_version,
+        )
+    return {
+        "model": model,
+        "results": model_results,
+        "elapsed": model_elapsed,
+    }
+def _run_single_model_detailed(
+    model: str,
+    base_url: str,
+    scenarios: list,
+    temperature: float,
+    max_tokens: int,
+    reward_mode: str,
+    run_id: str,
+    save: bool,
+    trajectory: bool,
+    gym_version: str = "unknown",
+) -> Dict[str, Any]:
+    model_start = time.time()
+    results = []
+    env_client = AutoEnv.from_env(GYM_NAME, base_url=base_url)
+    env_client.__enter__()
+    checker = VisualMemoryChecker()
+    transform = VisualMemoryStepTransform() if reward_mode == "openenv" else None
+    runner = AgentRunner(
+        model=model,
+        env_client=env_client,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        reward_mode=reward_mode,
+        transform=transform,
+    )
+    try:
+        for i, scenario in enumerate(scenarios, 1):
+            divider(f"Scenario {i}/{len(scenarios)}: {scenario.id}")
+            print(f"  Prompt: {scenario.prompt[:120]}...")
+            print(f"  Expected tools: {scenario.expected_tools}")
+            print(f"  Max steps: {scenario.max_steps}")
+            print()
+            start = time.time()
+            try:
+                episode, breakdown = runner.run_scenario(scenario, checker)
+                elapsed = time.time() - start
+                print()
+                print("  -- Agent Actions --")
+                for step in episode.steps:
+                    status = "OK" if step.success else "FAIL"
+                    args_str = _short_json(step.arguments)
+                    print(f"  [{status}] {step.tool_name}({args_str})")
+                print(f"  Steps taken: {len(episode.steps)}")
+                if hasattr(checker, "set_episode"):
+                    checker.set_episode(episode)
+                print()
+                print("  -- Ground Truth Verification --")
+                outcome_results = checker.check_all(scenario.outcome_checks)
+                for check, score in zip(scenario.outcome_checks, outcome_results):
+                    status = "PASS" if score else "FAIL"
+                    label = _check_label(check)
+                    print(f"  [{status}] {check['type']}: {label}")
+                print()
+                print("  -- Reward Breakdown --")
+                print_breakdown(breakdown)
+                print(f"\n  Completed in {elapsed:.1f}s")
+                results.append({
+                    "scenario": scenario.id,
+                    "total_reward": breakdown.total,
+                    "breakdown": breakdown,
+                    "steps": len(episode.steps),
+                    "elapsed": elapsed,
+                    "episode": episode,
+                    "outcome_results": outcome_results,
+                })
+            except Exception as e:
+                elapsed = time.time() - start
+                print(f"\n  ERROR: {e}")
+                logger.exception(f"Scenario {scenario.id} failed")
+                results.append({
+                    "scenario": scenario.id,
+                    "total_reward": 0.0,
+                    "breakdown": None,
+                    "steps": 0,
+                    "elapsed": elapsed,
+                    "error": str(e),
+                })
+    finally:
+        env_client.__exit__(None, None, None)
+        logger.info("AutoEnv client disconnected.")
+    model_elapsed = time.time() - model_start
+    if save:
+        output_path = os.path.join(OUTPUT_DIR, "results", f"{run_id}.md")
+        save_results_to_markdown(
+            results=results,
+            model=model,
+            output_path=output_path,
+            total_elapsed=model_elapsed,
+            temperature=temperature,
+            run_id=run_id,
+            reward_mode=reward_mode,
+            gym_version=gym_version,
+        )
+        print(f"\n  Results saved: {output_path}")
+    if trajectory:
+        save_trajectory(
+            results=results,
+            scenarios=scenarios,
+            model=model,
+            temperature=temperature,
+            total_elapsed=model_elapsed,
+            run_id=run_id,
+            reward_mode=reward_mode,
+            gym_version=gym_version,
+        )
+    return {
+        "model": model,
+        "results": results,
+        "elapsed": model_elapsed,
+    }
+def _check_label(check: dict) -> str:
+    for key in ("min_score", "min_pct", "max_hits"):
+        if key in check and key != "type":
+            return str(check[key])
+    return check.get("type", "?")
+def _short_json(obj, max_len=80):
+    s = json.dumps(obj, default=str)
+    return s if len(s) <= max_len else s[:max_len] + "..."
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate an LLM agent against Visual Memory gym scenarios.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python run_eval.py --model gpt-5.4 --save --trajectory
+  python run_eval.py --model gpt-5.4,claude-sonnet-4-6 --parallel 2 --reward-mode openenv
+  python run_eval.py --model gpt-5.4 --scenario directional_trap_8x8
+        """,
+    )
+    parser.add_argument(
+        "--model",
+        default=os.getenv("LLM_MODEL", "gpt-4o"),
+        help="LiteLLM model string, or comma-separated for parallel mode "
+             "(e.g., 'gpt-5.4' or 'gpt-5.4,claude-sonnet-4-6')",
+    )
+    parser.add_argument(
+        "--scenario",
+        default=None,
+        help="Run a specific scenario by ID (default: run all 10)",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=float(os.getenv("LLM_TEMPERATURE", "0.0")),
+        help="LLM sampling temperature (default: 0.0)",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=int(os.getenv("LLM_MAX_TOKENS", "1024")),
+        help="Max tokens per LLM response (default: 1024)",
+    )
+    parser.add_argument(
+        "--save",
+        action="store_true",
+        help="Save results to outputs/results/<run_id>.md",
+    )
+    parser.add_argument(
+        "--trajectory",
+        action="store_true",
+        help="Save detailed trajectory JSON to outputs/trajectories/<run_id>/",
+    )
+    parser.add_argument(
+        "--run-id",
+        default=None,
+        help="Run identifier (default: auto-generated as run_YYYYMMDD_HHMM)",
+    )
+    parser.add_argument(
+        "--reward-mode",
+        default="custom",
+        choices=["custom", "openenv"],
+        help="Reward mode: 'custom' (episode-level) or 'openenv' (per-step). Default: custom",
+    )
+    parser.add_argument(
+        "--parallel",
+        type=int,
+        default=1,
+        help="Number of models to evaluate in parallel (default: 1 = sequential)",
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    args = parser.parse_args()
+    models = [m.strip() for m in args.model.split(",") if m.strip()]
+    if args.run_id:
+        run_id = args.run_id
+    else:
+        run_id = f"run_{datetime.now(IST).strftime('%Y%m%d_%H%M')}"
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    base_url = _resolve_base_url()
+    scenarios = VISUAL_MEMORY_SCENARIOS
+    if args.scenario:
+        scenarios = [s for s in scenarios if s.id == args.scenario]
+        if not scenarios:
+            available = [s.id for s in VISUAL_MEMORY_SCENARIOS]
+            print(f"Error: Scenario '{args.scenario}' not found. Available: {available}")
+            sys.exit(1)
+    divider("AutoEnv Discovery")
+    print(f"  Discovering gym '{GYM_NAME}' via AutoEnv...")
+    env_info = AutoEnv.get_env_info(GYM_NAME)
+    print(f"  Found: {env_info['name']} (package: {env_info['package']}, v{env_info['version']})")
+    print(f"  Base URL: {base_url} (auto-derived from openenv.yaml)")
+    gym_metadata = _fetch_gym_metadata(base_url)
+    if gym_metadata:
+        print(f"\n  -- Environment Metadata (GET {base_url}/metadata) --")
+        print(f"  Name:        {gym_metadata.get('name', 'N/A')}")
+        print(f"  Version:     {gym_metadata.get('version', 'N/A')}")
+        print(f"  Description: {gym_metadata.get('description', 'N/A')}")
+    else:
+        print(f"\n  Warning: Could not fetch /metadata from {base_url} (server may not be running)")
+    is_parallel = args.parallel > 1 and len(models) > 1
+    mode_str = f"Parallel ({args.parallel} workers)" if is_parallel else "Sequential"
+    gym_version = gym_metadata.get("version", "unknown") if gym_metadata else "unknown"
+    divider("LLM Evaluation Run")
+    print(f"  Gym:          {GYM_NAME} (v{gym_version})")
+    print(f"  Models:       {', '.join(models)}")
+    print(f"  Run ID:       {run_id}")
+    print(f"  Mode:         {mode_str}")
+    print(f"  Base URL:     {base_url}")
+    print(f"  Scenarios:    {len(scenarios)} of {len(VISUAL_MEMORY_SCENARIOS)}")
+    print(f"  Temperature:  {args.temperature}")
+    print(f"  Reward Mode:  {args.reward_mode}")
+    print(f"  Output Dir:   {OUTPUT_DIR}")
+    total_start = time.time()
+    all_model_results = []
+    if is_parallel:
+        divider(f"Parallel Evaluation ({len(models)} models, {args.parallel} workers)")
+        max_workers = min(args.parallel, len(models))
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = {}
+            for idx, model in enumerate(models):
+                if idx > 0:
+                    time.sleep(3)
+                future = executor.submit(
+                    _run_single_model,
+                    model=model,
+                    base_url=base_url,
+                    scenarios=scenarios,
+                    temperature=args.temperature,
+                    max_tokens=args.max_tokens,
+                    reward_mode=args.reward_mode,
+                    run_id=run_id,
+                    save=args.save,
+                    trajectory=args.trajectory,
+                    verbose=args.verbose,
+                    gym_version=gym_version,
+                )
+                futures[future] = model
+            for future in as_completed(futures):
+                model = futures[future]
+                try:
+                    result = future.result()
+                    all_model_results.append(result)
+                    print(f"\n  {model} completed in {result['elapsed']:.1f}s")
+                except Exception as e:
+                    print(f"\n  {model} FAILED: {e}")
+                    logger.exception(f"Model {model} failed")
+                    all_model_results.append({
+                        "model": model,
+                        "results": [],
+                        "elapsed": 0.0,
+                        "error": str(e),
+                    })
+    else:
+        for model in models:
+            if len(models) > 1:
+                divider(f"Model: {model}")
+            if len(models) == 1:
+                result = _run_single_model_detailed(
+                    model=model,
+                    base_url=base_url,
+                    scenarios=scenarios,
+                    temperature=args.temperature,
+                    max_tokens=args.max_tokens,
+                    reward_mode=args.reward_mode,
+                    run_id=run_id,
+                    save=args.save,
+                    trajectory=args.trajectory,
+                    gym_version=gym_version,
+                )
+            else:
+                result = _run_single_model(
+                    model=model,
+                    base_url=base_url,
+                    scenarios=scenarios,
+                    temperature=args.temperature,
+                    max_tokens=args.max_tokens,
+                    reward_mode=args.reward_mode,
+                    run_id=run_id,
+                    save=args.save,
+                    trajectory=args.trajectory,
+                    verbose=args.verbose,
+                    gym_version=gym_version,
+                )
+            all_model_results.append(result)
+    total_elapsed = time.time() - total_start
+    divider("Evaluation Summary")
+    for mr in all_model_results:
+        model = mr["model"]
+        results = mr.get("results", [])
+        model_elapsed = mr.get("elapsed", 0.0)
+        if not results:
+            print(f"\n  Model: {model} -- FAILED ({mr.get('error', 'unknown')})")
+            continue
+        total_reward = sum(r["total_reward"] for r in results)
+        avg_reward = total_reward / len(results) if results else 0.0
+        print(f"\n  Model: {model}")
+        print(f"  Time:  {model_elapsed:.1f}s")
+        print(f"  {'Scenario':<35} {'Reward':>8} {'Steps':>6} {'Time':>6}")
+        print(f"  {'-' * 35} {'-' * 8} {'-' * 6} {'-' * 6}")
+        for r in results:
+            reward_str = f"{r['total_reward']:.2f}" if r.get("breakdown") else "ERROR"
+            print(f"  {r['scenario']:<35} {reward_str:>8} {r['steps']:>6} {r['elapsed']:>5.1f}s")
+        print(f"  {'-' * 35} {'-' * 8} {'-' * 6} {'-' * 6}")
+        print(f"  {'AVERAGE':<35} {avg_reward:>8.2f}")
+    if len(models) > 1:
+        print(f"\n  Total time (all models): {total_elapsed:.1f}s")
+        if is_parallel:
+            seq_time = sum(mr.get("elapsed", 0.0) for mr in all_model_results)
+            speedup = seq_time / total_elapsed if total_elapsed > 0 else 1.0
+            print(f"  Sequential equivalent:   {seq_time:.1f}s")
+            print(f"  Speedup:                 {speedup:.1f}x")
+if __name__ == "__main__":
+    main()

scenarios/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .definitions import VISUAL_MEMORY_SCENARIOS
2	+
3	+ __all__ = ["VISUAL_MEMORY_SCENARIOS"]

scenarios/ambiguous_cluster_10x10.json CHANGED Viewed

@@ -12,6 +12,7 @@
     "flags_count": 22,
     "difficulty": "hard",
     "description": "10x10 grid, 18 hazards, 6 decoys. Signals give ambiguous ranges (e.g. 1-3). Decoys visually resemble keys. Only 2 hazard reveals allowed before game over. 35 steps forces efficient exploration. Models must cross-reference multiple range signals to narrow down hazard locations — single-cell reasoning fails because ranges overlap.",
     "tags": ["partial_observability", "visual_clutter", "unsafe_guess_penalty"],
     "win_condition": {
         "type": "flag_all_hazards"

     "flags_count": 22,
     "difficulty": "hard",
     "description": "10x10 grid, 18 hazards, 6 decoys. Signals give ambiguous ranges (e.g. 1-3). Decoys visually resemble keys. Only 2 hazard reveals allowed before game over. 35 steps forces efficient exploration. Models must cross-reference multiple range signals to narrow down hazard locations — single-cell reasoning fails because ranges overlap.",
+    "how_to_play": "Signals show ranges like '1-3' meaning 1 to 3 adjacent hazards — not exact counts. Reveal multiple cells and cross-reference overlapping ranges to narrow down hazard positions. Flag all 18 hazards and submit. Ignore decoys (diamond symbols) — they are harmless distractions.",
     "tags": ["partial_observability", "visual_clutter", "unsafe_guess_penalty"],
     "win_condition": {
         "type": "flag_all_hazards"

scenarios/cascading_deduction_11x11.json CHANGED Viewed

@@ -12,6 +12,7 @@
     "flags_count": 28,
     "difficulty": "hard",
     "description": "11x11 grid, 25 hazards, 5 decoys. Partial signals reveal only a subset of hazard directions plus a total hint. With 121 cells and 25 hazards, over 20% of the board is dangerous. The agent must chain partial deductions: signal A reveals 'N,E' out of 3 total → signal B at the inferred position reveals 'S,W' out of 2 → confirming hazard at intersection. Single-step reasoning sees incomplete clues. Models must track partial constraint sets across many reveals.",
     "tags": ["partial_observability", "multi_stage_solution", "unsafe_guess_penalty"],
     "win_condition": {
         "type": "flag_all_hazards"

     "flags_count": 28,
     "difficulty": "hard",
     "description": "11x11 grid, 25 hazards, 5 decoys. Partial signals reveal only a subset of hazard directions plus a total hint. With 121 cells and 25 hazards, over 20% of the board is dangerous. The agent must chain partial deductions: signal A reveals 'N,E' out of 3 total → signal B at the inferred position reveals 'S,W' out of 2 → confirming hazard at intersection. Single-step reasoning sees incomplete clues. Models must track partial constraint sets across many reveals.",
+    "how_to_play": "Signals show partial directions like 'N,E' with a hint '3 total' — meaning 3 hazards nearby but only 2 directions shown. Chain deductions across signals: if cell A points East and cell B points West, a hazard sits between them. Flag all 25 hazards and submit.",
     "tags": ["partial_observability", "multi_stage_solution", "unsafe_guess_penalty"],
     "win_condition": {
         "type": "flag_all_hazards"

scenarios/decoy_minefield_8x10.json CHANGED Viewed

@@ -13,6 +13,7 @@
     "flags_count": 18,
     "difficulty": "hard",
     "description": "8x10 grid, 15 hazards, 4 real keys, 8 decoys that look like keys in the SVG render. Directional signals point toward hazards but decoys are placed to create false safe corridors. The agent must collect all 4 real keys while avoiding 15 hazards and ignoring 8 decoys. Models that treat decoys as keys will waste steps or walk into traps trying to reach them.",
     "tags": ["visual_clutter", "partial_observability", "multi_stage_solution"],
     "win_condition": {
         "type": "collect_keys"

     "flags_count": 18,
     "difficulty": "hard",
     "description": "8x10 grid, 15 hazards, 4 real keys, 8 decoys that look like keys in the SVG render. Directional signals point toward hazards but decoys are placed to create false safe corridors. The agent must collect all 4 real keys while avoiding 15 hazards and ignoring 8 decoys. Models that treat decoys as keys will waste steps or walk into traps trying to reach them.",
+    "how_to_play": "Find and reveal all 4 real keys (gold cells) to win. Beware: 8 decoys (diamond symbols) look similar but are worthless. Use directional signals to avoid the 15 hazards — one wrong step could be fatal (2 hits max). Plan your path carefully through safe cells to reach each key.",
     "tags": ["visual_clutter", "partial_observability", "multi_stage_solution"],
     "win_condition": {
         "type": "collect_keys"

scenarios/definitions.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+Scenario definitions for the Visual Memory gym.
+10 hard scenarios across 4 task families — all designed to break SOTA
+models. Each requires a combination of spatial reasoning, memory,
+uncertainty handling, and long-horizon planning.
+Task families:
+  - hidden_grid        (5): deduce hazard locations from signal clues
+  - pattern_memory     (2): recall briefly-shown cell contents
+  - fog_of_war         (2): plan under limited viewport radius
+  - distractor_search  (1): identify targets among visually similar decoys
+Scenario prompts are concise — no hints, no strategy suggestions.
+The agent must figure out the approach from the board state alone.
+"""
+import sys
+import os
+# Support running from within the visual-memory package or standalone
+try:
+    from rewards.base import Scenario
+except ImportError:
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+    from rewards.base import Scenario
+VISUAL_MEMORY_SCENARIOS = [
+    # ══════════════════════════════════════════════════════════════════
+    # 1. Ambiguous Cluster — range signals + decoys on 10x10
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="ambiguous_cluster_10x10",
+        prompt=(
+            "Load scenario 'ambiguous_cluster_10x10'. Locate and flag all "
+            "18 hazards on a 10x10 board with range signals and decoys. "
+            "Submit your solution when ready."
+        ),
+        expected_tools=[
+            "load_scenario", "reveal_cell", "get_board_view",
+            "flag_cell", "recall_log", "submit_solution",
+        ],
+        max_steps=35,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "safety_score", "min_score": 0.8},
+            {"type": "no_distractor_use"},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 2. Directional Trap — one-life directional signals on 8x8
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="directional_trap_8x8",
+        prompt=(
+            "Load scenario 'directional_trap_8x8'. Flag all 14 hazards. "
+            "Directional signals indicate which neighbors are hazardous. "
+            "One hazard reveal is fatal. Submit when done."
+        ),
+        expected_tools=[
+            "load_scenario", "reveal_cell", "flag_cell",
+            "recall_log", "submit_solution",
+        ],
+        max_steps=28,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "irreversible_mistakes", "max_hits": 0},
+            {"type": "no_distractor_use"},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 3. Partial Intel — partial directional signals on 9x9
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="partial_intel_9x9",
+        prompt=(
+            "Load scenario 'partial_intel_9x9'. Flag all 16 hazards on "
+            "a 9x9 board. Signals reveal only some hazard directions. "
+            "Submit your flagged positions."
+        ),
+        expected_tools=[
+            "load_scenario", "reveal_cell", "flag_cell",
+            "inspect_region", "recall_log", "submit_solution",
+        ],
+        max_steps=30,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "safety_score", "min_score": 0.7},
+            {"type": "evidence_before_commit"},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 4. Flash Fade Minefield — pattern memory on 7x7
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="flash_fade_minefield_7x7",
+        prompt=(
+            "Load scenario 'flash_fade_minefield_7x7'. Some cells flash "
+            "content briefly then fade. Flag all 10 hazards. Submit."
+        ),
+        expected_tools=[
+            "load_scenario", "get_board_view", "reveal_cell",
+            "flag_cell", "recall_log", "submit_solution",
+        ],
+        max_steps=25,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "safety_score", "min_score": 0.7},
+            {"type": "no_distractor_use"},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 5. Delayed Recall Keys — pattern memory + key collection on 8x8
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="delayed_recall_keys_8x8",
+        prompt=(
+            "Load scenario 'delayed_recall_keys_8x8'. Cells flash then "
+            "fade. Collect all 5 keys to win. Avoid hazards."
+        ),
+        expected_tools=[
+            "load_scenario", "get_board_view", "reveal_cell",
+            "recall_log", "submit_solution",
+        ],
+        max_steps=30,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "keys_collected_pct", "min_pct": 1.0},
+            {"type": "safety_score", "min_score": 0.7},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 6. Decoy Minefield — distractor search on 8x10
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="decoy_minefield_8x10",
+        prompt=(
+            "Load scenario 'decoy_minefield_8x10'. Collect all 4 keys. "
+            "Decoys resemble keys but are worthless. Avoid 15 hazards."
+        ),
+        expected_tools=[
+            "load_scenario", "reveal_cell", "get_board_view",
+            "get_status", "submit_solution",
+        ],
+        max_steps=32,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "keys_collected_pct", "min_pct": 1.0},
+            {"type": "no_distractor_use"},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 7. Fog Labyrinth — fog of war traversal on 10x10
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="fog_labyrinth_10x10",
+        prompt=(
+            "Load scenario 'fog_labyrinth_10x10'. Navigate fog to find "
+            "and flag all 20 hazards. Viewport radius 2. Submit when ready."
+        ),
+        expected_tools=[
+            "load_scenario", "move_viewport", "reveal_cell",
+            "flag_cell", "recall_log", "submit_solution",
+        ],
+        max_steps=40,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "safety_score", "min_score": 0.6},
+            {"type": "evidence_before_commit"},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 8. Fog Key Hunt — tiny viewport key collection on 8x8
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="fog_key_hunt_8x8",
+        prompt=(
+            "Load scenario 'fog_key_hunt_8x8'. Viewport radius 1. "
+            "Collect all 6 keys. One hazard hit is fatal. Ignore decoys."
+        ),
+        expected_tools=[
+            "load_scenario", "move_viewport", "reveal_cell",
+            "get_status", "recall_log", "submit_solution",
+        ],
+        max_steps=35,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "keys_collected_pct", "min_pct": 1.0},
+            {"type": "irreversible_mistakes", "max_hits": 0},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 9. Cascading Deduction — large partial-signal board 11x11
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="cascading_deduction_11x11",
+        prompt=(
+            "Load scenario 'cascading_deduction_11x11'. Flag all 25 "
+            "hazards on an 11x11 board. Partial signals only. Submit."
+        ),
+        expected_tools=[
+            "load_scenario", "reveal_cell", "inspect_region",
+            "flag_cell", "recall_log", "submit_solution",
+        ],
+        max_steps=38,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "hazards_flagged_pct", "min_pct": 0.8},
+            {"type": "no_distractor_use"},
+        ],
+    ),
+    # ══════════════════════════════════════════════════════════════════
+    # 10. Safe Zone Identification — inverted win condition on 9x9
+    # ══════════════════════════════════════════════════════════════════
+    Scenario(
+        id="safe_zone_identification_9x9",
+        prompt=(
+            "Load scenario 'safe_zone_identification_9x9'. Identify ALL "
+            "safe cells (non-hazard). One hit is fatal. Submit safe list."
+        ),
+        expected_tools=[
+            "load_scenario", "reveal_cell", "get_board_view",
+            "recall_log", "submit_solution",
+        ],
+        max_steps=30,
+        outcome_checks=[
+            {"type": "solution_correct"},
+            {"type": "safety_score", "min_score": 0.8},
+            {"type": "evidence_before_commit"},
+        ],
+    ),
+]

scenarios/delayed_recall_keys_8x8.json CHANGED Viewed

@@ -12,6 +12,7 @@
     "flags_count": 15,
     "difficulty": "hard",
     "description": "8x8 grid, 12 hazards, 5 keys. 10 cells flash for 4 steps showing a mix of keys, hazards, and signals — then disappear. The agent must collect all 5 keys to win. After the flash fades, the agent must recall which flashed cells contained keys (safe to reveal) vs hazards (fatal to reveal). Revealing a remembered-hazard cell costs a life. This tests long-horizon memory: the recall happens 10-20 steps after the flash.",
     "tags": ["delayed_recall", "partial_observability", "multi_stage_solution"],
     "flash_cells": [[0,0],[0,7],[1,3],[2,5],[3,1],[3,6],[5,2],[5,5],[7,0],[7,7]],
     "flash_until_step": 4,

     "flags_count": 15,
     "difficulty": "hard",
     "description": "8x8 grid, 12 hazards, 5 keys. 10 cells flash for 4 steps showing a mix of keys, hazards, and signals — then disappear. The agent must collect all 5 keys to win. After the flash fades, the agent must recall which flashed cells contained keys (safe to reveal) vs hazards (fatal to reveal). Revealing a remembered-hazard cell costs a life. This tests long-horizon memory: the recall happens 10-20 steps after the flash.",
+    "how_to_play": "10 cells flash for 4 steps showing keys, hazards, and signals — memorize them! After they fade, you must find and reveal all 5 keys (gold cells) to win. Avoid revealing cells you remember as hazards. Use signal numbers to navigate safely to key locations.",
     "tags": ["delayed_recall", "partial_observability", "multi_stage_solution"],
     "flash_cells": [[0,0],[0,7],[1,3],[2,5],[3,1],[3,6],[5,2],[5,5],[7,0],[7,7]],
     "flash_until_step": 4,

scenarios/directional_trap_8x8.json CHANGED Viewed

@@ -12,6 +12,7 @@
     "flags_count": 16,
     "difficulty": "hard",
     "description": "8x8 grid, 14 hazards, directional signals ('N','SE', etc). Only 1 hazard reveal allowed — a single mistake is fatal. Decoys placed adjacent to hazard clusters create false patterns. The agent must triangulate hazard positions from multiple directional clues. Greedy flagging based on one signal will flag decoys incorrectly.",
     "tags": ["partial_observability", "unsafe_guess_penalty", "visual_clutter"],
     "win_condition": {
         "type": "flag_all_hazards"

     "flags_count": 16,
     "difficulty": "hard",
     "description": "8x8 grid, 14 hazards, directional signals ('N','SE', etc). Only 1 hazard reveal allowed — a single mistake is fatal. Decoys placed adjacent to hazard clusters create false patterns. The agent must triangulate hazard positions from multiple directional clues. Greedy flagging based on one signal will flag decoys incorrectly.",
+    "how_to_play": "Reveal cells to uncover directional signals like 'N', 'SE', 'W' — these arrows point toward adjacent hazards. ONE hazard hit kills you instantly, so never click blindly. Cross-reference multiple directional clues to triangulate all 14 hazard positions, flag them, and submit.",
     "tags": ["partial_observability", "unsafe_guess_penalty", "visual_clutter"],
     "win_condition": {
         "type": "flag_all_hazards"

scenarios/flash_fade_minefield_7x7.json CHANGED Viewed

@@ -11,6 +11,7 @@
     "flags_count": 13,
     "difficulty": "hard",
     "description": "7x7 grid, 10 hazards. At the start, 8 cells flash their content (mix of hazards and signals) for 3 steps, then fade to hidden. The agent must memorize which flashed cells were hazards vs signals, then use that recalled info combined with new reveals to flag all hazards. Models that ignore or misremember the flash phase will miss critical hazard locations.",
     "tags": ["delayed_recall", "partial_observability", "unsafe_guess_penalty"],
     "flash_cells": [[0,2],[1,4],[2,1],[2,5],[3,3],[4,0],[4,6],[6,3]],
     "flash_until_step": 3,

     "flags_count": 13,
     "difficulty": "hard",
     "description": "7x7 grid, 10 hazards. At the start, 8 cells flash their content (mix of hazards and signals) for 3 steps, then fade to hidden. The agent must memorize which flashed cells were hazards vs signals, then use that recalled info combined with new reveals to flag all hazards. Models that ignore or misremember the flash phase will miss critical hazard locations.",
+    "how_to_play": "Some cells will briefly flash their contents (hazards and signals) for the first 3 steps, then fade back to hidden. Memorize which flashed cells were hazards! After the flash fades, reveal more cells to find signal clues (numbers showing adjacent hazard count), then flag all 10 hazard locations and submit your solution.",
     "tags": ["delayed_recall", "partial_observability", "unsafe_guess_penalty"],
     "flash_cells": [[0,2],[1,4],[2,1],[2,5],[3,3],[4,0],[4,6],[6,3]],
     "flash_until_step": 3,

scenarios/fog_key_hunt_8x8.json CHANGED Viewed

@@ -13,6 +13,7 @@
     "flags_count": 16,
     "difficulty": "hard",
     "description": "8x8 grid, 14 hazards, 6 keys, 4 decoys, fog viewport radius 1 (3x3 window). Agent starts center (4,4). Must find and collect all 6 keys with only a tiny viewport. One hazard hit is fatal. Decoys look like keys but waste steps. The agent must systematically explore, remember where keys and hazards were seen, avoid decoys, and plan a safe path. Impulsive exploration kills.",
     "tags": ["fog_of_war", "visual_clutter", "unsafe_guess_penalty", "delayed_recall"],
     "start_position": [4, 4],
     "viewport_radius": 1,

     "flags_count": 16,
     "difficulty": "hard",
     "description": "8x8 grid, 14 hazards, 6 keys, 4 decoys, fog viewport radius 1 (3x3 window). Agent starts center (4,4). Must find and collect all 6 keys with only a tiny viewport. One hazard hit is fatal. Decoys look like keys but waste steps. The agent must systematically explore, remember where keys and hazards were seen, avoid decoys, and plan a safe path. Impulsive exploration kills.",
+    "how_to_play": "Your viewport is tiny (3x3 window). Move it around the board to explore, reveal cells to find the 6 keys (gold), and collect them all to win. One hazard hit is fatal! Ignore decoys (diamonds). Memorize safe paths as you explore — you can't see where you've been.",
     "tags": ["fog_of_war", "visual_clutter", "unsafe_guess_penalty", "delayed_recall"],
     "start_position": [4, 4],
     "viewport_radius": 1,

scenarios/fog_labyrinth_10x10.json CHANGED Viewed

@@ -11,6 +11,7 @@
     "flags_count": 23,
     "difficulty": "hard",
     "description": "10x10 grid, 20 hazards, fog-of-war with viewport radius 2. Agent starts at corner (0,0) and can only see a 5x5 window. Must move viewport across the board, memorize revealed information from previous positions, and flag all 20 hazards. Revisiting areas wastes steps. Models must build and maintain an internal spatial map across 40 steps — most will forget early reveals by the time they need to submit.",
     "tags": ["fog_of_war", "delayed_recall", "multi_stage_solution"],
     "start_position": [0, 0],
     "viewport_radius": 2,

     "flags_count": 23,
     "difficulty": "hard",
     "description": "10x10 grid, 20 hazards, fog-of-war with viewport radius 2. Agent starts at corner (0,0) and can only see a 5x5 window. Must move viewport across the board, memorize revealed information from previous positions, and flag all 20 hazards. Revisiting areas wastes steps. Models must build and maintain an internal spatial map across 40 steps — most will forget early reveals by the time they need to submit.",
+    "how_to_play": "You can only see a 5x5 window around your viewport. Use 'Move Viewport' to pan across the 10x10 board. Reveal cells in each area to find signal clues, memorize what you've seen, then flag all 20 hazards and submit. Don't waste steps revisiting areas — you only have 40 moves.",
     "tags": ["fog_of_war", "delayed_recall", "multi_stage_solution"],
     "start_position": [0, 0],
     "viewport_radius": 2,

scenarios/partial_intel_9x9.json CHANGED Viewed

@@ -11,6 +11,7 @@
     "flags_count": 19,
     "difficulty": "hard",
     "description": "9x9 grid, 16 hazards. Partial signal mode: each signal cell only reveals SOME of the directions where hazards lie, plus a hint of the total count. The agent must combine partial clues from neighboring signal cells to reconstruct the full picture. No single signal cell gives complete info — models that act on incomplete data will flag wrong cells.",
     "tags": ["partial_observability", "multi_stage_solution", "unsafe_guess_penalty"],
     "win_condition": {
         "type": "flag_all_hazards"

     "flags_count": 19,
     "difficulty": "hard",
     "description": "9x9 grid, 16 hazards. Partial signal mode: each signal cell only reveals SOME of the directions where hazards lie, plus a hint of the total count. The agent must combine partial clues from neighboring signal cells to reconstruct the full picture. No single signal cell gives complete info — models that act on incomplete data will flag wrong cells.",
+    "how_to_play": "Each signal cell only reveals SOME directions to nearby hazards, plus a hint like '2 total'. Combine partial clues from neighboring signals to figure out the full picture. Flag all 16 hazards and submit. You can survive 2 hazard hits before game over.",
     "tags": ["partial_observability", "multi_stage_solution", "unsafe_guess_penalty"],
     "win_condition": {
         "type": "flag_all_hazards"

scenarios/safe_zone_identification_9x9.json CHANGED Viewed

@@ -12,6 +12,7 @@
     "flags_count": 25,
     "difficulty": "hard",
     "description": "9x9 grid, 22 hazards, 7 decoys. Win condition: identify ALL safe cells (non-hazard). With 81 cells and 22 hazards, there are 59 safe cells to find — but range signals give fuzzy counts (e.g. '1-3 hazards nearby'), and 7 decoys look suspicious but are actually safe. One hazard hit is fatal. The agent must prove cells are safe by elimination rather than flagging hazards. This inverts the usual strategy and punishes models that default to hazard-flagging heuristics.",
     "tags": ["partial_observability", "visual_clutter", "unsafe_guess_penalty", "multi_stage_solution"],
     "win_condition": {
         "type": "identify_safe_cells"

     "flags_count": 25,
     "difficulty": "hard",
     "description": "9x9 grid, 22 hazards, 7 decoys. Win condition: identify ALL safe cells (non-hazard). With 81 cells and 22 hazards, there are 59 safe cells to find — but range signals give fuzzy counts (e.g. '1-3 hazards nearby'), and 7 decoys look suspicious but are actually safe. One hazard hit is fatal. The agent must prove cells are safe by elimination rather than flagging hazards. This inverts the usual strategy and punishes models that default to hazard-flagging heuristics.",
+    "how_to_play": "Unlike other scenarios, your goal is to identify ALL safe (non-hazard) cells — not flag hazards. Reveal cells to find range signals, deduce which cells definitely have no hazards, and submit those as safe positions. One hazard hit is fatal. There are 59 safe cells to find among 81 total.",
     "tags": ["partial_observability", "visual_clutter", "unsafe_guess_penalty", "multi_stage_solution"],
     "win_condition": {
         "type": "identify_safe_cells"

server/Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# Visual Memory Gym — Docker image for OpenEnv + Hugging Face Spaces
+#
+# Single-service Python container on port 8000.
+# No database, no external APIs, no additional services.
+#
+# Build:
+#   cd visual-memory && docker build -f server/Dockerfile -t openenv-visual-memory .
+#
+# Run:
+#   docker run -d --name visual-memory -p 8000:8000 openenv-visual-memory
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git curl && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY . /app/env
+WORKDIR /app/env
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then uv sync --frozen --no-install-project --no-editable; \
+    else uv sync --no-install-project --no-editable; fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then uv sync --frozen --no-editable; \
+    else uv sync --no-editable; fi
+FROM ${BASE_IMAGE}
+WORKDIR /app
+COPY --from=builder /app/env/.venv /app/.venv
+COPY --from=builder /app/env /app/env
+ENV PATH="/app/.venv/bin:$PATH"
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+ENV ENABLE_WEB_INTERFACE=true
+EXPOSE 8000
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

server/app.py CHANGED Viewed

@@ -14,6 +14,7 @@ import sys
 from pathlib import Path
 from dotenv import load_dotenv
 from openenv.core.env_server.http_server import create_app
 load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env"))
@@ -37,6 +38,13 @@ app = create_app(
     max_concurrent_envs=MAX_CONCURRENT_ENVS,
 )
 def main(host: str = "0.0.0.0", port: int = 8000):
     import uvicorn

 from pathlib import Path
 from dotenv import load_dotenv
+from fastapi.middleware.cors import CORSMiddleware
 from openenv.core.env_server.http_server import create_app
 load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env"))
     max_concurrent_envs=MAX_CONCURRENT_ENVS,
 )
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
 def main(host: str = "0.0.0.0", port: int = 8000):
     import uvicorn

server/memory_environment.py CHANGED Viewed

@@ -70,6 +70,7 @@ def _list_available_scenarios() -> list[dict]:
                 "difficulty": data.get("difficulty", "hard"),
                 "board_size": f"{data.get('board_width', '?')}x{data.get('board_height', '?')}",
                 "description": data.get("description", ""),
                 "tags": data.get("tags", []),
             })
         except Exception:
@@ -146,6 +147,8 @@ class MemoryEnvironment(MCPEnvironment):
                 "scenario_type": self._engine.scenario_type.value,
                 "win_condition": self._engine.win_condition.value,
                 "max_steps": self._engine.max_steps,
                 "board_view": view,
             }

                 "difficulty": data.get("difficulty", "hard"),
                 "board_size": f"{data.get('board_width', '?')}x{data.get('board_height', '?')}",
                 "description": data.get("description", ""),
+                "how_to_play": data.get("how_to_play", ""),
                 "tags": data.get("tags", []),
             })
         except Exception:
                 "scenario_type": self._engine.scenario_type.value,
                 "win_condition": self._engine.win_condition.value,
                 "max_steps": self._engine.max_steps,
+                "description": data.get("description", ""),
+                "how_to_play": data.get("how_to_play", ""),
                 "board_view": view,
             }

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff