Spaces:
Sleeping
Sleeping
Commit ·
ea847ad
1
Parent(s): 97ac6b2
build configs added and added smoke tests
Browse files- .github/workflows/validate.yml +66 -0
- README.md +11 -0
- grid_env/Server/Dockerfile +1 -1
- pyproject.toml +30 -0
- grid_env/Server/requirement.text → requirements.txt +2 -0
- rl_env.egg-info/PKG-INFO +15 -0
- rl_env.egg-info/SOURCES.txt +23 -0
- rl_env.egg-info/dependency_links.txt +1 -0
- rl_env.egg-info/requires.txt +9 -0
- rl_env.egg-info/top_level.txt +1 -0
- tests/conftest.py +27 -0
- tests/test_baseline_stub.py +116 -0
- tests/test_env_smoke.py +122 -0
- tests/test_graders.py +199 -0
- tests/test_openenv_spec.py +107 -0
- tests/test_tasks.py +79 -0
.github/workflows/validate.yml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Phase 1 — Automated Validation
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [main]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [main]
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
# ── Job 1: Unit & integration tests ────────────────────────────────────────
|
| 11 |
+
test:
|
| 12 |
+
name: pytest
|
| 13 |
+
runs-on: ubuntu-latest
|
| 14 |
+
|
| 15 |
+
steps:
|
| 16 |
+
- name: Checkout repository
|
| 17 |
+
uses: actions/checkout@v4
|
| 18 |
+
|
| 19 |
+
- name: Set up Python 3.11
|
| 20 |
+
uses: actions/setup-python@v5
|
| 21 |
+
with:
|
| 22 |
+
python-version: "3.11"
|
| 23 |
+
cache: "pip"
|
| 24 |
+
|
| 25 |
+
- name: Install package and test dependencies
|
| 26 |
+
run: pip install -e ".[test]"
|
| 27 |
+
|
| 28 |
+
- name: Run test suite
|
| 29 |
+
run: pytest tests/ -v --tb=short
|
| 30 |
+
|
| 31 |
+
# ── Job 2: Dockerfile build gate ───────────────────────────────────────────
|
| 32 |
+
docker-build:
|
| 33 |
+
name: docker build
|
| 34 |
+
runs-on: ubuntu-latest
|
| 35 |
+
|
| 36 |
+
steps:
|
| 37 |
+
- name: Checkout repository
|
| 38 |
+
uses: actions/checkout@v4
|
| 39 |
+
|
| 40 |
+
- name: Build Docker image (no push)
|
| 41 |
+
run: docker build -t rl-env-server:ci -f grid_env/Server/Dockerfile .
|
| 42 |
+
|
| 43 |
+
# ── Job 3: OpenEnv spec validation (best-effort) ───────────────────────────
|
| 44 |
+
openenv-validate:
|
| 45 |
+
name: openenv spec check
|
| 46 |
+
runs-on: ubuntu-latest
|
| 47 |
+
|
| 48 |
+
steps:
|
| 49 |
+
- name: Checkout repository
|
| 50 |
+
uses: actions/checkout@v4
|
| 51 |
+
|
| 52 |
+
- name: Set up Python 3.11
|
| 53 |
+
uses: actions/setup-python@v5
|
| 54 |
+
with:
|
| 55 |
+
python-version: "3.11"
|
| 56 |
+
|
| 57 |
+
- name: Install package
|
| 58 |
+
run: pip install -e .
|
| 59 |
+
|
| 60 |
+
- name: Try openenv validate (non-blocking if openenv not available)
|
| 61 |
+
run: |
|
| 62 |
+
if pip install openenv 2>/dev/null; then
|
| 63 |
+
openenv validate grid_env/openv.yaml
|
| 64 |
+
else
|
| 65 |
+
echo "openenv package not available on PyPI — skipping CLI validate (YAML tests cover compliance)"
|
| 66 |
+
fi
|
README.md
CHANGED
|
@@ -1,5 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# RL-Env
|
| 2 |
|
|
|
|
| 3 |
This repo contains a real-world OpenEnv-style warehouse fulfillment environment. The simulated task is a pharmacy micro-fulfillment workflow: scan the right bins, pick the right items, pack them correctly, and manage battery under time pressure.
|
| 4 |
|
| 5 |
## Requirements Coverage
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RL-Env Warehouse Fulfillment
|
| 3 |
+
emoji: 🏭
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
# RL-Env
|
| 12 |
|
| 13 |
+
|
| 14 |
This repo contains a real-world OpenEnv-style warehouse fulfillment environment. The simulated task is a pharmacy micro-fulfillment workflow: scan the right bins, pick the right items, pack them correctly, and manage battery under time pressure.
|
| 15 |
|
| 16 |
## Requirements Coverage
|
grid_env/Server/Dockerfile
CHANGED
|
@@ -5,7 +5,7 @@ WORKDIR /app
|
|
| 5 |
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
ENV PYTHONUNBUFFERED=1
|
| 7 |
|
| 8 |
-
COPY
|
| 9 |
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 10 |
|
| 11 |
COPY . /app
|
|
|
|
| 5 |
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
ENV PYTHONUNBUFFERED=1
|
| 7 |
|
| 8 |
+
COPY requirements.txt /app/requirements.txt
|
| 9 |
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 10 |
|
| 11 |
COPY . /app
|
pyproject.toml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "rl-env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "MiniGrid-style warehouse fulfillment RL environment"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"fastapi>=0.100",
|
| 12 |
+
"pydantic>=2.0",
|
| 13 |
+
"openai>=1.0",
|
| 14 |
+
"uvicorn>=0.20",
|
| 15 |
+
"pyyaml>=6.0",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
[project.optional-dependencies]
|
| 19 |
+
test = [
|
| 20 |
+
"pytest>=8.0",
|
| 21 |
+
"pytest-timeout>=2.0",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
[tool.setuptools.packages.find]
|
| 25 |
+
where = ["."]
|
| 26 |
+
include = ["grid_env*"]
|
| 27 |
+
|
| 28 |
+
[tool.pytest.ini_options]
|
| 29 |
+
testpaths = ["tests"]
|
| 30 |
+
timeout = 60
|
grid_env/Server/requirement.text → requirements.txt
RENAMED
|
@@ -2,3 +2,5 @@ fastapi==0.116.1
|
|
| 2 |
pydantic==2.11.7
|
| 3 |
openai==1.108.1
|
| 4 |
uvicorn==0.35.0
|
|
|
|
|
|
|
|
|
| 2 |
pydantic==2.11.7
|
| 3 |
openai==1.108.1
|
| 4 |
uvicorn==0.35.0
|
| 5 |
+
pytest>=8.0
|
| 6 |
+
pyyaml>=6.0
|
rl_env.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: rl-env
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: MiniGrid-style warehouse fulfillment RL environment
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
License-File: LICENSE
|
| 7 |
+
Requires-Dist: fastapi>=0.100
|
| 8 |
+
Requires-Dist: pydantic>=2.0
|
| 9 |
+
Requires-Dist: openai>=1.0
|
| 10 |
+
Requires-Dist: uvicorn>=0.20
|
| 11 |
+
Requires-Dist: pyyaml>=6.0
|
| 12 |
+
Provides-Extra: test
|
| 13 |
+
Requires-Dist: pytest>=8.0; extra == "test"
|
| 14 |
+
Requires-Dist: pytest-timeout>=2.0; extra == "test"
|
| 15 |
+
Dynamic: license-file
|
rl_env.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LICENSE
|
| 2 |
+
README.md
|
| 3 |
+
pyproject.toml
|
| 4 |
+
grid_env/__init__.py
|
| 5 |
+
grid_env/baseline.py
|
| 6 |
+
grid_env/client.py
|
| 7 |
+
grid_env/env.py
|
| 8 |
+
grid_env/graders.py
|
| 9 |
+
grid_env/models.py
|
| 10 |
+
grid_env/tasks.py
|
| 11 |
+
grid_env/Server/__init__.py
|
| 12 |
+
grid_env/Server/app.py
|
| 13 |
+
grid_env/Server/warehouse_env.py
|
| 14 |
+
rl_env.egg-info/PKG-INFO
|
| 15 |
+
rl_env.egg-info/SOURCES.txt
|
| 16 |
+
rl_env.egg-info/dependency_links.txt
|
| 17 |
+
rl_env.egg-info/requires.txt
|
| 18 |
+
rl_env.egg-info/top_level.txt
|
| 19 |
+
tests/test_baseline_stub.py
|
| 20 |
+
tests/test_env_smoke.py
|
| 21 |
+
tests/test_graders.py
|
| 22 |
+
tests/test_openenv_spec.py
|
| 23 |
+
tests/test_tasks.py
|
rl_env.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
rl_env.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.100
|
| 2 |
+
pydantic>=2.0
|
| 3 |
+
openai>=1.0
|
| 4 |
+
uvicorn>=0.20
|
| 5 |
+
pyyaml>=6.0
|
| 6 |
+
|
| 7 |
+
[test]
|
| 8 |
+
pytest>=8.0
|
| 9 |
+
pytest-timeout>=2.0
|
rl_env.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
grid_env
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Shared fixtures for the warehouse fulfillment test suite.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from grid_env.env import WarehouseFulfillmentEnv
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@pytest.fixture()
|
| 10 |
+
def env_easy():
|
| 11 |
+
env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
|
| 12 |
+
env.reset()
|
| 13 |
+
return env
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@pytest.fixture()
|
| 17 |
+
def env_medium():
|
| 18 |
+
env = WarehouseFulfillmentEnv(task_id="medium_multi_item", seed=7)
|
| 19 |
+
env.reset()
|
| 20 |
+
return env
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@pytest.fixture()
|
| 24 |
+
def env_hard():
|
| 25 |
+
env = WarehouseFulfillmentEnv(task_id="hard_restock_priority", seed=7)
|
| 26 |
+
env.reset()
|
| 27 |
+
return env
|
tests/test_baseline_stub.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline stub tests — runs the baseline runner without an OpenAI API key
|
| 3 |
+
by monkey-patching the client with a deterministic stub.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import types
|
| 7 |
+
from unittest.mock import MagicMock, patch
|
| 8 |
+
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
from grid_env import WarehouseFulfillmentEnv
|
| 12 |
+
from grid_env.baseline import run_task, format_report
|
| 13 |
+
from grid_env.graders import grade_episode
|
| 14 |
+
from grid_env.models import BaselineCommand
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
TASK_IDS = ["easy_single_pick", "medium_multi_item", "hard_restock_priority"]
|
| 18 |
+
|
| 19 |
+
# Cycle of deterministic actions that exercise most code paths without getting stuck.
|
| 20 |
+
_ACTION_CYCLE = [
|
| 21 |
+
"turn_right",
|
| 22 |
+
"move_forward",
|
| 23 |
+
"turn_left",
|
| 24 |
+
"scan_bin",
|
| 25 |
+
"pick_item",
|
| 26 |
+
"move_forward",
|
| 27 |
+
"turn_right",
|
| 28 |
+
"move_forward",
|
| 29 |
+
"pack_item",
|
| 30 |
+
"wait",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _make_stub_client(action_cycle=None):
|
| 35 |
+
"""Build a mock OpenAI client whose responses.create() returns stub actions."""
|
| 36 |
+
actions = action_cycle or _ACTION_CYCLE
|
| 37 |
+
counter = {"i": 0}
|
| 38 |
+
|
| 39 |
+
def fake_create(**kwargs):
|
| 40 |
+
action = actions[counter["i"] % len(actions)]
|
| 41 |
+
counter["i"] += 1
|
| 42 |
+
cmd = BaselineCommand(command=action, rationale="stub")
|
| 43 |
+
import json
|
| 44 |
+
response = MagicMock()
|
| 45 |
+
response.output_text = json.dumps(cmd.model_dump())
|
| 46 |
+
return response
|
| 47 |
+
|
| 48 |
+
client = MagicMock()
|
| 49 |
+
client.responses.create.side_effect = fake_create
|
| 50 |
+
return client
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@pytest.fixture()
|
| 54 |
+
def stub_client():
|
| 55 |
+
return _make_stub_client()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@pytest.mark.parametrize("task_id", TASK_IDS)
|
| 59 |
+
def test_run_task_returns_required_keys(task_id):
|
| 60 |
+
"""run_task returns a dict with score, reward, steps, success, task_id."""
|
| 61 |
+
with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
|
| 62 |
+
patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
|
| 63 |
+
result = run_task(task_id, model="stub", seed=7)
|
| 64 |
+
assert set(result.keys()) >= {"task_id", "score", "reward", "steps", "success"}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@pytest.mark.parametrize("task_id", TASK_IDS)
|
| 68 |
+
def test_run_task_score_in_range(task_id):
|
| 69 |
+
"""Score from run_task is always ∈ [0, 1]."""
|
| 70 |
+
with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
|
| 71 |
+
patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
|
| 72 |
+
result = run_task(task_id, model="stub", seed=7)
|
| 73 |
+
assert 0.0 <= result["score"] <= 1.0, f"Score {result['score']} out of range"
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@pytest.mark.parametrize("task_id", TASK_IDS)
|
| 77 |
+
def test_run_task_steps_within_max(task_id):
|
| 78 |
+
"""Steps taken must not exceed the task's max_steps."""
|
| 79 |
+
from grid_env.tasks import get_task
|
| 80 |
+
max_steps = get_task(task_id).max_steps
|
| 81 |
+
with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
|
| 82 |
+
patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
|
| 83 |
+
result = run_task(task_id, model="stub", seed=7)
|
| 84 |
+
assert result["steps"] <= max_steps
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def test_run_task_task_id_in_result():
|
| 88 |
+
with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
|
| 89 |
+
patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
|
| 90 |
+
result = run_task("easy_single_pick", model="stub", seed=7)
|
| 91 |
+
assert result["task_id"] == "easy_single_pick"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def test_format_report_contains_model_and_scores():
|
| 95 |
+
results = [
|
| 96 |
+
{"task_id": "easy_single_pick", "score": 0.75, "reward": 1.2, "steps": 15, "success": 1.0},
|
| 97 |
+
{"task_id": "medium_multi_item", "score": 0.5, "reward": 0.8, "steps": 30, "success": 0.0},
|
| 98 |
+
]
|
| 99 |
+
report = format_report(results, model="test-model")
|
| 100 |
+
assert "test-model" in report
|
| 101 |
+
assert "easy_single_pick" in report
|
| 102 |
+
assert "mean_score" in report
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def test_deterministic_seed_reproducibility():
|
| 106 |
+
"""Same seed must produce the same final score on two separate runs."""
|
| 107 |
+
client1 = _make_stub_client()
|
| 108 |
+
client2 = _make_stub_client()
|
| 109 |
+
with patch("grid_env.baseline.OpenAI", return_value=client1), \
|
| 110 |
+
patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
|
| 111 |
+
r1 = run_task("easy_single_pick", model="stub", seed=7)
|
| 112 |
+
with patch("grid_env.baseline.OpenAI", return_value=client2), \
|
| 113 |
+
patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
|
| 114 |
+
r2 = run_task("easy_single_pick", model="stub", seed=7)
|
| 115 |
+
assert r1["score"] == r2["score"]
|
| 116 |
+
assert r1["steps"] == r2["steps"]
|
tests/test_env_smoke.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Smoke tests: environment instantiation, reset, step, and episode termination
|
| 3 |
+
for all three task IDs.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pytest
|
| 7 |
+
from grid_env.env import WarehouseFulfillmentEnv, available_tasks
|
| 8 |
+
from grid_env.graders import grade_episode
|
| 9 |
+
from grid_env.models import WarehouseObservation, WarehouseReward
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
TASK_IDS = ["easy_single_pick", "medium_multi_item", "hard_restock_priority"]
|
| 13 |
+
ALL_ACTIONS = [
|
| 14 |
+
"turn_left",
|
| 15 |
+
"turn_right",
|
| 16 |
+
"move_forward",
|
| 17 |
+
"scan_bin",
|
| 18 |
+
"pick_item",
|
| 19 |
+
"pack_item",
|
| 20 |
+
"recharge",
|
| 21 |
+
"wait",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@pytest.mark.parametrize("task_id", TASK_IDS)
|
| 26 |
+
def test_env_instantiation(task_id):
|
| 27 |
+
"""Environment can be created for each task without errors."""
|
| 28 |
+
env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
|
| 29 |
+
assert env is not None
|
| 30 |
+
assert env.task.task_id == task_id
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@pytest.mark.parametrize("task_id", TASK_IDS)
|
| 34 |
+
def test_reset_returns_observation(task_id):
|
| 35 |
+
"""reset() returns a valid WarehouseObservation for each task."""
|
| 36 |
+
env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
|
| 37 |
+
obs = env.reset()
|
| 38 |
+
assert isinstance(obs, WarehouseObservation)
|
| 39 |
+
assert obs.task_id == task_id
|
| 40 |
+
assert obs.battery_level > 0
|
| 41 |
+
assert isinstance(obs.pending_order, list)
|
| 42 |
+
assert len(obs.pending_order) > 0
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@pytest.mark.parametrize("action", ALL_ACTIONS)
|
| 46 |
+
def test_step_all_actions_no_crash(action):
|
| 47 |
+
"""Every action string can be stepped without raising an exception."""
|
| 48 |
+
env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
|
| 49 |
+
env.reset()
|
| 50 |
+
obs, reward, done, info = env.step(action)
|
| 51 |
+
assert isinstance(obs, WarehouseObservation)
|
| 52 |
+
assert isinstance(reward, WarehouseReward)
|
| 53 |
+
assert isinstance(done, bool)
|
| 54 |
+
assert isinstance(info, dict)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@pytest.mark.parametrize("task_id", TASK_IDS)
|
| 58 |
+
def test_step_returns_correct_types(task_id):
|
| 59 |
+
"""step() returns the four-tuple with correct types."""
|
| 60 |
+
env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
|
| 61 |
+
env.reset()
|
| 62 |
+
obs, reward, done, info = env.step("wait")
|
| 63 |
+
assert isinstance(obs, WarehouseObservation)
|
| 64 |
+
assert isinstance(reward, WarehouseReward)
|
| 65 |
+
assert isinstance(done, bool)
|
| 66 |
+
assert "completion_ratio" in info
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@pytest.mark.parametrize("task_id", TASK_IDS)
|
| 70 |
+
def test_episode_terminates_at_max_steps(task_id):
|
| 71 |
+
"""Running max_steps wait actions always terminates the episode."""
|
| 72 |
+
env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
|
| 73 |
+
env.reset()
|
| 74 |
+
max_steps = env.task.max_steps
|
| 75 |
+
done = False
|
| 76 |
+
for _ in range(max_steps + 5):
|
| 77 |
+
_, _, done, _ = env.step("wait")
|
| 78 |
+
if done:
|
| 79 |
+
break
|
| 80 |
+
assert done, f"Episode did not terminate after {max_steps} steps for {task_id}"
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@pytest.mark.parametrize("task_id", TASK_IDS)
|
| 84 |
+
def test_score_in_range_after_episode(task_id):
|
| 85 |
+
"""grade_episode() always returns a float in [0.0, 1.0]."""
|
| 86 |
+
env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
|
| 87 |
+
env.reset()
|
| 88 |
+
done = False
|
| 89 |
+
while not done:
|
| 90 |
+
_, _, done, _ = env.step("wait")
|
| 91 |
+
state = env.state()
|
| 92 |
+
score = grade_episode(state)
|
| 93 |
+
assert isinstance(score, float)
|
| 94 |
+
assert 0.0 <= score <= 1.0, f"Score {score} out of range for {task_id}"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def test_step_after_done_is_safe():
|
| 98 |
+
"""Stepping after episode is done returns done=True without raising."""
|
| 99 |
+
env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
|
| 100 |
+
env.reset()
|
| 101 |
+
for _ in range(env.task.max_steps):
|
| 102 |
+
env.step("wait")
|
| 103 |
+
_, _, done, _ = env.step("wait")
|
| 104 |
+
assert done
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def test_available_tasks_returns_all_three():
|
| 108 |
+
"""available_tasks() returns exactly the three expected task IDs."""
|
| 109 |
+
tasks = available_tasks()
|
| 110 |
+
ids = {t["task_id"] for t in tasks}
|
| 111 |
+
assert ids == set(TASK_IDS)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def test_state_method_returns_consistent_data():
|
| 115 |
+
"""state() reflects the same step count as internal counter."""
|
| 116 |
+
env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
|
| 117 |
+
env.reset()
|
| 118 |
+
env.step("turn_left")
|
| 119 |
+
env.step("move_forward")
|
| 120 |
+
state = env.state()
|
| 121 |
+
assert state.step_count == 2
|
| 122 |
+
assert state.task_id == "easy_single_pick"
|
tests/test_graders.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for the deterministic graders.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from grid_env.graders import grade_easy, grade_episode, grade_hard, grade_medium, _clamp
|
| 7 |
+
from grid_env.models import WarehouseMetrics, WarehouseState, BinState, OrderLine
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _make_state(
|
| 11 |
+
task_id: str = "easy_single_pick",
|
| 12 |
+
completion_ratio: float = 0.0,
|
| 13 |
+
step_count: int = 5,
|
| 14 |
+
max_steps: int = 40,
|
| 15 |
+
correct_scans: int = 0,
|
| 16 |
+
wrong_scans: int = 0,
|
| 17 |
+
correct_picks: int = 0,
|
| 18 |
+
wrong_picks: int = 0,
|
| 19 |
+
correct_packs: int = 0,
|
| 20 |
+
invalid_actions: int = 0,
|
| 21 |
+
recharges: int = 0,
|
| 22 |
+
battery_depletion_events: int = 0,
|
| 23 |
+
) -> WarehouseState:
|
| 24 |
+
metrics = WarehouseMetrics(
|
| 25 |
+
correct_scans=correct_scans,
|
| 26 |
+
wrong_scans=wrong_scans,
|
| 27 |
+
correct_picks=correct_picks,
|
| 28 |
+
wrong_picks=wrong_picks,
|
| 29 |
+
correct_packs=correct_packs,
|
| 30 |
+
invalid_actions=invalid_actions,
|
| 31 |
+
recharges=recharges,
|
| 32 |
+
battery_depletion_events=battery_depletion_events,
|
| 33 |
+
)
|
| 34 |
+
return WarehouseState(
|
| 35 |
+
episode_id="test-ep",
|
| 36 |
+
task_id=task_id,
|
| 37 |
+
difficulty="easy",
|
| 38 |
+
step_count=step_count,
|
| 39 |
+
done=True,
|
| 40 |
+
success=completion_ratio >= 1.0,
|
| 41 |
+
max_steps=max_steps,
|
| 42 |
+
grid_size=(7, 7),
|
| 43 |
+
agent_position=(1, 1),
|
| 44 |
+
heading="E",
|
| 45 |
+
carrying=None,
|
| 46 |
+
battery_level=30,
|
| 47 |
+
battery_capacity=36,
|
| 48 |
+
dock_position=(1, 1),
|
| 49 |
+
pack_station_position=(5, 5),
|
| 50 |
+
charger_position=(1, 5),
|
| 51 |
+
bins=[BinState(bin_id="A1", position=(2, 1), sku="thermometer", quantity=2)],
|
| 52 |
+
order=[OrderLine(sku="thermometer", quantity=1)],
|
| 53 |
+
packed_order=[OrderLine(sku="thermometer", quantity=int(completion_ratio))],
|
| 54 |
+
scanned_bins=[],
|
| 55 |
+
metrics=metrics,
|
| 56 |
+
action_history=[],
|
| 57 |
+
total_reward=0.0,
|
| 58 |
+
completion_ratio=completion_ratio,
|
| 59 |
+
task_description="Test",
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ── _clamp ────────────────────────────────────────────────────────────────────
|
| 64 |
+
|
| 65 |
+
def test_clamp_above_one():
|
| 66 |
+
assert _clamp(1.5) == 1.0
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_clamp_below_zero():
|
| 70 |
+
assert _clamp(-0.3) == 0.0
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test_clamp_within_range():
|
| 74 |
+
assert _clamp(0.75) == pytest.approx(0.75, abs=1e-4)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# ── grade_easy ────────────────────────────────────────────────────────────────
|
| 78 |
+
|
| 79 |
+
def test_grade_easy_zero_completion():
|
| 80 |
+
# completion_ratio=0, but efficiency_bonus still applies (5/40 = 87.5% efficiency)
|
| 81 |
+
# score = 0 + 0.15 * 0.875 + 0.10 * 0 = 0.13125 (no zero_penalty because invalid_actions=0)
|
| 82 |
+
state = _make_state("easy_single_pick", completion_ratio=0.0)
|
| 83 |
+
score = grade_easy(state)
|
| 84 |
+
assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def test_grade_easy_full_completion_no_penalty():
|
| 88 |
+
state = _make_state(
|
| 89 |
+
"easy_single_pick",
|
| 90 |
+
completion_ratio=1.0,
|
| 91 |
+
step_count=10,
|
| 92 |
+
max_steps=40,
|
| 93 |
+
invalid_actions=0,
|
| 94 |
+
wrong_picks=0,
|
| 95 |
+
)
|
| 96 |
+
score = grade_easy(state)
|
| 97 |
+
assert score >= 0.75
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def test_grade_easy_wrong_picks_reduce_score():
|
| 101 |
+
base = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=0)
|
| 102 |
+
penalised = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=3)
|
| 103 |
+
assert grade_easy(penalised) < grade_easy(base)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def test_grade_easy_clamped():
|
| 107 |
+
state = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=0, invalid_actions=0)
|
| 108 |
+
assert 0.0 <= grade_easy(state) <= 1.0
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ── grade_medium ──────────────────────────────────────────────────────────────
|
| 112 |
+
|
| 113 |
+
def test_grade_medium_zero_completion():
|
| 114 |
+
# completion_ratio=0; efficiency_bonus and scan_ratio are non-zero when steps are low
|
| 115 |
+
state = _make_state("medium_multi_item", completion_ratio=0.0)
|
| 116 |
+
score = grade_medium(state)
|
| 117 |
+
assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def test_grade_medium_full_completion_with_scans():
|
| 121 |
+
state = _make_state(
|
| 122 |
+
"medium_multi_item",
|
| 123 |
+
completion_ratio=1.0,
|
| 124 |
+
correct_scans=2,
|
| 125 |
+
wrong_picks=0,
|
| 126 |
+
invalid_actions=0,
|
| 127 |
+
step_count=20,
|
| 128 |
+
max_steps=60,
|
| 129 |
+
)
|
| 130 |
+
score = grade_medium(state)
|
| 131 |
+
assert score >= 0.75
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def test_grade_medium_scans_improve_score():
|
| 135 |
+
no_scans = _make_state("medium_multi_item", completion_ratio=0.5, correct_scans=0)
|
| 136 |
+
with_scans = _make_state("medium_multi_item", completion_ratio=0.5, correct_scans=2)
|
| 137 |
+
assert grade_medium(with_scans) > grade_medium(no_scans)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def test_grade_medium_clamped():
|
| 141 |
+
state = _make_state("medium_multi_item", completion_ratio=1.0, correct_scans=2)
|
| 142 |
+
assert 0.0 <= grade_medium(state) <= 1.0
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ── grade_hard ────────────────────────────────────────────────────────────────
|
| 146 |
+
|
| 147 |
+
def test_grade_hard_zero_completion():
|
| 148 |
+
# completion_ratio=0; efficiency_bonus and other bonuses contribute non-zero score
|
| 149 |
+
state = _make_state("hard_restock_priority", completion_ratio=0.0)
|
| 150 |
+
score = grade_hard(state)
|
| 151 |
+
assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def test_grade_hard_full_completion_with_recharge():
|
| 155 |
+
state = _make_state(
|
| 156 |
+
"hard_restock_priority",
|
| 157 |
+
completion_ratio=1.0,
|
| 158 |
+
correct_scans=3,
|
| 159 |
+
recharges=1,
|
| 160 |
+
battery_depletion_events=0,
|
| 161 |
+
wrong_picks=0,
|
| 162 |
+
invalid_actions=0,
|
| 163 |
+
step_count=30,
|
| 164 |
+
max_steps=85,
|
| 165 |
+
)
|
| 166 |
+
score = grade_hard(state)
|
| 167 |
+
assert score >= 0.75
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def test_grade_hard_battery_depletion_penalises():
|
| 171 |
+
safe = _make_state("hard_restock_priority", completion_ratio=1.0, battery_depletion_events=0)
|
| 172 |
+
depleted = _make_state("hard_restock_priority", completion_ratio=1.0, battery_depletion_events=1)
|
| 173 |
+
assert grade_hard(safe) > grade_hard(depleted)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def test_grade_hard_clamped():
|
| 177 |
+
state = _make_state("hard_restock_priority", completion_ratio=1.0)
|
| 178 |
+
assert 0.0 <= grade_hard(state) <= 1.0
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# ── grade_episode dispatcher ──────────────────────────────────────────────────
|
| 182 |
+
|
| 183 |
+
@pytest.mark.parametrize(
|
| 184 |
+
"task_id,grader",
|
| 185 |
+
[
|
| 186 |
+
("easy_single_pick", grade_easy),
|
| 187 |
+
("medium_multi_item", grade_medium),
|
| 188 |
+
("hard_restock_priority", grade_hard),
|
| 189 |
+
],
|
| 190 |
+
)
|
| 191 |
+
def test_grade_episode_dispatches_correctly(task_id, grader):
|
| 192 |
+
state = _make_state(task_id, completion_ratio=0.5)
|
| 193 |
+
assert grade_episode(state) == grader(state)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def test_grade_episode_unknown_task_raises():
|
| 197 |
+
state = _make_state("nonexistent_task", completion_ratio=0.5)
|
| 198 |
+
with pytest.raises(KeyError):
|
| 199 |
+
grade_episode(state)
|
tests/test_openenv_spec.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenEnv spec compliance tests — validates openv.yaml structure without
|
| 3 |
+
requiring the openenv package to be installed.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import importlib
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import pytest
|
| 10 |
+
import yaml
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
SPEC_PATH = Path(__file__).parent.parent / "grid_env" / "openv.yaml"
|
| 14 |
+
|
| 15 |
+
REQUIRED_TOP_LEVEL = {"spec_version", "name", "entrypoint", "models", "methods", "tasks"}
|
| 16 |
+
REQUIRED_MODELS = {"action", "observation", "state"}
|
| 17 |
+
REQUIRED_TASK_FIELDS = {"id", "grader"}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@pytest.fixture(scope="module")
|
| 21 |
+
def spec():
|
| 22 |
+
assert SPEC_PATH.exists(), f"openv.yaml not found at {SPEC_PATH}"
|
| 23 |
+
with SPEC_PATH.open() as f:
|
| 24 |
+
return yaml.safe_load(f)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_yaml_parses_successfully(spec):
|
| 28 |
+
assert spec is not None
|
| 29 |
+
assert isinstance(spec, dict)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_required_top_level_fields_present(spec):
|
| 33 |
+
missing = REQUIRED_TOP_LEVEL - set(spec.keys())
|
| 34 |
+
assert not missing, f"Missing top-level fields: {missing}"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_spec_version_is_string(spec):
|
| 38 |
+
assert isinstance(spec["spec_version"], str)
|
| 39 |
+
assert spec["spec_version"].strip() != ""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_name_is_non_empty_string(spec):
|
| 43 |
+
assert isinstance(spec["name"], str)
|
| 44 |
+
assert spec["name"].strip() != ""
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_entrypoint_format(spec):
|
| 48 |
+
"""Entrypoint should be 'module:ClassName' style."""
|
| 49 |
+
entrypoint = spec["entrypoint"]
|
| 50 |
+
assert isinstance(entrypoint, str)
|
| 51 |
+
assert ":" in entrypoint, "entrypoint must be 'module:ClassName'"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def test_entrypoint_module_is_importable(spec):
|
| 55 |
+
module_path, _ = spec["entrypoint"].split(":", 1)
|
| 56 |
+
try:
|
| 57 |
+
importlib.import_module(module_path)
|
| 58 |
+
except ImportError as exc:
|
| 59 |
+
pytest.fail(f"Entrypoint module '{module_path}' is not importable: {exc}")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def test_models_has_required_keys(spec):
|
| 63 |
+
models = spec.get("models", {})
|
| 64 |
+
missing = REQUIRED_MODELS - set(models.keys())
|
| 65 |
+
assert not missing, f"Missing model keys: {missing}"
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_methods_contains_reset_step_state(spec):
|
| 69 |
+
methods = set(spec.get("methods", []))
|
| 70 |
+
assert {"reset", "step", "state"}.issubset(methods), \
|
| 71 |
+
f"methods must include reset, step, state. Got: {methods}"
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def test_tasks_list_has_at_least_three_entries(spec):
|
| 75 |
+
tasks = spec.get("tasks", [])
|
| 76 |
+
assert len(tasks) >= 3, f"Expected ≥3 tasks, got {len(tasks)}"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def test_each_task_has_required_fields(spec):
|
| 80 |
+
for task in spec.get("tasks", []):
|
| 81 |
+
missing = REQUIRED_TASK_FIELDS - set(task.keys())
|
| 82 |
+
assert not missing, f"Task {task} missing fields: {missing}"
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def test_task_ids_are_unique(spec):
|
| 86 |
+
ids = [t["id"] for t in spec.get("tasks", [])]
|
| 87 |
+
assert len(ids) == len(set(ids)), "Duplicate task IDs found in spec"
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def test_grader_references_are_importable(spec):
|
| 91 |
+
"""Each grader in the spec should resolve to a callable."""
|
| 92 |
+
for task in spec.get("tasks", []):
|
| 93 |
+
grader_ref = task.get("grader", "")
|
| 94 |
+
assert ":" in grader_ref, f"Grader '{grader_ref}' is not 'module:fn' format"
|
| 95 |
+
mod_path, fn_name = grader_ref.split(":", 1)
|
| 96 |
+
try:
|
| 97 |
+
mod = importlib.import_module(mod_path)
|
| 98 |
+
except ImportError as exc:
|
| 99 |
+
pytest.fail(f"Cannot import grader module '{mod_path}': {exc}")
|
| 100 |
+
fn = getattr(mod, fn_name, None)
|
| 101 |
+
assert callable(fn), f"'{fn_name}' in '{mod_path}' is not callable"
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def test_baseline_section_present(spec):
|
| 105 |
+
assert "baseline" in spec
|
| 106 |
+
baseline = spec["baseline"]
|
| 107 |
+
assert "runner" in baseline and "seed" in baseline
|
tests/test_tasks.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for task definitions: presence of all 3 tasks, structural validity,
|
| 3 |
+
and grader callability.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pytest
|
| 7 |
+
from grid_env.tasks import TASKS, get_task, GRID_SIZE
|
| 8 |
+
from grid_env.graders import grade_episode
|
| 9 |
+
from grid_env.env import WarehouseFulfillmentEnv
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
EXPECTED_TASK_IDS = {"easy_single_pick", "medium_multi_item", "hard_restock_priority"}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_exactly_three_tasks_registered():
|
| 16 |
+
assert len(TASKS) == 3
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def test_all_expected_task_ids_present():
|
| 20 |
+
assert set(TASKS.keys()) == EXPECTED_TASK_IDS
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
|
| 24 |
+
def test_task_has_required_fields(task_id):
|
| 25 |
+
task = get_task(task_id)
|
| 26 |
+
assert task.task_id == task_id
|
| 27 |
+
assert task.difficulty in {"easy", "medium", "hard"}
|
| 28 |
+
assert task.max_steps > 0
|
| 29 |
+
assert task.battery_capacity > 0
|
| 30 |
+
assert len(task.bins) > 0
|
| 31 |
+
assert len(task.order) > 0
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
|
| 35 |
+
def test_task_required_scans_non_empty(task_id):
|
| 36 |
+
task = get_task(task_id)
|
| 37 |
+
assert len(task.required_scans) > 0, f"{task_id} has no required_scans"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
|
| 41 |
+
def test_task_order_skus_exist_in_bins(task_id):
|
| 42 |
+
"""Every SKU in the order exists in at least one bin."""
|
| 43 |
+
task = get_task(task_id)
|
| 44 |
+
bin_skus = {b.sku for b in task.bins}
|
| 45 |
+
for line in task.order:
|
| 46 |
+
assert line.sku in bin_skus, f"{line.sku} ordered but not stocked in {task_id}"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
|
| 50 |
+
def test_required_scans_are_valid_bin_ids(task_id):
|
| 51 |
+
"""required_scans reference bin IDs that actually exist."""
|
| 52 |
+
task = get_task(task_id)
|
| 53 |
+
bin_ids = {b.bin_id for b in task.bins}
|
| 54 |
+
for scan_id in task.required_scans:
|
| 55 |
+
assert scan_id in bin_ids, f"required scan {scan_id} not a valid bin in {task_id}"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
|
| 59 |
+
def test_grader_callable_returns_float_in_range(task_id):
|
| 60 |
+
"""Run a short episode and verify the grader returns [0,1]."""
|
| 61 |
+
env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
|
| 62 |
+
env.reset()
|
| 63 |
+
done = False
|
| 64 |
+
while not done:
|
| 65 |
+
_, _, done, _ = env.step("wait")
|
| 66 |
+
state = env.state()
|
| 67 |
+
score = grade_episode(state)
|
| 68 |
+
assert isinstance(score, float)
|
| 69 |
+
assert 0.0 <= score <= 1.0
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def test_get_task_raises_on_unknown_id():
|
| 73 |
+
with pytest.raises(KeyError, match="Unknown task_id"):
|
| 74 |
+
get_task("does_not_exist")
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_grid_size_is_positive_tuple():
|
| 78 |
+
assert len(GRID_SIZE) == 2
|
| 79 |
+
assert GRID_SIZE[0] > 0 and GRID_SIZE[1] > 0
|