sohambose98 commited on
Commit
ea847ad
·
1 Parent(s): 97ac6b2

build configs added and added smoke tests

Browse files
.github/workflows/validate.yml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Phase 1 — Automated Validation
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ # ── Job 1: Unit & integration tests ────────────────────────────────────────
11
+ test:
12
+ name: pytest
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout repository
17
+ uses: actions/checkout@v4
18
+
19
+ - name: Set up Python 3.11
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.11"
23
+ cache: "pip"
24
+
25
+ - name: Install package and test dependencies
26
+ run: pip install -e ".[test]"
27
+
28
+ - name: Run test suite
29
+ run: pytest tests/ -v --tb=short
30
+
31
+ # ── Job 2: Dockerfile build gate ───────────────────────────────────────────
32
+ docker-build:
33
+ name: docker build
34
+ runs-on: ubuntu-latest
35
+
36
+ steps:
37
+ - name: Checkout repository
38
+ uses: actions/checkout@v4
39
+
40
+ - name: Build Docker image (no push)
41
+ run: docker build -t rl-env-server:ci -f grid_env/Server/Dockerfile .
42
+
43
+ # ── Job 3: OpenEnv spec validation (best-effort) ───────────────────────────
44
+ openenv-validate:
45
+ name: openenv spec check
46
+ runs-on: ubuntu-latest
47
+
48
+ steps:
49
+ - name: Checkout repository
50
+ uses: actions/checkout@v4
51
+
52
+ - name: Set up Python 3.11
53
+ uses: actions/setup-python@v5
54
+ with:
55
+ python-version: "3.11"
56
+
57
+ - name: Install package
58
+ run: pip install -e .
59
+
60
+ - name: Try openenv validate (non-blocking if openenv not available)
61
+ run: |
62
+ if pip install openenv 2>/dev/null; then
63
+ openenv validate grid_env/openv.yaml
64
+ else
65
+ echo "openenv package not available on PyPI — skipping CLI validate (YAML tests cover compliance)"
66
+ fi
README.md CHANGED
@@ -1,5 +1,16 @@
 
 
 
 
 
 
 
 
 
 
1
  # RL-Env
2
 
 
3
  This repo contains a real-world OpenEnv-style warehouse fulfillment environment. The simulated task is a pharmacy micro-fulfillment workflow: scan the right bins, pick the right items, pack them correctly, and manage battery under time pressure.
4
 
5
  ## Requirements Coverage
 
1
+ ---
2
+ title: RL-Env Warehouse Fulfillment
3
+ emoji: 🏭
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 8000
8
+ pinned: false
9
+ ---
10
+
11
  # RL-Env
12
 
13
+
14
  This repo contains a real-world OpenEnv-style warehouse fulfillment environment. The simulated task is a pharmacy micro-fulfillment workflow: scan the right bins, pick the right items, pack them correctly, and manage battery under time pressure.
15
 
16
  ## Requirements Coverage
grid_env/Server/Dockerfile CHANGED
@@ -5,7 +5,7 @@ WORKDIR /app
5
  ENV PYTHONDONTWRITEBYTECODE=1
6
  ENV PYTHONUNBUFFERED=1
7
 
8
- COPY grid_env/Server/requirements.txt /app/requirements.txt
9
  RUN pip install --no-cache-dir -r /app/requirements.txt
10
 
11
  COPY . /app
 
5
  ENV PYTHONDONTWRITEBYTECODE=1
6
  ENV PYTHONUNBUFFERED=1
7
 
8
+ COPY requirements.txt /app/requirements.txt
9
  RUN pip install --no-cache-dir -r /app/requirements.txt
10
 
11
  COPY . /app
pyproject.toml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "rl-env"
7
+ version = "0.1.0"
8
+ description = "MiniGrid-style warehouse fulfillment RL environment"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "fastapi>=0.100",
12
+ "pydantic>=2.0",
13
+ "openai>=1.0",
14
+ "uvicorn>=0.20",
15
+ "pyyaml>=6.0",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ test = [
20
+ "pytest>=8.0",
21
+ "pytest-timeout>=2.0",
22
+ ]
23
+
24
+ [tool.setuptools.packages.find]
25
+ where = ["."]
26
+ include = ["grid_env*"]
27
+
28
+ [tool.pytest.ini_options]
29
+ testpaths = ["tests"]
30
+ timeout = 60
grid_env/Server/requirement.text → requirements.txt RENAMED
@@ -2,3 +2,5 @@ fastapi==0.116.1
2
  pydantic==2.11.7
3
  openai==1.108.1
4
  uvicorn==0.35.0
 
 
 
2
  pydantic==2.11.7
3
  openai==1.108.1
4
  uvicorn==0.35.0
5
+ pytest>=8.0
6
+ pyyaml>=6.0
rl_env.egg-info/PKG-INFO ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: rl-env
3
+ Version: 0.1.0
4
+ Summary: MiniGrid-style warehouse fulfillment RL environment
5
+ Requires-Python: >=3.10
6
+ License-File: LICENSE
7
+ Requires-Dist: fastapi>=0.100
8
+ Requires-Dist: pydantic>=2.0
9
+ Requires-Dist: openai>=1.0
10
+ Requires-Dist: uvicorn>=0.20
11
+ Requires-Dist: pyyaml>=6.0
12
+ Provides-Extra: test
13
+ Requires-Dist: pytest>=8.0; extra == "test"
14
+ Requires-Dist: pytest-timeout>=2.0; extra == "test"
15
+ Dynamic: license-file
rl_env.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ grid_env/__init__.py
5
+ grid_env/baseline.py
6
+ grid_env/client.py
7
+ grid_env/env.py
8
+ grid_env/graders.py
9
+ grid_env/models.py
10
+ grid_env/tasks.py
11
+ grid_env/Server/__init__.py
12
+ grid_env/Server/app.py
13
+ grid_env/Server/warehouse_env.py
14
+ rl_env.egg-info/PKG-INFO
15
+ rl_env.egg-info/SOURCES.txt
16
+ rl_env.egg-info/dependency_links.txt
17
+ rl_env.egg-info/requires.txt
18
+ rl_env.egg-info/top_level.txt
19
+ tests/test_baseline_stub.py
20
+ tests/test_env_smoke.py
21
+ tests/test_graders.py
22
+ tests/test_openenv_spec.py
23
+ tests/test_tasks.py
rl_env.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
rl_env.egg-info/requires.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.100
2
+ pydantic>=2.0
3
+ openai>=1.0
4
+ uvicorn>=0.20
5
+ pyyaml>=6.0
6
+
7
+ [test]
8
+ pytest>=8.0
9
+ pytest-timeout>=2.0
rl_env.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ grid_env
tests/conftest.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared fixtures for the warehouse fulfillment test suite.
3
+ """
4
+
5
+ import pytest
6
+ from grid_env.env import WarehouseFulfillmentEnv
7
+
8
+
9
+ @pytest.fixture()
10
+ def env_easy():
11
+ env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
12
+ env.reset()
13
+ return env
14
+
15
+
16
+ @pytest.fixture()
17
+ def env_medium():
18
+ env = WarehouseFulfillmentEnv(task_id="medium_multi_item", seed=7)
19
+ env.reset()
20
+ return env
21
+
22
+
23
+ @pytest.fixture()
24
+ def env_hard():
25
+ env = WarehouseFulfillmentEnv(task_id="hard_restock_priority", seed=7)
26
+ env.reset()
27
+ return env
tests/test_baseline_stub.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Baseline stub tests — runs the baseline runner without an OpenAI API key
3
+ by monkey-patching the client with a deterministic stub.
4
+ """
5
+
6
+ import types
7
+ from unittest.mock import MagicMock, patch
8
+
9
+ import pytest
10
+
11
+ from grid_env import WarehouseFulfillmentEnv
12
+ from grid_env.baseline import run_task, format_report
13
+ from grid_env.graders import grade_episode
14
+ from grid_env.models import BaselineCommand
15
+
16
+
17
+ TASK_IDS = ["easy_single_pick", "medium_multi_item", "hard_restock_priority"]
18
+
19
+ # Cycle of deterministic actions that exercise most code paths without getting stuck.
20
+ _ACTION_CYCLE = [
21
+ "turn_right",
22
+ "move_forward",
23
+ "turn_left",
24
+ "scan_bin",
25
+ "pick_item",
26
+ "move_forward",
27
+ "turn_right",
28
+ "move_forward",
29
+ "pack_item",
30
+ "wait",
31
+ ]
32
+
33
+
34
+ def _make_stub_client(action_cycle=None):
35
+ """Build a mock OpenAI client whose responses.create() returns stub actions."""
36
+ actions = action_cycle or _ACTION_CYCLE
37
+ counter = {"i": 0}
38
+
39
+ def fake_create(**kwargs):
40
+ action = actions[counter["i"] % len(actions)]
41
+ counter["i"] += 1
42
+ cmd = BaselineCommand(command=action, rationale="stub")
43
+ import json
44
+ response = MagicMock()
45
+ response.output_text = json.dumps(cmd.model_dump())
46
+ return response
47
+
48
+ client = MagicMock()
49
+ client.responses.create.side_effect = fake_create
50
+ return client
51
+
52
+
53
+ @pytest.fixture()
54
+ def stub_client():
55
+ return _make_stub_client()
56
+
57
+
58
+ @pytest.mark.parametrize("task_id", TASK_IDS)
59
+ def test_run_task_returns_required_keys(task_id):
60
+ """run_task returns a dict with score, reward, steps, success, task_id."""
61
+ with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
62
+ patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
63
+ result = run_task(task_id, model="stub", seed=7)
64
+ assert set(result.keys()) >= {"task_id", "score", "reward", "steps", "success"}
65
+
66
+
67
+ @pytest.mark.parametrize("task_id", TASK_IDS)
68
+ def test_run_task_score_in_range(task_id):
69
+ """Score from run_task is always ∈ [0, 1]."""
70
+ with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
71
+ patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
72
+ result = run_task(task_id, model="stub", seed=7)
73
+ assert 0.0 <= result["score"] <= 1.0, f"Score {result['score']} out of range"
74
+
75
+
76
+ @pytest.mark.parametrize("task_id", TASK_IDS)
77
+ def test_run_task_steps_within_max(task_id):
78
+ """Steps taken must not exceed the task's max_steps."""
79
+ from grid_env.tasks import get_task
80
+ max_steps = get_task(task_id).max_steps
81
+ with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
82
+ patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
83
+ result = run_task(task_id, model="stub", seed=7)
84
+ assert result["steps"] <= max_steps
85
+
86
+
87
+ def test_run_task_task_id_in_result():
88
+ with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
89
+ patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
90
+ result = run_task("easy_single_pick", model="stub", seed=7)
91
+ assert result["task_id"] == "easy_single_pick"
92
+
93
+
94
+ def test_format_report_contains_model_and_scores():
95
+ results = [
96
+ {"task_id": "easy_single_pick", "score": 0.75, "reward": 1.2, "steps": 15, "success": 1.0},
97
+ {"task_id": "medium_multi_item", "score": 0.5, "reward": 0.8, "steps": 30, "success": 0.0},
98
+ ]
99
+ report = format_report(results, model="test-model")
100
+ assert "test-model" in report
101
+ assert "easy_single_pick" in report
102
+ assert "mean_score" in report
103
+
104
+
105
+ def test_deterministic_seed_reproducibility():
106
+ """Same seed must produce the same final score on two separate runs."""
107
+ client1 = _make_stub_client()
108
+ client2 = _make_stub_client()
109
+ with patch("grid_env.baseline.OpenAI", return_value=client1), \
110
+ patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
111
+ r1 = run_task("easy_single_pick", model="stub", seed=7)
112
+ with patch("grid_env.baseline.OpenAI", return_value=client2), \
113
+ patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
114
+ r2 = run_task("easy_single_pick", model="stub", seed=7)
115
+ assert r1["score"] == r2["score"]
116
+ assert r1["steps"] == r2["steps"]
tests/test_env_smoke.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Smoke tests: environment instantiation, reset, step, and episode termination
3
+ for all three task IDs.
4
+ """
5
+
6
+ import pytest
7
+ from grid_env.env import WarehouseFulfillmentEnv, available_tasks
8
+ from grid_env.graders import grade_episode
9
+ from grid_env.models import WarehouseObservation, WarehouseReward
10
+
11
+
12
+ TASK_IDS = ["easy_single_pick", "medium_multi_item", "hard_restock_priority"]
13
+ ALL_ACTIONS = [
14
+ "turn_left",
15
+ "turn_right",
16
+ "move_forward",
17
+ "scan_bin",
18
+ "pick_item",
19
+ "pack_item",
20
+ "recharge",
21
+ "wait",
22
+ ]
23
+
24
+
25
+ @pytest.mark.parametrize("task_id", TASK_IDS)
26
+ def test_env_instantiation(task_id):
27
+ """Environment can be created for each task without errors."""
28
+ env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
29
+ assert env is not None
30
+ assert env.task.task_id == task_id
31
+
32
+
33
+ @pytest.mark.parametrize("task_id", TASK_IDS)
34
+ def test_reset_returns_observation(task_id):
35
+ """reset() returns a valid WarehouseObservation for each task."""
36
+ env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
37
+ obs = env.reset()
38
+ assert isinstance(obs, WarehouseObservation)
39
+ assert obs.task_id == task_id
40
+ assert obs.battery_level > 0
41
+ assert isinstance(obs.pending_order, list)
42
+ assert len(obs.pending_order) > 0
43
+
44
+
45
+ @pytest.mark.parametrize("action", ALL_ACTIONS)
46
+ def test_step_all_actions_no_crash(action):
47
+ """Every action string can be stepped without raising an exception."""
48
+ env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
49
+ env.reset()
50
+ obs, reward, done, info = env.step(action)
51
+ assert isinstance(obs, WarehouseObservation)
52
+ assert isinstance(reward, WarehouseReward)
53
+ assert isinstance(done, bool)
54
+ assert isinstance(info, dict)
55
+
56
+
57
+ @pytest.mark.parametrize("task_id", TASK_IDS)
58
+ def test_step_returns_correct_types(task_id):
59
+ """step() returns the four-tuple with correct types."""
60
+ env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
61
+ env.reset()
62
+ obs, reward, done, info = env.step("wait")
63
+ assert isinstance(obs, WarehouseObservation)
64
+ assert isinstance(reward, WarehouseReward)
65
+ assert isinstance(done, bool)
66
+ assert "completion_ratio" in info
67
+
68
+
69
+ @pytest.mark.parametrize("task_id", TASK_IDS)
70
+ def test_episode_terminates_at_max_steps(task_id):
71
+ """Running max_steps wait actions always terminates the episode."""
72
+ env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
73
+ env.reset()
74
+ max_steps = env.task.max_steps
75
+ done = False
76
+ for _ in range(max_steps + 5):
77
+ _, _, done, _ = env.step("wait")
78
+ if done:
79
+ break
80
+ assert done, f"Episode did not terminate after {max_steps} steps for {task_id}"
81
+
82
+
83
+ @pytest.mark.parametrize("task_id", TASK_IDS)
84
+ def test_score_in_range_after_episode(task_id):
85
+ """grade_episode() always returns a float in [0.0, 1.0]."""
86
+ env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
87
+ env.reset()
88
+ done = False
89
+ while not done:
90
+ _, _, done, _ = env.step("wait")
91
+ state = env.state()
92
+ score = grade_episode(state)
93
+ assert isinstance(score, float)
94
+ assert 0.0 <= score <= 1.0, f"Score {score} out of range for {task_id}"
95
+
96
+
97
+ def test_step_after_done_is_safe():
98
+ """Stepping after episode is done returns done=True without raising."""
99
+ env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
100
+ env.reset()
101
+ for _ in range(env.task.max_steps):
102
+ env.step("wait")
103
+ _, _, done, _ = env.step("wait")
104
+ assert done
105
+
106
+
107
+ def test_available_tasks_returns_all_three():
108
+ """available_tasks() returns exactly the three expected task IDs."""
109
+ tasks = available_tasks()
110
+ ids = {t["task_id"] for t in tasks}
111
+ assert ids == set(TASK_IDS)
112
+
113
+
114
+ def test_state_method_returns_consistent_data():
115
+ """state() reflects the same step count as internal counter."""
116
+ env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
117
+ env.reset()
118
+ env.step("turn_left")
119
+ env.step("move_forward")
120
+ state = env.state()
121
+ assert state.step_count == 2
122
+ assert state.task_id == "easy_single_pick"
tests/test_graders.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for the deterministic graders.
3
+ """
4
+
5
+ import pytest
6
+ from grid_env.graders import grade_easy, grade_episode, grade_hard, grade_medium, _clamp
7
+ from grid_env.models import WarehouseMetrics, WarehouseState, BinState, OrderLine
8
+
9
+
10
+ def _make_state(
11
+ task_id: str = "easy_single_pick",
12
+ completion_ratio: float = 0.0,
13
+ step_count: int = 5,
14
+ max_steps: int = 40,
15
+ correct_scans: int = 0,
16
+ wrong_scans: int = 0,
17
+ correct_picks: int = 0,
18
+ wrong_picks: int = 0,
19
+ correct_packs: int = 0,
20
+ invalid_actions: int = 0,
21
+ recharges: int = 0,
22
+ battery_depletion_events: int = 0,
23
+ ) -> WarehouseState:
24
+ metrics = WarehouseMetrics(
25
+ correct_scans=correct_scans,
26
+ wrong_scans=wrong_scans,
27
+ correct_picks=correct_picks,
28
+ wrong_picks=wrong_picks,
29
+ correct_packs=correct_packs,
30
+ invalid_actions=invalid_actions,
31
+ recharges=recharges,
32
+ battery_depletion_events=battery_depletion_events,
33
+ )
34
+ return WarehouseState(
35
+ episode_id="test-ep",
36
+ task_id=task_id,
37
+ difficulty="easy",
38
+ step_count=step_count,
39
+ done=True,
40
+ success=completion_ratio >= 1.0,
41
+ max_steps=max_steps,
42
+ grid_size=(7, 7),
43
+ agent_position=(1, 1),
44
+ heading="E",
45
+ carrying=None,
46
+ battery_level=30,
47
+ battery_capacity=36,
48
+ dock_position=(1, 1),
49
+ pack_station_position=(5, 5),
50
+ charger_position=(1, 5),
51
+ bins=[BinState(bin_id="A1", position=(2, 1), sku="thermometer", quantity=2)],
52
+ order=[OrderLine(sku="thermometer", quantity=1)],
53
+ packed_order=[OrderLine(sku="thermometer", quantity=int(completion_ratio))],
54
+ scanned_bins=[],
55
+ metrics=metrics,
56
+ action_history=[],
57
+ total_reward=0.0,
58
+ completion_ratio=completion_ratio,
59
+ task_description="Test",
60
+ )
61
+
62
+
63
+ # ── _clamp ────────────────────────────────────────────────────────────────────
64
+
65
+ def test_clamp_above_one():
66
+ assert _clamp(1.5) == 1.0
67
+
68
+
69
+ def test_clamp_below_zero():
70
+ assert _clamp(-0.3) == 0.0
71
+
72
+
73
+ def test_clamp_within_range():
74
+ assert _clamp(0.75) == pytest.approx(0.75, abs=1e-4)
75
+
76
+
77
+ # ── grade_easy ────────────────────────────────────────────────────────────────
78
+
79
+ def test_grade_easy_zero_completion():
80
+ # completion_ratio=0, but efficiency_bonus still applies (5/40 = 87.5% efficiency)
81
+ # score = 0 + 0.15 * 0.875 + 0.10 * 0 = 0.13125 (no zero_penalty because invalid_actions=0)
82
+ state = _make_state("easy_single_pick", completion_ratio=0.0)
83
+ score = grade_easy(state)
84
+ assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
85
+
86
+
87
+ def test_grade_easy_full_completion_no_penalty():
88
+ state = _make_state(
89
+ "easy_single_pick",
90
+ completion_ratio=1.0,
91
+ step_count=10,
92
+ max_steps=40,
93
+ invalid_actions=0,
94
+ wrong_picks=0,
95
+ )
96
+ score = grade_easy(state)
97
+ assert score >= 0.75
98
+
99
+
100
+ def test_grade_easy_wrong_picks_reduce_score():
101
+ base = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=0)
102
+ penalised = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=3)
103
+ assert grade_easy(penalised) < grade_easy(base)
104
+
105
+
106
+ def test_grade_easy_clamped():
107
+ state = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=0, invalid_actions=0)
108
+ assert 0.0 <= grade_easy(state) <= 1.0
109
+
110
+
111
+ # ── grade_medium ──────────────────────────────────────────────────────────────
112
+
113
+ def test_grade_medium_zero_completion():
114
+ # completion_ratio=0; efficiency_bonus and scan_ratio are non-zero when steps are low
115
+ state = _make_state("medium_multi_item", completion_ratio=0.0)
116
+ score = grade_medium(state)
117
+ assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
118
+
119
+
120
+ def test_grade_medium_full_completion_with_scans():
121
+ state = _make_state(
122
+ "medium_multi_item",
123
+ completion_ratio=1.0,
124
+ correct_scans=2,
125
+ wrong_picks=0,
126
+ invalid_actions=0,
127
+ step_count=20,
128
+ max_steps=60,
129
+ )
130
+ score = grade_medium(state)
131
+ assert score >= 0.75
132
+
133
+
134
+ def test_grade_medium_scans_improve_score():
135
+ no_scans = _make_state("medium_multi_item", completion_ratio=0.5, correct_scans=0)
136
+ with_scans = _make_state("medium_multi_item", completion_ratio=0.5, correct_scans=2)
137
+ assert grade_medium(with_scans) > grade_medium(no_scans)
138
+
139
+
140
+ def test_grade_medium_clamped():
141
+ state = _make_state("medium_multi_item", completion_ratio=1.0, correct_scans=2)
142
+ assert 0.0 <= grade_medium(state) <= 1.0
143
+
144
+
145
+ # ── grade_hard ────────────────────────────────────────────────────────────────
146
+
147
+ def test_grade_hard_zero_completion():
148
+ # completion_ratio=0; efficiency_bonus and other bonuses contribute non-zero score
149
+ state = _make_state("hard_restock_priority", completion_ratio=0.0)
150
+ score = grade_hard(state)
151
+ assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
152
+
153
+
154
+ def test_grade_hard_full_completion_with_recharge():
155
+ state = _make_state(
156
+ "hard_restock_priority",
157
+ completion_ratio=1.0,
158
+ correct_scans=3,
159
+ recharges=1,
160
+ battery_depletion_events=0,
161
+ wrong_picks=0,
162
+ invalid_actions=0,
163
+ step_count=30,
164
+ max_steps=85,
165
+ )
166
+ score = grade_hard(state)
167
+ assert score >= 0.75
168
+
169
+
170
+ def test_grade_hard_battery_depletion_penalises():
171
+ safe = _make_state("hard_restock_priority", completion_ratio=1.0, battery_depletion_events=0)
172
+ depleted = _make_state("hard_restock_priority", completion_ratio=1.0, battery_depletion_events=1)
173
+ assert grade_hard(safe) > grade_hard(depleted)
174
+
175
+
176
+ def test_grade_hard_clamped():
177
+ state = _make_state("hard_restock_priority", completion_ratio=1.0)
178
+ assert 0.0 <= grade_hard(state) <= 1.0
179
+
180
+
181
+ # ── grade_episode dispatcher ──────────────────────────────────────────────────
182
+
183
+ @pytest.mark.parametrize(
184
+ "task_id,grader",
185
+ [
186
+ ("easy_single_pick", grade_easy),
187
+ ("medium_multi_item", grade_medium),
188
+ ("hard_restock_priority", grade_hard),
189
+ ],
190
+ )
191
+ def test_grade_episode_dispatches_correctly(task_id, grader):
192
+ state = _make_state(task_id, completion_ratio=0.5)
193
+ assert grade_episode(state) == grader(state)
194
+
195
+
196
+ def test_grade_episode_unknown_task_raises():
197
+ state = _make_state("nonexistent_task", completion_ratio=0.5)
198
+ with pytest.raises(KeyError):
199
+ grade_episode(state)
tests/test_openenv_spec.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenEnv spec compliance tests — validates openv.yaml structure without
3
+ requiring the openenv package to be installed.
4
+ """
5
+
6
+ import importlib
7
+ from pathlib import Path
8
+
9
+ import pytest
10
+ import yaml
11
+
12
+
13
+ SPEC_PATH = Path(__file__).parent.parent / "grid_env" / "openv.yaml"
14
+
15
+ REQUIRED_TOP_LEVEL = {"spec_version", "name", "entrypoint", "models", "methods", "tasks"}
16
+ REQUIRED_MODELS = {"action", "observation", "state"}
17
+ REQUIRED_TASK_FIELDS = {"id", "grader"}
18
+
19
+
20
+ @pytest.fixture(scope="module")
21
+ def spec():
22
+ assert SPEC_PATH.exists(), f"openv.yaml not found at {SPEC_PATH}"
23
+ with SPEC_PATH.open() as f:
24
+ return yaml.safe_load(f)
25
+
26
+
27
+ def test_yaml_parses_successfully(spec):
28
+ assert spec is not None
29
+ assert isinstance(spec, dict)
30
+
31
+
32
+ def test_required_top_level_fields_present(spec):
33
+ missing = REQUIRED_TOP_LEVEL - set(spec.keys())
34
+ assert not missing, f"Missing top-level fields: {missing}"
35
+
36
+
37
+ def test_spec_version_is_string(spec):
38
+ assert isinstance(spec["spec_version"], str)
39
+ assert spec["spec_version"].strip() != ""
40
+
41
+
42
+ def test_name_is_non_empty_string(spec):
43
+ assert isinstance(spec["name"], str)
44
+ assert spec["name"].strip() != ""
45
+
46
+
47
+ def test_entrypoint_format(spec):
48
+ """Entrypoint should be 'module:ClassName' style."""
49
+ entrypoint = spec["entrypoint"]
50
+ assert isinstance(entrypoint, str)
51
+ assert ":" in entrypoint, "entrypoint must be 'module:ClassName'"
52
+
53
+
54
+ def test_entrypoint_module_is_importable(spec):
55
+ module_path, _ = spec["entrypoint"].split(":", 1)
56
+ try:
57
+ importlib.import_module(module_path)
58
+ except ImportError as exc:
59
+ pytest.fail(f"Entrypoint module '{module_path}' is not importable: {exc}")
60
+
61
+
62
+ def test_models_has_required_keys(spec):
63
+ models = spec.get("models", {})
64
+ missing = REQUIRED_MODELS - set(models.keys())
65
+ assert not missing, f"Missing model keys: {missing}"
66
+
67
+
68
+ def test_methods_contains_reset_step_state(spec):
69
+ methods = set(spec.get("methods", []))
70
+ assert {"reset", "step", "state"}.issubset(methods), \
71
+ f"methods must include reset, step, state. Got: {methods}"
72
+
73
+
74
+ def test_tasks_list_has_at_least_three_entries(spec):
75
+ tasks = spec.get("tasks", [])
76
+ assert len(tasks) >= 3, f"Expected ≥3 tasks, got {len(tasks)}"
77
+
78
+
79
+ def test_each_task_has_required_fields(spec):
80
+ for task in spec.get("tasks", []):
81
+ missing = REQUIRED_TASK_FIELDS - set(task.keys())
82
+ assert not missing, f"Task {task} missing fields: {missing}"
83
+
84
+
85
+ def test_task_ids_are_unique(spec):
86
+ ids = [t["id"] for t in spec.get("tasks", [])]
87
+ assert len(ids) == len(set(ids)), "Duplicate task IDs found in spec"
88
+
89
+
90
+ def test_grader_references_are_importable(spec):
91
+ """Each grader in the spec should resolve to a callable."""
92
+ for task in spec.get("tasks", []):
93
+ grader_ref = task.get("grader", "")
94
+ assert ":" in grader_ref, f"Grader '{grader_ref}' is not 'module:fn' format"
95
+ mod_path, fn_name = grader_ref.split(":", 1)
96
+ try:
97
+ mod = importlib.import_module(mod_path)
98
+ except ImportError as exc:
99
+ pytest.fail(f"Cannot import grader module '{mod_path}': {exc}")
100
+ fn = getattr(mod, fn_name, None)
101
+ assert callable(fn), f"'{fn_name}' in '{mod_path}' is not callable"
102
+
103
+
104
+ def test_baseline_section_present(spec):
105
+ assert "baseline" in spec
106
+ baseline = spec["baseline"]
107
+ assert "runner" in baseline and "seed" in baseline
tests/test_tasks.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for task definitions: presence of all 3 tasks, structural validity,
3
+ and grader callability.
4
+ """
5
+
6
+ import pytest
7
+ from grid_env.tasks import TASKS, get_task, GRID_SIZE
8
+ from grid_env.graders import grade_episode
9
+ from grid_env.env import WarehouseFulfillmentEnv
10
+
11
+
12
+ EXPECTED_TASK_IDS = {"easy_single_pick", "medium_multi_item", "hard_restock_priority"}
13
+
14
+
15
+ def test_exactly_three_tasks_registered():
16
+ assert len(TASKS) == 3
17
+
18
+
19
+ def test_all_expected_task_ids_present():
20
+ assert set(TASKS.keys()) == EXPECTED_TASK_IDS
21
+
22
+
23
+ @pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
24
+ def test_task_has_required_fields(task_id):
25
+ task = get_task(task_id)
26
+ assert task.task_id == task_id
27
+ assert task.difficulty in {"easy", "medium", "hard"}
28
+ assert task.max_steps > 0
29
+ assert task.battery_capacity > 0
30
+ assert len(task.bins) > 0
31
+ assert len(task.order) > 0
32
+
33
+
34
+ @pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
35
+ def test_task_required_scans_non_empty(task_id):
36
+ task = get_task(task_id)
37
+ assert len(task.required_scans) > 0, f"{task_id} has no required_scans"
38
+
39
+
40
+ @pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
41
+ def test_task_order_skus_exist_in_bins(task_id):
42
+ """Every SKU in the order exists in at least one bin."""
43
+ task = get_task(task_id)
44
+ bin_skus = {b.sku for b in task.bins}
45
+ for line in task.order:
46
+ assert line.sku in bin_skus, f"{line.sku} ordered but not stocked in {task_id}"
47
+
48
+
49
+ @pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
50
+ def test_required_scans_are_valid_bin_ids(task_id):
51
+ """required_scans reference bin IDs that actually exist."""
52
+ task = get_task(task_id)
53
+ bin_ids = {b.bin_id for b in task.bins}
54
+ for scan_id in task.required_scans:
55
+ assert scan_id in bin_ids, f"required scan {scan_id} not a valid bin in {task_id}"
56
+
57
+
58
+ @pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
59
+ def test_grader_callable_returns_float_in_range(task_id):
60
+ """Run a short episode and verify the grader returns [0,1]."""
61
+ env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
62
+ env.reset()
63
+ done = False
64
+ while not done:
65
+ _, _, done, _ = env.step("wait")
66
+ state = env.state()
67
+ score = grade_episode(state)
68
+ assert isinstance(score, float)
69
+ assert 0.0 <= score <= 1.0
70
+
71
+
72
+ def test_get_task_raises_on_unknown_id():
73
+ with pytest.raises(KeyError, match="Unknown task_id"):
74
+ get_task("does_not_exist")
75
+
76
+
77
+ def test_grid_size_is_positive_tuple():
78
+ assert len(GRID_SIZE) == 2
79
+ assert GRID_SIZE[0] > 0 and GRID_SIZE[1] > 0