Spaces:

sohambose98
/

mini-rl-env

Sleeping

App Files Files Community

sohambose98 commited on Mar 29

Commit

ea847ad

1 Parent(s): 97ac6b2

build configs added and added smoke tests

Browse files

Files changed (16) hide show

.github/workflows/validate.yml +66 -0
README.md +11 -0
grid_env/Server/Dockerfile +1 -1
pyproject.toml +30 -0
grid_env/Server/requirement.text → requirements.txt +2 -0
rl_env.egg-info/PKG-INFO +15 -0
rl_env.egg-info/SOURCES.txt +23 -0
rl_env.egg-info/dependency_links.txt +1 -0
rl_env.egg-info/requires.txt +9 -0
rl_env.egg-info/top_level.txt +1 -0
tests/conftest.py +27 -0
tests/test_baseline_stub.py +116 -0
tests/test_env_smoke.py +122 -0
tests/test_graders.py +199 -0
tests/test_openenv_spec.py +107 -0
tests/test_tasks.py +79 -0

.github/workflows/validate.yml ADDED Viewed

	@@ -0,0 +1,66 @@

+name: Phase 1 — Automated Validation
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+jobs:
+  # ── Job 1: Unit & integration tests ────────────────────────────────────────
+  test:
+    name: pytest
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+      - name: Install package and test dependencies
+        run: pip install -e ".[test]"
+      - name: Run test suite
+        run: pytest tests/ -v --tb=short
+  # ── Job 2: Dockerfile build gate ───────────────────────────────────────────
+  docker-build:
+    name: docker build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Build Docker image (no push)
+        run: docker build -t rl-env-server:ci -f grid_env/Server/Dockerfile .
+  # ── Job 3: OpenEnv spec validation (best-effort) ───────────────────────────
+  openenv-validate:
+    name: openenv spec check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install package
+        run: pip install -e .
+      - name: Try openenv validate (non-blocking if openenv not available)
+        run: |
+          if pip install openenv 2>/dev/null; then
+            openenv validate grid_env/openv.yaml
+          else
+            echo "openenv package not available on PyPI — skipping CLI validate (YAML tests cover compliance)"
+          fi

README.md CHANGED Viewed

@@ -1,5 +1,16 @@
 # RL-Env
 This repo contains a real-world OpenEnv-style warehouse fulfillment environment. The simulated task is a pharmacy micro-fulfillment workflow: scan the right bins, pick the right items, pack them correctly, and manage battery under time pressure.
 ## Requirements Coverage

+---
+title: RL-Env Warehouse Fulfillment
+emoji: 🏭
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 8000
+pinned: false
+---
 # RL-Env
 This repo contains a real-world OpenEnv-style warehouse fulfillment environment. The simulated task is a pharmacy micro-fulfillment workflow: scan the right bins, pick the right items, pack them correctly, and manage battery under time pressure.
 ## Requirements Coverage

grid_env/Server/Dockerfile CHANGED Viewed

@@ -5,7 +5,7 @@ WORKDIR /app
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
-COPY grid_env/Server/requirements.txt /app/requirements.txt
 RUN pip install --no-cache-dir -r /app/requirements.txt
 COPY . /app

 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
+COPY requirements.txt /app/requirements.txt
 RUN pip install --no-cache-dir -r /app/requirements.txt
 COPY . /app

pyproject.toml ADDED Viewed

	@@ -0,0 +1,30 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "rl-env"
+version = "0.1.0"
+description = "MiniGrid-style warehouse fulfillment RL environment"
+requires-python = ">=3.10"
+dependencies = [
+    "fastapi>=0.100",
+    "pydantic>=2.0",
+    "openai>=1.0",
+    "uvicorn>=0.20",
+    "pyyaml>=6.0",
+]
+[project.optional-dependencies]
+test = [
+    "pytest>=8.0",
+    "pytest-timeout>=2.0",
+]
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["grid_env*"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+timeout = 60

grid_env/Server/requirement.text → requirements.txt RENAMED Viewed

@@ -2,3 +2,5 @@ fastapi==0.116.1
 pydantic==2.11.7
 openai==1.108.1
 uvicorn==0.35.0

 pydantic==2.11.7
 openai==1.108.1
 uvicorn==0.35.0
+pytest>=8.0
+pyyaml>=6.0

rl_env.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,15 @@

+Metadata-Version: 2.4
+Name: rl-env
+Version: 0.1.0
+Summary: MiniGrid-style warehouse fulfillment RL environment
+Requires-Python: >=3.10
+License-File: LICENSE
+Requires-Dist: fastapi>=0.100
+Requires-Dist: pydantic>=2.0
+Requires-Dist: openai>=1.0
+Requires-Dist: uvicorn>=0.20
+Requires-Dist: pyyaml>=6.0
+Provides-Extra: test
+Requires-Dist: pytest>=8.0; extra == "test"
+Requires-Dist: pytest-timeout>=2.0; extra == "test"
+Dynamic: license-file

rl_env.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+LICENSE
+README.md
+pyproject.toml
+grid_env/__init__.py
+grid_env/baseline.py
+grid_env/client.py
+grid_env/env.py
+grid_env/graders.py
+grid_env/models.py
+grid_env/tasks.py
+grid_env/Server/__init__.py
+grid_env/Server/app.py
+grid_env/Server/warehouse_env.py
+rl_env.egg-info/PKG-INFO
+rl_env.egg-info/SOURCES.txt
+rl_env.egg-info/dependency_links.txt
+rl_env.egg-info/requires.txt
+rl_env.egg-info/top_level.txt
+tests/test_baseline_stub.py
+tests/test_env_smoke.py
+tests/test_graders.py
+tests/test_openenv_spec.py
+tests/test_tasks.py

rl_env.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

rl_env.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi>=0.100
+pydantic>=2.0
+openai>=1.0
+uvicorn>=0.20
+pyyaml>=6.0
+[test]
+pytest>=8.0
+pytest-timeout>=2.0

rl_env.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ grid_env

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Shared fixtures for the warehouse fulfillment test suite.
+"""
+import pytest
+from grid_env.env import WarehouseFulfillmentEnv
+@pytest.fixture()
+def env_easy():
+    env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
+    env.reset()
+    return env
+@pytest.fixture()
+def env_medium():
+    env = WarehouseFulfillmentEnv(task_id="medium_multi_item", seed=7)
+    env.reset()
+    return env
+@pytest.fixture()
+def env_hard():
+    env = WarehouseFulfillmentEnv(task_id="hard_restock_priority", seed=7)
+    env.reset()
+    return env

tests/test_baseline_stub.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Baseline stub tests — runs the baseline runner without an OpenAI API key
+by monkey-patching the client with a deterministic stub.
+"""
+import types
+from unittest.mock import MagicMock, patch
+import pytest
+from grid_env import WarehouseFulfillmentEnv
+from grid_env.baseline import run_task, format_report
+from grid_env.graders import grade_episode
+from grid_env.models import BaselineCommand
+TASK_IDS = ["easy_single_pick", "medium_multi_item", "hard_restock_priority"]
+# Cycle of deterministic actions that exercise most code paths without getting stuck.
+_ACTION_CYCLE = [
+    "turn_right",
+    "move_forward",
+    "turn_left",
+    "scan_bin",
+    "pick_item",
+    "move_forward",
+    "turn_right",
+    "move_forward",
+    "pack_item",
+    "wait",
+]
+def _make_stub_client(action_cycle=None):
+    """Build a mock OpenAI client whose responses.create() returns stub actions."""
+    actions = action_cycle or _ACTION_CYCLE
+    counter = {"i": 0}
+    def fake_create(**kwargs):
+        action = actions[counter["i"] % len(actions)]
+        counter["i"] += 1
+        cmd = BaselineCommand(command=action, rationale="stub")
+        import json
+        response = MagicMock()
+        response.output_text = json.dumps(cmd.model_dump())
+        return response
+    client = MagicMock()
+    client.responses.create.side_effect = fake_create
+    return client
+@pytest.fixture()
+def stub_client():
+    return _make_stub_client()
+@pytest.mark.parametrize("task_id", TASK_IDS)
+def test_run_task_returns_required_keys(task_id):
+    """run_task returns a dict with score, reward, steps, success, task_id."""
+    with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
+         patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
+        result = run_task(task_id, model="stub", seed=7)
+    assert set(result.keys()) >= {"task_id", "score", "reward", "steps", "success"}
+@pytest.mark.parametrize("task_id", TASK_IDS)
+def test_run_task_score_in_range(task_id):
+    """Score from run_task is always ∈ [0, 1]."""
+    with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
+         patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
+        result = run_task(task_id, model="stub", seed=7)
+    assert 0.0 <= result["score"] <= 1.0, f"Score {result['score']} out of range"
+@pytest.mark.parametrize("task_id", TASK_IDS)
+def test_run_task_steps_within_max(task_id):
+    """Steps taken must not exceed the task's max_steps."""
+    from grid_env.tasks import get_task
+    max_steps = get_task(task_id).max_steps
+    with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
+         patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
+        result = run_task(task_id, model="stub", seed=7)
+    assert result["steps"] <= max_steps
+def test_run_task_task_id_in_result():
+    with patch("grid_env.baseline.OpenAI", return_value=_make_stub_client()), \
+         patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
+        result = run_task("easy_single_pick", model="stub", seed=7)
+    assert result["task_id"] == "easy_single_pick"
+def test_format_report_contains_model_and_scores():
+    results = [
+        {"task_id": "easy_single_pick", "score": 0.75, "reward": 1.2, "steps": 15, "success": 1.0},
+        {"task_id": "medium_multi_item", "score": 0.5, "reward": 0.8, "steps": 30, "success": 0.0},
+    ]
+    report = format_report(results, model="test-model")
+    assert "test-model" in report
+    assert "easy_single_pick" in report
+    assert "mean_score" in report
+def test_deterministic_seed_reproducibility():
+    """Same seed must produce the same final score on two separate runs."""
+    client1 = _make_stub_client()
+    client2 = _make_stub_client()
+    with patch("grid_env.baseline.OpenAI", return_value=client1), \
+         patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
+        r1 = run_task("easy_single_pick", model="stub", seed=7)
+    with patch("grid_env.baseline.OpenAI", return_value=client2), \
+         patch("grid_env.baseline.os.environ.get", side_effect=lambda k, d=None: "stub-key" if k == "OPENAI_API_KEY" else d):
+        r2 = run_task("easy_single_pick", model="stub", seed=7)
+    assert r1["score"] == r2["score"]
+    assert r1["steps"] == r2["steps"]

tests/test_env_smoke.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Smoke tests: environment instantiation, reset, step, and episode termination
+for all three task IDs.
+"""
+import pytest
+from grid_env.env import WarehouseFulfillmentEnv, available_tasks
+from grid_env.graders import grade_episode
+from grid_env.models import WarehouseObservation, WarehouseReward
+TASK_IDS = ["easy_single_pick", "medium_multi_item", "hard_restock_priority"]
+ALL_ACTIONS = [
+    "turn_left",
+    "turn_right",
+    "move_forward",
+    "scan_bin",
+    "pick_item",
+    "pack_item",
+    "recharge",
+    "wait",
+]
+@pytest.mark.parametrize("task_id", TASK_IDS)
+def test_env_instantiation(task_id):
+    """Environment can be created for each task without errors."""
+    env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
+    assert env is not None
+    assert env.task.task_id == task_id
+@pytest.mark.parametrize("task_id", TASK_IDS)
+def test_reset_returns_observation(task_id):
+    """reset() returns a valid WarehouseObservation for each task."""
+    env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
+    obs = env.reset()
+    assert isinstance(obs, WarehouseObservation)
+    assert obs.task_id == task_id
+    assert obs.battery_level > 0
+    assert isinstance(obs.pending_order, list)
+    assert len(obs.pending_order) > 0
+@pytest.mark.parametrize("action", ALL_ACTIONS)
+def test_step_all_actions_no_crash(action):
+    """Every action string can be stepped without raising an exception."""
+    env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
+    env.reset()
+    obs, reward, done, info = env.step(action)
+    assert isinstance(obs, WarehouseObservation)
+    assert isinstance(reward, WarehouseReward)
+    assert isinstance(done, bool)
+    assert isinstance(info, dict)
+@pytest.mark.parametrize("task_id", TASK_IDS)
+def test_step_returns_correct_types(task_id):
+    """step() returns the four-tuple with correct types."""
+    env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
+    env.reset()
+    obs, reward, done, info = env.step("wait")
+    assert isinstance(obs, WarehouseObservation)
+    assert isinstance(reward, WarehouseReward)
+    assert isinstance(done, bool)
+    assert "completion_ratio" in info
+@pytest.mark.parametrize("task_id", TASK_IDS)
+def test_episode_terminates_at_max_steps(task_id):
+    """Running max_steps wait actions always terminates the episode."""
+    env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
+    env.reset()
+    max_steps = env.task.max_steps
+    done = False
+    for _ in range(max_steps + 5):
+        _, _, done, _ = env.step("wait")
+        if done:
+            break
+    assert done, f"Episode did not terminate after {max_steps} steps for {task_id}"
+@pytest.mark.parametrize("task_id", TASK_IDS)
+def test_score_in_range_after_episode(task_id):
+    """grade_episode() always returns a float in [0.0, 1.0]."""
+    env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
+    env.reset()
+    done = False
+    while not done:
+        _, _, done, _ = env.step("wait")
+    state = env.state()
+    score = grade_episode(state)
+    assert isinstance(score, float)
+    assert 0.0 <= score <= 1.0, f"Score {score} out of range for {task_id}"
+def test_step_after_done_is_safe():
+    """Stepping after episode is done returns done=True without raising."""
+    env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
+    env.reset()
+    for _ in range(env.task.max_steps):
+        env.step("wait")
+    _, _, done, _ = env.step("wait")
+    assert done
+def test_available_tasks_returns_all_three():
+    """available_tasks() returns exactly the three expected task IDs."""
+    tasks = available_tasks()
+    ids = {t["task_id"] for t in tasks}
+    assert ids == set(TASK_IDS)
+def test_state_method_returns_consistent_data():
+    """state() reflects the same step count as internal counter."""
+    env = WarehouseFulfillmentEnv(task_id="easy_single_pick", seed=7)
+    env.reset()
+    env.step("turn_left")
+    env.step("move_forward")
+    state = env.state()
+    assert state.step_count == 2
+    assert state.task_id == "easy_single_pick"

tests/test_graders.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Unit tests for the deterministic graders.
+"""
+import pytest
+from grid_env.graders import grade_easy, grade_episode, grade_hard, grade_medium, _clamp
+from grid_env.models import WarehouseMetrics, WarehouseState, BinState, OrderLine
+def _make_state(
+    task_id: str = "easy_single_pick",
+    completion_ratio: float = 0.0,
+    step_count: int = 5,
+    max_steps: int = 40,
+    correct_scans: int = 0,
+    wrong_scans: int = 0,
+    correct_picks: int = 0,
+    wrong_picks: int = 0,
+    correct_packs: int = 0,
+    invalid_actions: int = 0,
+    recharges: int = 0,
+    battery_depletion_events: int = 0,
+) -> WarehouseState:
+    metrics = WarehouseMetrics(
+        correct_scans=correct_scans,
+        wrong_scans=wrong_scans,
+        correct_picks=correct_picks,
+        wrong_picks=wrong_picks,
+        correct_packs=correct_packs,
+        invalid_actions=invalid_actions,
+        recharges=recharges,
+        battery_depletion_events=battery_depletion_events,
+    )
+    return WarehouseState(
+        episode_id="test-ep",
+        task_id=task_id,
+        difficulty="easy",
+        step_count=step_count,
+        done=True,
+        success=completion_ratio >= 1.0,
+        max_steps=max_steps,
+        grid_size=(7, 7),
+        agent_position=(1, 1),
+        heading="E",
+        carrying=None,
+        battery_level=30,
+        battery_capacity=36,
+        dock_position=(1, 1),
+        pack_station_position=(5, 5),
+        charger_position=(1, 5),
+        bins=[BinState(bin_id="A1", position=(2, 1), sku="thermometer", quantity=2)],
+        order=[OrderLine(sku="thermometer", quantity=1)],
+        packed_order=[OrderLine(sku="thermometer", quantity=int(completion_ratio))],
+        scanned_bins=[],
+        metrics=metrics,
+        action_history=[],
+        total_reward=0.0,
+        completion_ratio=completion_ratio,
+        task_description="Test",
+    )
+# ── _clamp ────────────────────────────────────────────────────────────────────
+def test_clamp_above_one():
+    assert _clamp(1.5) == 1.0
+def test_clamp_below_zero():
+    assert _clamp(-0.3) == 0.0
+def test_clamp_within_range():
+    assert _clamp(0.75) == pytest.approx(0.75, abs=1e-4)
+# ── grade_easy ────────────────────────────────────────────────────────────────
+def test_grade_easy_zero_completion():
+    # completion_ratio=0, but efficiency_bonus still applies (5/40 = 87.5% efficiency)
+    # score = 0 + 0.15 * 0.875 + 0.10 * 0 = 0.13125 (no zero_penalty because invalid_actions=0)
+    state = _make_state("easy_single_pick", completion_ratio=0.0)
+    score = grade_easy(state)
+    assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
+def test_grade_easy_full_completion_no_penalty():
+    state = _make_state(
+        "easy_single_pick",
+        completion_ratio=1.0,
+        step_count=10,
+        max_steps=40,
+        invalid_actions=0,
+        wrong_picks=0,
+    )
+    score = grade_easy(state)
+    assert score >= 0.75
+def test_grade_easy_wrong_picks_reduce_score():
+    base = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=0)
+    penalised = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=3)
+    assert grade_easy(penalised) < grade_easy(base)
+def test_grade_easy_clamped():
+    state = _make_state("easy_single_pick", completion_ratio=1.0, wrong_picks=0, invalid_actions=0)
+    assert 0.0 <= grade_easy(state) <= 1.0
+# ── grade_medium ──────────────────────────────────────────────────────────────
+def test_grade_medium_zero_completion():
+    # completion_ratio=0; efficiency_bonus and scan_ratio are non-zero when steps are low
+    state = _make_state("medium_multi_item", completion_ratio=0.0)
+    score = grade_medium(state)
+    assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
+def test_grade_medium_full_completion_with_scans():
+    state = _make_state(
+        "medium_multi_item",
+        completion_ratio=1.0,
+        correct_scans=2,
+        wrong_picks=0,
+        invalid_actions=0,
+        step_count=20,
+        max_steps=60,
+    )
+    score = grade_medium(state)
+    assert score >= 0.75
+def test_grade_medium_scans_improve_score():
+    no_scans = _make_state("medium_multi_item", completion_ratio=0.5, correct_scans=0)
+    with_scans = _make_state("medium_multi_item", completion_ratio=0.5, correct_scans=2)
+    assert grade_medium(with_scans) > grade_medium(no_scans)
+def test_grade_medium_clamped():
+    state = _make_state("medium_multi_item", completion_ratio=1.0, correct_scans=2)
+    assert 0.0 <= grade_medium(state) <= 1.0
+# ── grade_hard ────────────────────────────────────────────────────────────────
+def test_grade_hard_zero_completion():
+    # completion_ratio=0; efficiency_bonus and other bonuses contribute non-zero score
+    state = _make_state("hard_restock_priority", completion_ratio=0.0)
+    score = grade_hard(state)
+    assert 0.0 <= score < 0.30, f"Expected low score at zero completion, got {score}"
+def test_grade_hard_full_completion_with_recharge():
+    state = _make_state(
+        "hard_restock_priority",
+        completion_ratio=1.0,
+        correct_scans=3,
+        recharges=1,
+        battery_depletion_events=0,
+        wrong_picks=0,
+        invalid_actions=0,
+        step_count=30,
+        max_steps=85,
+    )
+    score = grade_hard(state)
+    assert score >= 0.75
+def test_grade_hard_battery_depletion_penalises():
+    safe = _make_state("hard_restock_priority", completion_ratio=1.0, battery_depletion_events=0)
+    depleted = _make_state("hard_restock_priority", completion_ratio=1.0, battery_depletion_events=1)
+    assert grade_hard(safe) > grade_hard(depleted)
+def test_grade_hard_clamped():
+    state = _make_state("hard_restock_priority", completion_ratio=1.0)
+    assert 0.0 <= grade_hard(state) <= 1.0
+# ── grade_episode dispatcher ──────────────────────────────────────────────────
+@pytest.mark.parametrize(
+    "task_id,grader",
+    [
+        ("easy_single_pick", grade_easy),
+        ("medium_multi_item", grade_medium),
+        ("hard_restock_priority", grade_hard),
+    ],
+)
+def test_grade_episode_dispatches_correctly(task_id, grader):
+    state = _make_state(task_id, completion_ratio=0.5)
+    assert grade_episode(state) == grader(state)
+def test_grade_episode_unknown_task_raises():
+    state = _make_state("nonexistent_task", completion_ratio=0.5)
+    with pytest.raises(KeyError):
+        grade_episode(state)

tests/test_openenv_spec.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+OpenEnv spec compliance tests — validates openv.yaml structure without
+requiring the openenv package to be installed.
+"""
+import importlib
+from pathlib import Path
+import pytest
+import yaml
+SPEC_PATH = Path(__file__).parent.parent / "grid_env" / "openv.yaml"
+REQUIRED_TOP_LEVEL = {"spec_version", "name", "entrypoint", "models", "methods", "tasks"}
+REQUIRED_MODELS = {"action", "observation", "state"}
+REQUIRED_TASK_FIELDS = {"id", "grader"}
+@pytest.fixture(scope="module")
+def spec():
+    assert SPEC_PATH.exists(), f"openv.yaml not found at {SPEC_PATH}"
+    with SPEC_PATH.open() as f:
+        return yaml.safe_load(f)
+def test_yaml_parses_successfully(spec):
+    assert spec is not None
+    assert isinstance(spec, dict)
+def test_required_top_level_fields_present(spec):
+    missing = REQUIRED_TOP_LEVEL - set(spec.keys())
+    assert not missing, f"Missing top-level fields: {missing}"
+def test_spec_version_is_string(spec):
+    assert isinstance(spec["spec_version"], str)
+    assert spec["spec_version"].strip() != ""
+def test_name_is_non_empty_string(spec):
+    assert isinstance(spec["name"], str)
+    assert spec["name"].strip() != ""
+def test_entrypoint_format(spec):
+    """Entrypoint should be 'module:ClassName' style."""
+    entrypoint = spec["entrypoint"]
+    assert isinstance(entrypoint, str)
+    assert ":" in entrypoint, "entrypoint must be 'module:ClassName'"
+def test_entrypoint_module_is_importable(spec):
+    module_path, _ = spec["entrypoint"].split(":", 1)
+    try:
+        importlib.import_module(module_path)
+    except ImportError as exc:
+        pytest.fail(f"Entrypoint module '{module_path}' is not importable: {exc}")
+def test_models_has_required_keys(spec):
+    models = spec.get("models", {})
+    missing = REQUIRED_MODELS - set(models.keys())
+    assert not missing, f"Missing model keys: {missing}"
+def test_methods_contains_reset_step_state(spec):
+    methods = set(spec.get("methods", []))
+    assert {"reset", "step", "state"}.issubset(methods), \
+        f"methods must include reset, step, state. Got: {methods}"
+def test_tasks_list_has_at_least_three_entries(spec):
+    tasks = spec.get("tasks", [])
+    assert len(tasks) >= 3, f"Expected ≥3 tasks, got {len(tasks)}"
+def test_each_task_has_required_fields(spec):
+    for task in spec.get("tasks", []):
+        missing = REQUIRED_TASK_FIELDS - set(task.keys())
+        assert not missing, f"Task {task} missing fields: {missing}"
+def test_task_ids_are_unique(spec):
+    ids = [t["id"] for t in spec.get("tasks", [])]
+    assert len(ids) == len(set(ids)), "Duplicate task IDs found in spec"
+def test_grader_references_are_importable(spec):
+    """Each grader in the spec should resolve to a callable."""
+    for task in spec.get("tasks", []):
+        grader_ref = task.get("grader", "")
+        assert ":" in grader_ref, f"Grader '{grader_ref}' is not 'module:fn' format"
+        mod_path, fn_name = grader_ref.split(":", 1)
+        try:
+            mod = importlib.import_module(mod_path)
+        except ImportError as exc:
+            pytest.fail(f"Cannot import grader module '{mod_path}': {exc}")
+        fn = getattr(mod, fn_name, None)
+        assert callable(fn), f"'{fn_name}' in '{mod_path}' is not callable"
+def test_baseline_section_present(spec):
+    assert "baseline" in spec
+    baseline = spec["baseline"]
+    assert "runner" in baseline and "seed" in baseline

tests/test_tasks.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+Tests for task definitions: presence of all 3 tasks, structural validity,
+and grader callability.
+"""
+import pytest
+from grid_env.tasks import TASKS, get_task, GRID_SIZE
+from grid_env.graders import grade_episode
+from grid_env.env import WarehouseFulfillmentEnv
+EXPECTED_TASK_IDS = {"easy_single_pick", "medium_multi_item", "hard_restock_priority"}
+def test_exactly_three_tasks_registered():
+    assert len(TASKS) == 3
+def test_all_expected_task_ids_present():
+    assert set(TASKS.keys()) == EXPECTED_TASK_IDS
+@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
+def test_task_has_required_fields(task_id):
+    task = get_task(task_id)
+    assert task.task_id == task_id
+    assert task.difficulty in {"easy", "medium", "hard"}
+    assert task.max_steps > 0
+    assert task.battery_capacity > 0
+    assert len(task.bins) > 0
+    assert len(task.order) > 0
+@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
+def test_task_required_scans_non_empty(task_id):
+    task = get_task(task_id)
+    assert len(task.required_scans) > 0, f"{task_id} has no required_scans"
+@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
+def test_task_order_skus_exist_in_bins(task_id):
+    """Every SKU in the order exists in at least one bin."""
+    task = get_task(task_id)
+    bin_skus = {b.sku for b in task.bins}
+    for line in task.order:
+        assert line.sku in bin_skus, f"{line.sku} ordered but not stocked in {task_id}"
+@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
+def test_required_scans_are_valid_bin_ids(task_id):
+    """required_scans reference bin IDs that actually exist."""
+    task = get_task(task_id)
+    bin_ids = {b.bin_id for b in task.bins}
+    for scan_id in task.required_scans:
+        assert scan_id in bin_ids, f"required scan {scan_id} not a valid bin in {task_id}"
+@pytest.mark.parametrize("task_id", list(EXPECTED_TASK_IDS))
+def test_grader_callable_returns_float_in_range(task_id):
+    """Run a short episode and verify the grader returns [0,1]."""
+    env = WarehouseFulfillmentEnv(task_id=task_id, seed=7)
+    env.reset()
+    done = False
+    while not done:
+        _, _, done, _ = env.step("wait")
+    state = env.state()
+    score = grade_episode(state)
+    assert isinstance(score, float)
+    assert 0.0 <= score <= 1.0
+def test_get_task_raises_on_unknown_id():
+    with pytest.raises(KeyError, match="Unknown task_id"):
+        get_task("does_not_exist")
+def test_grid_size_is_positive_tuple():
+    assert len(GRID_SIZE) == 2
+    assert GRID_SIZE[0] > 0 and GRID_SIZE[1] > 0