Spaces:
Runtime error
Runtime error
Initial submission
Browse files- Dockerfile +16 -0
- README.md +143 -32
- baseline/__init__.py +1 -0
- baseline/policy.py +100 -0
- config/openenv.yaml +48 -156
- docker/Dockerfile +16 -0
- grader/__init__.py +1 -0
- grader/grader.py +57 -220
- inference.py +82 -0
- inference/inference.py +9 -171
- inference/policy.py +100 -0
- openenv.yaml +48 -156
- pyproject.toml +23 -35
- requirements.txt +5 -2
- scenario/__init__.py +1 -0
- scenario/scenario.py +363 -189
- server.py +5 -0
- server/__init__.py +1 -0
- server/app.py +152 -31
- server/static/app.js +222 -0
- server/static/index.html +149 -0
- server/static/styles.css +499 -0
- tests/test_env.py +72 -0
- uv.toml +3 -0
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
+
PYTHONUNBUFFERED=1 \
|
| 7 |
+
PORT=7860
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt ./
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
EXPOSE 7860
|
| 15 |
+
|
| 16 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,32 +1,143 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: RecallTrace OpenEnv
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RecallTrace OpenEnv
|
| 3 |
+
emoji: 🚨
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# 🚀 RecallTrace OpenEnv
|
| 11 |
+
|
| 12 |
+
RecallTrace is a **real-world AI environment** designed for **product recall tracing and precision containment**.
|
| 13 |
+
|
| 14 |
+
It simulates how companies handle:
|
| 15 |
+
- contaminated product recalls
|
| 16 |
+
- supply chain tracing
|
| 17 |
+
- selective quarantine decisions
|
| 18 |
+
|
| 19 |
+
This environment evaluates **agent reasoning + decision-making**, not just correctness.
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
# 🧠 What This Environment Does
|
| 24 |
+
|
| 25 |
+
Given a recall notice (e.g., *"Lot A is contaminated"*), the agent must:
|
| 26 |
+
|
| 27 |
+
1. Trace where the product went
|
| 28 |
+
2. Identify affected nodes (warehouses, stores)
|
| 29 |
+
3. Handle relabeling / transformations
|
| 30 |
+
4. Quarantine **only unsafe inventory**
|
| 31 |
+
5. Avoid blocking safe stock
|
| 32 |
+
6. Notify affected entities
|
| 33 |
+
7. Finalize with correct containment
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
# 🎯 Why This Is Important
|
| 38 |
+
|
| 39 |
+
This is a **real industry problem** seen in:
|
| 40 |
+
- food recalls
|
| 41 |
+
- pharma defects
|
| 42 |
+
- logistics failures
|
| 43 |
+
|
| 44 |
+
Challenges include:
|
| 45 |
+
- Graph traversal
|
| 46 |
+
- Partial observability
|
| 47 |
+
- Lot transformations
|
| 48 |
+
- Mixed inventory reasoning
|
| 49 |
+
- Precision decision-making
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
# 🧩 Tasks (Scenarios)
|
| 54 |
+
|
| 55 |
+
## 🔹 Easy — Direct Recall
|
| 56 |
+
- Single contaminated lot
|
| 57 |
+
- Straight supply chain
|
| 58 |
+
- Goal: trace and quarantine correctly
|
| 59 |
+
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
## 🔹 Medium — Relabeled Inventory
|
| 63 |
+
- Lot gets renamed (LotA → LotA1)
|
| 64 |
+
- Goal: track transformations and quarantine
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
## 🔹 Hard — Mixed Inventory
|
| 69 |
+
- Contaminated + safe stock mixed
|
| 70 |
+
- Goal: isolate unsafe quantity **without over-blocking**
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
# ⚙️ Action Space
|
| 75 |
+
|
| 76 |
+
| Action | Description |
|
| 77 |
+
|------|------------|
|
| 78 |
+
| inspect_node | View inventory at a node |
|
| 79 |
+
| trace_lot | Follow product lineage |
|
| 80 |
+
| quarantine | Block unsafe stock |
|
| 81 |
+
| notify | Inform affected nodes |
|
| 82 |
+
| finalize | End task |
|
| 83 |
+
|
| 84 |
+
---
|
| 85 |
+
|
| 86 |
+
# 📦 Observation Structure
|
| 87 |
+
|
| 88 |
+
Each step returns:
|
| 89 |
+
|
| 90 |
+
- recall_notice
|
| 91 |
+
- inventory
|
| 92 |
+
- action history
|
| 93 |
+
- trace results
|
| 94 |
+
- inspection data
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
# 🏆 Reward & Grading
|
| 99 |
+
|
| 100 |
+
### Reward System
|
| 101 |
+
- + Correct tracing
|
| 102 |
+
- + Correct quarantine
|
| 103 |
+
- + Correct notification
|
| 104 |
+
- − Wrong node
|
| 105 |
+
- − Over-quarantine
|
| 106 |
+
- − Missed unsafe stock
|
| 107 |
+
|
| 108 |
+
---
|
| 109 |
+
|
| 110 |
+
### Final Score
|
| 111 |
+
Range: **0.0 → 1.0**
|
| 112 |
+
|
| 113 |
+
Based on:
|
| 114 |
+
- accuracy
|
| 115 |
+
- precision
|
| 116 |
+
- efficiency
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
# 🧱 Project Structure
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
recalltrace-openenv/
|
| 124 |
+
│
|
| 125 |
+
├── env/ # Environment logic
|
| 126 |
+
│ ├── env.py
|
| 127 |
+
│ └── __init__.py
|
| 128 |
+
│
|
| 129 |
+
├── scenario/ # Scenario generation
|
| 130 |
+
│ └── scenario.py
|
| 131 |
+
│
|
| 132 |
+
├── grader/ # Evaluation + reward
|
| 133 |
+
│ └── grader.py
|
| 134 |
+
│
|
| 135 |
+
├── inference/ # Agent simulation
|
| 136 |
+
│ └── inference.py
|
| 137 |
+
│
|
| 138 |
+
├── config/
|
| 139 |
+
│ └── openenv.yaml
|
| 140 |
+
│
|
| 141 |
+
├── Dockerfile
|
| 142 |
+
├── requirements.txt
|
| 143 |
+
├── README.md
|
baseline/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Baseline agent helpers for RecallTrace."""
|
baseline/policy.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Heuristic baseline policy for RecallTrace."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
from typing import Any, Dict, Optional
|
| 8 |
+
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
|
| 11 |
+
from env.models import RecallAction, RecallObservation
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
LOT_PATTERN = re.compile(r"\bLot[A-Za-z0-9_]+\b")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _extract_root_lot(observation: RecallObservation) -> str:
|
| 18 |
+
match = LOT_PATTERN.search(observation.recall_notice)
|
| 19 |
+
return match.group(0) if match else "LotA"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def choose_heuristic_action(observation: RecallObservation) -> RecallAction:
|
| 23 |
+
"""Choose the next deterministic action using only observable state."""
|
| 24 |
+
root_lot = _extract_root_lot(observation)
|
| 25 |
+
trace_result = observation.trace_results.get(root_lot)
|
| 26 |
+
|
| 27 |
+
if trace_result is None:
|
| 28 |
+
return RecallAction(type="trace_lot", lot_id=root_lot, rationale="Map the recall lineage first.")
|
| 29 |
+
|
| 30 |
+
affected_nodes = trace_result.get("affected_nodes", [])
|
| 31 |
+
for node_id in affected_nodes:
|
| 32 |
+
if node_id not in observation.inspected_nodes:
|
| 33 |
+
return RecallAction(type="inspect_node", node_id=node_id, rationale="Collect local evidence before quarantining.")
|
| 34 |
+
|
| 35 |
+
for node_id, findings in observation.inspection_results.items():
|
| 36 |
+
for lot_id, finding in findings.items():
|
| 37 |
+
unsafe_quantity = finding.unsafe_quantity
|
| 38 |
+
quarantined_quantity = observation.quarantined_inventory.get(node_id, {}).get(lot_id, 0)
|
| 39 |
+
available_quantity = observation.inventory.get(node_id, {}).get(lot_id, 0)
|
| 40 |
+
remaining_target = unsafe_quantity - quarantined_quantity
|
| 41 |
+
if remaining_target > 0 and available_quantity > 0:
|
| 42 |
+
return RecallAction(
|
| 43 |
+
type="quarantine",
|
| 44 |
+
node_id=node_id,
|
| 45 |
+
lot_id=lot_id,
|
| 46 |
+
quantity=min(remaining_target, available_quantity),
|
| 47 |
+
rationale="Isolate the exact unsafe quantity discovered during inspection.",
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
missing_notifications = [node_id for node_id in affected_nodes if node_id not in observation.notified_nodes]
|
| 51 |
+
if missing_notifications:
|
| 52 |
+
return RecallAction(type="notify", node_id="all", rationale="Alert every impacted stakeholder before closing the incident.")
|
| 53 |
+
|
| 54 |
+
return RecallAction(type="finalize", rationale="Containment actions are complete.")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def choose_llm_action(
|
| 58 |
+
client: Optional[OpenAI],
|
| 59 |
+
model_name: str,
|
| 60 |
+
observation: RecallObservation,
|
| 61 |
+
history: list[dict[str, Any]],
|
| 62 |
+
) -> Optional[RecallAction]:
|
| 63 |
+
"""Ask an LLM for the next action, returning None on failure."""
|
| 64 |
+
if client is None:
|
| 65 |
+
return None
|
| 66 |
+
|
| 67 |
+
prompt = {
|
| 68 |
+
"task_id": observation.task_id,
|
| 69 |
+
"phase": observation.phase,
|
| 70 |
+
"notice": observation.recall_notice,
|
| 71 |
+
"inventory": observation.inventory,
|
| 72 |
+
"inspection_results": {
|
| 73 |
+
node_id: {lot_id: evidence.model_dump() for lot_id, evidence in findings.items()}
|
| 74 |
+
for node_id, findings in observation.inspection_results.items()
|
| 75 |
+
},
|
| 76 |
+
"trace_results": observation.trace_results,
|
| 77 |
+
"notified_nodes": observation.notified_nodes,
|
| 78 |
+
"quarantined_inventory": observation.quarantined_inventory,
|
| 79 |
+
"steps_taken": observation.steps_taken,
|
| 80 |
+
"remaining_step_budget": observation.remaining_step_budget,
|
| 81 |
+
"history": history[-6:],
|
| 82 |
+
"instruction": "Return only compact JSON with keys type,node_id,lot_id,quantity,rationale. Use one valid action.",
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
completion = client.chat.completions.create(
|
| 87 |
+
model=model_name,
|
| 88 |
+
temperature=0,
|
| 89 |
+
max_tokens=180,
|
| 90 |
+
messages=[
|
| 91 |
+
{"role": "system", "content": "You are operating a deterministic product recall environment. Respond with only valid JSON for the next action."},
|
| 92 |
+
{"role": "user", "content": json.dumps(prompt, sort_keys=True)},
|
| 93 |
+
],
|
| 94 |
+
)
|
| 95 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 96 |
+
if not text:
|
| 97 |
+
return None
|
| 98 |
+
return RecallAction.model_validate_json(text)
|
| 99 |
+
except Exception:
|
| 100 |
+
return None
|
config/openenv.yaml
CHANGED
|
@@ -1,156 +1,48 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
- name: finalize
|
| 50 |
-
params: {}
|
| 51 |
-
description: >
|
| 52 |
-
Submit the containment plan. Triggers final scoring. Episode ends.
|
| 53 |
-
reward_hint: returns final_score in [0.0, 1.0]
|
| 54 |
-
|
| 55 |
-
# ── Observation Space ─────────────────────────────────────────────────────────
|
| 56 |
-
observation:
|
| 57 |
-
recall_notice:
|
| 58 |
-
type: str
|
| 59 |
-
description: Human-readable contamination alert issued at episode start
|
| 60 |
-
|
| 61 |
-
inventory:
|
| 62 |
-
type: dict
|
| 63 |
-
description: >
|
| 64 |
-
Full inventory snapshot across all nodes.
|
| 65 |
-
{ node_id: { lot_id: quantity } }
|
| 66 |
-
|
| 67 |
-
discovered_shipments:
|
| 68 |
-
type: dict
|
| 69 |
-
description: >
|
| 70 |
-
Outbound shipment edges revealed so far (only for inspected nodes).
|
| 71 |
-
{ node_id: [downstream_node_id, ...] }
|
| 72 |
-
|
| 73 |
-
history:
|
| 74 |
-
type: list[str]
|
| 75 |
-
description: Ordered log of all actions taken this episode
|
| 76 |
-
|
| 77 |
-
inspected_nodes:
|
| 78 |
-
type: list[str]
|
| 79 |
-
description: Sorted list of nodes that have been inspected
|
| 80 |
-
|
| 81 |
-
notified_nodes:
|
| 82 |
-
type: list[str]
|
| 83 |
-
description: Sorted list of nodes that have been sent recall alerts
|
| 84 |
-
|
| 85 |
-
quarantined_inventory:
|
| 86 |
-
type: dict
|
| 87 |
-
description: >
|
| 88 |
-
Inventory currently in quarantine (non-empty nodes only).
|
| 89 |
-
{ node_id: { lot_id: quantity } }
|
| 90 |
-
|
| 91 |
-
# ── Tasks ─────────────────────────────────────────────────────────────────────
|
| 92 |
-
tasks:
|
| 93 |
-
- id: easy
|
| 94 |
-
name: "Task 1 — Direct Recall"
|
| 95 |
-
assign: "Shreya B J"
|
| 96 |
-
description: >
|
| 97 |
-
Single contaminated lot (LotA) distributed across a linear
|
| 98 |
-
warehouse → store1 → store2 chain. No relabeling.
|
| 99 |
-
nodes: [warehouse, store1, store2]
|
| 100 |
-
contaminated_lots: [LotA]
|
| 101 |
-
|
| 102 |
-
- id: medium
|
| 103 |
-
name: "Task 2 — Relabeled Inventory"
|
| 104 |
-
assign: "Shreya B J"
|
| 105 |
-
description: >
|
| 106 |
-
LotA is contaminated; it was repacked and relabeled as LotA1
|
| 107 |
-
at the distribution centre. Agent must trace the transformation.
|
| 108 |
-
nodes: [warehouse, dist_centre, store_north, store_south]
|
| 109 |
-
contaminated_lots: [LotA, LotA1]
|
| 110 |
-
|
| 111 |
-
- id: hard
|
| 112 |
-
name: "Task 3 — Mixed Shipments"
|
| 113 |
-
assign: "Shreya B J"
|
| 114 |
-
description: >
|
| 115 |
-
Two contaminated lots (LotX, LotY) co-shipped with safe stock
|
| 116 |
-
(LotB, LotC) across a hub-and-spoke network. Precise quarantine required.
|
| 117 |
-
nodes: [plant_a, plant_b, hub, retail_east, retail_west, retail_central]
|
| 118 |
-
contaminated_lots: [LotX, LotY]
|
| 119 |
-
|
| 120 |
-
# ── Scoring ───────────────────────────────────────────────────────────────────
|
| 121 |
-
scoring:
|
| 122 |
-
range: [0.0, 1.0]
|
| 123 |
-
formula: "(quarantine_score + notification_score) / 2 − unnecessary_penalty"
|
| 124 |
-
components:
|
| 125 |
-
quarantine_score:
|
| 126 |
-
weight: 0.5
|
| 127 |
-
description: >
|
| 128 |
-
1 − ((missing_qty + over_qty) / total_affected_qty).
|
| 129 |
-
Full marks for exact quarantine of all affected lots.
|
| 130 |
-
notification_score:
|
| 131 |
-
weight: 0.5
|
| 132 |
-
description: >
|
| 133 |
-
fraction of affected nodes that were notified.
|
| 134 |
-
unnecessary_penalty:
|
| 135 |
-
max: 0.15
|
| 136 |
-
description: >
|
| 137 |
-
−0.05 per unnecessary quarantine (safe stock), capped at 0.15.
|
| 138 |
-
|
| 139 |
-
# ── OpenEnv Compliance ────────────────────────────────────────────────────────
|
| 140 |
-
compliance:
|
| 141 |
-
implements_reset: true
|
| 142 |
-
implements_step: true
|
| 143 |
-
implements_state: true
|
| 144 |
-
deterministic: true
|
| 145 |
-
typed_models: true
|
| 146 |
-
offline: true
|
| 147 |
-
reproducible: true
|
| 148 |
-
|
| 149 |
-
# ── Project Team ──────────────────────────────────────────────────────────────
|
| 150 |
-
team:
|
| 151 |
-
- name: "Shamanth MS"
|
| 152 |
-
tasks: [env_core, action_handler, ground_truth_system, connect_components, submission]
|
| 153 |
-
- name: "P G Ayush Rai"
|
| 154 |
-
tasks: [openenv_spec, docker_setup, openenv_validation, deploy_hf_spaces]
|
| 155 |
-
- name: "Shreya B J"
|
| 156 |
-
tasks: [scenario_expansion, grader_system, reward_function]
|
|
|
|
| 1 |
+
name: RecallTraceEnv
|
| 2 |
+
version: 1.0.0
|
| 3 |
+
description: Deterministic OpenEnv environment for supply-chain product recall tracing and precision containment.
|
| 4 |
+
entrypoint:
|
| 5 |
+
module: env.env
|
| 6 |
+
class: RecallTraceEnv
|
| 7 |
+
server:
|
| 8 |
+
module: server
|
| 9 |
+
app: app
|
| 10 |
+
models:
|
| 11 |
+
action: env.models.RecallAction
|
| 12 |
+
observation: env.models.RecallObservation
|
| 13 |
+
reward: env.models.RewardSignal
|
| 14 |
+
tasks:
|
| 15 |
+
- id: phase1_direct_recall
|
| 16 |
+
difficulty: easy
|
| 17 |
+
objective: Identify every location holding the recalled lot and quarantine all contaminated stock.
|
| 18 |
+
- id: phase2_relabel_recall
|
| 19 |
+
difficulty: medium
|
| 20 |
+
objective: Follow relabeled lots back to the source batch and quarantine every derived label precisely.
|
| 21 |
+
- id: phase3_mixed_shipments
|
| 22 |
+
difficulty: hard
|
| 23 |
+
objective: Contain only the unsafe quantity after contaminated stock was mixed with safe inventory during cross-docking.
|
| 24 |
+
interfaces:
|
| 25 |
+
methods:
|
| 26 |
+
- reset
|
| 27 |
+
- step
|
| 28 |
+
- state
|
| 29 |
+
actions:
|
| 30 |
+
- inspect_node
|
| 31 |
+
- trace_lot
|
| 32 |
+
- quarantine
|
| 33 |
+
- notify
|
| 34 |
+
- finalize
|
| 35 |
+
observation_fields:
|
| 36 |
+
- task_id
|
| 37 |
+
- phase
|
| 38 |
+
- recall_notice
|
| 39 |
+
- inventory
|
| 40 |
+
- discovered_shipments
|
| 41 |
+
- inspected_nodes
|
| 42 |
+
- inspection_results
|
| 43 |
+
- trace_results
|
| 44 |
+
- notified_nodes
|
| 45 |
+
- quarantined_inventory
|
| 46 |
+
- history
|
| 47 |
+
- steps_taken
|
| 48 |
+
- remaining_step_budget
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docker/Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
+
PYTHONUNBUFFERED=1 \
|
| 7 |
+
PORT=7860
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt ./
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
EXPOSE 7860
|
| 15 |
+
|
| 16 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
grader/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Grader package for RecallTrace."""
|
grader/grader.py
CHANGED
|
@@ -1,220 +1,57 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
if total > 0:
|
| 59 |
-
if node_id not in affected_nodes:
|
| 60 |
-
affected_nodes.append(node_id)
|
| 61 |
-
correct_quantities.setdefault(node_id, {})[lot_id] = total
|
| 62 |
-
|
| 63 |
-
total_affected_qty = sum(
|
| 64 |
-
q for node_q in correct_quantities.values() for q in node_q.values()
|
| 65 |
-
)
|
| 66 |
-
|
| 67 |
-
return {
|
| 68 |
-
"contaminated_lots": sorted(contaminated_lots),
|
| 69 |
-
"affected_lots": sorted(contaminated_lots), # alias used by Sham's env
|
| 70 |
-
"affected_nodes": sorted(affected_nodes),
|
| 71 |
-
"correct_quantities": correct_quantities,
|
| 72 |
-
"total_affected_quantity": total_affected_qty,
|
| 73 |
-
}
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 77 |
-
# Batch Grader (Shreya's style — for standalone agent_output dicts)
|
| 78 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 79 |
-
|
| 80 |
-
def grade(agent_output: Dict[str, Any], scenario: Dict[str, Any]) -> float:
|
| 81 |
-
"""
|
| 82 |
-
Score an agent_output dict against ground truth.
|
| 83 |
-
agent_output = {"quarantine": [{"node": ..., "lot": ..., "qty": ...}, ...]}
|
| 84 |
-
Returns score in [0.0, 1.0].
|
| 85 |
-
"""
|
| 86 |
-
ground_truth = compute_ground_truth(scenario)
|
| 87 |
-
correct_quantities = ground_truth["correct_quantities"]
|
| 88 |
-
total_targets = len(correct_quantities)
|
| 89 |
-
if total_targets == 0:
|
| 90 |
-
return 1.0
|
| 91 |
-
|
| 92 |
-
score = 0.0
|
| 93 |
-
for action in agent_output.get("quarantine", []):
|
| 94 |
-
node = action.get("node") or action.get("node_id")
|
| 95 |
-
lot = action.get("lot") or action.get("lot_id")
|
| 96 |
-
qty = action.get("qty") or action.get("quantity", 0)
|
| 97 |
-
|
| 98 |
-
if node in correct_quantities and lot in correct_quantities[node]:
|
| 99 |
-
score += 0.5 # correct lot/node
|
| 100 |
-
if qty == correct_quantities[node][lot]:
|
| 101 |
-
score += 0.5 # exact quantity
|
| 102 |
-
|
| 103 |
-
return round(min(score / total_targets, 1.0), 4)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
def compute_reward(agent_output: Dict[str, Any], scenario: Dict[str, Any]) -> float:
|
| 107 |
-
"""
|
| 108 |
-
Compute a shaped reward for a batch agent_output.
|
| 109 |
-
Returns a float (can exceed 1.0 — raw reward, not normalised).
|
| 110 |
-
"""
|
| 111 |
-
ground_truth = compute_ground_truth(scenario)
|
| 112 |
-
correct_quantities = ground_truth["correct_quantities"]
|
| 113 |
-
contaminated_lots = set(ground_truth["contaminated_lots"])
|
| 114 |
-
|
| 115 |
-
reward = 0.0
|
| 116 |
-
for action in agent_output.get("quarantine", []):
|
| 117 |
-
node = action.get("node") or action.get("node_id")
|
| 118 |
-
lot = action.get("lot") or action.get("lot_id")
|
| 119 |
-
qty = action.get("qty") or action.get("quantity", 0)
|
| 120 |
-
|
| 121 |
-
if node in correct_quantities and lot in correct_quantities[node]:
|
| 122 |
-
reward += 10 # correct lot at correct node
|
| 123 |
-
if qty == correct_quantities[node][lot]:
|
| 124 |
-
reward += 10 # exact quantity bonus
|
| 125 |
-
else:
|
| 126 |
-
reward -= 3 # partial quantity penalty
|
| 127 |
-
elif lot not in contaminated_lots:
|
| 128 |
-
reward -= 5 # unnecessary quarantine
|
| 129 |
-
else:
|
| 130 |
-
reward -= 5 # wrong node
|
| 131 |
-
|
| 132 |
-
# Completion bonus
|
| 133 |
-
if grade(agent_output, scenario) >= 1.0:
|
| 134 |
-
reward += 20
|
| 135 |
-
|
| 136 |
-
return reward
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 140 |
-
# Live Grader (integrated with Sham's env — called from env._handle_finalize)
|
| 141 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 142 |
-
|
| 143 |
-
class LiveGrader:
|
| 144 |
-
"""
|
| 145 |
-
Used inside RecallTraceEnv to compute the final score from env state.
|
| 146 |
-
Mirrors Sham's finalize logic but uses the unified ground truth builder.
|
| 147 |
-
"""
|
| 148 |
-
|
| 149 |
-
def __init__(self, scenario: Dict[str, Any]):
|
| 150 |
-
self.ground_truth = compute_ground_truth(scenario)
|
| 151 |
-
|
| 152 |
-
def score(
|
| 153 |
-
self,
|
| 154 |
-
nodes: Dict[str, Dict],
|
| 155 |
-
notified_nodes: set,
|
| 156 |
-
) -> Tuple[float, Dict[str, Any]]:
|
| 157 |
-
"""
|
| 158 |
-
Compute final score from the env's live node state.
|
| 159 |
-
Returns (score_0_to_1, breakdown_dict).
|
| 160 |
-
"""
|
| 161 |
-
correct_quantities = self.ground_truth["correct_quantities"]
|
| 162 |
-
affected_nodes_set = set(self.ground_truth["affected_nodes"])
|
| 163 |
-
contaminated_lots = set(self.ground_truth["contaminated_lots"])
|
| 164 |
-
|
| 165 |
-
# ── Quarantine match ────────────────────────────────────────────
|
| 166 |
-
missing: Dict[str, Dict[str, int]] = {}
|
| 167 |
-
over: Dict[str, Dict[str, int]] = {}
|
| 168 |
-
unnecessary: int = 0
|
| 169 |
-
|
| 170 |
-
for node_id, node_data in nodes.items():
|
| 171 |
-
q_inv = node_data.get("quarantined_inventory", {})
|
| 172 |
-
expected = correct_quantities.get(node_id, {})
|
| 173 |
-
all_lots = set(expected) | set(q_inv)
|
| 174 |
-
|
| 175 |
-
for lot_id in all_lots:
|
| 176 |
-
exp_qty = expected.get(lot_id, 0)
|
| 177 |
-
act_qty = q_inv.get(lot_id, 0)
|
| 178 |
-
|
| 179 |
-
if lot_id not in contaminated_lots and act_qty > 0:
|
| 180 |
-
unnecessary += 1 # quarantined safe stock
|
| 181 |
-
elif act_qty < exp_qty:
|
| 182 |
-
missing.setdefault(node_id, {})[lot_id] = exp_qty - act_qty
|
| 183 |
-
elif act_qty > exp_qty:
|
| 184 |
-
over.setdefault(node_id, {})[lot_id] = act_qty - exp_qty
|
| 185 |
-
|
| 186 |
-
total_affected_qty = self.ground_truth["total_affected_quantity"]
|
| 187 |
-
missing_total = sum(q for d in missing.values() for q in d.values())
|
| 188 |
-
over_total = sum(q for d in over.values() for q in d.values())
|
| 189 |
-
|
| 190 |
-
quarantine_score = (
|
| 191 |
-
max(0.0, 1.0 - ((missing_total + over_total) / total_affected_qty))
|
| 192 |
-
if total_affected_qty else 1.0
|
| 193 |
-
)
|
| 194 |
-
|
| 195 |
-
# ── Notification score ──────────────────────────────────────────
|
| 196 |
-
correctly_notified = len(notified_nodes & affected_nodes_set)
|
| 197 |
-
notification_score = (
|
| 198 |
-
correctly_notified / len(affected_nodes_set) if affected_nodes_set else 1.0
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
# ── Penalty for unnecessary quarantines ─────────────────────────
|
| 202 |
-
penalty = min(unnecessary * 0.05, 0.15)
|
| 203 |
-
|
| 204 |
-
# ── Final score ─────────────────────────────────────────────────
|
| 205 |
-
raw = (quarantine_score + notification_score) / 2.0 - penalty
|
| 206 |
-
final_score = round(max(0.0, min(1.0, raw)), 4)
|
| 207 |
-
|
| 208 |
-
breakdown = {
|
| 209 |
-
"final_score": final_score,
|
| 210 |
-
"quarantine_score": round(quarantine_score, 4),
|
| 211 |
-
"notification_score": round(notification_score, 4),
|
| 212 |
-
"unnecessary_penalty": round(-penalty, 4),
|
| 213 |
-
"missing_quantities": missing,
|
| 214 |
-
"over_quarantined": over,
|
| 215 |
-
"unnecessary_quarantines": unnecessary,
|
| 216 |
-
"correctly_notified": correctly_notified,
|
| 217 |
-
"all_affected_notified": notification_score == 1.0,
|
| 218 |
-
"all_stock_quarantined": missing_total == 0 and over_total == 0,
|
| 219 |
-
}
|
| 220 |
-
return final_score, breakdown
|
|
|
|
| 1 |
+
"""Deterministic graders for RecallTrace tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Iterable, List
|
| 6 |
+
|
| 7 |
+
from env.env import RecallTraceEnv
|
| 8 |
+
from env.models import RecallAction, TaskGrade
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def evaluate_action_plan(task_id: str, actions: Iterable[RecallAction | dict]) -> TaskGrade:
|
| 12 |
+
"""Run an action plan against a task and return a deterministic score."""
|
| 13 |
+
env = RecallTraceEnv(task_id=task_id)
|
| 14 |
+
env.reset()
|
| 15 |
+
|
| 16 |
+
rewards: List[float] = []
|
| 17 |
+
final_info = {"message": "Episode never finalized."}
|
| 18 |
+
|
| 19 |
+
for action in actions:
|
| 20 |
+
_, reward, done, info = env.step(action)
|
| 21 |
+
rewards.append(reward)
|
| 22 |
+
final_info = info
|
| 23 |
+
if done:
|
| 24 |
+
break
|
| 25 |
+
|
| 26 |
+
if not env.done:
|
| 27 |
+
_, reward, done, info = env.step(RecallAction(type="finalize"))
|
| 28 |
+
rewards.append(reward)
|
| 29 |
+
final_info = info
|
| 30 |
+
assert done
|
| 31 |
+
|
| 32 |
+
score = float(final_info.get("score", 0.0))
|
| 33 |
+
state = env.state()
|
| 34 |
+
return TaskGrade(
|
| 35 |
+
task_id=task_id,
|
| 36 |
+
score=score,
|
| 37 |
+
success=score >= 0.9,
|
| 38 |
+
steps_taken=state.steps_taken,
|
| 39 |
+
max_steps=state.task.max_steps,
|
| 40 |
+
reward_total=round(sum(rewards), 4),
|
| 41 |
+
final_info=final_info,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def grade_finalize_info(task_id: str, steps_taken: int, final_info: dict) -> TaskGrade:
|
| 46 |
+
"""Build a TaskGrade object from a finalized episode payload."""
|
| 47 |
+
env = RecallTraceEnv(task_id=task_id)
|
| 48 |
+
env.reset()
|
| 49 |
+
return TaskGrade(
|
| 50 |
+
task_id=task_id,
|
| 51 |
+
score=float(final_info.get("score", 0.0)),
|
| 52 |
+
success=float(final_info.get("score", 0.0)) >= 0.9,
|
| 53 |
+
steps_taken=steps_taken,
|
| 54 |
+
max_steps=env.task.max_steps,
|
| 55 |
+
reward_total=float(final_info.get("score", 0.0)),
|
| 56 |
+
final_info=final_info,
|
| 57 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Submission-grade baseline inference runner for RecallTrace."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from typing import Any, List
|
| 8 |
+
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
|
| 11 |
+
from env.env import RecallTraceEnv
|
| 12 |
+
from env.models import RecallAction
|
| 13 |
+
from grader.grader import grade_finalize_info
|
| 14 |
+
from baseline.policy import choose_heuristic_action, choose_llm_action
|
| 15 |
+
|
| 16 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 17 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
|
| 18 |
+
API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN", "")
|
| 19 |
+
BENCHMARK = "RecallTrace"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 23 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def log_step(step: int, action: RecallAction, reward: float, done: bool, error: str | None) -> None:
|
| 27 |
+
payload = json.dumps(action.model_dump(exclude_none=True), sort_keys=True)
|
| 28 |
+
error_text = error if error is not None else "null"
|
| 29 |
+
print(f"[STEP] step={step} action={payload} reward={reward:.4f} done={str(done).lower()} error={error_text}", flush=True)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 33 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={json.dumps([round(r, 4) for r in rewards])}", flush=True)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def run_task(task_id: str, client: OpenAI | None) -> float:
|
| 37 |
+
env = RecallTraceEnv(task_id=task_id)
|
| 38 |
+
observation = env.reset()
|
| 39 |
+
|
| 40 |
+
history: List[dict[str, Any]] = []
|
| 41 |
+
rewards: List[float] = []
|
| 42 |
+
steps_taken = 0
|
| 43 |
+
final_info: dict[str, Any] = {"score": 0.0}
|
| 44 |
+
|
| 45 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME if client else "heuristic-baseline")
|
| 46 |
+
|
| 47 |
+
for step in range(1, env.task.max_steps + 1):
|
| 48 |
+
llm_action = choose_llm_action(client, MODEL_NAME, observation, history)
|
| 49 |
+
action = llm_action or choose_heuristic_action(observation)
|
| 50 |
+
|
| 51 |
+
observation, reward, done, info = env.step(action)
|
| 52 |
+
rewards.append(reward)
|
| 53 |
+
steps_taken = step
|
| 54 |
+
final_info = info
|
| 55 |
+
log_step(step=step, action=action, reward=reward, done=done, error=info.get("error"))
|
| 56 |
+
|
| 57 |
+
history.append(
|
| 58 |
+
{
|
| 59 |
+
"step": step,
|
| 60 |
+
"action": action.model_dump(exclude_none=True),
|
| 61 |
+
"reward": reward,
|
| 62 |
+
"done": done,
|
| 63 |
+
"message": info.get("message"),
|
| 64 |
+
}
|
| 65 |
+
)
|
| 66 |
+
if done:
|
| 67 |
+
break
|
| 68 |
+
|
| 69 |
+
grade = grade_finalize_info(task_id, steps_taken, final_info)
|
| 70 |
+
log_end(success=grade.success, steps=steps_taken, score=grade.score, rewards=rewards)
|
| 71 |
+
return grade.score
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def main() -> None:
|
| 75 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) if API_KEY else None
|
| 76 |
+
task_scores = [run_task(task.task_id, client) for task in RecallTraceEnv.available_tasks()]
|
| 77 |
+
average_score = sum(task_scores) / len(task_scores)
|
| 78 |
+
print(json.dumps({"benchmark": BENCHMARK, "average_score": round(average_score, 4), "task_scores": task_scores}), flush=True)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
main()
|
inference/inference.py
CHANGED
|
@@ -1,171 +1,9 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
Runs a rule-based baseline agent across all three difficulty levels.
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
from __future__ import annotations
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
from pathlib import Path
|
| 17 |
-
|
| 18 |
-
ROOT_DIR = Path(__file__).resolve().parents[1]
|
| 19 |
-
if str(ROOT_DIR) not in sys.path:
|
| 20 |
-
sys.path.insert(0, str(ROOT_DIR))
|
| 21 |
-
|
| 22 |
-
from recall_env.env import RecallTraceEnv
|
| 23 |
-
from scenario.scenario import get_scenario, list_levels
|
| 24 |
-
from grader.grader import grade, compute_reward
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 28 |
-
# Baseline rule-based agent (uses the scenario's ground truth directly)
|
| 29 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 30 |
-
|
| 31 |
-
def build_action_sequence(env: RecallTraceEnv) -> list:
|
| 32 |
-
"""
|
| 33 |
-
Build a perfect action sequence using the scenario's known structure.
|
| 34 |
-
In a real eval, the agent would not have access to ground truth —
|
| 35 |
-
this baseline is used for smoke testing only.
|
| 36 |
-
"""
|
| 37 |
-
scenario = env._scenario_template
|
| 38 |
-
nodes = list(scenario["nodes"].keys())
|
| 39 |
-
lot_catalog = scenario.get("lot_catalog", {})
|
| 40 |
-
transformations = scenario.get("transformations", {})
|
| 41 |
-
contaminated_lot = scenario.get("contaminated_lot", "")
|
| 42 |
-
|
| 43 |
-
# Contaminated lots = all flagged in catalog + relabeled variants
|
| 44 |
-
contaminated_lots = {
|
| 45 |
-
lot for lot, meta in lot_catalog.items() if meta.get("contaminated", False)
|
| 46 |
-
}
|
| 47 |
-
contaminated_lots.add(contaminated_lot)
|
| 48 |
-
if contaminated_lot in transformations:
|
| 49 |
-
contaminated_lots.add(transformations[contaminated_lot])
|
| 50 |
-
|
| 51 |
-
actions = []
|
| 52 |
-
|
| 53 |
-
# 1. Inspect all nodes
|
| 54 |
-
for node_id in nodes:
|
| 55 |
-
actions.append({"type": "inspect_node", "node_id": node_id})
|
| 56 |
-
|
| 57 |
-
# 2. Trace all contaminated lots
|
| 58 |
-
for lot_id in sorted(contaminated_lots):
|
| 59 |
-
actions.append({"type": "trace_lot", "lot_id": lot_id})
|
| 60 |
-
|
| 61 |
-
# 3. Quarantine all contaminated inventory (exact quantities)
|
| 62 |
-
for node_id, node_data in scenario["nodes"].items():
|
| 63 |
-
for lot_id, qty in node_data["inventory"].items():
|
| 64 |
-
if lot_id in contaminated_lots and qty > 0:
|
| 65 |
-
actions.append({
|
| 66 |
-
"type": "quarantine",
|
| 67 |
-
"node_id": node_id,
|
| 68 |
-
"lot_id": lot_id,
|
| 69 |
-
"quantity": qty,
|
| 70 |
-
})
|
| 71 |
-
|
| 72 |
-
# 4. Notify all affected nodes
|
| 73 |
-
notified = set()
|
| 74 |
-
for node_id, node_data in scenario["nodes"].items():
|
| 75 |
-
if any(lot_id in contaminated_lots for lot_id in node_data["inventory"]):
|
| 76 |
-
if node_id not in notified:
|
| 77 |
-
actions.append({"type": "notify", "node_id": node_id})
|
| 78 |
-
notified.add(node_id)
|
| 79 |
-
|
| 80 |
-
# 5. Finalize
|
| 81 |
-
actions.append({"type": "finalize"})
|
| 82 |
-
|
| 83 |
-
return actions
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 87 |
-
# Episode runner
|
| 88 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 89 |
-
|
| 90 |
-
def run_episode(level: str) -> float:
|
| 91 |
-
DIVIDER = "=" * 62
|
| 92 |
-
|
| 93 |
-
print(f"\n{DIVIDER}")
|
| 94 |
-
print(f"[START] level={level.upper()}")
|
| 95 |
-
|
| 96 |
-
env = RecallTraceEnv(level=level)
|
| 97 |
-
obs = env.reset()
|
| 98 |
-
|
| 99 |
-
print(f"[START] scenario_id : {env._scenario_template['scenario_id']}")
|
| 100 |
-
print(f"[START] recall_notice: {obs['recall_notice']}")
|
| 101 |
-
print(f"[START] nodes : {obs['inspected_nodes']} (none inspected yet)")
|
| 102 |
-
print(DIVIDER)
|
| 103 |
-
|
| 104 |
-
actions = build_action_sequence(env)
|
| 105 |
-
|
| 106 |
-
# Re-initialise so env is clean (build_action_sequence used _scenario_template only)
|
| 107 |
-
env2 = RecallTraceEnv(level=level)
|
| 108 |
-
obs = env2.reset()
|
| 109 |
-
|
| 110 |
-
final_info = {}
|
| 111 |
-
for action in actions:
|
| 112 |
-
obs, reward, done, info = env2.step(action)
|
| 113 |
-
err = f" ⚠ {info['error']}" if info.get("error") else ""
|
| 114 |
-
print(f"[STEP] action={action} reward={reward} done={done}{err}")
|
| 115 |
-
if done:
|
| 116 |
-
final_info = info
|
| 117 |
-
break
|
| 118 |
-
|
| 119 |
-
score = final_info.get("score", 0.0)
|
| 120 |
-
breakdown = final_info.get("breakdown", {})
|
| 121 |
-
|
| 122 |
-
print(f"\n{DIVIDER}")
|
| 123 |
-
print(f"[END] final_score : {score}")
|
| 124 |
-
print(f"[END] quarantine_score : {breakdown.get('quarantine_score', '-')}")
|
| 125 |
-
print(f"[END] notification_score : {breakdown.get('notification_score', '-')}")
|
| 126 |
-
print(f"[END] all_stock_quarantined : {breakdown.get('all_stock_quarantined', '-')}")
|
| 127 |
-
print(f"[END] all_affected_notified : {breakdown.get('all_affected_notified', '-')}")
|
| 128 |
-
print(DIVIDER)
|
| 129 |
-
|
| 130 |
-
# ── Shreya's batch grader validation (independent cross-check) ──────
|
| 131 |
-
# Read quarantined_inventory from the LIVE env state (post-episode)
|
| 132 |
-
scenario = env2._scenario_template
|
| 133 |
-
live_nodes = env2.state_data["nodes"]
|
| 134 |
-
agent_output = {
|
| 135 |
-
"quarantine": [
|
| 136 |
-
{"node": node_id, "lot": lot_id, "qty": qty}
|
| 137 |
-
for node_id, node_data in live_nodes.items()
|
| 138 |
-
for lot_id, qty in node_data.get("quarantined_inventory", {}).items()
|
| 139 |
-
if qty > 0
|
| 140 |
-
]
|
| 141 |
-
}
|
| 142 |
-
batch_score = grade(agent_output, scenario)
|
| 143 |
-
batch_reward = compute_reward(agent_output, scenario)
|
| 144 |
-
print(f"[XCHECK] batch_grade={batch_score} batch_reward={batch_reward}")
|
| 145 |
-
|
| 146 |
-
return score
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 150 |
-
# Main
|
| 151 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 152 |
-
|
| 153 |
-
def main():
|
| 154 |
-
print("\nRecallTrace OpenEnv — Local Smoke Test (Phase 3 / Task 10)")
|
| 155 |
-
print("Team: Shamanth MS | P G Ayush Rai | Shreya B J")
|
| 156 |
-
|
| 157 |
-
results = {}
|
| 158 |
-
for level in list_levels():
|
| 159 |
-
results[level] = run_episode(level)
|
| 160 |
-
|
| 161 |
-
print("\n" + "=" * 62)
|
| 162 |
-
print("FINAL SUMMARY")
|
| 163 |
-
print("-" * 62)
|
| 164 |
-
for level, score in results.items():
|
| 165 |
-
status = "✓ PASS" if score >= 0.8 else "✗ FAIL"
|
| 166 |
-
print(f" {level:8s} → score={score:.4f} {status}")
|
| 167 |
-
print("=" * 62)
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
if __name__ == "__main__":
|
| 171 |
-
main()
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import runpy
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
if __name__ == "__main__":
|
| 7 |
+
root = Path(__file__).resolve().parents[1]
|
| 8 |
+
sys.path.insert(0, str(root))
|
| 9 |
+
runpy.run_path(str(root / "inference.py"), run_name="__main__")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference/policy.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Heuristic baseline policy for RecallTrace."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
from typing import Any, Dict, Optional
|
| 8 |
+
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
|
| 11 |
+
from env.models import RecallAction, RecallObservation
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
LOT_PATTERN = re.compile(r"\bLot[A-Za-z0-9_]+\b")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _extract_root_lot(observation: RecallObservation) -> str:
|
| 18 |
+
match = LOT_PATTERN.search(observation.recall_notice)
|
| 19 |
+
return match.group(0) if match else "LotA"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def choose_heuristic_action(observation: RecallObservation) -> RecallAction:
|
| 23 |
+
"""Choose the next deterministic action using only observable state."""
|
| 24 |
+
root_lot = _extract_root_lot(observation)
|
| 25 |
+
trace_result = observation.trace_results.get(root_lot)
|
| 26 |
+
|
| 27 |
+
if trace_result is None:
|
| 28 |
+
return RecallAction(type="trace_lot", lot_id=root_lot, rationale="Map the recall lineage first.")
|
| 29 |
+
|
| 30 |
+
affected_nodes = trace_result.get("affected_nodes", [])
|
| 31 |
+
for node_id in affected_nodes:
|
| 32 |
+
if node_id not in observation.inspected_nodes:
|
| 33 |
+
return RecallAction(type="inspect_node", node_id=node_id, rationale="Collect local evidence before quarantining.")
|
| 34 |
+
|
| 35 |
+
for node_id, findings in observation.inspection_results.items():
|
| 36 |
+
for lot_id, finding in findings.items():
|
| 37 |
+
unsafe_quantity = finding.unsafe_quantity
|
| 38 |
+
quarantined_quantity = observation.quarantined_inventory.get(node_id, {}).get(lot_id, 0)
|
| 39 |
+
available_quantity = observation.inventory.get(node_id, {}).get(lot_id, 0)
|
| 40 |
+
remaining_target = unsafe_quantity - quarantined_quantity
|
| 41 |
+
if remaining_target > 0 and available_quantity > 0:
|
| 42 |
+
return RecallAction(
|
| 43 |
+
type="quarantine",
|
| 44 |
+
node_id=node_id,
|
| 45 |
+
lot_id=lot_id,
|
| 46 |
+
quantity=min(remaining_target, available_quantity),
|
| 47 |
+
rationale="Isolate the exact unsafe quantity discovered during inspection.",
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
missing_notifications = [node_id for node_id in affected_nodes if node_id not in observation.notified_nodes]
|
| 51 |
+
if missing_notifications:
|
| 52 |
+
return RecallAction(type="notify", node_id="all", rationale="Alert every impacted stakeholder before closing the incident.")
|
| 53 |
+
|
| 54 |
+
return RecallAction(type="finalize", rationale="Containment actions are complete.")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def choose_llm_action(
|
| 58 |
+
client: Optional[OpenAI],
|
| 59 |
+
model_name: str,
|
| 60 |
+
observation: RecallObservation,
|
| 61 |
+
history: list[dict[str, Any]],
|
| 62 |
+
) -> Optional[RecallAction]:
|
| 63 |
+
"""Ask an LLM for the next action, returning None on failure."""
|
| 64 |
+
if client is None:
|
| 65 |
+
return None
|
| 66 |
+
|
| 67 |
+
prompt = {
|
| 68 |
+
"task_id": observation.task_id,
|
| 69 |
+
"phase": observation.phase,
|
| 70 |
+
"notice": observation.recall_notice,
|
| 71 |
+
"inventory": observation.inventory,
|
| 72 |
+
"inspection_results": {
|
| 73 |
+
node_id: {lot_id: evidence.model_dump() for lot_id, evidence in findings.items()}
|
| 74 |
+
for node_id, findings in observation.inspection_results.items()
|
| 75 |
+
},
|
| 76 |
+
"trace_results": observation.trace_results,
|
| 77 |
+
"notified_nodes": observation.notified_nodes,
|
| 78 |
+
"quarantined_inventory": observation.quarantined_inventory,
|
| 79 |
+
"steps_taken": observation.steps_taken,
|
| 80 |
+
"remaining_step_budget": observation.remaining_step_budget,
|
| 81 |
+
"history": history[-6:],
|
| 82 |
+
"instruction": "Return only compact JSON with keys type,node_id,lot_id,quantity,rationale. Use one valid action.",
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
completion = client.chat.completions.create(
|
| 87 |
+
model=model_name,
|
| 88 |
+
temperature=0,
|
| 89 |
+
max_tokens=180,
|
| 90 |
+
messages=[
|
| 91 |
+
{"role": "system", "content": "You are operating a deterministic product recall environment. Respond with only valid JSON for the next action."},
|
| 92 |
+
{"role": "user", "content": json.dumps(prompt, sort_keys=True)},
|
| 93 |
+
],
|
| 94 |
+
)
|
| 95 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 96 |
+
if not text:
|
| 97 |
+
return None
|
| 98 |
+
return RecallAction.model_validate_json(text)
|
| 99 |
+
except Exception:
|
| 100 |
+
return None
|
openenv.yaml
CHANGED
|
@@ -1,156 +1,48 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
- name: finalize
|
| 50 |
-
params: {}
|
| 51 |
-
description: >
|
| 52 |
-
Submit the containment plan. Triggers final scoring. Episode ends.
|
| 53 |
-
reward_hint: returns final_score in [0.0, 1.0]
|
| 54 |
-
|
| 55 |
-
# ── Observation Space ─────────────────────────────────────────────────────────
|
| 56 |
-
observation:
|
| 57 |
-
recall_notice:
|
| 58 |
-
type: str
|
| 59 |
-
description: Human-readable contamination alert issued at episode start
|
| 60 |
-
|
| 61 |
-
inventory:
|
| 62 |
-
type: dict
|
| 63 |
-
description: >
|
| 64 |
-
Full inventory snapshot across all nodes.
|
| 65 |
-
{ node_id: { lot_id: quantity } }
|
| 66 |
-
|
| 67 |
-
discovered_shipments:
|
| 68 |
-
type: dict
|
| 69 |
-
description: >
|
| 70 |
-
Outbound shipment edges revealed so far (only for inspected nodes).
|
| 71 |
-
{ node_id: [downstream_node_id, ...] }
|
| 72 |
-
|
| 73 |
-
history:
|
| 74 |
-
type: list[str]
|
| 75 |
-
description: Ordered log of all actions taken this episode
|
| 76 |
-
|
| 77 |
-
inspected_nodes:
|
| 78 |
-
type: list[str]
|
| 79 |
-
description: Sorted list of nodes that have been inspected
|
| 80 |
-
|
| 81 |
-
notified_nodes:
|
| 82 |
-
type: list[str]
|
| 83 |
-
description: Sorted list of nodes that have been sent recall alerts
|
| 84 |
-
|
| 85 |
-
quarantined_inventory:
|
| 86 |
-
type: dict
|
| 87 |
-
description: >
|
| 88 |
-
Inventory currently in quarantine (non-empty nodes only).
|
| 89 |
-
{ node_id: { lot_id: quantity } }
|
| 90 |
-
|
| 91 |
-
# ── Tasks ─────────────────────────────────────────────────────────────────────
|
| 92 |
-
tasks:
|
| 93 |
-
- id: easy
|
| 94 |
-
name: "Task 1 — Direct Recall"
|
| 95 |
-
assign: "Shreya B J"
|
| 96 |
-
description: >
|
| 97 |
-
Single contaminated lot (LotA) distributed across a linear
|
| 98 |
-
warehouse → store1 → store2 chain. No relabeling.
|
| 99 |
-
nodes: [warehouse, store1, store2]
|
| 100 |
-
contaminated_lots: [LotA]
|
| 101 |
-
|
| 102 |
-
- id: medium
|
| 103 |
-
name: "Task 2 — Relabeled Inventory"
|
| 104 |
-
assign: "Shreya B J"
|
| 105 |
-
description: >
|
| 106 |
-
LotA is contaminated; it was repacked and relabeled as LotA1
|
| 107 |
-
at the distribution centre. Agent must trace the transformation.
|
| 108 |
-
nodes: [warehouse, dist_centre, store_north, store_south]
|
| 109 |
-
contaminated_lots: [LotA, LotA1]
|
| 110 |
-
|
| 111 |
-
- id: hard
|
| 112 |
-
name: "Task 3 — Mixed Shipments"
|
| 113 |
-
assign: "Shreya B J"
|
| 114 |
-
description: >
|
| 115 |
-
Two contaminated lots (LotX, LotY) co-shipped with safe stock
|
| 116 |
-
(LotB, LotC) across a hub-and-spoke network. Precise quarantine required.
|
| 117 |
-
nodes: [plant_a, plant_b, hub, retail_east, retail_west, retail_central]
|
| 118 |
-
contaminated_lots: [LotX, LotY]
|
| 119 |
-
|
| 120 |
-
# ── Scoring ───────────────────────────────────────────────────────────────────
|
| 121 |
-
scoring:
|
| 122 |
-
range: [0.0, 1.0]
|
| 123 |
-
formula: "(quarantine_score + notification_score) / 2 − unnecessary_penalty"
|
| 124 |
-
components:
|
| 125 |
-
quarantine_score:
|
| 126 |
-
weight: 0.5
|
| 127 |
-
description: >
|
| 128 |
-
1 − ((missing_qty + over_qty) / total_affected_qty).
|
| 129 |
-
Full marks for exact quarantine of all affected lots.
|
| 130 |
-
notification_score:
|
| 131 |
-
weight: 0.5
|
| 132 |
-
description: >
|
| 133 |
-
fraction of affected nodes that were notified.
|
| 134 |
-
unnecessary_penalty:
|
| 135 |
-
max: 0.15
|
| 136 |
-
description: >
|
| 137 |
-
−0.05 per unnecessary quarantine (safe stock), capped at 0.15.
|
| 138 |
-
|
| 139 |
-
# ── OpenEnv Compliance ────────────────────────────────────────────────────────
|
| 140 |
-
compliance:
|
| 141 |
-
implements_reset: true
|
| 142 |
-
implements_step: true
|
| 143 |
-
implements_state: true
|
| 144 |
-
deterministic: true
|
| 145 |
-
typed_models: true
|
| 146 |
-
offline: true
|
| 147 |
-
reproducible: true
|
| 148 |
-
|
| 149 |
-
# ── Project Team ──────────────────────────────────────────────────────────────
|
| 150 |
-
team:
|
| 151 |
-
- name: "Shamanth MS"
|
| 152 |
-
tasks: [env_core, action_handler, ground_truth_system, connect_components, submission]
|
| 153 |
-
- name: "P G Ayush Rai"
|
| 154 |
-
tasks: [openenv_spec, docker_setup, openenv_validation, deploy_hf_spaces]
|
| 155 |
-
- name: "Shreya B J"
|
| 156 |
-
tasks: [scenario_expansion, grader_system, reward_function]
|
|
|
|
| 1 |
+
name: RecallTraceEnv
|
| 2 |
+
version: 1.0.0
|
| 3 |
+
description: Deterministic OpenEnv environment for supply-chain product recall tracing and precision containment.
|
| 4 |
+
entrypoint:
|
| 5 |
+
module: env.env
|
| 6 |
+
class: RecallTraceEnv
|
| 7 |
+
server:
|
| 8 |
+
module: server
|
| 9 |
+
app: app
|
| 10 |
+
models:
|
| 11 |
+
action: env.models.RecallAction
|
| 12 |
+
observation: env.models.RecallObservation
|
| 13 |
+
reward: env.models.RewardSignal
|
| 14 |
+
tasks:
|
| 15 |
+
- id: phase1_direct_recall
|
| 16 |
+
difficulty: easy
|
| 17 |
+
objective: Identify every location holding the recalled lot and quarantine all contaminated stock.
|
| 18 |
+
- id: phase2_relabel_recall
|
| 19 |
+
difficulty: medium
|
| 20 |
+
objective: Follow relabeled lots back to the source batch and quarantine every derived label precisely.
|
| 21 |
+
- id: phase3_mixed_shipments
|
| 22 |
+
difficulty: hard
|
| 23 |
+
objective: Contain only the unsafe quantity after contaminated stock was mixed with safe inventory during cross-docking.
|
| 24 |
+
interfaces:
|
| 25 |
+
methods:
|
| 26 |
+
- reset
|
| 27 |
+
- step
|
| 28 |
+
- state
|
| 29 |
+
actions:
|
| 30 |
+
- inspect_node
|
| 31 |
+
- trace_lot
|
| 32 |
+
- quarantine
|
| 33 |
+
- notify
|
| 34 |
+
- finalize
|
| 35 |
+
observation_fields:
|
| 36 |
+
- task_id
|
| 37 |
+
- phase
|
| 38 |
+
- recall_notice
|
| 39 |
+
- inventory
|
| 40 |
+
- discovered_shipments
|
| 41 |
+
- inspected_nodes
|
| 42 |
+
- inspection_results
|
| 43 |
+
- trace_results
|
| 44 |
+
- notified_nodes
|
| 45 |
+
- quarantined_inventory
|
| 46 |
+
- history
|
| 47 |
+
- steps_taken
|
| 48 |
+
- remaining_step_budget
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
CHANGED
|
@@ -1,35 +1,23 @@
|
|
| 1 |
-
[build-system]
|
| 2 |
-
requires = ["setuptools>=68
|
| 3 |
-
build-backend = "setuptools.
|
| 4 |
-
|
| 5 |
-
[project]
|
| 6 |
-
name = "recalltrace-openenv"
|
| 7 |
-
version = "1.0.0"
|
| 8 |
-
description = "
|
| 9 |
-
readme = "README.md"
|
| 10 |
-
requires-python = ">=3.
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
[
|
| 24 |
-
dev = ["pytest", "gymnasium"]
|
| 25 |
-
|
| 26 |
-
[project.scripts]
|
| 27 |
-
server = "server.app:main"
|
| 28 |
-
|
| 29 |
-
[tool.setuptools.packages.find]
|
| 30 |
-
where = ["."]
|
| 31 |
-
include = ["env*", "scenario*", "grader*", "inference*", "server*"]
|
| 32 |
-
|
| 33 |
-
[tool.openenv]
|
| 34 |
-
entry_point = "env.env:RecallTraceEnv"
|
| 35 |
-
tasks = ["easy", "medium", "hard"]
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "recalltrace-openenv"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "Deterministic OpenEnv environment for supply-chain recall tracing and precision containment"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.12"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"fastapi>=0.115.0,<1.0.0",
|
| 13 |
+
"openai>=2.7.2,<3.0.0",
|
| 14 |
+
"openenv-core>=0.2.0",
|
| 15 |
+
"pydantic>=2.7.0,<3.0.0",
|
| 16 |
+
"uvicorn>=0.30.0,<1.0.0",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
[project.scripts]
|
| 20 |
+
server = "server.app:main"
|
| 21 |
+
|
| 22 |
+
[tool.setuptools]
|
| 23 |
+
packages = ["env", "grader", "scenario", "baseline", "server"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,2 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.115.0,<1.0.0
|
| 2 |
+
openai>=2.7.2,<3.0.0
|
| 3 |
+
pydantic>=2.7.0,<3.0.0
|
| 4 |
+
uvicorn>=0.30.0,<1.0.0
|
| 5 |
+
openenv-core>=0.2.0,<1.0.0
|
scenario/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Scenario package for RecallTrace."""
|
scenario/scenario.py
CHANGED
|
@@ -1,189 +1,363 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
"
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
"
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
"
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
"
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
"
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
"
|
| 97 |
-
"
|
| 98 |
-
"
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
"
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
"
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
"
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic scenario catalog for RecallTrace."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from copy import deepcopy
|
| 6 |
+
from typing import Any, Dict, List
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
PHASE1_SCENARIO: Dict[str, Any] = {
|
| 10 |
+
"task_id": "phase1_direct_recall",
|
| 11 |
+
"phase": 1,
|
| 12 |
+
"difficulty": "easy",
|
| 13 |
+
"name": "Direct Recall Containment",
|
| 14 |
+
"objective": "Identify every location holding the recalled lot and quarantine all contaminated stock.",
|
| 15 |
+
"max_steps": 10,
|
| 16 |
+
"recall_notice": "Immediate recall: contaminated LotA detected in the cold-chain network.",
|
| 17 |
+
"contaminated_lot": "LotA",
|
| 18 |
+
"shipment_graph": {
|
| 19 |
+
"warehouse": ["store1", "store2"],
|
| 20 |
+
"store1": ["store2"],
|
| 21 |
+
"store2": [],
|
| 22 |
+
},
|
| 23 |
+
"lot_catalog": {
|
| 24 |
+
"LotA": {
|
| 25 |
+
"contaminated": True,
|
| 26 |
+
"product": "ready_meal",
|
| 27 |
+
"root_lot": "LotA",
|
| 28 |
+
"notes": "Original contaminated production batch.",
|
| 29 |
+
},
|
| 30 |
+
"LotB": {
|
| 31 |
+
"contaminated": False,
|
| 32 |
+
"product": "ready_meal",
|
| 33 |
+
"root_lot": "LotB",
|
| 34 |
+
"notes": "Safe control batch.",
|
| 35 |
+
},
|
| 36 |
+
},
|
| 37 |
+
"nodes": {
|
| 38 |
+
"warehouse": {
|
| 39 |
+
"inventory": {"LotA": 100},
|
| 40 |
+
"quarantined_inventory": {},
|
| 41 |
+
"inspection_findings": {
|
| 42 |
+
"LotA": {
|
| 43 |
+
"status": "confirmed_contaminated",
|
| 44 |
+
"unsafe_quantity": 100,
|
| 45 |
+
"evidence": "QA retained sample matched the recall notice for LotA.",
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
},
|
| 49 |
+
"store1": {
|
| 50 |
+
"inventory": {"LotA": 50},
|
| 51 |
+
"quarantined_inventory": {},
|
| 52 |
+
"inspection_findings": {
|
| 53 |
+
"LotA": {
|
| 54 |
+
"status": "confirmed_contaminated",
|
| 55 |
+
"unsafe_quantity": 50,
|
| 56 |
+
"evidence": "Receiving records show unopened cases from LotA.",
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
},
|
| 60 |
+
"store2": {
|
| 61 |
+
"inventory": {"LotA": 20, "LotB": 30},
|
| 62 |
+
"quarantined_inventory": {},
|
| 63 |
+
"inspection_findings": {
|
| 64 |
+
"LotA": {
|
| 65 |
+
"status": "confirmed_contaminated",
|
| 66 |
+
"unsafe_quantity": 20,
|
| 67 |
+
"evidence": "Backroom scan confirms LotA units remain unsold.",
|
| 68 |
+
},
|
| 69 |
+
"LotB": {
|
| 70 |
+
"status": "safe",
|
| 71 |
+
"unsafe_quantity": 0,
|
| 72 |
+
"evidence": "LotB is outside the recall scope.",
|
| 73 |
+
},
|
| 74 |
+
},
|
| 75 |
+
},
|
| 76 |
+
},
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
PHASE2_SCENARIO: Dict[str, Any] = {
|
| 80 |
+
"task_id": "phase2_relabel_recall",
|
| 81 |
+
"phase": 2,
|
| 82 |
+
"difficulty": "medium",
|
| 83 |
+
"name": "Relabeled Inventory Investigation",
|
| 84 |
+
"objective": "Follow relabeled lots back to the source batch and quarantine every derived label precisely.",
|
| 85 |
+
"max_steps": 14,
|
| 86 |
+
"recall_notice": "Urgent recall: source LotA was relabeled during repacking and must be traced across derived labels.",
|
| 87 |
+
"contaminated_lot": "LotA",
|
| 88 |
+
"shipment_graph": {
|
| 89 |
+
"warehouse": ["repack", "store1"],
|
| 90 |
+
"repack": ["store2", "store3"],
|
| 91 |
+
"store1": [],
|
| 92 |
+
"store2": [],
|
| 93 |
+
"store3": [],
|
| 94 |
+
},
|
| 95 |
+
"lot_catalog": {
|
| 96 |
+
"LotA": {
|
| 97 |
+
"contaminated": True,
|
| 98 |
+
"product": "ready_meal",
|
| 99 |
+
"root_lot": "LotA",
|
| 100 |
+
"notes": "Original contaminated batch.",
|
| 101 |
+
},
|
| 102 |
+
"LotA_R1": {
|
| 103 |
+
"contaminated": True,
|
| 104 |
+
"product": "ready_meal",
|
| 105 |
+
"root_lot": "LotA",
|
| 106 |
+
"relabeled_from": "LotA",
|
| 107 |
+
"notes": "Repacked under an internal secondary label.",
|
| 108 |
+
},
|
| 109 |
+
"LotA_R2": {
|
| 110 |
+
"contaminated": True,
|
| 111 |
+
"product": "ready_meal",
|
| 112 |
+
"root_lot": "LotA",
|
| 113 |
+
"relabeled_from": "LotA_R1",
|
| 114 |
+
"notes": "Retail-ready relabel shipped after repacking.",
|
| 115 |
+
},
|
| 116 |
+
"LotB": {
|
| 117 |
+
"contaminated": False,
|
| 118 |
+
"product": "ready_meal",
|
| 119 |
+
"root_lot": "LotB",
|
| 120 |
+
"notes": "Safe control batch.",
|
| 121 |
+
},
|
| 122 |
+
},
|
| 123 |
+
"nodes": {
|
| 124 |
+
"warehouse": {
|
| 125 |
+
"inventory": {"LotA": 40, "LotB": 30},
|
| 126 |
+
"quarantined_inventory": {},
|
| 127 |
+
"inspection_findings": {
|
| 128 |
+
"LotA": {
|
| 129 |
+
"status": "confirmed_contaminated",
|
| 130 |
+
"unsafe_quantity": 40,
|
| 131 |
+
"evidence": "Source pallet labels match the recalled production run.",
|
| 132 |
+
},
|
| 133 |
+
"LotB": {
|
| 134 |
+
"status": "safe",
|
| 135 |
+
"unsafe_quantity": 0,
|
| 136 |
+
"evidence": "LotB remains outside the repacking stream.",
|
| 137 |
+
},
|
| 138 |
+
},
|
| 139 |
+
},
|
| 140 |
+
"repack": {
|
| 141 |
+
"inventory": {"LotA_R1": 45},
|
| 142 |
+
"quarantined_inventory": {},
|
| 143 |
+
"inspection_findings": {
|
| 144 |
+
"LotA_R1": {
|
| 145 |
+
"status": "confirmed_contaminated",
|
| 146 |
+
"unsafe_quantity": 45,
|
| 147 |
+
"evidence": "Repacking worksheet maps LotA directly to LotA_R1.",
|
| 148 |
+
}
|
| 149 |
+
},
|
| 150 |
+
},
|
| 151 |
+
"store1": {
|
| 152 |
+
"inventory": {"LotA": 15, "LotB": 20},
|
| 153 |
+
"quarantined_inventory": {},
|
| 154 |
+
"inspection_findings": {
|
| 155 |
+
"LotA": {
|
| 156 |
+
"status": "confirmed_contaminated",
|
| 157 |
+
"unsafe_quantity": 15,
|
| 158 |
+
"evidence": "Store retains cases with original LotA stickers.",
|
| 159 |
+
},
|
| 160 |
+
"LotB": {
|
| 161 |
+
"status": "safe",
|
| 162 |
+
"unsafe_quantity": 0,
|
| 163 |
+
"evidence": "LotB SKUs are unaffected.",
|
| 164 |
+
},
|
| 165 |
+
},
|
| 166 |
+
},
|
| 167 |
+
"store2": {
|
| 168 |
+
"inventory": {"LotA_R1": 25},
|
| 169 |
+
"quarantined_inventory": {},
|
| 170 |
+
"inspection_findings": {
|
| 171 |
+
"LotA_R1": {
|
| 172 |
+
"status": "confirmed_contaminated",
|
| 173 |
+
"unsafe_quantity": 25,
|
| 174 |
+
"evidence": "Receiving scan ties LotA_R1 to the repack facility transfer.",
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
},
|
| 178 |
+
"store3": {
|
| 179 |
+
"inventory": {"LotA_R2": 20, "LotB": 10},
|
| 180 |
+
"quarantined_inventory": {},
|
| 181 |
+
"inspection_findings": {
|
| 182 |
+
"LotA_R2": {
|
| 183 |
+
"status": "confirmed_contaminated",
|
| 184 |
+
"unsafe_quantity": 20,
|
| 185 |
+
"evidence": "Shelf tags reference the LotA_R2 relabel lineage.",
|
| 186 |
+
},
|
| 187 |
+
"LotB": {
|
| 188 |
+
"status": "safe",
|
| 189 |
+
"unsafe_quantity": 0,
|
| 190 |
+
"evidence": "LotB is a later safe shipment.",
|
| 191 |
+
},
|
| 192 |
+
},
|
| 193 |
+
},
|
| 194 |
+
},
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
PHASE3_SCENARIO: Dict[str, Any] = {
|
| 198 |
+
"task_id": "phase3_mixed_shipments",
|
| 199 |
+
"phase": 3,
|
| 200 |
+
"difficulty": "hard",
|
| 201 |
+
"name": "Mixed Inventory Precision Containment",
|
| 202 |
+
"objective": "Contain only the unsafe quantity after contaminated stock was mixed with safe inventory during cross-docking.",
|
| 203 |
+
"max_steps": 16,
|
| 204 |
+
"recall_notice": "Critical recall: contaminated LotA was mixed with safe stock during cross-docking. Quarantine only the unsafe quantity.",
|
| 205 |
+
"contaminated_lot": "LotA",
|
| 206 |
+
"shipment_graph": {
|
| 207 |
+
"warehouse": ["crossdock", "store1"],
|
| 208 |
+
"crossdock": ["store2", "store3"],
|
| 209 |
+
"store1": [],
|
| 210 |
+
"store2": [],
|
| 211 |
+
"store3": [],
|
| 212 |
+
},
|
| 213 |
+
"lot_catalog": {
|
| 214 |
+
"LotA": {
|
| 215 |
+
"contaminated": True,
|
| 216 |
+
"product": "ready_meal",
|
| 217 |
+
"root_lot": "LotA",
|
| 218 |
+
"notes": "Contaminated upstream batch.",
|
| 219 |
+
},
|
| 220 |
+
"LotBlend": {
|
| 221 |
+
"contaminated": True,
|
| 222 |
+
"product": "ready_meal",
|
| 223 |
+
"root_lot": "LotA",
|
| 224 |
+
"mixed_from": ["LotA", "LotB"],
|
| 225 |
+
"notes": "Cross-docked mixed lot containing both safe and unsafe units.",
|
| 226 |
+
},
|
| 227 |
+
"LotB": {
|
| 228 |
+
"contaminated": False,
|
| 229 |
+
"product": "ready_meal",
|
| 230 |
+
"root_lot": "LotB",
|
| 231 |
+
"notes": "Safe batch mixed into downstream palletization.",
|
| 232 |
+
},
|
| 233 |
+
},
|
| 234 |
+
"nodes": {
|
| 235 |
+
"warehouse": {
|
| 236 |
+
"inventory": {"LotA": 30, "LotB": 25},
|
| 237 |
+
"quarantined_inventory": {},
|
| 238 |
+
"inspection_findings": {
|
| 239 |
+
"LotA": {
|
| 240 |
+
"status": "confirmed_contaminated",
|
| 241 |
+
"unsafe_quantity": 30,
|
| 242 |
+
"evidence": "Source batch LotA remains fully unsafe at origin.",
|
| 243 |
+
},
|
| 244 |
+
"LotB": {
|
| 245 |
+
"status": "safe",
|
| 246 |
+
"unsafe_quantity": 0,
|
| 247 |
+
"evidence": "LotB remains unaffected at origin.",
|
| 248 |
+
},
|
| 249 |
+
},
|
| 250 |
+
},
|
| 251 |
+
"crossdock": {
|
| 252 |
+
"inventory": {"LotBlend": 35, "LotB": 10},
|
| 253 |
+
"quarantined_inventory": {},
|
| 254 |
+
"inspection_findings": {
|
| 255 |
+
"LotBlend": {
|
| 256 |
+
"status": "mixed",
|
| 257 |
+
"unsafe_quantity": 12,
|
| 258 |
+
"safe_quantity": 23,
|
| 259 |
+
"evidence": "Cross-dock exception log shows 12 unsafe units merged into LotBlend.",
|
| 260 |
+
},
|
| 261 |
+
"LotB": {
|
| 262 |
+
"status": "safe",
|
| 263 |
+
"unsafe_quantity": 0,
|
| 264 |
+
"evidence": "Standalone LotB pallet is outside the recall.",
|
| 265 |
+
},
|
| 266 |
+
},
|
| 267 |
+
},
|
| 268 |
+
"store1": {
|
| 269 |
+
"inventory": {"LotA": 10, "LotB": 20},
|
| 270 |
+
"quarantined_inventory": {},
|
| 271 |
+
"inspection_findings": {
|
| 272 |
+
"LotA": {
|
| 273 |
+
"status": "confirmed_contaminated",
|
| 274 |
+
"unsafe_quantity": 10,
|
| 275 |
+
"evidence": "Original LotA cases shipped directly before blending.",
|
| 276 |
+
},
|
| 277 |
+
"LotB": {
|
| 278 |
+
"status": "safe",
|
| 279 |
+
"unsafe_quantity": 0,
|
| 280 |
+
"evidence": "Store LotB stock is unaffected.",
|
| 281 |
+
},
|
| 282 |
+
},
|
| 283 |
+
},
|
| 284 |
+
"store2": {
|
| 285 |
+
"inventory": {"LotBlend": 15},
|
| 286 |
+
"quarantined_inventory": {},
|
| 287 |
+
"inspection_findings": {
|
| 288 |
+
"LotBlend": {
|
| 289 |
+
"status": "mixed",
|
| 290 |
+
"unsafe_quantity": 8,
|
| 291 |
+
"safe_quantity": 7,
|
| 292 |
+
"evidence": "Receiving variance report allocates 8 unsafe units to store2.",
|
| 293 |
+
}
|
| 294 |
+
},
|
| 295 |
+
},
|
| 296 |
+
"store3": {
|
| 297 |
+
"inventory": {"LotBlend": 20, "LotB": 5},
|
| 298 |
+
"quarantined_inventory": {},
|
| 299 |
+
"inspection_findings": {
|
| 300 |
+
"LotBlend": {
|
| 301 |
+
"status": "mixed",
|
| 302 |
+
"unsafe_quantity": 4,
|
| 303 |
+
"safe_quantity": 16,
|
| 304 |
+
"evidence": "Inventory reconciliation isolates 4 unsafe units in store3's mixed lot.",
|
| 305 |
+
},
|
| 306 |
+
"LotB": {
|
| 307 |
+
"status": "safe",
|
| 308 |
+
"unsafe_quantity": 0,
|
| 309 |
+
"evidence": "Separate LotB shelf stock is unaffected.",
|
| 310 |
+
},
|
| 311 |
+
},
|
| 312 |
+
},
|
| 313 |
+
},
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
SCENARIOS: Dict[str, Dict[str, Any]] = {
|
| 317 |
+
PHASE1_SCENARIO["task_id"]: PHASE1_SCENARIO,
|
| 318 |
+
PHASE2_SCENARIO["task_id"]: PHASE2_SCENARIO,
|
| 319 |
+
PHASE3_SCENARIO["task_id"]: PHASE3_SCENARIO,
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
PHASE_LOOKUP: Dict[int, str] = {
|
| 323 |
+
1: PHASE1_SCENARIO["task_id"],
|
| 324 |
+
2: PHASE2_SCENARIO["task_id"],
|
| 325 |
+
3: PHASE3_SCENARIO["task_id"],
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def build_scenario(task_id: str | None = None, phase: int | None = None) -> Dict[str, Any]:
|
| 330 |
+
"""Return a fresh copy of the deterministic scenario for the requested task or phase."""
|
| 331 |
+
if task_id is None:
|
| 332 |
+
if phase is None:
|
| 333 |
+
phase = 1
|
| 334 |
+
task_id = PHASE_LOOKUP[phase]
|
| 335 |
+
if task_id not in SCENARIOS:
|
| 336 |
+
raise ValueError(f"Unknown task_id '{task_id}'. Expected one of {sorted(SCENARIOS)}.")
|
| 337 |
+
return deepcopy(SCENARIOS[task_id])
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
def build_phase1_scenario() -> Dict[str, Any]:
|
| 341 |
+
return build_scenario(task_id=PHASE1_SCENARIO["task_id"])
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def build_phase2_scenario() -> Dict[str, Any]:
|
| 345 |
+
return build_scenario(task_id=PHASE2_SCENARIO["task_id"])
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def build_phase3_scenario() -> Dict[str, Any]:
|
| 349 |
+
return build_scenario(task_id=PHASE3_SCENARIO["task_id"])
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def list_task_specs() -> List[Dict[str, Any]]:
|
| 353 |
+
"""Return lightweight metadata for all tasks."""
|
| 354 |
+
return [
|
| 355 |
+
{
|
| 356 |
+
"task_id": scenario["task_id"],
|
| 357 |
+
"name": scenario["name"],
|
| 358 |
+
"difficulty": scenario["difficulty"],
|
| 359 |
+
"objective": scenario["objective"],
|
| 360 |
+
"max_steps": scenario["max_steps"],
|
| 361 |
+
}
|
| 362 |
+
for scenario in SCENARIOS.values()
|
| 363 |
+
]
|
server.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.app import app, main
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
main()
|
server/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Server package for RecallTrace."""
|
server/app.py
CHANGED
|
@@ -1,31 +1,152 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
import
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI server for serving RecallTrace in Docker or Hugging Face Spaces."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
import uvicorn
|
| 9 |
+
from fastapi import FastAPI, HTTPException
|
| 10 |
+
from fastapi.responses import FileResponse
|
| 11 |
+
from fastapi.staticfiles import StaticFiles
|
| 12 |
+
from pydantic import BaseModel
|
| 13 |
+
|
| 14 |
+
from baseline.policy import choose_heuristic_action
|
| 15 |
+
from env.env import RecallTraceEnv
|
| 16 |
+
from env.models import RecallAction
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 20 |
+
STATIC_DIR = BASE_DIR / "static"
|
| 21 |
+
|
| 22 |
+
app = FastAPI(title="RecallTrace OpenEnv", version="1.0.0")
|
| 23 |
+
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
| 24 |
+
|
| 25 |
+
ACTIVE_ENV = RecallTraceEnv()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class ResetRequest(BaseModel):
|
| 29 |
+
task_id: Optional[str] = None
|
| 30 |
+
phase: Optional[int] = None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class RunEpisodeRequest(BaseModel):
|
| 34 |
+
task_id: Optional[str] = None
|
| 35 |
+
phase: Optional[int] = None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@app.get("/")
|
| 39 |
+
def root() -> FileResponse:
|
| 40 |
+
return FileResponse(STATIC_DIR / "index.html")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@app.get("/health")
|
| 44 |
+
def health() -> dict:
|
| 45 |
+
return {"status": "healthy"}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@app.get("/tasks")
|
| 49 |
+
def tasks() -> dict:
|
| 50 |
+
return {"tasks": [task.model_dump() for task in RecallTraceEnv.available_tasks()]}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@app.get("/api/tasks")
|
| 54 |
+
def api_tasks() -> dict:
|
| 55 |
+
return tasks()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@app.get("/reset")
|
| 59 |
+
def reset_get(task_id: Optional[str] = None, phase: Optional[int] = None) -> dict:
|
| 60 |
+
try:
|
| 61 |
+
return ACTIVE_ENV.reset(task_id=task_id, phase=phase).model_dump()
|
| 62 |
+
except Exception as exc:
|
| 63 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@app.post("/reset")
|
| 67 |
+
def reset_post(request: ResetRequest) -> dict:
|
| 68 |
+
try:
|
| 69 |
+
return ACTIVE_ENV.reset(task_id=request.task_id, phase=request.phase).model_dump()
|
| 70 |
+
except Exception as exc:
|
| 71 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@app.post("/step")
|
| 75 |
+
def step(action: RecallAction) -> dict:
|
| 76 |
+
try:
|
| 77 |
+
observation, reward, done, info = ACTIVE_ENV.step(action)
|
| 78 |
+
return {
|
| 79 |
+
"observation": observation.model_dump(),
|
| 80 |
+
"reward": reward,
|
| 81 |
+
"done": done,
|
| 82 |
+
"info": info,
|
| 83 |
+
}
|
| 84 |
+
except Exception as exc:
|
| 85 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@app.get("/state")
|
| 89 |
+
def state() -> dict:
|
| 90 |
+
return ACTIVE_ENV.state().model_dump()
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _run_episode(task_id: str | None = None, phase: int | None = None) -> dict:
|
| 94 |
+
env = RecallTraceEnv(task_id=task_id, phase=phase)
|
| 95 |
+
observation = env.reset(task_id=task_id, phase=phase)
|
| 96 |
+
logs = []
|
| 97 |
+
final_info = {"score": 0.0}
|
| 98 |
+
|
| 99 |
+
for step_number in range(1, env.task.max_steps + 1):
|
| 100 |
+
action = choose_heuristic_action(observation)
|
| 101 |
+
observation, reward, done, info = env.step(action)
|
| 102 |
+
logs.append(
|
| 103 |
+
{
|
| 104 |
+
"step": step_number,
|
| 105 |
+
"action": action.model_dump(exclude_none=True),
|
| 106 |
+
"reward": reward,
|
| 107 |
+
"done": done,
|
| 108 |
+
"message": info.get("message"),
|
| 109 |
+
}
|
| 110 |
+
)
|
| 111 |
+
final_info = info
|
| 112 |
+
if done:
|
| 113 |
+
break
|
| 114 |
+
|
| 115 |
+
return {
|
| 116 |
+
"task": env.task.model_dump(),
|
| 117 |
+
"score": float(final_info.get("score", 0.0)),
|
| 118 |
+
"success": float(final_info.get("score", 0.0)) >= 0.9,
|
| 119 |
+
"steps_taken": env.state().steps_taken,
|
| 120 |
+
"final_info": final_info,
|
| 121 |
+
"final_observation": observation.model_dump(),
|
| 122 |
+
"logs": logs,
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@app.post("/api/run_episode")
|
| 127 |
+
def run_episode(request: RunEpisodeRequest) -> dict:
|
| 128 |
+
try:
|
| 129 |
+
return _run_episode(task_id=request.task_id, phase=request.phase)
|
| 130 |
+
except Exception as exc:
|
| 131 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@app.get("/api/run_all")
|
| 135 |
+
def run_all() -> dict:
|
| 136 |
+
try:
|
| 137 |
+
episodes = [_run_episode(task_id=task.task_id) for task in RecallTraceEnv.available_tasks()]
|
| 138 |
+
average_score = round(sum(item["score"] for item in episodes) / len(episodes), 4)
|
| 139 |
+
return {
|
| 140 |
+
"average_score": average_score,
|
| 141 |
+
"episodes": episodes,
|
| 142 |
+
}
|
| 143 |
+
except Exception as exc:
|
| 144 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def main() -> None:
|
| 148 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
main()
|
server/static/app.js
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const taskSelect = document.getElementById("task-select");
|
| 2 |
+
const taskSummary = document.getElementById("task-summary");
|
| 3 |
+
const currentScore = document.getElementById("current-score");
|
| 4 |
+
const currentSteps = document.getElementById("current-steps");
|
| 5 |
+
const currentStatus = document.getElementById("current-status");
|
| 6 |
+
const allScore = document.getElementById("all-score");
|
| 7 |
+
const allResults = document.getElementById("all-results");
|
| 8 |
+
const episodeLog = document.getElementById("episode-log");
|
| 9 |
+
const rewardChart = document.getElementById("reward-chart");
|
| 10 |
+
const finalSummary = document.getElementById("final-summary");
|
| 11 |
+
|
| 12 |
+
let taskCatalog = [];
|
| 13 |
+
|
| 14 |
+
function renderTaskSummary(task) {
|
| 15 |
+
taskSummary.innerHTML = `
|
| 16 |
+
<h3>${task.name}</h3>
|
| 17 |
+
<p><strong>Difficulty:</strong> ${task.difficulty}</p>
|
| 18 |
+
<p>${task.objective}</p>
|
| 19 |
+
<p><strong>Max steps:</strong> ${task.max_steps}</p>
|
| 20 |
+
`;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
function buildLineChart(logs) {
|
| 24 |
+
if (!logs.length) {
|
| 25 |
+
rewardChart.innerHTML = "No rewards available.";
|
| 26 |
+
return;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
const width = 380;
|
| 30 |
+
const height = 220;
|
| 31 |
+
const padding = 28;
|
| 32 |
+
const values = logs.map((entry) => entry.reward);
|
| 33 |
+
const maxReward = Math.max(...values, 1);
|
| 34 |
+
const minReward = Math.min(...values, 0);
|
| 35 |
+
const range = Math.max(maxReward - minReward, 0.25);
|
| 36 |
+
|
| 37 |
+
const toX = (index) => {
|
| 38 |
+
if (logs.length === 1) {
|
| 39 |
+
return width / 2;
|
| 40 |
+
}
|
| 41 |
+
return padding + (index * (width - padding * 2)) / (logs.length - 1);
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
const toY = (value) => {
|
| 45 |
+
return height - padding - ((value - minReward) / range) * (height - padding * 2);
|
| 46 |
+
};
|
| 47 |
+
|
| 48 |
+
const linePoints = logs
|
| 49 |
+
.map((entry, index) => `${toX(index)},${toY(entry.reward)}`)
|
| 50 |
+
.join(" ");
|
| 51 |
+
|
| 52 |
+
const horizontalGuides = [0, 0.25, 0.5, 0.75, 1]
|
| 53 |
+
.map((ratio) => {
|
| 54 |
+
const y = padding + ratio * (height - padding * 2);
|
| 55 |
+
return `<line class="chart-grid" x1="${padding}" y1="${y}" x2="${width - padding}" y2="${y}"></line>`;
|
| 56 |
+
})
|
| 57 |
+
.join("");
|
| 58 |
+
|
| 59 |
+
const labels = logs
|
| 60 |
+
.map((entry, index) => {
|
| 61 |
+
const x = toX(index);
|
| 62 |
+
return `<text class="chart-label" x="${x}" y="${height - 8}" text-anchor="middle">S${entry.step}</text>`;
|
| 63 |
+
})
|
| 64 |
+
.join("");
|
| 65 |
+
|
| 66 |
+
const points = logs
|
| 67 |
+
.map((entry, index) => {
|
| 68 |
+
const x = toX(index);
|
| 69 |
+
const y = toY(entry.reward);
|
| 70 |
+
return `
|
| 71 |
+
<circle class="chart-point" cx="${x}" cy="${y}" r="5"></circle>
|
| 72 |
+
<text class="chart-label" x="${x}" y="${y - 10}" text-anchor="middle">${entry.reward.toFixed(2)}</text>
|
| 73 |
+
`;
|
| 74 |
+
})
|
| 75 |
+
.join("");
|
| 76 |
+
|
| 77 |
+
rewardChart.innerHTML = `
|
| 78 |
+
<svg viewBox="0 0 ${width} ${height}" aria-label="Reward line chart">
|
| 79 |
+
${horizontalGuides}
|
| 80 |
+
<line class="chart-axis" x1="${padding}" y1="${height - padding}" x2="${width - padding}" y2="${height - padding}"></line>
|
| 81 |
+
<line class="chart-axis" x1="${padding}" y1="${padding}" x2="${padding}" y2="${height - padding}"></line>
|
| 82 |
+
<polyline class="chart-line" points="${linePoints}"></polyline>
|
| 83 |
+
${points}
|
| 84 |
+
${labels}
|
| 85 |
+
</svg>
|
| 86 |
+
`;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
function renderEpisode(data) {
|
| 90 |
+
currentScore.textContent = data.score.toFixed(4);
|
| 91 |
+
currentSteps.textContent = String(data.steps_taken);
|
| 92 |
+
currentStatus.textContent = data.success ? "Contained" : "Needs work";
|
| 93 |
+
|
| 94 |
+
buildLineChart(data.logs);
|
| 95 |
+
|
| 96 |
+
finalSummary.innerHTML = `
|
| 97 |
+
<div class="summary-grid">
|
| 98 |
+
<div class="summary-pill">
|
| 99 |
+
<span>Final score</span>
|
| 100 |
+
<strong>${data.score.toFixed(4)}</strong>
|
| 101 |
+
</div>
|
| 102 |
+
<div class="summary-pill">
|
| 103 |
+
<span>Status</span>
|
| 104 |
+
<strong>${data.success ? "Success" : "Needs improvement"}</strong>
|
| 105 |
+
</div>
|
| 106 |
+
<div class="summary-pill">
|
| 107 |
+
<span>Steps used</span>
|
| 108 |
+
<strong>${data.steps_taken}</strong>
|
| 109 |
+
</div>
|
| 110 |
+
<div class="summary-pill">
|
| 111 |
+
<span>Quarantine quality</span>
|
| 112 |
+
<strong>${(data.final_info.quarantine_score ?? 0).toFixed(4)}</strong>
|
| 113 |
+
</div>
|
| 114 |
+
</div>
|
| 115 |
+
<div class="summary-card">
|
| 116 |
+
<strong>Containment outcome</strong>
|
| 117 |
+
<div>All affected nodes notified: ${data.final_info.all_affected_nodes_notified ? "Yes" : "No"}</div>
|
| 118 |
+
<div>All affected stock quarantined: ${data.final_info.all_affected_stock_quarantined ? "Yes" : "No"}</div>
|
| 119 |
+
</div>
|
| 120 |
+
<div class="summary-card">
|
| 121 |
+
<strong>Grader focus</strong>
|
| 122 |
+
<div>Notification score: ${(data.final_info.notification_score ?? 0).toFixed(4)}</div>
|
| 123 |
+
<div>Investigation score: ${(data.final_info.investigation_score ?? 0).toFixed(4)}</div>
|
| 124 |
+
<div>Efficiency score: ${(data.final_info.efficiency_score ?? 0).toFixed(4)}</div>
|
| 125 |
+
</div>
|
| 126 |
+
`;
|
| 127 |
+
|
| 128 |
+
const logMarkup = data.logs.map((entry) => {
|
| 129 |
+
const actionType = entry.action.type || "action";
|
| 130 |
+
const detailBits = [];
|
| 131 |
+
if (entry.action.node_id) detailBits.push(`Node: ${entry.action.node_id}`);
|
| 132 |
+
if (entry.action.lot_id) detailBits.push(`Lot: ${entry.action.lot_id}`);
|
| 133 |
+
if (entry.action.quantity) detailBits.push(`Qty: ${entry.action.quantity}`);
|
| 134 |
+
|
| 135 |
+
return `
|
| 136 |
+
<div class="log-step">
|
| 137 |
+
<div class="log-title">
|
| 138 |
+
<strong>Step ${entry.step}</strong>
|
| 139 |
+
<span class="action-chip">${actionType.replace("_", " ")}</span>
|
| 140 |
+
</div>
|
| 141 |
+
<div class="action-meta">
|
| 142 |
+
<div>${detailBits.length ? detailBits.join(" | ") : "No extra parameters"}</div>
|
| 143 |
+
<div>Reward: ${entry.reward.toFixed(4)}</div>
|
| 144 |
+
<div>Message: ${entry.message || "-"}</div>
|
| 145 |
+
</div>
|
| 146 |
+
</div>
|
| 147 |
+
`;
|
| 148 |
+
}).join("");
|
| 149 |
+
|
| 150 |
+
episodeLog.innerHTML = `
|
| 151 |
+
<div class="log-step">
|
| 152 |
+
<strong>Task:</strong> ${data.task.name}
|
| 153 |
+
</div>
|
| 154 |
+
${logMarkup}
|
| 155 |
+
`;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
function renderRunAll(data) {
|
| 159 |
+
allScore.textContent = data.average_score.toFixed(4);
|
| 160 |
+
allResults.innerHTML = data.episodes.map((episode) => `
|
| 161 |
+
<div class="log-step">
|
| 162 |
+
<strong>${episode.task.name}</strong>
|
| 163 |
+
<div>Difficulty: ${episode.task.difficulty}</div>
|
| 164 |
+
<div>Score: ${episode.score.toFixed(4)}</div>
|
| 165 |
+
<div>Steps: ${episode.steps_taken}</div>
|
| 166 |
+
<div>Status: ${episode.success ? "Success" : "Needs work"}</div>
|
| 167 |
+
</div>
|
| 168 |
+
`).join("");
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
async function fetchTasks() {
|
| 172 |
+
const response = await fetch("/api/tasks");
|
| 173 |
+
const data = await response.json();
|
| 174 |
+
taskCatalog = data.tasks;
|
| 175 |
+
|
| 176 |
+
taskSelect.innerHTML = taskCatalog.map((task) => `
|
| 177 |
+
<option value="${task.task_id}">${task.difficulty.toUpperCase()} - ${task.name}</option>
|
| 178 |
+
`).join("");
|
| 179 |
+
|
| 180 |
+
renderTaskSummary(taskCatalog[0]);
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
async function resetTask() {
|
| 184 |
+
const taskId = taskSelect.value;
|
| 185 |
+
const response = await fetch(`/reset?task_id=${encodeURIComponent(taskId)}`);
|
| 186 |
+
const data = await response.json();
|
| 187 |
+
currentScore.textContent = "-";
|
| 188 |
+
currentSteps.textContent = String(data.steps_taken || 0);
|
| 189 |
+
currentStatus.textContent = "Reset";
|
| 190 |
+
rewardChart.innerHTML = "Task reset. Run a task to render the reward trajectory.";
|
| 191 |
+
finalSummary.innerHTML = "Readable scoring highlights will appear here.";
|
| 192 |
+
episodeLog.textContent = JSON.stringify(data, null, 2);
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
async function runEpisode() {
|
| 196 |
+
const response = await fetch("/api/run_episode", {
|
| 197 |
+
method: "POST",
|
| 198 |
+
headers: { "Content-Type": "application/json" },
|
| 199 |
+
body: JSON.stringify({ task_id: taskSelect.value }),
|
| 200 |
+
});
|
| 201 |
+
const data = await response.json();
|
| 202 |
+
renderEpisode(data);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
async function runAllTasks() {
|
| 206 |
+
const response = await fetch("/api/run_all");
|
| 207 |
+
const data = await response.json();
|
| 208 |
+
renderRunAll(data);
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
taskSelect.addEventListener("change", () => {
|
| 212 |
+
const task = taskCatalog.find((item) => item.task_id === taskSelect.value);
|
| 213 |
+
if (task) {
|
| 214 |
+
renderTaskSummary(task);
|
| 215 |
+
}
|
| 216 |
+
});
|
| 217 |
+
|
| 218 |
+
document.getElementById("reset-button").addEventListener("click", resetTask);
|
| 219 |
+
document.getElementById("run-button").addEventListener("click", runEpisode);
|
| 220 |
+
document.getElementById("run-all-button").addEventListener("click", runAllTasks);
|
| 221 |
+
|
| 222 |
+
fetchTasks();
|
server/static/index.html
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>RecallTrace OpenEnv</title>
|
| 7 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 8 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 9 |
+
<link href="https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;700&family=IBM+Plex+Mono:wght@400;500&display=swap" rel="stylesheet">
|
| 10 |
+
<link rel="stylesheet" href="/static/styles.css?v=4">
|
| 11 |
+
</head>
|
| 12 |
+
<body>
|
| 13 |
+
<div class="page-shell">
|
| 14 |
+
<header class="hero">
|
| 15 |
+
<div class="hero-copy">
|
| 16 |
+
<span class="eyebrow">Safety-Critical OpenEnv Benchmark</span>
|
| 17 |
+
<h1>RecallTrace OpenEnv</h1>
|
| 18 |
+
<p class="hero-text">
|
| 19 |
+
A real-world supply-chain recall benchmark where agents must trace contaminated lots,
|
| 20 |
+
follow relabeled inventory lineage, inspect evidence, and quarantine only the unsafe stock.
|
| 21 |
+
</p>
|
| 22 |
+
<div class="badge-row">
|
| 23 |
+
<span class="badge">OpenEnv compliant</span>
|
| 24 |
+
<span class="badge">Deterministic grading</span>
|
| 25 |
+
<span class="badge">3 escalating tasks</span>
|
| 26 |
+
<span class="badge">Precision containment</span>
|
| 27 |
+
</div>
|
| 28 |
+
</div>
|
| 29 |
+
<div class="hero-panel">
|
| 30 |
+
<div class="metric-card">
|
| 31 |
+
<span class="metric-label">Average baseline</span>
|
| 32 |
+
<strong id="metric-average">0.9677</strong>
|
| 33 |
+
</div>
|
| 34 |
+
<div class="metric-card">
|
| 35 |
+
<span class="metric-label">Hard task focus</span>
|
| 36 |
+
<strong>Mixed safe/unsafe inventory</strong>
|
| 37 |
+
</div>
|
| 38 |
+
<div class="metric-card">
|
| 39 |
+
<span class="metric-label">Judging edge</span>
|
| 40 |
+
<strong>Operational realism over toy mechanics</strong>
|
| 41 |
+
</div>
|
| 42 |
+
</div>
|
| 43 |
+
</header>
|
| 44 |
+
|
| 45 |
+
<main class="dashboard-grid">
|
| 46 |
+
<section class="panel panel-accent">
|
| 47 |
+
<div class="panel-header">
|
| 48 |
+
<h2>Task Runner</h2>
|
| 49 |
+
<p>Choose a task and run the deterministic baseline to inspect the full trajectory.</p>
|
| 50 |
+
</div>
|
| 51 |
+
<div class="controls">
|
| 52 |
+
<label class="field">
|
| 53 |
+
<span>Task level</span>
|
| 54 |
+
<select id="task-select"></select>
|
| 55 |
+
</label>
|
| 56 |
+
<div class="button-row">
|
| 57 |
+
<button id="reset-button" class="button button-secondary">Reset Task</button>
|
| 58 |
+
<button id="run-button" class="button button-primary">Run Episode</button>
|
| 59 |
+
<button id="run-all-button" class="button button-ghost">Run All Tasks</button>
|
| 60 |
+
</div>
|
| 61 |
+
</div>
|
| 62 |
+
<div id="task-summary" class="task-summary"></div>
|
| 63 |
+
</section>
|
| 64 |
+
|
| 65 |
+
<section class="panel">
|
| 66 |
+
<div class="panel-header">
|
| 67 |
+
<h2>Scoreboard</h2>
|
| 68 |
+
<p>Live summary of the current task and the multi-task baseline run.</p>
|
| 69 |
+
</div>
|
| 70 |
+
<div class="score-grid">
|
| 71 |
+
<div class="score-card">
|
| 72 |
+
<span>Current score</span>
|
| 73 |
+
<strong id="current-score">-</strong>
|
| 74 |
+
</div>
|
| 75 |
+
<div class="score-card">
|
| 76 |
+
<span>Steps taken</span>
|
| 77 |
+
<strong id="current-steps">-</strong>
|
| 78 |
+
</div>
|
| 79 |
+
<div class="score-card">
|
| 80 |
+
<span>Status</span>
|
| 81 |
+
<strong id="current-status">Ready</strong>
|
| 82 |
+
</div>
|
| 83 |
+
<div class="score-card">
|
| 84 |
+
<span>Average over all tasks</span>
|
| 85 |
+
<strong id="all-score">-</strong>
|
| 86 |
+
</div>
|
| 87 |
+
</div>
|
| 88 |
+
<div id="all-results" class="all-results empty-state">Run all tasks to compare easy, medium, and hard performance.</div>
|
| 89 |
+
</section>
|
| 90 |
+
|
| 91 |
+
<section class="panel panel-wide">
|
| 92 |
+
<div class="panel-header">
|
| 93 |
+
<h2>Episode Output</h2>
|
| 94 |
+
<p>Visual baseline trajectory, readable action summaries, and final grading highlights.</p>
|
| 95 |
+
</div>
|
| 96 |
+
<div class="episode-layout">
|
| 97 |
+
<div class="episode-visuals">
|
| 98 |
+
<div class="mini-panel">
|
| 99 |
+
<h3>Reward Curve</h3>
|
| 100 |
+
<div id="reward-chart" class="reward-chart empty-state">Run a task to render the reward trajectory.</div>
|
| 101 |
+
</div>
|
| 102 |
+
<div class="mini-panel">
|
| 103 |
+
<h3>Final Outcome</h3>
|
| 104 |
+
<div id="final-summary" class="final-summary empty-state">Readable scoring highlights will appear here.</div>
|
| 105 |
+
</div>
|
| 106 |
+
</div>
|
| 107 |
+
<div id="episode-log" class="episode-log empty-state">Run a task to populate the episode trajectory.</div>
|
| 108 |
+
</div>
|
| 109 |
+
</section>
|
| 110 |
+
|
| 111 |
+
<section class="panel">
|
| 112 |
+
<div class="panel-header">
|
| 113 |
+
<h2>Judge Lens</h2>
|
| 114 |
+
</div>
|
| 115 |
+
<div class="highlight-stack">
|
| 116 |
+
<div class="highlight-card">
|
| 117 |
+
<span class="highlight-title">Real-world utility</span>
|
| 118 |
+
<p>Models a safety-critical recall workflow that QA, operations, and supply-chain teams actually perform.</p>
|
| 119 |
+
</div>
|
| 120 |
+
<div class="highlight-card">
|
| 121 |
+
<span class="highlight-title">Frontier challenge</span>
|
| 122 |
+
<p>The hard task forces precision containment of mixed safe and unsafe stock under partial observability.</p>
|
| 123 |
+
</div>
|
| 124 |
+
<div class="highlight-card">
|
| 125 |
+
<span class="highlight-title">Benchmark quality</span>
|
| 126 |
+
<p>Deterministic graders evaluate precision, coverage, investigation depth, and efficiency with reproducible scores.</p>
|
| 127 |
+
</div>
|
| 128 |
+
</div>
|
| 129 |
+
</section>
|
| 130 |
+
|
| 131 |
+
<section class="panel">
|
| 132 |
+
<div class="panel-header">
|
| 133 |
+
<h2>Project Hub</h2>
|
| 134 |
+
</div>
|
| 135 |
+
<div class="link-list">
|
| 136 |
+
<a href="/health" target="_blank" rel="noreferrer">Health endpoint</a>
|
| 137 |
+
<a href="/reset" target="_blank" rel="noreferrer">Reset endpoint</a>
|
| 138 |
+
<a href="/tasks" target="_blank" rel="noreferrer">Task catalog JSON</a>
|
| 139 |
+
<a href="https://github.com/MS-Shamanth/recalltrace-openenv/tree/sham" target="_blank" rel="noreferrer">GitHub source</a>
|
| 140 |
+
<a href="https://huggingface.co/spaces/ms-shamanth/recalltrace-openenv/tree/main" target="_blank" rel="noreferrer">Space files</a>
|
| 141 |
+
<a href="https://www.docker.com/" target="_blank" rel="noreferrer">Docker runtime</a>
|
| 142 |
+
<a href="https://github.com/openenvai/openenv" target="_blank" rel="noreferrer">OpenEnv ecosystem</a>
|
| 143 |
+
</div>
|
| 144 |
+
</section>
|
| 145 |
+
</main>
|
| 146 |
+
</div>
|
| 147 |
+
<script src="/static/app.js?v=4"></script>
|
| 148 |
+
</body>
|
| 149 |
+
</html>
|
server/static/styles.css
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
:root {
|
| 2 |
+
--bg: #09111f;
|
| 3 |
+
--panel: rgba(16, 25, 40, 0.92);
|
| 4 |
+
--panel-strong: rgba(12, 20, 34, 0.98);
|
| 5 |
+
--text: #eef3ff;
|
| 6 |
+
--muted: #a8b4ca;
|
| 7 |
+
--border: rgba(255, 255, 255, 0.08);
|
| 8 |
+
--warning: #ff6f3c;
|
| 9 |
+
--warning-soft: rgba(255, 111, 60, 0.14);
|
| 10 |
+
--success: #38d39f;
|
| 11 |
+
--shadow: 0 24px 60px rgba(0, 0, 0, 0.4);
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
* {
|
| 15 |
+
box-sizing: border-box;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
body {
|
| 19 |
+
margin: 0;
|
| 20 |
+
min-height: 100vh;
|
| 21 |
+
background:
|
| 22 |
+
radial-gradient(circle at top left, rgba(255, 111, 60, 0.18), transparent 30%),
|
| 23 |
+
radial-gradient(circle at top right, rgba(56, 211, 159, 0.14), transparent 26%),
|
| 24 |
+
linear-gradient(180deg, #08101d 0%, #050a14 100%);
|
| 25 |
+
color: var(--text);
|
| 26 |
+
font-family: "Space Grotesk", sans-serif;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
.page-shell {
|
| 30 |
+
width: min(1280px, calc(100% - 32px));
|
| 31 |
+
margin: 32px auto 48px;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
.hero,
|
| 35 |
+
.panel {
|
| 36 |
+
border: 1px solid var(--border);
|
| 37 |
+
background: var(--panel);
|
| 38 |
+
box-shadow: var(--shadow);
|
| 39 |
+
backdrop-filter: blur(16px);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.hero {
|
| 43 |
+
display: grid;
|
| 44 |
+
grid-template-columns: 1.6fr 1fr;
|
| 45 |
+
gap: 24px;
|
| 46 |
+
padding: 28px;
|
| 47 |
+
border-radius: 28px;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
.eyebrow {
|
| 51 |
+
display: inline-block;
|
| 52 |
+
margin-bottom: 12px;
|
| 53 |
+
color: var(--warning);
|
| 54 |
+
font-size: 0.9rem;
|
| 55 |
+
letter-spacing: 0.12em;
|
| 56 |
+
text-transform: uppercase;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
h1, h2, h3 {
|
| 60 |
+
margin: 0;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
h1 {
|
| 64 |
+
font-size: clamp(2.4rem, 6vw, 4.8rem);
|
| 65 |
+
line-height: 0.95;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
.hero-text,
|
| 69 |
+
.panel-header p,
|
| 70 |
+
.task-summary p,
|
| 71 |
+
.link-list,
|
| 72 |
+
.all-results,
|
| 73 |
+
.episode-log {
|
| 74 |
+
color: var(--muted);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.hero-text {
|
| 78 |
+
max-width: 60ch;
|
| 79 |
+
font-size: 1.08rem;
|
| 80 |
+
line-height: 1.6;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.badge-row {
|
| 84 |
+
display: flex;
|
| 85 |
+
flex-wrap: wrap;
|
| 86 |
+
gap: 10px;
|
| 87 |
+
margin-top: 18px;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.badge {
|
| 91 |
+
padding: 8px 12px;
|
| 92 |
+
border-radius: 999px;
|
| 93 |
+
background: rgba(255, 255, 255, 0.06);
|
| 94 |
+
border: 1px solid var(--border);
|
| 95 |
+
font-size: 0.92rem;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
.hero-panel {
|
| 99 |
+
display: grid;
|
| 100 |
+
gap: 14px;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
.metric-card,
|
| 104 |
+
.score-card {
|
| 105 |
+
padding: 18px;
|
| 106 |
+
border-radius: 20px;
|
| 107 |
+
background: var(--panel-strong);
|
| 108 |
+
border: 1px solid var(--border);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
.metric-card strong,
|
| 112 |
+
.score-card strong {
|
| 113 |
+
display: block;
|
| 114 |
+
margin-top: 8px;
|
| 115 |
+
font-size: 1.25rem;
|
| 116 |
+
line-height: 1.3;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.metric-label,
|
| 120 |
+
.score-card span,
|
| 121 |
+
.field span {
|
| 122 |
+
color: var(--muted);
|
| 123 |
+
font-size: 0.95rem;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
.dashboard-grid {
|
| 127 |
+
display: grid;
|
| 128 |
+
grid-template-columns: 1.1fr 0.9fr;
|
| 129 |
+
gap: 20px;
|
| 130 |
+
margin-top: 20px;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
.panel {
|
| 134 |
+
padding: 24px;
|
| 135 |
+
border-radius: 24px;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
.panel-accent {
|
| 139 |
+
background:
|
| 140 |
+
linear-gradient(180deg, rgba(255, 111, 60, 0.12), transparent 55%),
|
| 141 |
+
var(--panel);
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.panel-wide {
|
| 145 |
+
grid-column: 1 / -1;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
.panel-header {
|
| 149 |
+
margin-bottom: 18px;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.panel-header p {
|
| 153 |
+
margin-top: 8px;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.controls {
|
| 157 |
+
display: grid;
|
| 158 |
+
gap: 18px;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
.field {
|
| 162 |
+
display: grid;
|
| 163 |
+
gap: 8px;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
select,
|
| 167 |
+
button {
|
| 168 |
+
font: inherit;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
select {
|
| 172 |
+
padding: 14px 16px;
|
| 173 |
+
border-radius: 16px;
|
| 174 |
+
border: 1px solid var(--border);
|
| 175 |
+
background: rgba(7, 13, 24, 0.96);
|
| 176 |
+
color: var(--text);
|
| 177 |
+
font-weight: 600;
|
| 178 |
+
box-shadow: inset 0 0 0 1px rgba(255, 255, 255, 0.03);
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
select:focus {
|
| 182 |
+
outline: 2px solid rgba(255, 111, 60, 0.45);
|
| 183 |
+
outline-offset: 2px;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
select option {
|
| 187 |
+
background: #0d1525;
|
| 188 |
+
color: var(--text);
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
.button-row {
|
| 192 |
+
display: flex;
|
| 193 |
+
flex-wrap: wrap;
|
| 194 |
+
gap: 12px;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
.button {
|
| 198 |
+
border: none;
|
| 199 |
+
border-radius: 16px;
|
| 200 |
+
padding: 14px 18px;
|
| 201 |
+
cursor: pointer;
|
| 202 |
+
transition: transform 0.2s ease, opacity 0.2s ease, box-shadow 0.2s ease;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
.button:hover {
|
| 206 |
+
transform: translateY(-1px);
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
.button-primary {
|
| 210 |
+
background: linear-gradient(135deg, #ff934f 0%, #ff6f3c 100%);
|
| 211 |
+
color: #fff;
|
| 212 |
+
box-shadow: 0 14px 32px rgba(255, 111, 60, 0.24);
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
.button-secondary {
|
| 216 |
+
background: rgba(255, 255, 255, 0.07);
|
| 217 |
+
color: var(--text);
|
| 218 |
+
border: 1px solid var(--border);
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
.button-ghost {
|
| 222 |
+
background: rgba(56, 211, 159, 0.12);
|
| 223 |
+
color: #dffff4;
|
| 224 |
+
border: 1px solid rgba(56, 211, 159, 0.24);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.task-summary {
|
| 228 |
+
margin-top: 18px;
|
| 229 |
+
padding: 18px;
|
| 230 |
+
border-radius: 18px;
|
| 231 |
+
background: rgba(255, 255, 255, 0.04);
|
| 232 |
+
border: 1px solid var(--border);
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
.task-summary h3 {
|
| 236 |
+
margin: 0 0 8px;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
.score-grid {
|
| 240 |
+
display: grid;
|
| 241 |
+
grid-template-columns: repeat(2, minmax(0, 1fr));
|
| 242 |
+
gap: 12px;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
.empty-state {
|
| 246 |
+
padding: 18px;
|
| 247 |
+
border: 1px dashed rgba(255, 255, 255, 0.16);
|
| 248 |
+
border-radius: 18px;
|
| 249 |
+
background: rgba(255, 255, 255, 0.03);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
.episode-layout {
|
| 253 |
+
display: grid;
|
| 254 |
+
grid-template-columns: 460px minmax(0, 1fr);
|
| 255 |
+
gap: 22px;
|
| 256 |
+
align-items: start;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
.episode-visuals {
|
| 260 |
+
display: grid;
|
| 261 |
+
gap: 18px;
|
| 262 |
+
position: sticky;
|
| 263 |
+
top: 16px;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
.mini-panel {
|
| 267 |
+
padding: 18px;
|
| 268 |
+
border-radius: 20px;
|
| 269 |
+
background: var(--panel-strong);
|
| 270 |
+
border: 1px solid var(--border);
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
.episode-log,
|
| 274 |
+
.all-results {
|
| 275 |
+
font-family: "IBM Plex Mono", monospace;
|
| 276 |
+
font-size: 0.93rem;
|
| 277 |
+
line-height: 1.6;
|
| 278 |
+
white-space: pre-wrap;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
.episode-log {
|
| 282 |
+
max-height: 760px;
|
| 283 |
+
min-height: 760px;
|
| 284 |
+
overflow-y: auto;
|
| 285 |
+
overflow-x: hidden;
|
| 286 |
+
padding: 22px;
|
| 287 |
+
border-radius: 20px;
|
| 288 |
+
background: var(--panel-strong);
|
| 289 |
+
border: 1px solid var(--border);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
.all-results {
|
| 293 |
+
max-height: 240px;
|
| 294 |
+
overflow-y: auto;
|
| 295 |
+
padding-right: 10px;
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
.reward-chart {
|
| 299 |
+
min-height: 240px;
|
| 300 |
+
padding: 12px 8px 8px;
|
| 301 |
+
border-radius: 18px;
|
| 302 |
+
background: rgba(255, 255, 255, 0.03);
|
| 303 |
+
border: 1px solid var(--border);
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
.reward-chart svg {
|
| 307 |
+
display: block;
|
| 308 |
+
width: 100%;
|
| 309 |
+
height: 240px;
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
.chart-axis {
|
| 313 |
+
stroke: rgba(255, 255, 255, 0.15);
|
| 314 |
+
stroke-width: 1;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
.chart-grid {
|
| 318 |
+
stroke: rgba(255, 255, 255, 0.08);
|
| 319 |
+
stroke-width: 1;
|
| 320 |
+
stroke-dasharray: 4 4;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
.chart-line {
|
| 324 |
+
fill: none;
|
| 325 |
+
stroke: #38d39f;
|
| 326 |
+
stroke-width: 3;
|
| 327 |
+
stroke-linecap: round;
|
| 328 |
+
stroke-linejoin: round;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
.chart-point {
|
| 332 |
+
fill: #ff6f3c;
|
| 333 |
+
stroke: #fff;
|
| 334 |
+
stroke-width: 2;
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
.chart-label {
|
| 338 |
+
fill: #a8b4ca;
|
| 339 |
+
font-size: 11px;
|
| 340 |
+
font-family: "IBM Plex Mono", monospace;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.final-summary {
|
| 344 |
+
display: grid;
|
| 345 |
+
gap: 12px;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
.summary-card {
|
| 349 |
+
padding: 14px;
|
| 350 |
+
border-radius: 16px;
|
| 351 |
+
background: rgba(255, 255, 255, 0.04);
|
| 352 |
+
border: 1px solid var(--border);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
.summary-card strong {
|
| 356 |
+
display: block;
|
| 357 |
+
margin-bottom: 6px;
|
| 358 |
+
font-size: 0.96rem;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
.summary-grid {
|
| 362 |
+
display: grid;
|
| 363 |
+
grid-template-columns: repeat(2, minmax(0, 1fr));
|
| 364 |
+
gap: 10px;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
.summary-pill {
|
| 368 |
+
padding: 12px;
|
| 369 |
+
border-radius: 14px;
|
| 370 |
+
background: rgba(255, 255, 255, 0.05);
|
| 371 |
+
border: 1px solid var(--border);
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
.summary-pill span {
|
| 375 |
+
display: block;
|
| 376 |
+
color: var(--muted);
|
| 377 |
+
font-size: 0.82rem;
|
| 378 |
+
margin-bottom: 6px;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
.summary-pill strong {
|
| 382 |
+
font-size: 1rem;
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
.episode-log::-webkit-scrollbar,
|
| 386 |
+
.all-results::-webkit-scrollbar {
|
| 387 |
+
width: 10px;
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
.episode-log::-webkit-scrollbar-thumb,
|
| 391 |
+
.all-results::-webkit-scrollbar-thumb {
|
| 392 |
+
background: rgba(255, 255, 255, 0.14);
|
| 393 |
+
border-radius: 999px;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
.log-step {
|
| 397 |
+
padding: 18px 0;
|
| 398 |
+
border-bottom: 1px solid rgba(255, 255, 255, 0.06);
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
.log-step:first-child {
|
| 402 |
+
padding-top: 0;
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
.log-step:last-child {
|
| 406 |
+
border-bottom: none;
|
| 407 |
+
padding-bottom: 0;
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
.log-step strong {
|
| 411 |
+
color: var(--text);
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
.log-title {
|
| 415 |
+
display: flex;
|
| 416 |
+
justify-content: space-between;
|
| 417 |
+
gap: 12px;
|
| 418 |
+
align-items: center;
|
| 419 |
+
margin-bottom: 10px;
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
.action-chip {
|
| 423 |
+
padding: 4px 10px;
|
| 424 |
+
border-radius: 999px;
|
| 425 |
+
background: var(--warning-soft);
|
| 426 |
+
color: #ffd6c5;
|
| 427 |
+
border: 1px solid rgba(255, 111, 60, 0.22);
|
| 428 |
+
font-size: 0.76rem;
|
| 429 |
+
text-transform: uppercase;
|
| 430 |
+
letter-spacing: 0.08em;
|
| 431 |
+
white-space: nowrap;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
.action-meta {
|
| 435 |
+
display: grid;
|
| 436 |
+
gap: 8px;
|
| 437 |
+
color: var(--muted);
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
.highlight-stack {
|
| 441 |
+
display: grid;
|
| 442 |
+
gap: 12px;
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
.highlight-card {
|
| 446 |
+
padding: 16px;
|
| 447 |
+
border-radius: 18px;
|
| 448 |
+
background: rgba(255, 255, 255, 0.04);
|
| 449 |
+
border: 1px solid var(--border);
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
.highlight-card p {
|
| 453 |
+
margin: 8px 0 0;
|
| 454 |
+
color: var(--muted);
|
| 455 |
+
line-height: 1.6;
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
.highlight-title {
|
| 459 |
+
color: var(--text);
|
| 460 |
+
font-weight: 700;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
.link-list {
|
| 464 |
+
display: grid;
|
| 465 |
+
gap: 12px;
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
.link-list a {
|
| 469 |
+
color: #ffd7c7;
|
| 470 |
+
text-decoration: none;
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
.link-list a:hover {
|
| 474 |
+
text-decoration: underline;
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
@media (max-width: 1100px) {
|
| 478 |
+
.episode-layout {
|
| 479 |
+
grid-template-columns: 1fr;
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
.episode-visuals {
|
| 483 |
+
position: static;
|
| 484 |
+
}
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
@media (max-width: 960px) {
|
| 488 |
+
.hero,
|
| 489 |
+
.dashboard-grid,
|
| 490 |
+
.summary-grid,
|
| 491 |
+
.score-grid {
|
| 492 |
+
grid-template-columns: 1fr;
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
.episode-log {
|
| 496 |
+
min-height: 520px;
|
| 497 |
+
max-height: 520px;
|
| 498 |
+
}
|
| 499 |
+
}
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for RecallTrace."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import unittest
|
| 6 |
+
|
| 7 |
+
from env.env import RecallTraceEnv
|
| 8 |
+
from grader.grader import evaluate_action_plan
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class RecallTraceEnvTests(unittest.TestCase):
|
| 12 |
+
def test_phase1_plan_scores_high(self) -> None:
|
| 13 |
+
grade = evaluate_action_plan(
|
| 14 |
+
"phase1_direct_recall",
|
| 15 |
+
[
|
| 16 |
+
{"type": "trace_lot", "lot_id": "LotA"},
|
| 17 |
+
{"type": "inspect_node", "node_id": "warehouse"},
|
| 18 |
+
{"type": "inspect_node", "node_id": "store1"},
|
| 19 |
+
{"type": "inspect_node", "node_id": "store2"},
|
| 20 |
+
{"type": "quarantine", "node_id": "warehouse", "lot_id": "LotA", "quantity": 100},
|
| 21 |
+
{"type": "quarantine", "node_id": "store1", "lot_id": "LotA", "quantity": 50},
|
| 22 |
+
{"type": "quarantine", "node_id": "store2", "lot_id": "LotA", "quantity": 20},
|
| 23 |
+
{"type": "notify", "node_id": "all"},
|
| 24 |
+
{"type": "finalize"},
|
| 25 |
+
],
|
| 26 |
+
)
|
| 27 |
+
self.assertGreaterEqual(grade.score, 0.95)
|
| 28 |
+
self.assertTrue(grade.success)
|
| 29 |
+
|
| 30 |
+
def test_phase2_trace_reveals_relabels(self) -> None:
|
| 31 |
+
env = RecallTraceEnv(task_id="phase2_relabel_recall")
|
| 32 |
+
env.reset()
|
| 33 |
+
observation, reward, done, info = env.step({"type": "trace_lot", "lot_id": "LotA"})
|
| 34 |
+
self.assertFalse(done)
|
| 35 |
+
self.assertGreater(reward, 0)
|
| 36 |
+
self.assertEqual(info["matched_lots"], ["LotA", "LotA_R1", "LotA_R2"])
|
| 37 |
+
self.assertIn("store3", observation.trace_results["LotA"]["affected_nodes"])
|
| 38 |
+
|
| 39 |
+
def test_phase3_mixed_inventory_requires_exact_quarantine(self) -> None:
|
| 40 |
+
env = RecallTraceEnv(task_id="phase3_mixed_shipments")
|
| 41 |
+
env.reset()
|
| 42 |
+
env.step({"type": "trace_lot", "lot_id": "LotA"})
|
| 43 |
+
env.step({"type": "inspect_node", "node_id": "crossdock"})
|
| 44 |
+
_, reward, _, info = env.step({"type": "quarantine", "node_id": "crossdock", "lot_id": "LotBlend", "quantity": 15})
|
| 45 |
+
self.assertLess(reward, 0)
|
| 46 |
+
self.assertEqual(info["target_contaminated_quantity"], 12)
|
| 47 |
+
|
| 48 |
+
def test_phase3_full_plan_scores_high(self) -> None:
|
| 49 |
+
grade = evaluate_action_plan(
|
| 50 |
+
"phase3_mixed_shipments",
|
| 51 |
+
[
|
| 52 |
+
{"type": "trace_lot", "lot_id": "LotA"},
|
| 53 |
+
{"type": "inspect_node", "node_id": "warehouse"},
|
| 54 |
+
{"type": "inspect_node", "node_id": "crossdock"},
|
| 55 |
+
{"type": "inspect_node", "node_id": "store1"},
|
| 56 |
+
{"type": "inspect_node", "node_id": "store2"},
|
| 57 |
+
{"type": "inspect_node", "node_id": "store3"},
|
| 58 |
+
{"type": "quarantine", "node_id": "warehouse", "lot_id": "LotA", "quantity": 30},
|
| 59 |
+
{"type": "quarantine", "node_id": "crossdock", "lot_id": "LotBlend", "quantity": 12},
|
| 60 |
+
{"type": "quarantine", "node_id": "store1", "lot_id": "LotA", "quantity": 10},
|
| 61 |
+
{"type": "quarantine", "node_id": "store2", "lot_id": "LotBlend", "quantity": 8},
|
| 62 |
+
{"type": "quarantine", "node_id": "store3", "lot_id": "LotBlend", "quantity": 4},
|
| 63 |
+
{"type": "notify", "node_id": "all"},
|
| 64 |
+
{"type": "finalize"},
|
| 65 |
+
],
|
| 66 |
+
)
|
| 67 |
+
self.assertGreaterEqual(grade.score, 0.95)
|
| 68 |
+
self.assertTrue(grade.final_info["all_affected_stock_quarantined"])
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
unittest.main()
|
uv.toml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
no-cache = true
|
| 2 |
+
python-preference = "only-system"
|
| 3 |
+
python-downloads = "never"
|