Spaces:

yashu2000
/

TemporalBenchEnv

Sleeping

App Files Files Community

yashu2000 commited on Apr 12

Commit

d954568

verified ·

1 Parent(s): 5e7b72a

Upload folder using huggingface_hub

Browse files

Files changed (42) hide show

.gitattributes +3 -0
Dockerfile +85 -0
README.md +196 -5
__init__.py +22 -0
client.py +86 -0
data/__init__.py +6 -0
data/banks/MIMIC_questions.json +0 -0
data/banks/PSML_questions.json +3 -0
data/banks/causal_chambers_questions.json +3 -0
data/banks/freshretailnet_questions.json +3 -0
data/banks/manifest.json +12 -0
data/loaders.py +114 -0
data/question.py +25 -0
env/__init__.py +6 -0
env/config.py +28 -0
env/episode_sampler.py +140 -0
env/grading.py +39 -0
env/models.py +64 -0
env/reward.py +44 -0
env/temporal_bench_env.py +233 -0
models.py +26 -0
openenv.yaml +7 -0
openenv_TemporalBenchEnv.egg-info/PKG-INFO +11 -0
openenv_TemporalBenchEnv.egg-info/SOURCES.txt +26 -0
openenv_TemporalBenchEnv.egg-info/dependency_links.txt +1 -0
openenv_TemporalBenchEnv.egg-info/entry_points.txt +2 -0
openenv_TemporalBenchEnv.egg-info/requires.txt +7 -0
openenv_TemporalBenchEnv.egg-info/top_level.txt +5 -0
pyproject.toml +41 -0
server/__init__.py +11 -0
server/app.py +79 -0
server/requirements.txt +6 -0
tests/conftest.py +11 -0
tests/fixtures/banks/MIMIC_questions.json +4 -0
tests/fixtures/banks/PSML_questions.json +14 -0
tests/fixtures/banks/causal_chambers_questions.json +4 -0
tests/fixtures/banks/freshretailnet_questions.json +4 -0
tests/test_env.py +79 -0
tests/test_loaders.py +26 -0
tests/test_reward.py +26 -0
tests/test_sampler.py +34 -0
uv.lock +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/banks/PSML_questions.json filter=lfs diff=lfs merge=lfs -text
+data/banks/causal_chambers_questions.json filter=lfs diff=lfs merge=lfs -text
+data/banks/freshretailnet_questions.json filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=TemporalBenchEnv
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Copy the question bank
+# COPY TS-benchmark/artifacts/temporal_bench_openenv_banks /app/env/data/banks
+ENV TEMPORALBENCH_QUESTION_BANK_DIR=/app/env/data/banks
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,201 @@
 ---
-title: TemporalBenchEnv
-emoji: 📊
-colorFrom: red
-colorTo: yellow
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TemporalBenchEnv MCQ Server
+emoji: 🥁
+colorFrom: yellow
+colorTo: indigo
 sdk: docker
 pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
 ---
+# TemporalBenchEnv
+OpenEnv environment for **multi-step multiple-choice** time-series reasoning. Each episode samples nine questions from pre-built JSON banks (per-dataset files or merged JSONL in `TSQuestion` shape). Rewards combine per-step correctness and an episode bonus (see `env/reward.py`).
+## Question bank layout
+Point the server at a directory containing `PSML_questions.json`, `freshretailnet_questions.json`, `MIMIC_questions.json`, and `causal_chambers_questions.json` (each file is a JSON array of `TSQuestion` records), or set **`TEMPORALBENCH_QUESTION_BANK_DIR`** to that path. If unset, the server uses `tests/fixtures/banks` when present (for local smoke runs).
+Each record must include at least: `question_id`, `dataset`, `task_type` (`T1U` | `T3` | `T2_MCQ`), `prompt`, `options` (length ≥ 2), `answer`, plus optional `family`, `capability_tags`, `difficulty`, `metadata`.
+## Quick Start
+Use the typed client (`TemporalBenchEnvClient`; alias `TemporalbenchenvEnv`):
+```python
+from client import TemporalBenchAction, TemporalBenchEnvClient
+try:
+    env = TemporalBenchEnvClient.from_docker_image("TemporalBenchEnv-env:latest")
+    out = env.reset()
+    while not out.done:
+        q = out.observation
+        # Agent picks q.options[i] or equivalent label string
+        out = env.step(TemporalBenchAction(answer=q.options[0]))
+finally:
+    env.close()
+```
+`TemporalBenchEnvClient.from_docker_image()` handles:
+- Starting the Docker container
+- Waiting for the server to be ready
+- Connecting to the environment
+- Container cleanup when you call `close()`
+## Building the Docker Image
+Before using the environment, you need to build the Docker image:
+```bash
+# From project root
+docker build -t TemporalBenchEnv-env:latest -f server/Dockerfile .
+```
+## Deploying to Hugging Face Spaces
+You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
+```bash
+# From the environment directory (where openenv.yaml is located)
+openenv push
+# Or specify options
+openenv push --namespace my-org --private
+```
+The `openenv push` command will:
+1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
+2. Prepare a custom build for Hugging Face Docker space (enables web interface)
+3. Upload to Hugging Face (ensuring you're logged in)
+### Prerequisites
+- Authenticate with Hugging Face: The command will prompt for login if not already authenticated
+### Options
+- `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
+- `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
+- `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
+- `--private`: Deploy the space as private (default: public)
+### Examples
+```bash
+# Push to your personal namespace (defaults to username/env-name from openenv.yaml)
+openenv push
+# Push to a specific repository
+openenv push --repo-id my-org/my-env
+# Push with a custom base image
+openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
+# Push as a private space
+openenv push --private
+# Combine options
+openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
+```
+After deployment, your space will be available at:
+`https://huggingface.co/spaces/<repo-id>`
+The deployed space includes:
+- **Web Interface** at `/web` - Interactive UI for exploring the environment
+- **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
+- **Health Check** at `/health` - Container health monitoring
+- **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
+## Environment Details
+### Action (`TemporalBenchAction`)
+- `answer` (str) — MCQ label (must match ground truth after optional normalization)
+- `confidence`, `reasoning` — optional
+### Observation (`TemporalBenchObservation`)
+- `question`, `options`, `task_type`, `dataset`, `history`, `accuracy_so_far`
+- `step_idx`, `steps_remaining`, `max_steps`, `done`, `reward`, `metadata`
+### Reward
+- Per step: `alpha * correctness` (correctness 0 or 1).
+- On the final step, adds episode bonus: `lambda_ep * (total_correct / N) * coverage_multiplier` (1.0 if every dataset in the episode has at least one correct answer, else 0.8).
+## Advanced Usage
+### Connecting to an Existing Server
+If you already have a TemporalBenchEnv server running, connect with:
+```python
+from client import TemporalBenchAction, TemporalBenchEnvClient
+with TemporalBenchEnvClient(base_url="http://localhost:8000") as env:
+    r = env.reset()
+    r = env.step(TemporalBenchAction(answer=r.observation.options[0]))
+```
+Note: `close()` does not stop a remote server you attached to with `base_url=...`.
+### Using the Context Manager
+The client supports context manager usage for automatic connection management:
+```python
+from client import TemporalBenchAction, TemporalBenchEnvClient
+with TemporalBenchEnvClient(base_url="http://localhost:8000") as env:
+    result = env.reset()
+    while not result.done:
+        ans = result.observation.options[0]
+        result = env.step(TemporalBenchAction(answer=ans))
+```
+The client uses WebSocket connections for:
+- **Lower latency**: No HTTP connection overhead per request
+- **Persistent session**: Server maintains your environment state
+- **Efficient for episodes**: Better for many sequential steps
+### Concurrent WebSocket Sessions
+The server uses **factory mode** (`create_app(_env_factory, ...)`) so each WebSocket session gets a fresh `TemporalBenchEnvironment`. Tune `max_concurrent_envs` in `server/app.py` as needed.
+## Development & Testing
+### Direct environment testing
+```bash
+uv sync --extra dev
+uv run pytest tests/
+```
+### Running Locally
+Run the server locally for development:
+```bash
+uvicorn server.app:app --reload
+```
+## Project Structure
+```
+TemporalBenchEnv/
+├── .dockerignore         # Docker build exclusions
+├── __init__.py            # Module exports
+├── README.md              # This file
+├── openenv.yaml           # OpenEnv manifest
+├── pyproject.toml         # Project metadata and dependencies
+├── uv.lock                # Locked dependencies (generated)
+├── client.py              # TemporalBenchEnvClient (alias TemporalbenchenvEnv)
+├── models.py              # Action / observation / state re-exports
+├── env/                   # Environment, sampler, grading, rewards
+├── data/                  # TSQuestion schema + JSON/JSONL loaders
+└── server/
+    ├── __init__.py        # Server module exports
+    ├── app.py             # FastAPI application (HTTP + WebSocket endpoints)
+    └── Dockerfile         # Container image definition
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""TemporalBenchEnv — MCQ time-series reasoning for OpenEnv."""
+from client import TemporalBenchEnvClient, TemporalbenchenvEnv
+from models import (
+    TemporalBenchAction,
+    TemporalBenchObservation,
+    TemporalBenchState,
+)
+__all__ = [
+    "TemporalBenchAction",
+    "TemporalBenchObservation",
+    "TemporalBenchState",
+    "TemporalBenchEnvClient",
+    "TemporalbenchenvEnv",
+]

client.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Typed OpenEnv client for TemporalBenchEnv."""
+from typing import Any, Dict
+from openenv.core.client_types import StepResult
+from openenv.core.env_client import EnvClient
+try:
+    from env.models import (
+        TemporalBenchAction,
+        TemporalBenchObservation,
+        TemporalBenchState,
+    )
+except ImportError:
+    from TemporalBenchEnv.env.models import (
+        TemporalBenchAction,
+        TemporalBenchObservation,
+        TemporalBenchState,
+    )
+class TemporalBenchEnvClient(
+    EnvClient[
+        TemporalBenchAction,
+        TemporalBenchObservation,
+        TemporalBenchState,
+    ]
+):
+    """WebSocket client for TemporalBench MCQ episodes."""
+    def _step_payload(self, action: TemporalBenchAction) -> Dict[str, Any]:
+        payload: Dict[str, Any] = {"answer": action.answer}
+        if action.confidence is not None:
+            payload["confidence"] = action.confidence
+        if action.reasoning is not None:
+            payload["reasoning"] = action.reasoning
+        return payload
+    def _parse_result(self, payload: Dict[str, Any]) -> StepResult[TemporalBenchObservation]:
+        obs_data = payload.get("observation")
+        if not isinstance(obs_data, dict):
+            obs_data = payload if isinstance(payload, dict) else {}
+        done = payload.get("done", obs_data.get("done", False))
+        reward = payload.get("reward", obs_data.get("reward"))
+        observation = TemporalBenchObservation(
+            step_idx=int(obs_data.get("step_idx", 0)),
+            steps_remaining=int(obs_data.get("steps_remaining", 0)),
+            max_steps=int(obs_data.get("max_steps", 9)),
+            question=str(obs_data.get("question", "")),
+            options=list(obs_data.get("options", [])),
+            task_type=str(obs_data.get("task_type", "")),
+            dataset=str(obs_data.get("dataset", "")),
+            history=list(obs_data.get("history", [])),
+            accuracy_so_far=float(obs_data.get("accuracy_so_far", 0.0)),
+            done=done,
+            reward=reward,
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(observation=observation, reward=reward, done=done)
+    def _parse_state(self, payload: Dict[str, Any]) -> TemporalBenchState:
+        state_data = payload.get("state")
+        if not isinstance(state_data, dict):
+            state_data = payload if isinstance(payload, dict) else {}
+        return TemporalBenchState(
+            episode_id=state_data.get("episode_id"),
+            step_count=int(state_data.get("step_count", 0)),
+            total_correct=int(state_data.get("total_correct", 0)),
+            total_questions=int(state_data.get("total_questions", 9)),
+            current_accuracy=float(state_data.get("current_accuracy", 0.0)),
+            primary_domain=str(state_data.get("primary_domain", "PSML")),
+            per_task_type_accuracy=dict(state_data.get("per_task_type_accuracy", {})),
+            total_reward=float(state_data.get("total_reward", 0.0)),
+        )
+TemporalbenchenvEnv = TemporalBenchEnvClient

data/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Question bank types and loaders for TemporalBenchEnv."""
+from .question import TSQuestion
+from .loaders import load_question_banks
+__all__ = ["TSQuestion", "load_question_banks"]

data/banks/MIMIC_questions.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/banks/PSML_questions.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fbe5a4b7ef698a2cf38cc1bad5e0a49e8ea2e8d5e694fed063992fb9b85202c
+size 31377743

data/banks/causal_chambers_questions.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fecebc4f5dc8db88b83825bd35d0a9b003f16c8df43521d616e928104481163
+size 30021480

data/banks/freshretailnet_questions.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:475410d2787c20bf3d908e4f4b67febd78ff9da3f4e2bf993ef8a7c9af31431a
+size 31221095

data/banks/manifest.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "source_file": "/Users/yashaswisharma/Desktop/Academia Career/usc-gradschool/Research/AgentX/TS-Agents-Melady/TS-benchmark/task_merged_dev_with_labels_tiers.jsonl",
+  "input_rows": 764,
+  "total_ts_questions": 2775,
+  "per_dataset": {
+    "PSML": 750,
+    "freshretailnet": 616,
+    "MIMIC": 709,
+    "causal_chambers": 700
+  },
+  "openenv_loader": "Set TEMPORALBENCH_QUESTION_BANK_DIR to this directory."
+}

data/loaders.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""Load question banks from JSON or JSONL files."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from .question import TSQuestion
+# Canonical domain keys used by EpisodeSampler (must match bank files or dataset field)
+DEFAULT_DOMAIN_ORDER = ("PSML", "freshretailnet", "MIMIC", "causal_chambers")
+def _parse_records(raw: Any) -> list[dict[str, Any]]:
+    if isinstance(raw, list):
+        return [x for x in raw if isinstance(x, dict)]
+    if isinstance(raw, dict) and "questions" in raw:
+        q = raw["questions"]
+        if isinstance(q, list):
+            return [x for x in q if isinstance(x, dict)]
+    raise ValueError("JSON root must be a list of objects or {\"questions\": [...]}")
+def _record_to_question(obj: dict[str, Any]) -> TSQuestion:
+    return TSQuestion.model_validate(obj)
+def load_json_file(path: Path) -> list[TSQuestion]:
+    """Load a single .json file (array or {\"questions\": [...]})."""
+    raw = json.loads(path.read_text(encoding="utf-8"))
+    records = _parse_records(raw)
+    return [_record_to_question(r) for r in records]
+def load_jsonl_file(path: Path) -> list[TSQuestion]:
+    """Load newline-delimited JSON; each line must be a full TSQuestion object."""
+    out: list[TSQuestion] = []
+    for line_no, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"{path}:{line_no}: invalid JSON: {e}") from e
+        if not isinstance(obj, dict):
+            raise ValueError(f"{path}:{line_no}: expected object per line")
+        out.append(_record_to_question(obj))
+    return out
+def load_question_banks(
+    bank_dir: Path | str | None,
+    *,
+    domain_order: tuple[str, ...] = DEFAULT_DOMAIN_ORDER,
+    explicit_files: list[Path | str] | None = None,
+) -> dict[str, list[TSQuestion]]:
+    """
+    Load per-dataset question pools.
+    If ``bank_dir`` is set, loads ``<Dataset>_questions.json`` for each domain in
+    ``domain_order`` when that file exists, plus any ``*.json`` / ``*.jsonl`` in
+    the directory that declare a ``dataset`` field per record (merged lists).
+    If ``explicit_files`` is set, each file is loaded; records are grouped by
+    ``dataset`` field (required for merged files).
+    """
+    pools: dict[str, list[TSQuestion]] = {d: [] for d in domain_order}
+    if explicit_files:
+        for fp in explicit_files:
+            path = Path(fp)
+            items = load_jsonl_file(path) if path.suffix.lower() == ".jsonl" else load_json_file(path)
+            for q in items:
+                if q.dataset not in pools:
+                    pools[q.dataset] = []
+                pools[q.dataset].append(q)
+        return pools
+    if bank_dir is None:
+        return pools
+    root = Path(bank_dir)
+    if not root.is_dir():
+        raise NotADirectoryError(f"question_bank_path must be a directory: {root}")
+    # Per-dataset convention: PSML_questions.json etc.
+    for domain in domain_order:
+        candidates = [
+            root / f"{domain}_questions.json",
+            root / f"{domain.lower()}_questions.json",
+        ]
+        for c in candidates:
+            if c.is_file():
+                pools[domain].extend(load_json_file(c))
+                break
+    # Any extra json/jsonl with dataset on each row (skip per-dataset files + manifests)
+    for path in sorted(root.glob("*.json")) + sorted(root.glob("*.jsonl")):
+        if path.name in ("manifest.json", "build_manifest.json"):
+            continue
+        if any(path.name == f"{d}_questions.json" for d in domain_order):
+            continue
+        if any(path.name == f"{d.lower()}_questions.json" for d in domain_order):
+            continue
+        items = load_jsonl_file(path) if path.suffix.lower() == ".jsonl" else load_json_file(path)
+        for q in items:
+            key = q.dataset
+            if key not in pools:
+                pools[key] = []
+            pools[key].append(q)
+    return pools

data/question.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""Canonical question record for TemporalBench MCQ episodes."""
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field
+class TSQuestion(BaseModel):
+    """One MCQ item from a pre-built question bank."""
+    question_id: str = Field(..., description="Unique identifier")
+    dataset: str = Field(
+        ...,
+        description="Source dataset, e.g. PSML, freshretailnet, MIMIC, causal_chambers",
+    )
+    task_type: str = Field(..., description="T1U | T3 | T2_MCQ")
+    family: str | None = Field(default=None, description="T3 family, e.g. S1:A")
+    prompt: str = Field(..., description="Full question text with context")
+    options: list[str] = Field(..., min_length=2, description="Answer choices")
+    answer: str = Field(..., description="Ground-truth label (matches one option after normalize)")
+    capability_tags: list[str] = Field(default_factory=list)
+    difficulty: str | None = Field(default=None)
+    metadata: dict[str, Any] = Field(default_factory=dict)

env/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""TemporalBenchEnv core logic."""
+from .config import EnvConfig
+from .temporal_bench_env import TemporalBenchEnvironment
+__all__ = ["EnvConfig", "TemporalBenchEnvironment"]

env/config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""TemporalBenchEnv configuration."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+DEFAULT_DOMAINS: tuple[str, ...] = ("PSML", "freshretailnet", "MIMIC", "causal_chambers")
+@dataclass
+class EnvConfig:
+    """Episode structure, rewards, and data paths."""
+    num_questions: int = 9
+    primary_count: int = 6
+    secondary_count: int = 3
+    primary_domain: str = "PSML"
+    allowed_task_types: tuple[str, ...] = ("T1U", "T3", "T2_MCQ")
+    alpha: float = 1.0
+    lambda_ep: float = 0.5
+    partial_credit_t1u: bool = False
+    case_insensitive_match: bool = False
+    curriculum_stage: int = 3
+    question_bank_path: str | None = None
+    seed: int | None = None
+    forecasting_enabled: bool = False
+    all_domains: tuple[str, ...] = field(default_factory=lambda: DEFAULT_DOMAINS)

env/episode_sampler.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""Sample 9-question episodes from per-dataset pools."""
+from __future__ import annotations
+from collections import defaultdict
+from typing import TYPE_CHECKING
+import numpy as np
+from .config import DEFAULT_DOMAINS, EnvConfig
+if TYPE_CHECKING:
+    from data.question import TSQuestion
+def _curriculum_task_types(config: EnvConfig) -> set[str]:
+    stage = int(config.curriculum_stage)
+    if stage <= 1:
+        allowed = {"T1U"}
+    elif stage == 2:
+        allowed = {"T1U", "T3"}
+    else:
+        allowed = {"T1U", "T3", "T2_MCQ"}
+    return allowed.intersection(set(config.allowed_task_types))
+def _filter_pool(questions: list[TSQuestion], config: EnvConfig) -> list[TSQuestion]:
+    allowed_types = _curriculum_task_types(config)
+    return [q for q in questions if q.task_type in allowed_types]
+class EpisodeSampler:
+    """Builds stratified multi-domain MCQ episodes."""
+    def __init__(
+        self,
+        pools: dict[str, list[TSQuestion]],
+        config: EnvConfig,
+        rng: np.random.Generator,
+    ):
+        self._config = config
+        self._rng = rng
+        self._pools: dict[str, list[TSQuestion]] = {}
+        for domain, qs in pools.items():
+            filtered = _filter_pool(list(qs), config)
+            if filtered:
+                self._pools[domain] = filtered
+    def sample_episode(self) -> list[TSQuestion]:
+        """Return shuffled list of ``num_questions`` questions (default 9)."""
+        cfg = self._config
+        n = cfg.num_questions
+        primary_n = cfg.primary_count
+        domains = list(cfg.all_domains)
+        primary = cfg.primary_domain
+        if primary not in domains:
+            raise ValueError(f"primary_domain {primary!r} not in all_domains {domains}")
+        others = [d for d in domains if d != primary]
+        if len(others) != 3:
+            raise ValueError("Expected exactly 3 non-primary domains")
+        for d in domains:
+            if d not in self._pools or not self._pools[d]:
+                raise ValueError(f"No questions available for dataset {d!r} after filtering")
+        primary_pool = list(self._pools[primary])
+        chosen: list[TSQuestion] = []
+        chosen.extend(self._sample_primary_stratified(primary_pool, primary_n))
+        for d in others:
+            idx = int(self._rng.integers(0, len(self._pools[d])))
+            chosen.append(self._pools[d][idx])
+        if len(chosen) != n:
+            raise ValueError(f"Expected {n} questions, got {len(chosen)}")
+        order = self._rng.permutation(n)
+        return [chosen[i] for i in order]
+    def _sample_primary_stratified(self, pool: list[TSQuestion], k: int) -> list[TSQuestion]:
+        """Prefer spreading T3 across families; fill with uniform random."""
+        rng = self._rng
+        if len(pool) < k:
+            raise ValueError(f"Primary pool too small: need {k}, have {len(pool)}")
+        # Partition by (task_type, family or "")
+        buckets: dict[tuple[str, str], list[TSQuestion]] = defaultdict(list)
+        for q in pool:
+            fam = q.family or ""
+            buckets[(q.task_type, fam)].append(q)
+        # Round-robin order across T3 families first, then other keys
+        t3_families = sorted({fam for (tt, fam) in buckets if tt == "T3"})
+        rr_keys: list[tuple[str, str]] = [("T3", f) for f in t3_families]
+        other_keys = [key for key in buckets if key not in rr_keys]
+        rng.shuffle(other_keys)
+        key_order = rr_keys + other_keys
+        picks: list[TSQuestion] = []
+        while len(picks) < k and key_order:
+            progressed = False
+            for key in list(key_order):
+                if len(picks) >= k:
+                    break
+                b = buckets.get(key, [])
+                if not b:
+                    key_order.remove(key)
+                    continue
+                idx = int(rng.integers(0, len(b)))
+                picks.append(b.pop(idx))
+                progressed = True
+            if not progressed:
+                break
+        # Uniform fill from remaining pool elements
+        remainder = [q for lst in buckets.values() for q in lst]
+        rng.shuffle(remainder)
+        for q in remainder:
+            if len(picks) >= k:
+                break
+            picks.append(q)
+        if len(picks) < k:
+            chosen_ids = {p.question_id for p in picks}
+            pool_rest = [q for q in pool if q.question_id not in chosen_ids]
+            need = k - len(picks)
+            if len(pool_rest) < need:
+                raise ValueError("Primary pool could not supply enough unique questions")
+            extra_idx = rng.choice(len(pool_rest), size=need, replace=False)
+            for i in extra_idx:
+                picks.append(pool_rest[int(i)])
+        return picks[:k]
+def default_domains_for_config(config: EnvConfig) -> tuple[str, ...]:
+    """Domains tuple from config (falls back to module default)."""
+    return tuple(config.all_domains) if config.all_domains else DEFAULT_DOMAINS

env/grading.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""MCQ answer grading."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from data.question import TSQuestion
+from .config import EnvConfig
+def _normalize(s: str, *, lower: bool) -> str:
+    s = str(s).strip()
+    return s.lower() if lower else s
+def grade_answer(
+    submitted: str,
+    question: TSQuestion,
+    config: EnvConfig,
+) -> tuple[bool, float]:
+    """
+    Returns (is_fully_correct, partial_score in [0,1]).
+    Binary match to ``question.answer``, or equality to an option whose
+    normalized text equals the normalized ground-truth answer.
+    ``partial_credit_t1u`` is reserved for future multi-part T1U grading.
+    """
+    lower = config.case_insensitive_match
+    truth = _normalize(question.answer, lower=lower)
+    pred = _normalize(submitted, lower=lower)
+    if pred == truth:
+        return True, 1.0
+    for opt in question.options:
+        if pred == _normalize(opt, lower=lower) and _normalize(opt, lower=lower) == truth:
+            return True, 1.0
+    return False, 0.0

env/models.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""OpenEnv Pydantic models for TemporalBenchEnv."""
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, ConfigDict, Field
+try:
+    from openenv.core.env_server.types import Action as _ActionBase
+    from openenv.core.env_server.types import Observation as _ObservationBase
+    from openenv.core.env_server.types import State as _StateBase
+except ImportError:
+    _ActionBase = BaseModel
+    _ObservationBase = BaseModel
+    _StateBase = BaseModel
+class TemporalBenchAction(_ActionBase):
+    """Agent submits an MCQ answer (optional confidence / reasoning)."""
+    if _ActionBase is BaseModel:
+        model_config = ConfigDict(extra="forbid")
+        metadata: dict[str, Any] = Field(default_factory=dict)
+    answer: str = Field(..., description="MCQ answer label matching an option")
+    confidence: float | None = Field(default=None, ge=0.0, le=1.0)
+    reasoning: str | None = Field(default=None, description="Optional chain-of-thought")
+class TemporalBenchObservation(_ObservationBase):
+    """Current question and progress."""
+    if _ObservationBase is BaseModel:
+        model_config = ConfigDict(extra="forbid")
+        done: bool = Field(default=False)
+        reward: float | None = Field(default=None)
+        metadata: dict[str, Any] = Field(default_factory=dict)
+    step_idx: int = Field(..., ge=0)
+    steps_remaining: int = Field(..., ge=0)
+    max_steps: int = Field(default=9, ge=1)
+    question: str = Field(..., description="Current MCQ prompt")
+    options: list[str] = Field(..., description="Answer choices")
+    task_type: str = Field(..., description="T1U | T3 | T2_MCQ")
+    dataset: str = Field(..., description="Source dataset")
+    history: list[dict[str, Any]] = Field(default_factory=list)
+    accuracy_so_far: float = Field(default=0.0, ge=0.0, le=1.0)
+class TemporalBenchState(_StateBase):
+    """Serializable environment state."""
+    if _StateBase is BaseModel:
+        model_config = ConfigDict(extra="allow")
+        episode_id: str | None = Field(default=None)
+        step_count: int = Field(default=0, ge=0)
+    total_correct: int = Field(default=0, ge=0)
+    total_questions: int = Field(default=9, ge=0)
+    current_accuracy: float = Field(default=0.0, ge=0.0, le=1.0)
+    primary_domain: str = Field(default="PSML")
+    per_task_type_accuracy: dict[str, float] = Field(default_factory=dict)
+    total_reward: float = Field(default=0.0)

env/reward.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""Reward helpers for TemporalBenchEnv."""
+from __future__ import annotations
+from typing import Any
+import numpy as np
+def compute_mcq_reward(correctness_score: float, alpha: float = 1.0) -> float:
+    """Per-step reward: alpha * correctness (0..1)."""
+    return float(alpha) * float(correctness_score)
+def compute_forecasting_reward(
+    predicted: np.ndarray,
+    ground_truth: np.ndarray,
+    metric: str = "mse",
+    **kwargs: Any,
+) -> float:
+    del predicted, ground_truth, metric, kwargs
+    raise NotImplementedError("Forecasting reward not yet implemented")
+def compute_episode_bonus(
+    total_correct: int,
+    num_questions: int,
+    domain_coverage: dict[str, int],
+    *,
+    all_domains: tuple[str, ...],
+    lambda_ep: float = 0.5,
+) -> float:
+    """
+    Episode bonus: lambda_ep * episode_accuracy * domain_coverage_bonus.
+    ``domain_coverage`` maps dataset name -> count of *correct* answers in the
+    episode for that dataset. Bonus uses 1.0 if every domain in ``all_domains``
+    has at least one correct answer, else 0.8.
+    """
+    accuracy = total_correct / num_questions if num_questions > 0 else 0.0
+    counts = [int(domain_coverage.get(d, 0)) for d in all_domains]
+    all_covered = all(c > 0 for c in counts)
+    coverage_bonus = 1.0 if all_covered else 0.8
+    return float(lambda_ep) * float(accuracy) * coverage_bonus

env/temporal_bench_env.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""Core OpenEnv environment for TemporalBench MCQ episodes."""
+from __future__ import annotations
+import uuid
+from collections import defaultdict
+from dataclasses import replace
+from typing import Any, Optional
+import numpy as np
+from data.loaders import load_question_banks
+from data.question import TSQuestion
+from .config import EnvConfig
+from .episode_sampler import EpisodeSampler
+from .grading import grade_answer
+from .models import TemporalBenchAction, TemporalBenchObservation, TemporalBenchState
+from .reward import compute_episode_bonus, compute_mcq_reward
+try:
+    from openenv.core.env_server.interfaces import Environment
+except ImportError:
+    from abc import ABC, abstractmethod
+    from typing import Generic, TypeVar
+    ActT = TypeVar("ActT")
+    ObsT = TypeVar("ObsT")
+    StateT = TypeVar("StateT")
+    class Environment(ABC, Generic[ActT, ObsT, StateT]):
+        @abstractmethod
+        def reset(self, seed=None, episode_id=None, **kwargs): ...
+        @abstractmethod
+        def step(self, action, timeout_s=None, **kwargs): ...
+        @property
+        @abstractmethod
+        def state(self): ...
+class TemporalBenchEnvironment(
+    Environment[TemporalBenchAction, TemporalBenchObservation, TemporalBenchState]
+):
+    """Multi-step MCQ environment over a pre-built TemporalBench question bank."""
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self, config: Optional[EnvConfig] = None, **kwargs: Any):
+        super().__init__(**kwargs)
+        self._config = config or EnvConfig()
+        seed = self._config.seed
+        self._rng = np.random.default_rng(seed)
+        self._banks = load_question_banks(self._config.question_bank_path)
+        self._sampler = EpisodeSampler(self._banks, self._config, self._rng)
+        self._episode_id: Optional[str] = None
+        self._questions: list[TSQuestion] = []
+        self._answered: int = 0
+        self._history: list[dict[str, Any]] = []
+        self._done: bool = False
+        self._total_correct: int = 0
+        self._total_reward: float = 0.0
+        self._domain_correct: dict[str, int] = defaultdict(int)
+        self._task_correct: dict[str, int] = defaultdict(int)
+        self._task_total: dict[str, int] = defaultdict(int)
+        self._last_metadata: dict[str, Any] = {}
+    def _accuracy_so_far(self) -> float:
+        if self._answered == 0:
+            return 0.0
+        return self._total_correct / self._answered
+    def _per_task_accuracy(self) -> dict[str, float]:
+        out: dict[str, float] = {}
+        for k, tot in self._task_total.items():
+            out[k] = (self._task_correct[k] / tot) if tot else 0.0
+        return out
+    def _build_observation(
+        self,
+        *,
+        reward: float | None,
+        done: bool,
+    ) -> TemporalBenchObservation:
+        n = self._config.num_questions
+        if done or self._answered >= n:
+            return TemporalBenchObservation(
+                step_idx=self._answered,
+                steps_remaining=0,
+                max_steps=n,
+                question="",
+                options=[],
+                task_type="",
+                dataset="",
+                history=list(self._history),
+                accuracy_so_far=self._accuracy_so_far(),
+                done=True,
+                reward=reward,
+                metadata=dict(self._last_metadata),
+            )
+        q = self._questions[self._answered]
+        steps_remaining = n - self._answered
+        return TemporalBenchObservation(
+            step_idx=self._answered,
+            steps_remaining=steps_remaining,
+            max_steps=n,
+            question=q.prompt,
+            options=list(q.options),
+            task_type=q.task_type,
+            dataset=q.dataset,
+            history=list(self._history),
+            accuracy_so_far=self._accuracy_so_far(),
+            done=False,
+            reward=reward,
+            metadata=dict(self._last_metadata),
+        )
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **kwargs: Any,
+    ) -> TemporalBenchObservation:
+        curriculum_kw = kwargs.pop("curriculum_stage", None)
+        if seed is not None:
+            self._rng = np.random.default_rng(seed)
+        cfg = self._config
+        if curriculum_kw is not None:
+            cfg = replace(self._config, curriculum_stage=int(curriculum_kw))
+        self._sampler = EpisodeSampler(self._banks, cfg, self._rng)
+        self._episode_id = episode_id or str(uuid.uuid4())
+        self._questions = self._sampler.sample_episode()
+        self._answered = 0
+        self._history = []
+        self._done = False
+        self._total_correct = 0
+        self._total_reward = 0.0
+        self._domain_correct = defaultdict(int)
+        self._task_correct = defaultdict(int)
+        self._task_total = defaultdict(int)
+        self._last_metadata = {}
+        return self._build_observation(reward=0.0, done=False)
+    def step(
+        self,
+        action: TemporalBenchAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> TemporalBenchObservation:
+        del timeout_s, kwargs
+        if self._done:
+            self._last_metadata = {"info": "Episode already done."}
+            return self._build_observation(reward=0.0, done=True)
+        self._last_metadata = {}
+        n = self._config.num_questions
+        if self._answered >= n:
+            self._done = True
+            self._last_metadata = {"info": "Episode already complete."}
+            return self._build_observation(reward=0.0, done=True)
+        q = self._questions[self._answered]
+        if not str(action.answer).strip():
+            self._last_metadata = {"error": "answer must be a non-empty string."}
+            return self._build_observation(reward=0.0, done=False)
+        fully_correct, score = grade_answer(action.answer, q, self._config)
+        r_step = compute_mcq_reward(score, alpha=self._config.alpha)
+        self._history.append(
+            {
+                "question_id": q.question_id,
+                "dataset": q.dataset,
+                "task_type": q.task_type,
+                "submitted": action.answer,
+                "correct": fully_correct,
+                "reward": r_step,
+            }
+        )
+        self._task_total[q.task_type] += 1
+        if fully_correct:
+            self._total_correct += 1
+            self._domain_correct[q.dataset] += 1
+            self._task_correct[q.task_type] += 1
+        self._answered += 1
+        total_reward_this_step = r_step
+        if self._answered >= n:
+            bonus = compute_episode_bonus(
+                self._total_correct,
+                n,
+                dict(self._domain_correct),
+                all_domains=tuple(self._config.all_domains),
+                lambda_ep=self._config.lambda_ep,
+            )
+            total_reward_this_step = r_step + bonus
+            self._done = True
+            self._last_metadata = {
+                "episode_bonus": bonus,
+                "domain_correct_counts": dict(self._domain_correct),
+            }
+        self._total_reward += total_reward_this_step
+        return self._build_observation(
+            reward=total_reward_this_step,
+            done=self._done,
+        )
+    @property
+    def state(self) -> TemporalBenchState:
+        return TemporalBenchState(
+            episode_id=self._episode_id,
+            step_count=self._answered,
+            total_correct=self._total_correct,
+            total_questions=self._config.num_questions,
+            current_accuracy=self._accuracy_so_far(),
+            primary_domain=self._config.primary_domain,
+            per_task_type_accuracy=self._per_task_accuracy(),
+            total_reward=self._total_reward,
+        )

models.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""OpenEnv models at package root for schema validation and server imports."""
+try:
+    from env.models import (
+        TemporalBenchAction,
+        TemporalBenchObservation,
+        TemporalBenchState,
+    )
+except ImportError:
+    from .env.models import (
+        TemporalBenchAction,
+        TemporalBenchObservation,
+        TemporalBenchState,
+    )
+__all__ = [
+    "TemporalBenchAction",
+    "TemporalBenchObservation",
+    "TemporalBenchState",
+]

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: TemporalBenchEnv
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

openenv_TemporalBenchEnv.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,11 @@

+Metadata-Version: 2.4
+Name: openenv-TemporalBenchEnv
+Version: 0.1.0
+Summary: TemporalBench MCQ environment for OpenEnv
+Requires-Python: >=3.10
+Requires-Dist: openenv-core[core]>=0.2.0
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: pydantic>=2.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

openenv_TemporalBenchEnv.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+README.md
+client.py
+models.py
+pyproject.toml
+data/__init__.py
+data/loaders.py
+data/question.py
+env/__init__.py
+env/config.py
+env/episode_sampler.py
+env/grading.py
+env/models.py
+env/reward.py
+env/temporal_bench_env.py
+openenv_TemporalBenchEnv.egg-info/PKG-INFO
+openenv_TemporalBenchEnv.egg-info/SOURCES.txt
+openenv_TemporalBenchEnv.egg-info/dependency_links.txt
+openenv_TemporalBenchEnv.egg-info/entry_points.txt
+openenv_TemporalBenchEnv.egg-info/requires.txt
+openenv_TemporalBenchEnv.egg-info/top_level.txt
+server/__init__.py
+server/app.py
+tests/test_env.py
+tests/test_loaders.py
+tests/test_reward.py
+tests/test_sampler.py

openenv_TemporalBenchEnv.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

openenv_TemporalBenchEnv.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = server.app:main

openenv_TemporalBenchEnv.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+openenv-core[core]>=0.2.0
+numpy>=1.24.0
+pydantic>=2.0.0
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0

openenv_TemporalBenchEnv.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+client
+data
+env
+models
+server

pyproject.toml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-TemporalBenchEnv"
+version = "0.1.0"
+description = "TemporalBench MCQ environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core[core]>=0.2.0",
+    "numpy>=1.24.0",
+    "pydantic>=2.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+server = "server.app:main"
+[tool.setuptools]
+include-package-data = true
+py-modules = ["client", "models"]
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["env*", "data*", "server*"]
+[tool.pytest.ini_options]
+pythonpath = ["."]
+testpaths = ["tests"]

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""TemporalBenchEnv environment server components."""
+from env.temporal_bench_env import TemporalBenchEnvironment
+__all__ = ["TemporalBenchEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""FastAPI application for TemporalBenchEnv."""
+import os
+from pathlib import Path
+try:
+    from env.config import EnvConfig
+    from env.temporal_bench_env import TemporalBenchEnvironment
+    from models import TemporalBenchAction, TemporalBenchObservation
+except ImportError:
+    from ..env.config import EnvConfig
+    from ..env.temporal_bench_env import TemporalBenchEnvironment
+    from ..models import TemporalBenchAction, TemporalBenchObservation
+try:
+    from openenv.core.env_server import create_app
+except ImportError:
+    create_app = None  # type: ignore
+def _env_factory():
+    """Create a fresh environment instance per WebSocket session."""
+    bank_dir = os.environ.get("TEMPORALBENCH_QUESTION_BANK_DIR")
+    if not bank_dir:
+        default = Path(__file__).resolve().parents[1] / "tests" / "fixtures" / "banks"
+        if default.is_dir():
+            bank_dir = str(default)
+    cfg = EnvConfig(question_bank_path=bank_dir) if bank_dir else EnvConfig()
+    return TemporalBenchEnvironment(config=cfg)
+if create_app is not None:
+    app = create_app(
+        _env_factory,
+        TemporalBenchAction,
+        TemporalBenchObservation,
+        env_name="temporal-bench-env",
+        max_concurrent_envs=64,
+    )
+else:
+    from fastapi import FastAPI
+    app = FastAPI(title="temporal-bench-env")
+    app.get("/health")(lambda: {"status": "ok"})
+def main(host: str | None = None, port: int | None = None) -> None:
+    """
+    Entry point for `uv run server` and OpenEnv multi-mode validation.
+    OpenEnv's validator does a naive substring check for ``main()`` in this
+    file, so the ``if __name__ == "__main__"`` block must call ``main()`` with
+    no arguments; CLI flags are parsed here via ``parse_known_args``.
+    """
+    import argparse
+    import uvicorn
+    if host is None or port is None:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--host", type=str, default="0.0.0.0")
+        parser.add_argument("--port", type=int, default=8000)
+        ns, _ = parser.parse_known_args()
+        if host is None:
+            host = ns.host
+        if port is None:
+            port = ns.port
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from pathlib import Path
+import pytest
+FIXTURE_BANK_DIR = Path(__file__).resolve().parent / "fixtures" / "banks"
+@pytest.fixture
+def fixture_bank_dir() -> Path:
+    assert FIXTURE_BANK_DIR.is_dir()
+    return FIXTURE_BANK_DIR

tests/fixtures/banks/MIMIC_questions.json ADDED Viewed

	@@ -0,0 +1,4 @@

+[
+  {"question_id": "mim-1", "dataset": "MIMIC", "task_type": "T1U", "family": null, "prompt": "M1", "options": ["0", "1"], "answer": "0", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "mim-2", "dataset": "MIMIC", "task_type": "T2_MCQ", "family": null, "prompt": "M2", "options": ["0", "1"], "answer": "1", "capability_tags": [], "difficulty": null, "metadata": {}}
+]

tests/fixtures/banks/PSML_questions.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {"question_id": "psml-t3-s1a-1", "dataset": "PSML", "task_type": "T3", "family": "S1:A", "prompt": "Q1", "options": ["a", "b"], "answer": "a", "capability_tags": ["C1"], "difficulty": "easy", "metadata": {}},
+  {"question_id": "psml-t3-s2b-1", "dataset": "PSML", "task_type": "T3", "family": "S2:B", "prompt": "Q2", "options": ["a", "b"], "answer": "b", "capability_tags": ["C2"], "difficulty": "easy", "metadata": {}},
+  {"question_id": "psml-t3-s3c-1", "dataset": "PSML", "task_type": "T3", "family": "S3:C", "prompt": "Q3", "options": ["x", "y"], "answer": "x", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t1u-1", "dataset": "PSML", "task_type": "T1U", "family": null, "prompt": "Q4 trend?", "options": ["upward", "downward"], "answer": "upward", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t2-1", "dataset": "PSML", "task_type": "T2_MCQ", "family": null, "prompt": "Q5", "options": ["m1", "m2"], "answer": "m1", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t3-s4d-1", "dataset": "PSML", "task_type": "T3", "family": "S4:D", "prompt": "Q6", "options": ["p", "q"], "answer": "p", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t1u-2", "dataset": "PSML", "task_type": "T1U", "family": null, "prompt": "Q7", "options": ["yes", "no"], "answer": "yes", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t1u-3", "dataset": "PSML", "task_type": "T1U", "family": null, "prompt": "Q7b", "options": ["yes", "no"], "answer": "no", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t1u-4", "dataset": "PSML", "task_type": "T1U", "family": null, "prompt": "Q7c", "options": ["u", "d"], "answer": "u", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t1u-5", "dataset": "PSML", "task_type": "T1U", "family": null, "prompt": "Q7d", "options": ["u", "d"], "answer": "d", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t1u-6", "dataset": "PSML", "task_type": "T1U", "family": null, "prompt": "Q7e", "options": ["e", "f"], "answer": "e", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "psml-t2-2", "dataset": "PSML", "task_type": "T2_MCQ", "family": null, "prompt": "Q8", "options": ["h", "l"], "answer": "h", "capability_tags": [], "difficulty": null, "metadata": {}}
+]

tests/fixtures/banks/causal_chambers_questions.json ADDED Viewed

	@@ -0,0 +1,4 @@

+[
+  {"question_id": "cc-1", "dataset": "causal_chambers", "task_type": "T3", "family": "S5:E", "prompt": "CC1", "options": ["low", "high"], "answer": "low", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "cc-2", "dataset": "causal_chambers", "task_type": "T1U", "family": null, "prompt": "CC2", "options": ["low", "high"], "answer": "high", "capability_tags": [], "difficulty": null, "metadata": {}}
+]

tests/fixtures/banks/freshretailnet_questions.json ADDED Viewed

	@@ -0,0 +1,4 @@

+[
+  {"question_id": "frn-1", "dataset": "freshretailnet", "task_type": "T1U", "family": null, "prompt": "FRN1", "options": ["A", "B"], "answer": "A", "capability_tags": [], "difficulty": null, "metadata": {}},
+  {"question_id": "frn-2", "dataset": "freshretailnet", "task_type": "T3", "family": "S1:A", "prompt": "FRN2", "options": ["A", "B"], "answer": "B", "capability_tags": [], "difficulty": null, "metadata": {}}
+]

tests/test_env.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from pathlib import Path
+import pytest
+from env.config import EnvConfig
+from env.temporal_bench_env import TemporalBenchEnvironment
+from models import TemporalBenchAction
+def _bank_dir() -> Path:
+    return Path(__file__).resolve().parent / "fixtures" / "banks"
+def test_full_episode_all_correct():
+    cfg = EnvConfig(question_bank_path=str(_bank_dir()), seed=123, lambda_ep=0.5, alpha=1.0)
+    env = TemporalBenchEnvironment(config=cfg)
+    obs0 = env.reset(seed=123)
+    assert not obs0.done
+    assert obs0.step_idx == 0
+    assert len(obs0.options) >= 2
+    total_r = 0.0
+    obs = obs0
+    for i in range(cfg.num_questions):
+        cur = env._questions[env._answered]  # noqa: SLF001
+        obs = env.step(TemporalBenchAction(answer=cur.answer))
+        total_r += obs.reward or 0.0
+        if i < cfg.num_questions - 1:
+            assert not obs.done
+            assert obs.step_idx == i + 1
+        else:
+            assert obs.done
+            assert obs.step_idx == cfg.num_questions
+    st = env.state
+    assert st.total_correct == cfg.num_questions
+    assert st.step_count == cfg.num_questions
+    assert st.total_reward == pytest.approx(total_r)
+def test_episode_with_one_wrong_answer():
+    cfg = EnvConfig(question_bank_path=str(_bank_dir()), seed=0, lambda_ep=0.5, alpha=1.0)
+    env = TemporalBenchEnvironment(config=cfg)
+    env.reset(seed=0)
+    for i in range(cfg.num_questions):
+        cur = env._questions[env._answered]  # noqa: SLF001
+        wrong = next(o for o in cur.options if o != cur.answer)
+        act = TemporalBenchAction(answer=wrong if i == 0 else cur.answer)
+        obs = env.step(act)
+        if i == cfg.num_questions - 1:
+            assert obs.done
+            meta = obs.metadata or {}
+            assert "episode_bonus" in meta
+            bonus = float(meta["episode_bonus"])
+            lo = 0.5 * (8 / 9) * 0.8
+            hi = 0.5 * (8 / 9) * 1.0
+            assert lo - 1e-9 <= bonus <= hi + 1e-9
+    assert env.state.total_correct == 8
+def test_double_step_after_done():
+    cfg = EnvConfig(question_bank_path=str(_bank_dir()), seed=1)
+    env = TemporalBenchEnvironment(config=cfg)
+    env.reset(seed=1)
+    for _ in range(cfg.num_questions):
+        cur = env._questions[env._answered]  # noqa: SLF001
+        env.step(TemporalBenchAction(answer=cur.answer))
+    obs = env.step(TemporalBenchAction(answer="x"))
+    assert obs.done
+def test_empty_answer_no_advance():
+    cfg = EnvConfig(question_bank_path=str(_bank_dir()), seed=2)
+    env = TemporalBenchEnvironment(config=cfg)
+    env.reset(seed=2)
+    before = env._answered  # noqa: SLF001
+    obs = env.step(TemporalBenchAction(answer="  "))
+    assert env._answered == before  # noqa: SLF001
+    assert "error" in (obs.metadata or {})

tests/test_loaders.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from pathlib import Path
+import pytest
+from data.loaders import load_json_file, load_question_banks
+from data.question import TSQuestion
+def test_load_json_file_roundtrip(fixture_bank_dir: Path):
+    path = fixture_bank_dir / "PSML_questions.json"
+    qs = load_json_file(path)
+    assert len(qs) >= 6
+    assert all(isinstance(q, TSQuestion) for q in qs)
+    assert qs[0].dataset == "PSML"
+def test_load_question_banks_directory(fixture_bank_dir: Path):
+    pools = load_question_banks(str(fixture_bank_dir))
+    for d in ("PSML", "freshretailnet", "MIMIC", "causal_chambers"):
+        assert d in pools
+        assert len(pools[d]) >= 1
+def test_load_question_banks_rejects_non_dir():
+    with pytest.raises(NotADirectoryError):
+        load_question_banks("/nonexistent/path/that/is/not/a/dir")

tests/test_reward.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import pytest
+from env.reward import compute_episode_bonus, compute_forecasting_reward, compute_mcq_reward
+def test_compute_mcq_reward():
+    assert compute_mcq_reward(1.0, alpha=2.0) == 2.0
+    assert compute_mcq_reward(0.0) == 0.0
+def test_compute_episode_bonus_coverage():
+    domains = ("PSML", "freshretailnet", "MIMIC", "causal_chambers")
+    cov = {"PSML": 1, "freshretailnet": 0, "MIMIC": 1, "causal_chambers": 1}
+    b = compute_episode_bonus(3, 9, cov, all_domains=domains, lambda_ep=0.5)
+    assert b == 0.5 * (3 / 9) * 0.8
+    cov2 = {"PSML": 1, "freshretailnet": 1, "MIMIC": 1, "causal_chambers": 1}
+    b2 = compute_episode_bonus(4, 9, cov2, all_domains=domains, lambda_ep=0.5)
+    assert b2 == 0.5 * (4 / 9) * 1.0
+def test_forecasting_reward_stub():
+    import numpy as np
+    with pytest.raises(NotImplementedError):
+        compute_forecasting_reward(np.array([1.0]), np.array([1.0]))

tests/test_sampler.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+from data.loaders import load_question_banks
+from env.config import EnvConfig
+from env.episode_sampler import EpisodeSampler
+def test_sample_episode_shape_and_domains(fixture_bank_dir):
+    banks = load_question_banks(str(fixture_bank_dir))
+    cfg = EnvConfig(
+        question_bank_path=str(fixture_bank_dir),
+        primary_domain="PSML",
+        seed=42,
+    )
+    rng = np.random.default_rng(42)
+    sampler = EpisodeSampler(banks, cfg, rng)
+    ep = sampler.sample_episode()
+    assert len(ep) == cfg.num_questions
+    datasets = {q.dataset for q in ep}
+    assert datasets == set(cfg.all_domains)
+def test_curriculum_stage_filters_task_types(fixture_bank_dir):
+    banks = load_question_banks(str(fixture_bank_dir))
+    cfg = EnvConfig(
+        question_bank_path=str(fixture_bank_dir),
+        curriculum_stage=1,
+        seed=0,
+    )
+    rng = np.random.default_rng(0)
+    sampler = EpisodeSampler(banks, cfg, rng)
+    for _ in range(5):
+        ep = sampler.sample_episode()
+        assert all(q.task_type == "T1U" for q in ep)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff