Spaces:

kgdrathan
/

openenv-curator

Sleeping

App Files Files Community

kgdrathan commited on Apr 7

Commit

d33e9ca

0 Parent(s):

initial commit

Browse files

Files changed (21) hide show

.gitignore +216 -0
CLAUDE.md +63 -0
README.md +72 -0
__init__.py +10 -0
client.py +94 -0
data/ground_truth.json +178 -0
data/items.json +0 -0
data/tasks.json +93 -0
inference.py +283 -0
models.py +135 -0
openenv.yaml +6 -0
pre-validation.sh +185 -0
pyproject.toml +34 -0
scripts/fetch_data.py +501 -0
server/Dockerfile +80 -0
server/__init__.py +5 -0
server/app.py +44 -0
server/curator_environment.py +332 -0
server/grader.py +217 -0
server/requirements.txt +6 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,216 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+OpenEnv Curator is a personalized content curation RL environment where agents filter, categorize, rank, and recommend content items from multiple sources (Hacker News, arXiv, DEV.to, Reddit) based on user preference profiles. Built on the OpenEnv framework, deployed as a FastAPI server on Hugging Face Spaces via Docker.
+## Common Commands
+```bash
+# Install dependencies (uses uv package manager)
+uv sync
+# Run server locally
+uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+# Or via entry point:
+uv run server
+# Fetch/regenerate data files (one-time)
+python scripts/fetch_data.py
+# Run inference agent
+CURATOR_TASK=easy HF_TOKEN=<token> python inference.py
+# Docker build & run
+docker build -f server/Dockerfile -t curator:latest .
+docker run -p 8000:8000 curator:latest
+# Pre-submission validation
+bash pre-validation.sh https://your-space.hf.space
+# Lint
+uv run ruff check .
+```
+## Architecture
+The system follows a client-server pattern using the OpenEnv framework:
+- **Client** (`client.py`, `models.py`): `CuratorEnv` is an async HTTP client (extends OpenEnv `EnvClient`). `CuratorAction` and `CuratorObservation` are Pydantic models extending OpenEnv base types. The package exports these three from `__init__.py`.
+- **Server** (`server/app.py`): Uses `openenv.core.env_server.http_server.create_app()` to generate FastAPI endpoints (`/reset`, `/step`, `/state`, `/schema`, `/ws`). Max 4 concurrent environments.
+- **Environment** (`server/curator_environment.py`): Implements the OpenEnv `Environment` interface. Manages episode state (item pool, relevance scores, categories, rankings). Four action handlers: `_handle_filter` (remove low-relevance items), `_handle_categorize` (tag as urgent/read_later/share/skip), `_handle_rank` (order by relevance), `_handle_recommend` (final selection, ends episode).
+- **Grader** (`server/grader.py`): Deterministic IR metrics. `grade_episode()` computes composite score: 0.35×NDCG@k + 0.25×Precision@k + 0.20×Recall@k + 0.10×CategoryAccuracy + 0.10×SourceDiversity.
+- **Data** (`data/`): Static JSON files — `items.json` (real content from 4 sources), `tasks.json` (easy/medium/hard with embedded user profiles), `ground_truth.json` (relevance scores per task). Generated by `scripts/fetch_data.py`.
+- **Inference** (`inference.py`): Example LLM agent using OpenAI SDK. Parses JSON actions from model output, logs in structured `[START]/[STEP]/[END]` format.
+## Task Difficulty Levels
+| Task   | Pool Size | Sources        | Max Steps | Recommend K |
+|--------|-----------|----------------|-----------|-------------|
+| easy   | 20        | HN only        | 10        | 5           |
+| medium | 50        | HN+arXiv+DEV   | 20        | 10          |
+| hard   | 100       | All 4 sources  | 30        | 15          |
+## Package Layout
+The package is named `curator` with subpackage `curator.server`. The `package-dir` mapping in `pyproject.toml` maps the repo root to `curator` and `server/` to `curator.server`. This means imports use `from curator import ...` or `from curator.server import ...`, but on-disk the files are at the repo root.

README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+---
+title: Curator Environment
+emoji: 📰
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - OpenEnv
+  - RL
+---
+# Curator — Personalized Content Curation Environment
+An OpenEnv environment where an agent must curate a pool of real content items (from Hacker News, arXiv, DEV.to, Reddit) and curate a personalized reading list based on a user's preference profile.
+## Goal
+Every knowledge worker drowns in information — hundreds of articles, papers, and posts across dozens of sources daily. Given a user profile and a content pool, the agent must intelligently **filter**, **categorize**, **rank**, and **recommend** the most relevant items. Scored using standard Information Retrieval metrics (NDCG, precision, recall).
+## Action Space
+| Action | Fields | Description |
+|--------|--------|-------------|
+| `filter` | `item_ids: List[str]` | Remove irrelevant items from the pool |
+| `categorize` | `categories: Dict[str, "urgent"\|"read_later"\|"share"\|"skip"]` | Tag items by priority |
+| `rank` | `rankings: List[str]` | Order items by relevance (best first) |
+| `recommend` | `item_ids: List[str]` | Final recommendation (ends episode) |
+## Observation Space
+Each observation includes:
+- **items** — current pool of content items (`id`, `source`, `title`, `summary`, `tags`, `score`, `reading_time_mins`, `content_type`)
+- **user_profile** — interests (topic weights 0-1), preferred sources, skill level, time budget, read history
+- **feedback** — per-step scores (relevance, coverage) from the last action
+- **task_info** — difficulty, max steps, progress counters
+## Tasks
+| Task | Pool Size | Sources | Max Steps | Recommend K | Description |
+|------|-----------|---------|-----------|-------------|-------------|
+| **easy** | 20 | Hacker News | 10 | 5 | Clear AI/ML interests, single source |
+| **medium** | 50 | HN + arXiv + DEV.to | 20 | 10 | Broad interests, 3 sources, some already-read items |
+| **hard** | 100 | All 4 sources | 30 | 15 | Minimal preferences, must infer interests from feedback |
+Each task includes an embedded user profile that defines what "relevant" means for scoring.
+## Scoring
+**Per-step rewards** (0-1):
+- **filter**: higher reward for removing low-relevance items
+- **categorize**: accuracy against relevance-derived ground truth
+- **rank**: NDCG@k against ground truth relevance
+- **recommend**: composite final episode score
+**Final episode score** (deterministic, 0-1):
+```
+score = 0.35 * NDCG@k + 0.25 * Precision@k + 0.20 * Recall@k + 0.10 * Category accuracy + 0.10 * Source diversity
+```
+## Data
+All content is real data fetched from free public APIs (no auth needed), cached as static JSON — no API calls at runtime:
+- **Hacker News** — top stories via Firebase API
+- **arXiv** — recent AI/ML/NLP papers
+- **DEV.to** — programming articles and tutorials
+- **Reddit** — posts from r/programming, r/machinelearning, r/webdev

__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Curator Environment — Personalized Content Curation."""
+from .client import CuratorEnv
+from .models import CuratorAction, CuratorObservation
+__all__ = [
+    "CuratorAction",
+    "CuratorObservation",
+    "CuratorEnv",
+]

client.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Curator Environment Client."""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from .models import (
+    ActionFeedback,
+    ContentItem,
+    CuratorAction,
+    CuratorObservation,
+    TaskInfo,
+    UserProfile,
+)
+class CuratorEnv(EnvClient[CuratorAction, CuratorObservation, State]):
+    """
+    Client for the Curator Environment.
+    Example:
+        >>> async with CuratorEnv(base_url="http://localhost:8000") as client:
+        ...     result = await client.reset(task_id="easy")
+        ...     print(len(result.observation.items))
+        ...
+        ...     result = await client.step(CuratorAction(
+        ...         action_type="rank",
+        ...         rankings=["hn_123", "hn_456"]
+        ...     ))
+    Example with Docker:
+        >>> client = await CuratorEnv.from_docker_image("curator:latest")
+    """
+    def _step_payload(self, action: CuratorAction) -> Dict:
+        """Convert CuratorAction to JSON payload."""
+        payload = {"action_type": action.action_type}
+        if action.item_ids:
+            payload["item_ids"] = action.item_ids
+        if action.categories is not None:
+            payload["categories"] = action.categories
+        if action.rankings is not None:
+            payload["rankings"] = action.rankings
+        if action.reasoning is not None:
+            payload["reasoning"] = action.reasoning
+        if action.metadata:
+            payload["metadata"] = action.metadata
+        return payload
+    def _parse_result(self, payload: Dict) -> StepResult[CuratorObservation]:
+        """Parse server response into StepResult[CuratorObservation]."""
+        obs_data = payload.get("observation", {})
+        # Parse nested models
+        items = [ContentItem(**it) for it in obs_data.get("items", [])]
+        user_profile = None
+        if obs_data.get("user_profile"):
+            user_profile = UserProfile(**obs_data["user_profile"])
+        feedback = None
+        if obs_data.get("feedback"):
+            feedback = ActionFeedback(**obs_data["feedback"])
+        task_info = None
+        if obs_data.get("task_info"):
+            task_info = TaskInfo(**obs_data["task_info"])
+        observation = CuratorObservation(
+            items=items,
+            user_profile=user_profile,
+            feedback=feedback,
+            task_info=task_info,
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        """Parse server response into State."""
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

data/ground_truth.json ADDED Viewed

	@@ -0,0 +1,178 @@

+{
+  "easy": {
+    "hn_47672818": 0.49,
+    "hn_47673005": 0.3843,
+    "hn_47673541": 0.4808,
+    "hn_47673360": 0.3535,
+    "hn_47672295": 0.4338,
+    "hn_47672318": 0.5117,
+    "hn_47672884": 0.3678,
+    "hn_47614528": 0.4893,
+    "hn_47672778": 0.355,
+    "hn_47641472": 0.3595,
+    "hn_47627217": 0.3513,
+    "hn_47666024": 0.49,
+    "hn_47673072": 0.3738,
+    "hn_47659135": 0.49,
+    "hn_47660925": 0.49,
+    "hn_47626242": 0.3888,
+    "hn_47669337": 0.4488,
+    "hn_47663147": 0.49,
+    "hn_47662234": 0.49,
+    "hn_47667321": 0.4825
+  },
+  "medium": {
+    "hn_47672818": 0.075,
+    "hn_47673005": 0.3693,
+    "hn_47673541": 0.0,
+    "hn_47673360": 0.3385,
+    "hn_47672295": 0.0188,
+    "hn_47672318": 0.4465,
+    "hn_47672884": 0.3528,
+    "hn_47614528": 0.4743,
+    "hn_47672778": 0.3921,
+    "hn_47641472": 0.3445,
+    "hn_47627217": 0.3675,
+    "hn_47666024": 0.5375,
+    "hn_47673072": 0.3588,
+    "hn_47659135": 0.475,
+    "hn_47660925": 0.475,
+    "hn_47626242": 0.3738,
+    "hn_47669337": 0.4338,
+    "hn_47663147": 0.475,
+    "hn_47662234": 0.5271,
+    "hn_47667321": 0.4675,
+    "hn_47638498": 0.3393,
+    "hn_47641528": 0.4393,
+    "hn_47660954": 0.475,
+    "hn_47662945": 0.475,
+    "hn_47662596": 0.5687,
+    "hn_47660286": 0.475,
+    "hn_47660853": 0.475,
+    "hn_47636579": 0.458,
+    "hn_47637010": 0.475,
+    "hn_47672268": 0.4225,
+    "hn_47667672": 0.5005,
+    "hn_47662116": 0.4547,
+    "hn_47667717": 0.475,
+    "hn_47627998": 0.4495,
+    "hn_47637828": 0.4655,
+    "hn_47669749": 0.3858,
+    "hn_47673576": 0.3273,
+    "hn_47664186": 0.5427,
+    "hn_47642125": 0.404,
+    "hn_47661065": 0.5238,
+    "hn_47671527": 0.3723,
+    "hn_47664912": 0.475,
+    "hn_47665685": 0.3678,
+    "hn_47647397": 0.4039,
+    "hn_47665245": 0.373,
+    "hn_47668727": 0.5464,
+    "hn_47627361": 0.3445,
+    "hn_47673208": 0.328,
+    "hn_47665207": 0.3633,
+    "hn_47673182": 0.328
+  },
+  "hard": {
+    "hn_47672818": 0.1,
+    "hn_47673005": 0.3943,
+    "hn_47673541": 0.362,
+    "hn_47673360": 0.0,
+    "hn_47672295": 0.4438,
+    "hn_47672318": 0.3777,
+    "hn_47672884": 0.0,
+    "hn_47614528": 0.4993,
+    "hn_47672778": 0.365,
+    "hn_47641472": 0.0,
+    "hn_47627217": 0.3613,
+    "hn_47666024": 0.7222,
+    "hn_47673072": 0.3837,
+    "hn_47659135": 0.7778,
+    "hn_47660925": 0.5,
+    "hn_47626242": 0.3987,
+    "hn_47669337": 0.4588,
+    "hn_47663147": 0.5,
+    "hn_47662234": 0.5,
+    "hn_47667321": 0.4925,
+    "hn_47638498": 0.3643,
+    "hn_47641528": 0.624,
+    "hn_47660954": 0.5,
+    "hn_47662945": 0.5,
+    "hn_47662596": 0.5,
+    "hn_47660286": 0.5,
+    "hn_47660853": 0.5,
+    "hn_47636579": 0.6427,
+    "hn_47637010": 0.5,
+    "hn_47672268": 0.3538,
+    "hn_47667672": 0.4318,
+    "hn_47662116": 0.4797,
+    "hn_47667717": 0.5,
+    "hn_47627998": 0.4745,
+    "hn_47637828": 0.6502,
+    "hn_47669749": 0.4108,
+    "hn_47673576": 0.3523,
+    "hn_47664186": 0.5,
+    "hn_47642125": 0.5887,
+    "hn_47661065": 0.455,
+    "hn_47671527": 0.3973,
+    "hn_47664912": 0.5,
+    "hn_47665685": 0.3927,
+    "hn_47647397": 0.356,
+    "hn_47665245": 0.398,
+    "hn_47668727": 0.4985,
+    "hn_47627361": 0.3695,
+    "hn_47673208": 0.353,
+    "hn_47665207": 0.3883,
+    "hn_47673182": 0.353,
+    "hn_47636937": 0.4078,
+    "hn_47658146": 0.5,
+    "hn_47664205": 0.7222,
+    "hn_47655408": 0.5,
+    "hn_47637116": 0.4715,
+    "hn_47636456": 0.497,
+    "hn_47651703": 0.7222,
+    "hn_47652007": 0.5,
+    "arxiv_2604_04932v1": 0.285,
+    "arxiv_2604_04930v1": 0.5072,
+    "arxiv_2604_04924v1": 0.285,
+    "arxiv_2604_04923v1": 0.5072,
+    "arxiv_2604_04921v1": 0.5072,
+    "arxiv_2604_04920v1": 0.285,
+    "arxiv_2604_04917v1": 0.5072,
+    "arxiv_2604_04916v1": 0.5072,
+    "arxiv_2604_04914v1": 0.5072,
+    "arxiv_2604_04908v1": 0.5072,
+    "arxiv_2604_04906v1": 0.285,
+    "arxiv_2604_04902v1": 0.5072,
+    "arxiv_2604_04901v1": 0.5072,
+    "arxiv_2604_04898v1": 0.5072,
+    "arxiv_2604_04895v1": 0.5072,
+    "arxiv_2604_04894v1": 0.285,
+    "arxiv_2604_04892v1": 0.5072,
+    "arxiv_2604_04891v1": 0.285,
+    "arxiv_2604_04878v1": 0.285,
+    "arxiv_2604_04876v1": 0.5072,
+    "arxiv_2604_04875v1": 0.5072,
+    "arxiv_2604_04872v1": 0.285,
+    "arxiv_2604_04869v1": 0.5072,
+    "arxiv_2604_04868v1": 0.5072,
+    "arxiv_2604_04858v1": 0.785,
+    "arxiv_2604_04855v1": 0.5072,
+    "arxiv_2604_04853v1": 0.285,
+    "arxiv_2604_04852v1": 0.285,
+    "arxiv_2604_04847v1": 0.5072,
+    "arxiv_2604_04843v1": 0.5072,
+    "arxiv_2604_04842v1": 0.5072,
+    "arxiv_2604_04839v1": 0.285,
+    "arxiv_2604_04829v1": 0.5072,
+    "arxiv_2604_04828v1": 0.285,
+    "arxiv_2604_04825v1": 0.285,
+    "arxiv_2604_04820v1": 0.285,
+    "arxiv_2604_04815v1": 0.285,
+    "arxiv_2604_04808v1": 0.5072,
+    "arxiv_2604_04804v1": 0.285,
+    "arxiv_2604_04802v1": 0.5072,
+    "arxiv_2604_04800v1": 0.285,
+    "arxiv_2604_04797v1": 0.5072
+  }
+}

data/items.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/tasks.json ADDED Viewed

	@@ -0,0 +1,93 @@

+[
+  {
+    "task_id": "easy",
+    "difficulty": "easy",
+    "item_count": 20,
+    "max_steps": 10,
+    "sources": [
+      "hackernews"
+    ],
+    "recommend_k": 5,
+    "description": "Curate 5 top articles from 20 Hacker News stories for an AI/ML enthusiast.",
+    "profile": {
+      "interests": {
+        "ai": 0.95,
+        "nlp": 0.85,
+        "python": 0.8,
+        "data": 0.7
+      },
+      "preferred_sources": [
+        "hackernews",
+        "arxiv"
+      ],
+      "time_budget_mins": 120,
+      "read_history": [],
+      "skill_level": "intermediate"
+    }
+  },
+  {
+    "task_id": "medium",
+    "difficulty": "medium",
+    "item_count": 50,
+    "max_steps": 20,
+    "sources": [
+      "hackernews",
+      "devto",
+      "arxiv"
+    ],
+    "recommend_k": 10,
+    "description": "Curate 10 items from 50 across HN, DEV.to, and arXiv for a senior engineer with broad interests.",
+    "profile": {
+      "interests": {
+        "ai": 0.9,
+        "web": 0.7,
+        "systems": 0.6,
+        "security": 0.5,
+        "python": 0.75,
+        "cloud": 0.4,
+        "open-source": 0.65,
+        "startup": 0.3
+      },
+      "preferred_sources": [
+        "hackernews",
+        "devto"
+      ],
+      "time_budget_mins": 60,
+      "read_history": [
+        "hn_47672818",
+        "hn_47673541",
+        "hn_47672295"
+      ],
+      "skill_level": "expert"
+    }
+  },
+  {
+    "task_id": "hard",
+    "difficulty": "hard",
+    "item_count": 100,
+    "max_steps": 30,
+    "sources": [
+      "hackernews",
+      "devto",
+      "arxiv",
+      "reddit"
+    ],
+    "recommend_k": 15,
+    "description": "Curate 15 items from 100 across all sources for a beginner with minimal stated preferences. Must infer interests from feedback.",
+    "profile": {
+      "interests": {
+        "rust": 0.5,
+        "systems": 0.4
+      },
+      "preferred_sources": [],
+      "time_budget_mins": 30,
+      "read_history": [
+        "hn_47672818",
+        "hn_47673360",
+        "hn_47672884",
+        "hn_47641472"
+      ],
+      "skill_level": "beginner"
+    }
+  }
+]

inference.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+Inference Script for Curator Environment
+============================================
+Environment Variables:
+    API_BASE_URL   The API endpoint for the LLM.
+    MODEL_NAME     The model identifier to use for inference.
+    HF_TOKEN       Your Hugging Face / API key.
+    IMAGE_NAME     Docker image name for the environment.
+    CURATOR_TASK  Task difficulty: "easy", "medium", or "hard" (default: "easy").
+STDOUT FORMAT:
+    [START] task=<task_name> env=curator model=<model_name>
+    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...,rn>
+"""
+import asyncio
+import json
+import os
+import textwrap
+from typing import Any, Dict, List, Optional
+from openai import OpenAI
+from client import CuratorEnv
+from models import CuratorAction
+IMAGE_NAME = os.getenv("IMAGE_NAME")
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
+MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
+TASK_NAME = os.getenv("CURATOR_TASK", "easy")
+BENCHMARK = "curator"
+TEMPERATURE = 0.3
+MAX_TOKENS = 2000
+SUCCESS_SCORE_THRESHOLD = 0.3
+SYSTEM_PROMPT = textwrap.dedent("""
+    You are a content curation agent. You help users find the most relevant
+    articles from a pool of content items based on their interest profile.
+    Available actions (respond with valid JSON):
+    1. Filter (remove irrelevant items):
+       {"action_type": "filter", "item_ids": ["id1", "id2", ...]}
+    2. Categorize items:
+       {"action_type": "categorize", "categories": {"id1": "urgent", "id2": "skip", ...}}
+       Categories: "urgent", "read_later", "share", "skip"
+    3. Rank items by relevance:
+       {"action_type": "rank", "rankings": ["best_id", "second_id", ...]}
+    4. Final recommendation (ends episode):
+       {"action_type": "recommend", "item_ids": ["id1", "id2", ...]}
+    Strategy: First filter out clearly irrelevant items, then rank the remainder,
+    then recommend the top items.
+    IMPORTANT: Respond with ONLY a JSON object, no markdown or explanation.
+""").strip()
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(
+    step: int, action: str, reward: float, done: bool, error: Optional[str]
+) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
+        flush=True,
+    )
+def format_items_for_prompt(items: List[Dict], max_items: int = 30) -> str:
+    """Format content items into a compact string for the LLM prompt."""
+    lines = []
+    for item in items[:max_items]:
+        tags = ", ".join(item.get("tags", []))
+        lines.append(
+            f"- [{item['id']}] ({item['source']}) {item['title']} [tags: {tags}] [score: {item.get('score', 0)}]"
+        )
+    if len(items) > max_items:
+        lines.append(f"  ... and {len(items) - max_items} more items")
+    return "\n".join(lines)
+def format_profile_for_prompt(profile: Dict) -> str:
+    """Format user profile for the LLM prompt."""
+    interests = ", ".join(
+        f"{k}={v:.1f}"
+        for k, v in sorted(profile.get("interests", {}).items(), key=lambda x: -x[1])
+    )
+    sources = ", ".join(profile.get("preferred_sources", [])) or "no preference"
+    history = profile.get("read_history", [])
+    return (
+        f"Interests: {interests}\n"
+        f"Preferred sources: {sources}\n"
+        f"Skill level: {profile.get('skill_level', 'intermediate')}\n"
+        f"Time budget: {profile.get('time_budget_mins', 60)} mins\n"
+        f"Already read: {len(history)} items ({', '.join(history[:5])}{'...' if len(history) > 5 else ''})"
+    )
+def build_user_prompt(obs: Any, step: int, last_feedback: Optional[str]) -> str:
+    """Build the user prompt from current observation."""
+    items = [
+        item.model_dump() if hasattr(item, "model_dump") else item for item in obs.items
+    ]
+    profile = (
+        obs.user_profile.model_dump()
+        if hasattr(obs.user_profile, "model_dump")
+        else obs.user_profile
+    )
+    ti = obs.task_info
+    prompt = f"""Step {step}/{ti.max_steps}. You must recommend {ti.recommend_k} items.
+Pool: {ti.pool_size} items. Filtered so far: {ti.items_filtered}. Categorized: {ti.items_categorized}.
+User Profile:
+{format_profile_for_prompt(profile)}
+Items in pool:
+{format_items_for_prompt(items)}
+"""
+    if last_feedback:
+        prompt += f"\nLast action feedback: {last_feedback}\n"
+    if step >= ti.max_steps - 1:
+        prompt += (
+            "\nWARNING: This is your last step. You MUST use 'recommend' action now.\n"
+        )
+    elif step >= ti.max_steps - 2:
+        prompt += "\nOnly 2 steps left. Consider recommending soon.\n"
+    return prompt
+def parse_action_from_response(text: str) -> Optional[Dict]:
+    """Parse a JSON action from LLM response text."""
+    text = text.strip()
+    # Try to extract JSON from markdown code blocks
+    if "```" in text:
+        parts = text.split("```")
+        for part in parts[1::2]:
+            part = part.strip()
+            if part.startswith("json"):
+                part = part[4:].strip()
+            try:
+                return json.loads(part)
+            except json.JSONDecodeError:
+                continue
+    # Try direct JSON parse
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    # Try to find JSON object in text
+    start = text.find("{")
+    end = text.rfind("}") + 1
+    if start >= 0 and end > start:
+        try:
+            return json.loads(text[start:end])
+        except json.JSONDecodeError:
+            pass
+    return None
+def get_model_action(
+    client: OpenAI, obs: Any, step: int, last_feedback: Optional[str]
+) -> Dict:
+    """Get action from LLM."""
+    user_prompt = build_user_prompt(obs, step, last_feedback)
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+            stream=False,
+        )
+        text = (completion.choices[0].message.content or "").strip()
+        action = parse_action_from_response(text)
+        if action and "action_type" in action:
+            return action
+    except Exception as exc:
+        print(f"[DEBUG] Model request failed: {exc}", flush=True)
+    # Fallback: recommend first N items from pool
+    item_ids = [item.id if hasattr(item, "id") else item["id"] for item in obs.items]
+    k = obs.task_info.recommend_k if obs.task_info else 5
+    return {"action_type": "recommend", "item_ids": item_ids[:k]}
+async def main() -> None:
+    llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = await CuratorEnv.from_docker_image(IMAGE_NAME)
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    last_feedback: Optional[str] = None
+    log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
+    try:
+        result = await env.reset(task_id=TASK_NAME)
+        obs = result.observation
+        task_info = obs.task_info
+        max_steps = task_info.max_steps if task_info else 10
+        for step in range(1, max_steps + 1):
+            if result.done:
+                break
+            action_dict = get_model_action(llm_client, obs, step, last_feedback)
+            action = CuratorAction(**action_dict)
+            result = await env.step(action)
+            obs = result.observation
+            reward = result.reward or 0.0
+            done = result.done
+            error = None
+            rewards.append(reward)
+            steps_taken = step
+            # Summarize action for logging
+            action_summary = f"{action.action_type}({len(action.item_ids)}items)"
+            log_step(
+                step=step, action=action_summary, reward=reward, done=done, error=error
+            )
+            # Capture feedback for next prompt
+            if obs.feedback:
+                last_feedback = obs.feedback.explanation
+            else:
+                last_feedback = None
+            if done:
+                break
+        # Final score is the last reward (from recommend action)
+        score = rewards[-1] if rewards else 0.0
+        score = min(max(score, 0.0), 1.0)
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    finally:
+        try:
+            await env.close()
+        except Exception as e:
+            print(f"[DEBUG] env.close() error: {e}", flush=True)
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
+if __name__ == "__main__":
+    asyncio.run(main())

models.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Data models for the Curator Environment.
+Curator is a personalized content curation environment where an agent
+must filter, categorize, rank, and recommend content items from a mixed
+pool of real articles across multiple sources.
+"""
+from typing import Dict, List, Literal, Optional
+from openenv.core.env_server.types import Action, Observation
+from pydantic import BaseModel, Field
+# =============================================================================
+# Helper Models
+# =============================================================================
+class ContentItem(BaseModel):
+    """A single content item from any source."""
+    id: str = Field(..., description="Unique item identifier")
+    source: str = Field(
+        ..., description="Content source: hackernews, arxiv, devto, reddit"
+    )
+    title: str = Field(..., description="Item title")
+    summary: str = Field(default="", description="Brief summary or description")
+    tags: List[str] = Field(default_factory=list, description="Topic tags")
+    url: str = Field(default="", description="Original URL")
+    author: str = Field(default="", description="Author name")
+    score: int = Field(default=0, description="Community score/upvotes")
+    reading_time_mins: int = Field(default=5, description="Estimated reading time")
+    content_type: str = Field(
+        default="article",
+        description="Type: article, paper, discussion, job, tutorial, event",
+    )
+class UserProfile(BaseModel):
+    """A user's preference profile for content curation."""
+    interests: Dict[str, float] = Field(
+        ..., description="Topic interest weights (0.0-1.0)"
+    )
+    preferred_sources: List[str] = Field(
+        default_factory=list, description="Preferred content sources"
+    )
+    time_budget_mins: int = Field(
+        default=60, description="Available reading time in minutes"
+    )
+    read_history: List[str] = Field(
+        default_factory=list, description="IDs of already-read items"
+    )
+    skill_level: str = Field(
+        default="intermediate",
+        description="User expertise: beginner, intermediate, expert",
+    )
+class ActionFeedback(BaseModel):
+    """Feedback from the environment after an action."""
+    relevance_score: float = Field(
+        default=0.0, description="How relevant the action's items were (0-1)"
+    )
+    coverage_score: float = Field(
+        default=0.0, description="Source/topic diversity score (0-1)"
+    )
+    redundancy_penalty: float = Field(
+        default=0.0, description="Penalty for recommending already-seen items (0-1)"
+    )
+    explanation: str = Field(default="", description="Explanation of the feedback")
+class TaskInfo(BaseModel):
+    """Information about the current task configuration."""
+    task_id: str = Field(..., description="Task identifier: easy, medium, hard")
+    difficulty: str = Field(..., description="Difficulty level")
+    max_steps: int = Field(..., description="Maximum steps allowed")
+    recommend_k: int = Field(..., description="Number of items to recommend")
+    pool_size: int = Field(default=0, description="Current items in pool")
+    items_filtered: int = Field(default=0, description="Items filtered so far")
+    items_categorized: int = Field(default=0, description="Items categorized so far")
+    step_number: int = Field(default=0, description="Current step number")
+# =============================================================================
+# Action & Observation Models
+# =============================================================================
+class CuratorAction(Action):
+    """Action for the Curator environment.
+    The agent can filter, categorize, rank, or recommend items.
+    """
+    action_type: Literal["filter", "categorize", "rank", "recommend"] = Field(
+        ..., description="Type of action to perform"
+    )
+    item_ids: List[str] = Field(
+        default_factory=list,
+        description="Item IDs being acted on",
+    )
+    categories: Optional[
+        Dict[str, Literal["urgent", "read_later", "share", "skip"]]
+    ] = Field(
+        default=None,
+        description="Category assignments: {item_id: category} (for categorize action)",
+    )
+    rankings: Optional[List[str]] = Field(
+        default=None,
+        description="Ordered list of item IDs by priority (for rank action)",
+    )
+    reasoning: Optional[str] = Field(
+        default=None, description="Agent's reasoning for this action"
+    )
+class CuratorObservation(Observation):
+    """Observation from the Curator environment."""
+    items: List[ContentItem] = Field(
+        default_factory=list, description="Current pool of content items"
+    )
+    user_profile: Optional[UserProfile] = Field(
+        default=None, description="User preference profile"
+    )
+    feedback: Optional[ActionFeedback] = Field(
+        default=None, description="Feedback from the last action"
+    )
+    task_info: Optional[TaskInfo] = Field(
+        default=None, description="Current task configuration and progress"
+    )

openenv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+spec_version: 1
+name: openenv-curator
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pre-validation.sh ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env bash
+#
+# validate-submission.sh — OpenEnv Submission Validator
+#
+# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
+#
+# Prerequisites:
+#   - Docker:       https://docs.docker.com/get-docker/
+#   - openenv-core: pip install openenv-core
+#   - curl (usually pre-installed)
+#
+# Run:
+#   curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
+#
+#   Or download and run locally:
+#     chmod +x validate-submission.sh
+#     ./validate-submission.sh <ping_url> [repo_dir]
+#
+# Arguments:
+#   ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)
+#   repo_dir   Path to your repo (default: current directory)
+#
+# Examples:
+#   ./validate-submission.sh https://my-team.hf.space
+#   ./validate-submission.sh https://my-team.hf.space ./my-repo
+#
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+export PING_URL
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it: pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Your submission is ready to submit.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-curator"
+version = "0.1.0"
+description = "Curator: Personalized content curation environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core[core]>=0.2.2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+inference = [
+    "openai>=1.0",
+]
+[project.scripts]
+server = "curator.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["curator", "curator.server"]
+package-dir = { "curator" = ".", "curator.server" = "server" }
+[dependency-groups]
+dev = [
+    "ruff>=0.15.9",
+]

scripts/fetch_data.py ADDED Viewed

	@@ -0,0 +1,501 @@

+#!/usr/bin/env python3
+"""
+Fetch real content items from public APIs and save as static JSON.
+Sources (all free, no auth):
+- Hacker News (Firebase API)
+- arXiv (public API)
+- DEV.to (public API)
+- Reddit (public JSON)
+Run once: python scripts/fetch_data.py
+Output: data/items.json
+"""
+import json
+import math
+import time
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from urllib.request import Request, urlopen
+DATA_DIR = Path(__file__).parent.parent / "data"
+# Tag extraction keywords
+TAG_KEYWORDS = {
+    "ai": [
+        "ai",
+        "artificial intelligence",
+        "machine learning",
+        "ml",
+        "deep learning",
+        "neural",
+    ],
+    "nlp": [
+        "nlp",
+        "natural language",
+        "language model",
+        "llm",
+        "gpt",
+        "transformer",
+        "bert",
+    ],
+    "web": [
+        "web",
+        "javascript",
+        "react",
+        "frontend",
+        "css",
+        "html",
+        "browser",
+        "nextjs",
+        "vue",
+    ],
+    "systems": [
+        "systems",
+        "linux",
+        "kernel",
+        "os",
+        "distributed",
+        "infrastructure",
+        "devops",
+    ],
+    "rust": ["rust", "cargo", "rustc", "borrow checker"],
+    "python": ["python", "pip", "django", "flask", "fastapi", "pytorch"],
+    "go": ["golang", " go ", "goroutine"],
+    "security": [
+        "security",
+        "vulnerability",
+        "exploit",
+        "crypto",
+        "encryption",
+        "privacy",
+    ],
+    "database": ["database", "sql", "postgres", "mongodb", "redis", "sqlite"],
+    "cloud": ["cloud", "aws", "gcp", "azure", "kubernetes", "docker", "k8s"],
+    "mobile": ["mobile", "ios", "android", "swift", "kotlin", "flutter"],
+    "data": [
+        "data",
+        "analytics",
+        "visualization",
+        "pandas",
+        "spark",
+        "etl",
+        "pipeline",
+    ],
+    "career": ["career", "hiring", "interview", "salary", "remote", "job"],
+    "startup": ["startup", "funding", "venture", "entrepreneur", "saas", "product"],
+    "open-source": [
+        "open source",
+        "open-source",
+        "oss",
+        "github",
+        "foss",
+        "mit license",
+    ],
+    "robotics": ["robot", "robotics", "autonomous", "drone", "perception", "slam"],
+    "cv": ["computer vision", "image", "object detection", "segmentation", "diffusion"],
+}
+def extract_tags(title: str, summary: str = "") -> list[str]:
+    """Extract topic tags from title and summary text."""
+    text = f"{title} {summary}".lower()
+    tags = []
+    for tag, keywords in TAG_KEYWORDS.items():
+        if any(kw in text for kw in keywords):
+            tags.append(tag)
+    return tags if tags else ["general"]
+def fetch_json(url: str, headers: dict | None = None) -> dict | list:
+    """Fetch JSON from a URL."""
+    req = Request(url, headers=headers or {"User-Agent": "Curator/1.0"})
+    with urlopen(req, timeout=30) as resp:
+        return json.loads(resp.read().decode())
+def fetch_text(url: str) -> str:
+    """Fetch raw text from a URL."""
+    req = Request(url, headers={"User-Agent": "Curator/1.0"})
+    with urlopen(req, timeout=30) as resp:
+        return resp.read().decode()
+def fetch_hackernews(count: int = 60) -> list[dict]:
+    """Fetch top stories from Hacker News."""
+    print(f"  Fetching {count} Hacker News stories...")
+    story_ids = fetch_json("https://hacker-news.firebaseio.com/v0/topstories.json")
+    items = []
+    for sid in story_ids[:count]:
+        try:
+            story = fetch_json(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json")
+            if not story or story.get("type") != "story":
+                continue
+            title = story.get("title", "")
+            url = story.get("url", f"https://news.ycombinator.com/item?id={sid}")
+            items.append(
+                {
+                    "id": f"hn_{sid}",
+                    "source": "hackernews",
+                    "title": title,
+                    "summary": title,  # HN doesn't have summaries; title is the content
+                    "tags": extract_tags(title),
+                    "url": url,
+                    "author": story.get("by", ""),
+                    "score": story.get("score", 0),
+                    "reading_time_mins": 5,
+                    "content_type": "article",
+                }
+            )
+        except Exception as e:
+            print(f"    Skipping HN story {sid}: {e}")
+        time.sleep(0.05)  # Be polite
+    print(f"    Got {len(items)} HN items")
+    return items
+def fetch_arxiv(count: int = 50) -> list[dict]:
+    """Fetch recent AI/ML papers from arXiv."""
+    print(f"  Fetching {count} arXiv papers...")
+    categories = "cat:cs.AI+OR+cat:cs.LG+OR+cat:cs.CL"
+    url = f"https://export.arxiv.org/api/query?search_query={categories}&sortBy=submittedDate&sortOrder=descending&max_results={count}"
+    xml_text = fetch_text(url)
+    root = ET.fromstring(xml_text)
+    ns = {"atom": "http://www.w3.org/2005/Atom"}
+    items = []
+    for entry in root.findall("atom:entry", ns):
+        try:
+            arxiv_id = entry.find("atom:id", ns).text.split("/abs/")[-1]
+            title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
+            summary = (
+                entry.find("atom:summary", ns).text.strip().replace("\n", " ")[:300]
+            )
+            authors = [
+                a.find("atom:name", ns).text for a in entry.findall("atom:author", ns)
+            ]
+            link = entry.find("atom:id", ns).text
+            # Estimate reading time from summary length
+            word_count = len(summary.split())
+            reading_time = max(10, word_count // 20)
+            items.append(
+                {
+                    "id": f"arxiv_{arxiv_id.replace('/', '_').replace('.', '_')}",
+                    "source": "arxiv",
+                    "title": title,
+                    "summary": summary,
+                    "tags": extract_tags(title, summary),
+                    "url": link,
+                    "author": authors[0] if authors else "",
+                    "score": 0,
+                    "reading_time_mins": reading_time,
+                    "content_type": "paper",
+                }
+            )
+        except Exception as e:
+            print(f"    Skipping arXiv entry: {e}")
+    print(f"    Got {len(items)} arXiv items")
+    return items
+def fetch_devto(count: int = 50) -> list[dict]:
+    """Fetch articles from DEV.to."""
+    print(f"  Fetching {count} DEV.to articles...")
+    items = []
+    # Fetch from multiple tags to get variety
+    tags_to_fetch = ["programming", "ai", "webdev", "python", "tutorial"]
+    per_tag = math.ceil(count / len(tags_to_fetch))
+    seen_ids = set()
+    for tag in tags_to_fetch:
+        try:
+            articles = fetch_json(
+                f"https://dev.to/api/articles?per_page={per_tag}&tag={tag}&top=7"
+            )
+            for article in articles:
+                aid = article["id"]
+                if aid in seen_ids:
+                    continue
+                seen_ids.add(aid)
+                title = article.get("title", "")
+                desc = article.get("description", "")
+                tag_list = article.get("tag_list", [])
+                items.append(
+                    {
+                        "id": f"devto_{aid}",
+                        "source": "devto",
+                        "title": title,
+                        "summary": desc[:300] if desc else title,
+                        "tags": extract_tags(title, desc)
+                        if not tag_list
+                        else [t.lower() for t in tag_list[:5]],
+                        "url": article.get("url", ""),
+                        "author": article.get("user", {}).get("username", ""),
+                        "score": article.get("positive_reactions_count", 0),
+                        "reading_time_mins": article.get("reading_time_minutes", 5),
+                        "content_type": "tutorial"
+                        if "tutorial" in (tag_list or [])
+                        else "article",
+                    }
+                )
+            time.sleep(0.2)
+        except Exception as e:
+            print(f"    Skipping DEV.to tag {tag}: {e}")
+    items = items[:count]
+    print(f"    Got {len(items)} DEV.to items")
+    return items
+def fetch_reddit(count: int = 40) -> list[dict]:
+    """Fetch posts from Reddit programming subreddits."""
+    print(f"  Fetching {count} Reddit posts...")
+    items = []
+    subreddits = ["programming", "machinelearning", "webdev"]
+    per_sub = math.ceil(count / len(subreddits))
+    seen_ids = set()
+    for sub in subreddits:
+        try:
+            data = fetch_json(
+                f"https://www.reddit.com/r/{sub}/hot.json?limit={per_sub}",
+                headers={"User-Agent": "Curator/1.0 (content-curation-research)"},
+            )
+            for post in data.get("data", {}).get("children", []):
+                pd = post["data"]
+                rid = pd["id"]
+                if rid in seen_ids or pd.get("stickied"):
+                    continue
+                seen_ids.add(rid)
+                title = pd.get("title", "")
+                selftext = pd.get("selftext", "")[:300]
+                items.append(
+                    {
+                        "id": f"reddit_{rid}",
+                        "source": "reddit",
+                        "title": title,
+                        "summary": selftext if selftext else title,
+                        "tags": extract_tags(title, selftext),
+                        "url": f"https://reddit.com{pd.get('permalink', '')}",
+                        "author": pd.get("author", ""),
+                        "score": pd.get("score", 0),
+                        "reading_time_mins": max(2, len(selftext.split()) // 200)
+                        if selftext
+                        else 3,
+                        "content_type": "discussion",
+                    }
+                )
+            time.sleep(0.5)
+        except Exception as e:
+            print(f"    Skipping Reddit r/{sub}: {e}")
+    items = items[:count]
+    print(f"    Got {len(items)} Reddit items")
+    return items
+def compute_relevance(item: dict, profile: dict) -> float:
+    """Compute relevance score (0-1) of an item for a user profile.
+    Scoring:
+    - 0.50 weight: tag match (sum of matched interest weights / total interest weight)
+    - 0.20 weight: source preference (1.0 if preferred, 0.3 otherwise)
+    - 0.15 weight: community signal (normalized score/upvotes)
+    - 0.10 weight: reading time fit (within budget = 1.0, over = 0.3)
+    - 0.05 weight: content type match (paper for expert, tutorial for beginner)
+    - Penalty: -0.4 for already-read items
+    """
+    interests = profile["interests"]
+    item_tags = set(item["tags"])
+    if not interests:
+        return 0.05
+    # Tag match: how much of the user's interest space does this item cover?
+    total_interest_weight = sum(interests.values())
+    matched_weight = sum(interests.get(tag, 0.0) for tag in item_tags)
+    tag_score = matched_weight / total_interest_weight if total_interest_weight > 0 else 0.0
+    # Source preference
+    preferred = profile.get("preferred_sources", [])
+    source_score = 1.0 if (not preferred or item["source"] in preferred) else 0.3
+    # Community signal (normalize score: 0-100+ -> 0-1)
+    raw_score = item.get("score", 0)
+    community_score = min(1.0, raw_score / 200) if raw_score > 0 else 0.2
+    # Reading time fit
+    budget = profile.get("time_budget_mins", 60)
+    per_item_budget = budget / 5
+    time_score = 1.0 if item["reading_time_mins"] <= per_item_budget else 0.3
+    # Content type match
+    skill = profile.get("skill_level", "intermediate")
+    ctype = item.get("content_type", "article")
+    if skill == "expert" and ctype == "paper":
+        type_score = 1.0
+    elif skill == "beginner" and ctype in ("tutorial", "article"):
+        type_score = 1.0
+    elif skill == "intermediate":
+        type_score = 0.8
+    else:
+        type_score = 0.5
+    # Weighted combination
+    relevance = (
+        0.50 * tag_score
+        + 0.20 * source_score
+        + 0.15 * community_score
+        + 0.10 * time_score
+        + 0.05 * type_score
+    )
+    # Already-read penalty
+    if item["id"] in profile.get("read_history", []):
+        relevance -= 0.4
+    return round(max(0.0, min(1.0, relevance)), 4)
+def create_tasks() -> list[dict]:
+    """Create task definitions with embedded user profiles for 3 difficulty levels."""
+    return [
+        {
+            "task_id": "easy",
+            "difficulty": "easy",
+            "item_count": 20,
+            "max_steps": 10,
+            "sources": ["hackernews"],
+            "recommend_k": 5,
+            "description": "Curate 5 top articles from 20 Hacker News stories for an AI/ML enthusiast.",
+            "profile": {
+                "interests": {
+                    "ai": 0.95,
+                    "nlp": 0.85,
+                    "python": 0.8,
+                    "data": 0.7,
+                },
+                "preferred_sources": ["hackernews", "arxiv"],
+                "time_budget_mins": 120,
+                "read_history": [],
+                "skill_level": "intermediate",
+            },
+        },
+        {
+            "task_id": "medium",
+            "difficulty": "medium",
+            "item_count": 50,
+            "max_steps": 20,
+            "sources": ["hackernews", "devto", "arxiv"],
+            "recommend_k": 10,
+            "description": "Curate 10 items from 50 across HN, DEV.to, and arXiv for a senior engineer with broad interests.",
+            "profile": {
+                "interests": {
+                    "ai": 0.9,
+                    "web": 0.7,
+                    "systems": 0.6,
+                    "security": 0.5,
+                    "python": 0.75,
+                    "cloud": 0.4,
+                    "open-source": 0.65,
+                    "startup": 0.3,
+                },
+                "preferred_sources": ["hackernews", "devto"],
+                "time_budget_mins": 60,
+                "read_history": [],
+                "skill_level": "expert",
+            },
+        },
+        {
+            "task_id": "hard",
+            "difficulty": "hard",
+            "item_count": 100,
+            "max_steps": 30,
+            "sources": ["hackernews", "devto", "arxiv", "reddit"],
+            "recommend_k": 15,
+            "description": "Curate 15 items from 100 across all sources for a beginner with minimal stated preferences. Must infer interests from feedback.",
+            "profile": {
+                "interests": {
+                    "rust": 0.5,
+                    "systems": 0.4,
+                },
+                "preferred_sources": [],
+                "time_budget_mins": 30,
+                "read_history": [],
+                "skill_level": "beginner",
+            },
+        },
+    ]
+def main():
+    DATA_DIR.mkdir(exist_ok=True)
+    print("Fetching real content data from public APIs...\n")
+    # Fetch from all sources
+    all_items = []
+    all_items.extend(fetch_hackernews(60))
+    all_items.extend(fetch_arxiv(50))
+    all_items.extend(fetch_devto(50))
+    all_items.extend(fetch_reddit(40))
+    print(f"\nTotal items fetched: {len(all_items)}")
+    # Save items
+    items_path = DATA_DIR / "items.json"
+    with open(items_path, "w") as f:
+        json.dump(all_items, f, indent=2)
+    print(f"Saved items to {items_path}")
+    # Create tasks (profiles are embedded in each task)
+    tasks = create_tasks()
+    # Compute ground truth relevance and set read_history
+    ground_truth = {}
+    for task in tasks:
+        profile = task["profile"]
+        sources = task["sources"]
+        task_items = [it for it in all_items if it["source"] in sources][
+            : task["item_count"]
+        ]
+        # Set some items as already read for medium/hard tasks
+        if task["task_id"] == "medium" and len(task_items) > 5:
+            profile["read_history"] = [task_items[i]["id"] for i in range(0, 6, 2)]
+        elif task["task_id"] == "hard" and len(task_items) > 10:
+            profile["read_history"] = [task_items[i]["id"] for i in range(0, 10, 3)]
+        relevance = {}
+        for item in task_items:
+            relevance[item["id"]] = round(compute_relevance(item, profile), 4)
+        ground_truth[task["task_id"]] = relevance
+    # Save tasks (with updated read_history in profiles)
+    tasks_path = DATA_DIR / "tasks.json"
+    with open(tasks_path, "w") as f:
+        json.dump(tasks, f, indent=2)
+    print(f"Saved tasks to {tasks_path}")
+    gt_path = DATA_DIR / "ground_truth.json"
+    with open(gt_path, "w") as f:
+        json.dump(ground_truth, f, indent=2)
+    print(f"Saved ground truth to {gt_path}")
+    # Summary
+    print("\n--- Summary ---")
+    for task in tasks:
+        tid = task["task_id"]
+        gt = ground_truth[tid]
+        avg_rel = sum(gt.values()) / len(gt) if gt else 0
+        high_rel = sum(1 for v in gt.values() if v >= 0.5)
+        print(
+            f"  {tid}: {len(gt)} items, avg relevance={avg_rel:.3f}, high-relevance={high_rel}"
+        )
+if __name__ == "__main__":
+    main()

server/Dockerfile ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=curator
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

server/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Curator environment server components."""
+from .curator_environment import CuratorEnvironment
+__all__ = ["CuratorEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+FastAPI application for the Curator Environment.
+Endpoints:
+    - POST /reset: Reset the environment
+    - POST /step: Execute an action
+    - GET /state: Get current environment state
+    - GET /schema: Get action/observation schemas
+    - WS /ws: WebSocket endpoint for persistent sessions
+Usage:
+    uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+"""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:  # pragma: no cover
+    raise ImportError("openenv is required. Install with: uv sync") from e
+try:
+    from ..models import CuratorAction, CuratorObservation
+    from .curator_environment import CuratorEnvironment
+except (ImportError, ModuleNotFoundError):
+    from models import CuratorAction, CuratorObservation
+    from server.curator_environment import CuratorEnvironment
+app = create_app(
+    CuratorEnvironment,
+    CuratorAction,
+    CuratorObservation,
+    env_name="curator",
+    max_concurrent_envs=4,
+)
+def main(host: str = "0.0.0.0", port: int = 8000):
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

server/curator_environment.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""
+Curator Environment Implementation.
+A personalized content curation environment where an agent must filter,
+categorize, rank, and recommend content items from a mixed pool of real
+articles across multiple sources.
+"""
+import copy
+import json
+import random
+from pathlib import Path
+from typing import Dict, List, Optional
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+try:
+    from ..models import (
+        ActionFeedback,
+        ContentItem,
+        CuratorAction,
+        CuratorObservation,
+        TaskInfo,
+        UserProfile,
+    )
+except ImportError:
+    from models import (
+        ActionFeedback,
+        ContentItem,
+        CuratorAction,
+        CuratorObservation,
+        TaskInfo,
+        UserProfile,
+    )
+try:
+    from . import grader
+except ImportError:
+    from server import grader
+DATA_DIR = Path(__file__).parent.parent / "data"
+class CuratorEnvironment(Environment):
+    """
+    Personalized content curation environment.
+    The agent receives a pool of real content items and a user profile,
+    then must filter, categorize, rank, and recommend the most relevant
+    items. Scored using standard IR metrics (NDCG, precision, recall).
+    Tasks:
+        - easy: 20 items from 1 source, clear preferences
+        - medium: 50 items from 3 sources, nuanced preferences
+        - hard: 100 items from 4 sources, minimal initial preferences
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        # Load static data
+        self._all_items = self._load_json("items.json")
+        self._all_tasks = {t["task_id"]: t for t in self._load_json("tasks.json")}
+        self._ground_truth = self._load_json("ground_truth.json")
+        # Episode state
+        self._task_config: Optional[dict] = None
+        self._profile: Optional[dict] = None
+        self._relevance: Dict[str, float] = {}
+        self._current_pool: List[dict] = []
+        self._items_by_id: Dict[str, dict] = {}
+        self._filtered_ids: List[str] = []
+        self._categories: Dict[str, str] = {}
+        self._last_ranking: List[str] = []
+        self._recommended_ids: List[str] = []
+        self._items_filtered_count = 0
+        self._items_categorized_count = 0
+    @staticmethod
+    def _load_json(filename: str) -> dict | list:
+        path = DATA_DIR / filename
+        with open(path) as f:
+            return json.load(f)
+    def reset(self, **kwargs) -> CuratorObservation:  # type: ignore[override]
+        """Reset the environment with a task configuration.
+        Args:
+            **kwargs: Must include 'task_id' ("easy", "medium", or "hard").
+                      Optional 'seed' for reproducibility.
+        """
+        task_id = kwargs.get("task_id", "easy")
+        seed = kwargs.get("seed", None)
+        if task_id not in self._all_tasks:
+            task_id = "easy"
+        self._task_config = self._all_tasks[task_id]
+        self._profile = copy.deepcopy(self._task_config["profile"])
+        self._relevance = self._ground_truth.get(task_id, {})
+        # Select items for this task
+        sources = self._task_config["sources"]
+        item_count = self._task_config["item_count"]
+        if sources == "all":
+            pool = list(self._all_items)
+        else:
+            pool = [it for it in self._all_items if it["source"] in sources]
+        # Shuffle with seed for reproducibility
+        if seed is not None:
+            random.seed(seed)
+        random.shuffle(pool)
+        self._current_pool = pool[:item_count]
+        self._items_by_id = {it["id"]: it for it in self._current_pool}
+        # Reset episode state
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._filtered_ids = []
+        self._categories = {}
+        self._last_ranking = []
+        self._recommended_ids = []
+        self._items_filtered_count = 0
+        self._items_categorized_count = 0
+        return self._make_observation(reward=0.0, done=False)
+    def step(self, action: CuratorAction) -> CuratorObservation:  # type: ignore[override]
+        """Execute one step in the environment.
+        Args:
+            action: CuratorAction with action_type and relevant fields.
+        """
+        self._state.step_count += 1
+        max_steps = self._task_config["max_steps"]
+        action_type = action.action_type
+        reward = 0.0
+        feedback = ActionFeedback()
+        done = False
+        if action_type == "filter":
+            reward, feedback = self._handle_filter(action)
+        elif action_type == "categorize":
+            reward, feedback = self._handle_categorize(action)
+        elif action_type == "rank":
+            reward, feedback = self._handle_rank(action)
+        elif action_type == "recommend":
+            reward, feedback = self._handle_recommend(action)
+            done = True
+        # Auto-end if max steps reached
+        if self._state.step_count >= max_steps and not done:
+            done = True
+            # If no recommendation was made, auto-recommend from last ranking or pool
+            if not self._recommended_ids:
+                k = self._task_config["recommend_k"]
+                if self._last_ranking:
+                    self._recommended_ids = self._last_ranking[:k]
+                else:
+                    pool_ids = [it["id"] for it in self._current_pool]
+                    self._recommended_ids = pool_ids[:k]
+                # Compute final episode score
+                reward = self._compute_final_score()
+                feedback = ActionFeedback(
+                    relevance_score=reward,
+                    explanation="Episode ended (max steps). Auto-recommended from best available ranking.",
+                )
+        return self._make_observation(reward=reward, done=done, feedback=feedback)
+    @property
+    def state(self) -> State:
+        return self._state
+    # =========================================================================
+    # Action Handlers
+    # =========================================================================
+    def _handle_filter(self, action: CuratorAction) -> tuple[float, ActionFeedback]:
+        """Remove items from the pool. Reward for removing low-relevance items."""
+        valid_ids = [iid for iid in action.item_ids if iid in self._items_by_id]
+        if not valid_ids:
+            return 0.0, ActionFeedback(explanation="No valid items to filter.")
+        # Remove from pool
+        for iid in valid_ids:
+            self._items_by_id.pop(iid, None)
+            self._filtered_ids.append(iid)
+        self._current_pool = [
+            it for it in self._current_pool if it["id"] in self._items_by_id
+        ]
+        self._items_filtered_count += len(valid_ids)
+        # Score: reward for removing low-relevance items
+        quality = grader.filter_quality(valid_ids, self._relevance)
+        return quality, ActionFeedback(
+            relevance_score=quality,
+            explanation=f"Filtered {len(valid_ids)} items. Quality={quality:.3f}",
+        )
+    def _handle_categorize(
+        self, action: CuratorAction
+    ) -> tuple[float, ActionFeedback]:
+        """Categorize items. Reward for matching relevance-derived categories."""
+        if not action.categories:
+            return 0.0, ActionFeedback(explanation="No categories provided.")
+        valid_cats = {
+            iid: cat
+            for iid, cat in action.categories.items()
+            if iid in self._items_by_id
+        }
+        if not valid_cats:
+            return 0.0, ActionFeedback(explanation="No valid items to categorize.")
+        self._categories.update(valid_cats)
+        self._items_categorized_count += len(valid_cats)
+        quality = grader.categorize_quality(valid_cats, self._relevance)
+        return quality, ActionFeedback(
+            relevance_score=quality,
+            explanation=f"Categorized {len(valid_cats)} items. Accuracy={quality:.3f}",
+        )
+    def _handle_rank(self, action: CuratorAction) -> tuple[float, ActionFeedback]:
+        """Rank items by priority. Reward based on NDCG."""
+        rankings = action.rankings or action.item_ids
+        if not rankings:
+            return 0.0, ActionFeedback(explanation="No ranking provided.")
+        valid_ranking = [iid for iid in rankings if iid in self._items_by_id]
+        self._last_ranking = valid_ranking
+        k = self._task_config["recommend_k"]
+        quality = grader.ndcg_at_k(valid_ranking, self._relevance, k)
+        # Also compute coverage
+        coverage = grader.source_diversity(valid_ranking[:k], self._items_by_id)
+        return quality, ActionFeedback(
+            relevance_score=quality,
+            coverage_score=coverage,
+            explanation=f"Ranked {len(valid_ranking)} items. NDCG@{k}={quality:.3f}",
+        )
+    def _handle_recommend(
+        self, action: CuratorAction
+    ) -> tuple[float, ActionFeedback]:
+        """Final recommendation. Triggers episode end with composite score."""
+        rec_ids = action.item_ids
+        k = self._task_config["recommend_k"]
+        if not rec_ids:
+            # Fall back to last ranking
+            if self._last_ranking:
+                rec_ids = self._last_ranking[:k]
+            else:
+                return 0.0, ActionFeedback(
+                    explanation="No items recommended and no prior ranking."
+                )
+        self._recommended_ids = rec_ids[:k]
+        score = self._compute_final_score()
+        return score, ActionFeedback(
+            relevance_score=score,
+            coverage_score=grader.source_diversity(
+                self._recommended_ids, self._items_by_id
+            ),
+            explanation=f"Final recommendation of {len(self._recommended_ids)} items. Score={score:.3f}",
+        )
+    # =========================================================================
+    # Scoring
+    # =========================================================================
+    def _compute_final_score(self) -> float:
+        """Compute composite episode score."""
+        return grader.grade_episode(
+            recommended_ids=self._recommended_ids,
+            ranked_ids=self._last_ranking if self._last_ranking else None,
+            categories=self._categories if self._categories else None,
+            relevance_scores=self._relevance,
+            items_by_id=self._items_by_id,
+            recommend_k=self._task_config["recommend_k"],
+        )
+    # =========================================================================
+    # Observation Builder
+    # =========================================================================
+    def _make_observation(
+        self,
+        reward: float,
+        done: bool,
+        feedback: Optional[ActionFeedback] = None,
+    ) -> CuratorObservation:
+        items = [ContentItem(**it) for it in self._current_pool]
+        profile = UserProfile(**self._profile) if self._profile else None
+        task_info = None
+        if self._task_config:
+            task_info = TaskInfo(
+                task_id=self._task_config["task_id"],
+                difficulty=self._task_config["difficulty"],
+                max_steps=self._task_config["max_steps"],
+                recommend_k=self._task_config["recommend_k"],
+                pool_size=len(self._current_pool),
+                items_filtered=self._items_filtered_count,
+                items_categorized=self._items_categorized_count,
+                step_number=self._state.step_count,
+            )
+        return CuratorObservation(
+            items=items,
+            user_profile=profile,
+            feedback=feedback,
+            task_info=task_info,
+            done=done,
+            reward=reward,
+            metadata={
+                "episode_id": self._state.episode_id,
+                "step": self._state.step_count,
+            },
+        )

server/grader.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+Grading module for Curator environment.
+Implements standard Information Retrieval metrics for deterministic,
+reproducible scoring of agent performance (0.0-1.0).
+"""
+import math
+from typing import Dict, List, Optional
+def dcg_at_k(relevances: List[float], k: int) -> float:
+    """Compute Discounted Cumulative Gain at k."""
+    dcg = 0.0
+    for i, rel in enumerate(relevances[:k]):
+        dcg += rel / math.log2(i + 2)  # i+2 because log2(1) = 0
+    return dcg
+def ndcg_at_k(
+    ranked_ids: List[str],
+    relevance_scores: Dict[str, float],
+    k: int,
+) -> float:
+    """Compute Normalized Discounted Cumulative Gain at k.
+    Args:
+        ranked_ids: Agent's ranked list of item IDs (best first).
+        relevance_scores: Ground truth {item_id: relevance} scores.
+        k: Evaluate top-k items.
+    Returns:
+        NDCG score in [0, 1].
+    """
+    if not ranked_ids or not relevance_scores or k <= 0:
+        return 0.0
+    # Actual DCG from agent ranking
+    actual_rels = [relevance_scores.get(iid, 0.0) for iid in ranked_ids[:k]]
+    actual_dcg = dcg_at_k(actual_rels, k)
+    # Ideal DCG (sorted by relevance, descending)
+    ideal_rels = sorted(relevance_scores.values(), reverse=True)[:k]
+    ideal_dcg = dcg_at_k(ideal_rels, k)
+    if ideal_dcg == 0:
+        return 0.0
+    return actual_dcg / ideal_dcg
+def precision_at_k(
+    selected_ids: List[str],
+    relevance_scores: Dict[str, float],
+    k: int,
+    threshold: float = 0.5,
+) -> float:
+    """Compute Precision at k.
+    Args:
+        selected_ids: Agent's selected item IDs.
+        relevance_scores: Ground truth {item_id: relevance} scores.
+        k: Evaluate top-k items.
+        threshold: Minimum relevance to count as "relevant".
+    Returns:
+        Precision score in [0, 1].
+    """
+    if not selected_ids or k <= 0:
+        return 0.0
+    top_k = selected_ids[:k]
+    relevant_count = sum(
+        1 for iid in top_k if relevance_scores.get(iid, 0.0) >= threshold
+    )
+    return relevant_count / min(k, len(top_k))
+def recall_at_k(
+    selected_ids: List[str],
+    relevance_scores: Dict[str, float],
+    k: int,
+    threshold: float = 0.5,
+) -> float:
+    """Compute Recall at k.
+    Args:
+        selected_ids: Agent's selected item IDs.
+        relevance_scores: Ground truth {item_id: relevance} scores.
+        k: Evaluate top-k items.
+        threshold: Minimum relevance to count as "relevant".
+    Returns:
+        Recall score in [0, 1].
+    """
+    total_relevant = sum(1 for v in relevance_scores.values() if v >= threshold)
+    if total_relevant == 0:
+        return 1.0  # No relevant items to find
+    top_k = selected_ids[:k]
+    found_relevant = sum(
+        1 for iid in top_k if relevance_scores.get(iid, 0.0) >= threshold
+    )
+    return found_relevant / total_relevant
+def source_diversity(selected_ids: List[str], items_by_id: Dict[str, dict]) -> float:
+    """Compute source diversity of selected items.
+    Returns:
+        Diversity score in [0, 1] based on unique source coverage.
+    """
+    if not selected_ids:
+        return 0.0
+    all_sources = set(it.get("source", "") for it in items_by_id.values())
+    selected_sources = set(
+        items_by_id[iid].get("source", "") for iid in selected_ids if iid in items_by_id
+    )
+    if not all_sources:
+        return 0.0
+    return len(selected_sources) / len(all_sources)
+def filter_quality(
+    removed_ids: List[str],
+    relevance_scores: Dict[str, float],
+) -> float:
+    """Score a filter action: reward for removing low-relevance items.
+    Returns:
+        Score in [0, 1]. Higher is better (removed less relevant items).
+    """
+    if not removed_ids:
+        return 0.0
+    avg_relevance_of_removed = sum(
+        relevance_scores.get(iid, 0.5) for iid in removed_ids
+    ) / len(removed_ids)
+    # Good filtering removes low-relevance items
+    return max(0.0, min(1.0, 1.0 - avg_relevance_of_removed))
+def categorize_quality(
+    agent_categories: Dict[str, str],
+    relevance_scores: Dict[str, float],
+    threshold_urgent: float = 0.7,
+    threshold_read: float = 0.4,
+) -> float:
+    """Score categorization accuracy against relevance-derived ground truth.
+    Ground truth categories derived from relevance:
+        >= threshold_urgent → "urgent"
+        >= threshold_read   → "read_later"
+        < threshold_read    → "skip"
+        (any relevance can be "share" — not penalized)
+    Returns:
+        Accuracy score in [0, 1].
+    """
+    if not agent_categories:
+        return 0.0
+    correct = 0
+    total = len(agent_categories)
+    for item_id, agent_cat in agent_categories.items():
+        rel = relevance_scores.get(item_id, 0.0)
+        # Derive expected category
+        if rel >= threshold_urgent:
+            expected = {"urgent", "share"}
+        elif rel >= threshold_read:
+            expected = {"read_later", "share"}
+        else:
+            expected = {"skip"}
+        if agent_cat in expected:
+            correct += 1
+    return correct / total
+def grade_episode(
+    recommended_ids: List[str],
+    ranked_ids: Optional[List[str]],
+    categories: Optional[Dict[str, str]],
+    relevance_scores: Dict[str, float],
+    items_by_id: Dict[str, dict],
+    recommend_k: int,
+) -> float:
+    """Compute final episode score (0-1).
+    Composite:
+        0.35 * NDCG@k
+        0.25 * Precision@k
+        0.20 * Recall@k
+        0.10 * Category accuracy
+        0.10 * Source diversity
+    """
+    # Use recommended_ids as ranking if no explicit ranking
+    ranking = ranked_ids if ranked_ids else recommended_ids
+    ndcg = ndcg_at_k(ranking, relevance_scores, recommend_k)
+    precision = precision_at_k(recommended_ids, relevance_scores, recommend_k)
+    recall = recall_at_k(recommended_ids, relevance_scores, recommend_k)
+    cat_acc = categorize_quality(categories, relevance_scores) if categories else 0.0
+    diversity = source_diversity(recommended_ids, items_by_id)
+    score = (
+        0.35 * ndcg
+        + 0.25 * precision
+        + 0.20 * recall
+        + 0.10 * cat_acc
+        + 0.10 * diversity
+    )
+    return max(0.0, min(1.0, score))

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff