kgdrathan commited on
Commit
d33e9ca
·
0 Parent(s):

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+
204
+ # Ruff stuff:
205
+ .ruff_cache/
206
+
207
+ # PyPI configuration file
208
+ .pypirc
209
+
210
+ # Marimo
211
+ marimo/_static/
212
+ marimo/_lsp/
213
+ __marimo__/
214
+
215
+ # Streamlit
216
+ .streamlit/secrets.toml
CLAUDE.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ OpenEnv Curator is a personalized content curation RL environment where agents filter, categorize, rank, and recommend content items from multiple sources (Hacker News, arXiv, DEV.to, Reddit) based on user preference profiles. Built on the OpenEnv framework, deployed as a FastAPI server on Hugging Face Spaces via Docker.
8
+
9
+ ## Common Commands
10
+
11
+ ```bash
12
+ # Install dependencies (uses uv package manager)
13
+ uv sync
14
+
15
+ # Run server locally
16
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
17
+ # Or via entry point:
18
+ uv run server
19
+
20
+ # Fetch/regenerate data files (one-time)
21
+ python scripts/fetch_data.py
22
+
23
+ # Run inference agent
24
+ CURATOR_TASK=easy HF_TOKEN=<token> python inference.py
25
+
26
+ # Docker build & run
27
+ docker build -f server/Dockerfile -t curator:latest .
28
+ docker run -p 8000:8000 curator:latest
29
+
30
+ # Pre-submission validation
31
+ bash pre-validation.sh https://your-space.hf.space
32
+
33
+ # Lint
34
+ uv run ruff check .
35
+ ```
36
+
37
+ ## Architecture
38
+
39
+ The system follows a client-server pattern using the OpenEnv framework:
40
+
41
+ - **Client** (`client.py`, `models.py`): `CuratorEnv` is an async HTTP client (extends OpenEnv `EnvClient`). `CuratorAction` and `CuratorObservation` are Pydantic models extending OpenEnv base types. The package exports these three from `__init__.py`.
42
+
43
+ - **Server** (`server/app.py`): Uses `openenv.core.env_server.http_server.create_app()` to generate FastAPI endpoints (`/reset`, `/step`, `/state`, `/schema`, `/ws`). Max 4 concurrent environments.
44
+
45
+ - **Environment** (`server/curator_environment.py`): Implements the OpenEnv `Environment` interface. Manages episode state (item pool, relevance scores, categories, rankings). Four action handlers: `_handle_filter` (remove low-relevance items), `_handle_categorize` (tag as urgent/read_later/share/skip), `_handle_rank` (order by relevance), `_handle_recommend` (final selection, ends episode).
46
+
47
+ - **Grader** (`server/grader.py`): Deterministic IR metrics. `grade_episode()` computes composite score: 0.35×NDCG@k + 0.25×Precision@k + 0.20×Recall@k + 0.10×CategoryAccuracy + 0.10×SourceDiversity.
48
+
49
+ - **Data** (`data/`): Static JSON files — `items.json` (real content from 4 sources), `tasks.json` (easy/medium/hard with embedded user profiles), `ground_truth.json` (relevance scores per task). Generated by `scripts/fetch_data.py`.
50
+
51
+ - **Inference** (`inference.py`): Example LLM agent using OpenAI SDK. Parses JSON actions from model output, logs in structured `[START]/[STEP]/[END]` format.
52
+
53
+ ## Task Difficulty Levels
54
+
55
+ | Task | Pool Size | Sources | Max Steps | Recommend K |
56
+ |--------|-----------|----------------|-----------|-------------|
57
+ | easy | 20 | HN only | 10 | 5 |
58
+ | medium | 50 | HN+arXiv+DEV | 20 | 10 |
59
+ | hard | 100 | All 4 sources | 30 | 15 |
60
+
61
+ ## Package Layout
62
+
63
+ The package is named `curator` with subpackage `curator.server`. The `package-dir` mapping in `pyproject.toml` maps the repo root to `curator` and `server/` to `curator.server`. This means imports use `from curator import ...` or `from curator.server import ...`, but on-disk the files are at the repo root.
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Curator Environment
3
+ emoji: 📰
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - OpenEnv
12
+ - RL
13
+ ---
14
+
15
+ # Curator — Personalized Content Curation Environment
16
+
17
+ An OpenEnv environment where an agent must curate a pool of real content items (from Hacker News, arXiv, DEV.to, Reddit) and curate a personalized reading list based on a user's preference profile.
18
+
19
+ ## Goal
20
+
21
+ Every knowledge worker drowns in information — hundreds of articles, papers, and posts across dozens of sources daily. Given a user profile and a content pool, the agent must intelligently **filter**, **categorize**, **rank**, and **recommend** the most relevant items. Scored using standard Information Retrieval metrics (NDCG, precision, recall).
22
+
23
+ ## Action Space
24
+
25
+ | Action | Fields | Description |
26
+ |--------|--------|-------------|
27
+ | `filter` | `item_ids: List[str]` | Remove irrelevant items from the pool |
28
+ | `categorize` | `categories: Dict[str, "urgent"\|"read_later"\|"share"\|"skip"]` | Tag items by priority |
29
+ | `rank` | `rankings: List[str]` | Order items by relevance (best first) |
30
+ | `recommend` | `item_ids: List[str]` | Final recommendation (ends episode) |
31
+
32
+ ## Observation Space
33
+
34
+ Each observation includes:
35
+
36
+ - **items** — current pool of content items (`id`, `source`, `title`, `summary`, `tags`, `score`, `reading_time_mins`, `content_type`)
37
+ - **user_profile** — interests (topic weights 0-1), preferred sources, skill level, time budget, read history
38
+ - **feedback** — per-step scores (relevance, coverage) from the last action
39
+ - **task_info** — difficulty, max steps, progress counters
40
+
41
+ ## Tasks
42
+
43
+ | Task | Pool Size | Sources | Max Steps | Recommend K | Description |
44
+ |------|-----------|---------|-----------|-------------|-------------|
45
+ | **easy** | 20 | Hacker News | 10 | 5 | Clear AI/ML interests, single source |
46
+ | **medium** | 50 | HN + arXiv + DEV.to | 20 | 10 | Broad interests, 3 sources, some already-read items |
47
+ | **hard** | 100 | All 4 sources | 30 | 15 | Minimal preferences, must infer interests from feedback |
48
+
49
+ Each task includes an embedded user profile that defines what "relevant" means for scoring.
50
+
51
+ ## Scoring
52
+
53
+ **Per-step rewards** (0-1):
54
+ - **filter**: higher reward for removing low-relevance items
55
+ - **categorize**: accuracy against relevance-derived ground truth
56
+ - **rank**: NDCG@k against ground truth relevance
57
+ - **recommend**: composite final episode score
58
+
59
+ **Final episode score** (deterministic, 0-1):
60
+
61
+ ```
62
+ score = 0.35 * NDCG@k + 0.25 * Precision@k + 0.20 * Recall@k + 0.10 * Category accuracy + 0.10 * Source diversity
63
+ ```
64
+
65
+ ## Data
66
+
67
+ All content is real data fetched from free public APIs (no auth needed), cached as static JSON — no API calls at runtime:
68
+
69
+ - **Hacker News** — top stories via Firebase API
70
+ - **arXiv** — recent AI/ML/NLP papers
71
+ - **DEV.to** — programming articles and tutorials
72
+ - **Reddit** — posts from r/programming, r/machinelearning, r/webdev
__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Curator Environment — Personalized Content Curation."""
2
+
3
+ from .client import CuratorEnv
4
+ from .models import CuratorAction, CuratorObservation
5
+
6
+ __all__ = [
7
+ "CuratorAction",
8
+ "CuratorObservation",
9
+ "CuratorEnv",
10
+ ]
client.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Curator Environment Client."""
2
+
3
+ from typing import Dict
4
+
5
+ from openenv.core import EnvClient
6
+ from openenv.core.client_types import StepResult
7
+ from openenv.core.env_server.types import State
8
+
9
+ from .models import (
10
+ ActionFeedback,
11
+ ContentItem,
12
+ CuratorAction,
13
+ CuratorObservation,
14
+ TaskInfo,
15
+ UserProfile,
16
+ )
17
+
18
+
19
+ class CuratorEnv(EnvClient[CuratorAction, CuratorObservation, State]):
20
+ """
21
+ Client for the Curator Environment.
22
+
23
+ Example:
24
+ >>> async with CuratorEnv(base_url="http://localhost:8000") as client:
25
+ ... result = await client.reset(task_id="easy")
26
+ ... print(len(result.observation.items))
27
+ ...
28
+ ... result = await client.step(CuratorAction(
29
+ ... action_type="rank",
30
+ ... rankings=["hn_123", "hn_456"]
31
+ ... ))
32
+
33
+ Example with Docker:
34
+ >>> client = await CuratorEnv.from_docker_image("curator:latest")
35
+ """
36
+
37
+ def _step_payload(self, action: CuratorAction) -> Dict:
38
+ """Convert CuratorAction to JSON payload."""
39
+ payload = {"action_type": action.action_type}
40
+
41
+ if action.item_ids:
42
+ payload["item_ids"] = action.item_ids
43
+ if action.categories is not None:
44
+ payload["categories"] = action.categories
45
+ if action.rankings is not None:
46
+ payload["rankings"] = action.rankings
47
+ if action.reasoning is not None:
48
+ payload["reasoning"] = action.reasoning
49
+ if action.metadata:
50
+ payload["metadata"] = action.metadata
51
+
52
+ return payload
53
+
54
+ def _parse_result(self, payload: Dict) -> StepResult[CuratorObservation]:
55
+ """Parse server response into StepResult[CuratorObservation]."""
56
+ obs_data = payload.get("observation", {})
57
+
58
+ # Parse nested models
59
+ items = [ContentItem(**it) for it in obs_data.get("items", [])]
60
+
61
+ user_profile = None
62
+ if obs_data.get("user_profile"):
63
+ user_profile = UserProfile(**obs_data["user_profile"])
64
+
65
+ feedback = None
66
+ if obs_data.get("feedback"):
67
+ feedback = ActionFeedback(**obs_data["feedback"])
68
+
69
+ task_info = None
70
+ if obs_data.get("task_info"):
71
+ task_info = TaskInfo(**obs_data["task_info"])
72
+
73
+ observation = CuratorObservation(
74
+ items=items,
75
+ user_profile=user_profile,
76
+ feedback=feedback,
77
+ task_info=task_info,
78
+ done=payload.get("done", False),
79
+ reward=payload.get("reward"),
80
+ metadata=obs_data.get("metadata", {}),
81
+ )
82
+
83
+ return StepResult(
84
+ observation=observation,
85
+ reward=payload.get("reward"),
86
+ done=payload.get("done", False),
87
+ )
88
+
89
+ def _parse_state(self, payload: Dict) -> State:
90
+ """Parse server response into State."""
91
+ return State(
92
+ episode_id=payload.get("episode_id"),
93
+ step_count=payload.get("step_count", 0),
94
+ )
data/ground_truth.json ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "easy": {
3
+ "hn_47672818": 0.49,
4
+ "hn_47673005": 0.3843,
5
+ "hn_47673541": 0.4808,
6
+ "hn_47673360": 0.3535,
7
+ "hn_47672295": 0.4338,
8
+ "hn_47672318": 0.5117,
9
+ "hn_47672884": 0.3678,
10
+ "hn_47614528": 0.4893,
11
+ "hn_47672778": 0.355,
12
+ "hn_47641472": 0.3595,
13
+ "hn_47627217": 0.3513,
14
+ "hn_47666024": 0.49,
15
+ "hn_47673072": 0.3738,
16
+ "hn_47659135": 0.49,
17
+ "hn_47660925": 0.49,
18
+ "hn_47626242": 0.3888,
19
+ "hn_47669337": 0.4488,
20
+ "hn_47663147": 0.49,
21
+ "hn_47662234": 0.49,
22
+ "hn_47667321": 0.4825
23
+ },
24
+ "medium": {
25
+ "hn_47672818": 0.075,
26
+ "hn_47673005": 0.3693,
27
+ "hn_47673541": 0.0,
28
+ "hn_47673360": 0.3385,
29
+ "hn_47672295": 0.0188,
30
+ "hn_47672318": 0.4465,
31
+ "hn_47672884": 0.3528,
32
+ "hn_47614528": 0.4743,
33
+ "hn_47672778": 0.3921,
34
+ "hn_47641472": 0.3445,
35
+ "hn_47627217": 0.3675,
36
+ "hn_47666024": 0.5375,
37
+ "hn_47673072": 0.3588,
38
+ "hn_47659135": 0.475,
39
+ "hn_47660925": 0.475,
40
+ "hn_47626242": 0.3738,
41
+ "hn_47669337": 0.4338,
42
+ "hn_47663147": 0.475,
43
+ "hn_47662234": 0.5271,
44
+ "hn_47667321": 0.4675,
45
+ "hn_47638498": 0.3393,
46
+ "hn_47641528": 0.4393,
47
+ "hn_47660954": 0.475,
48
+ "hn_47662945": 0.475,
49
+ "hn_47662596": 0.5687,
50
+ "hn_47660286": 0.475,
51
+ "hn_47660853": 0.475,
52
+ "hn_47636579": 0.458,
53
+ "hn_47637010": 0.475,
54
+ "hn_47672268": 0.4225,
55
+ "hn_47667672": 0.5005,
56
+ "hn_47662116": 0.4547,
57
+ "hn_47667717": 0.475,
58
+ "hn_47627998": 0.4495,
59
+ "hn_47637828": 0.4655,
60
+ "hn_47669749": 0.3858,
61
+ "hn_47673576": 0.3273,
62
+ "hn_47664186": 0.5427,
63
+ "hn_47642125": 0.404,
64
+ "hn_47661065": 0.5238,
65
+ "hn_47671527": 0.3723,
66
+ "hn_47664912": 0.475,
67
+ "hn_47665685": 0.3678,
68
+ "hn_47647397": 0.4039,
69
+ "hn_47665245": 0.373,
70
+ "hn_47668727": 0.5464,
71
+ "hn_47627361": 0.3445,
72
+ "hn_47673208": 0.328,
73
+ "hn_47665207": 0.3633,
74
+ "hn_47673182": 0.328
75
+ },
76
+ "hard": {
77
+ "hn_47672818": 0.1,
78
+ "hn_47673005": 0.3943,
79
+ "hn_47673541": 0.362,
80
+ "hn_47673360": 0.0,
81
+ "hn_47672295": 0.4438,
82
+ "hn_47672318": 0.3777,
83
+ "hn_47672884": 0.0,
84
+ "hn_47614528": 0.4993,
85
+ "hn_47672778": 0.365,
86
+ "hn_47641472": 0.0,
87
+ "hn_47627217": 0.3613,
88
+ "hn_47666024": 0.7222,
89
+ "hn_47673072": 0.3837,
90
+ "hn_47659135": 0.7778,
91
+ "hn_47660925": 0.5,
92
+ "hn_47626242": 0.3987,
93
+ "hn_47669337": 0.4588,
94
+ "hn_47663147": 0.5,
95
+ "hn_47662234": 0.5,
96
+ "hn_47667321": 0.4925,
97
+ "hn_47638498": 0.3643,
98
+ "hn_47641528": 0.624,
99
+ "hn_47660954": 0.5,
100
+ "hn_47662945": 0.5,
101
+ "hn_47662596": 0.5,
102
+ "hn_47660286": 0.5,
103
+ "hn_47660853": 0.5,
104
+ "hn_47636579": 0.6427,
105
+ "hn_47637010": 0.5,
106
+ "hn_47672268": 0.3538,
107
+ "hn_47667672": 0.4318,
108
+ "hn_47662116": 0.4797,
109
+ "hn_47667717": 0.5,
110
+ "hn_47627998": 0.4745,
111
+ "hn_47637828": 0.6502,
112
+ "hn_47669749": 0.4108,
113
+ "hn_47673576": 0.3523,
114
+ "hn_47664186": 0.5,
115
+ "hn_47642125": 0.5887,
116
+ "hn_47661065": 0.455,
117
+ "hn_47671527": 0.3973,
118
+ "hn_47664912": 0.5,
119
+ "hn_47665685": 0.3927,
120
+ "hn_47647397": 0.356,
121
+ "hn_47665245": 0.398,
122
+ "hn_47668727": 0.4985,
123
+ "hn_47627361": 0.3695,
124
+ "hn_47673208": 0.353,
125
+ "hn_47665207": 0.3883,
126
+ "hn_47673182": 0.353,
127
+ "hn_47636937": 0.4078,
128
+ "hn_47658146": 0.5,
129
+ "hn_47664205": 0.7222,
130
+ "hn_47655408": 0.5,
131
+ "hn_47637116": 0.4715,
132
+ "hn_47636456": 0.497,
133
+ "hn_47651703": 0.7222,
134
+ "hn_47652007": 0.5,
135
+ "arxiv_2604_04932v1": 0.285,
136
+ "arxiv_2604_04930v1": 0.5072,
137
+ "arxiv_2604_04924v1": 0.285,
138
+ "arxiv_2604_04923v1": 0.5072,
139
+ "arxiv_2604_04921v1": 0.5072,
140
+ "arxiv_2604_04920v1": 0.285,
141
+ "arxiv_2604_04917v1": 0.5072,
142
+ "arxiv_2604_04916v1": 0.5072,
143
+ "arxiv_2604_04914v1": 0.5072,
144
+ "arxiv_2604_04908v1": 0.5072,
145
+ "arxiv_2604_04906v1": 0.285,
146
+ "arxiv_2604_04902v1": 0.5072,
147
+ "arxiv_2604_04901v1": 0.5072,
148
+ "arxiv_2604_04898v1": 0.5072,
149
+ "arxiv_2604_04895v1": 0.5072,
150
+ "arxiv_2604_04894v1": 0.285,
151
+ "arxiv_2604_04892v1": 0.5072,
152
+ "arxiv_2604_04891v1": 0.285,
153
+ "arxiv_2604_04878v1": 0.285,
154
+ "arxiv_2604_04876v1": 0.5072,
155
+ "arxiv_2604_04875v1": 0.5072,
156
+ "arxiv_2604_04872v1": 0.285,
157
+ "arxiv_2604_04869v1": 0.5072,
158
+ "arxiv_2604_04868v1": 0.5072,
159
+ "arxiv_2604_04858v1": 0.785,
160
+ "arxiv_2604_04855v1": 0.5072,
161
+ "arxiv_2604_04853v1": 0.285,
162
+ "arxiv_2604_04852v1": 0.285,
163
+ "arxiv_2604_04847v1": 0.5072,
164
+ "arxiv_2604_04843v1": 0.5072,
165
+ "arxiv_2604_04842v1": 0.5072,
166
+ "arxiv_2604_04839v1": 0.285,
167
+ "arxiv_2604_04829v1": 0.5072,
168
+ "arxiv_2604_04828v1": 0.285,
169
+ "arxiv_2604_04825v1": 0.285,
170
+ "arxiv_2604_04820v1": 0.285,
171
+ "arxiv_2604_04815v1": 0.285,
172
+ "arxiv_2604_04808v1": 0.5072,
173
+ "arxiv_2604_04804v1": 0.285,
174
+ "arxiv_2604_04802v1": 0.5072,
175
+ "arxiv_2604_04800v1": 0.285,
176
+ "arxiv_2604_04797v1": 0.5072
177
+ }
178
+ }
data/items.json ADDED
The diff for this file is too large to render. See raw diff
 
data/tasks.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task_id": "easy",
4
+ "difficulty": "easy",
5
+ "item_count": 20,
6
+ "max_steps": 10,
7
+ "sources": [
8
+ "hackernews"
9
+ ],
10
+ "recommend_k": 5,
11
+ "description": "Curate 5 top articles from 20 Hacker News stories for an AI/ML enthusiast.",
12
+ "profile": {
13
+ "interests": {
14
+ "ai": 0.95,
15
+ "nlp": 0.85,
16
+ "python": 0.8,
17
+ "data": 0.7
18
+ },
19
+ "preferred_sources": [
20
+ "hackernews",
21
+ "arxiv"
22
+ ],
23
+ "time_budget_mins": 120,
24
+ "read_history": [],
25
+ "skill_level": "intermediate"
26
+ }
27
+ },
28
+ {
29
+ "task_id": "medium",
30
+ "difficulty": "medium",
31
+ "item_count": 50,
32
+ "max_steps": 20,
33
+ "sources": [
34
+ "hackernews",
35
+ "devto",
36
+ "arxiv"
37
+ ],
38
+ "recommend_k": 10,
39
+ "description": "Curate 10 items from 50 across HN, DEV.to, and arXiv for a senior engineer with broad interests.",
40
+ "profile": {
41
+ "interests": {
42
+ "ai": 0.9,
43
+ "web": 0.7,
44
+ "systems": 0.6,
45
+ "security": 0.5,
46
+ "python": 0.75,
47
+ "cloud": 0.4,
48
+ "open-source": 0.65,
49
+ "startup": 0.3
50
+ },
51
+ "preferred_sources": [
52
+ "hackernews",
53
+ "devto"
54
+ ],
55
+ "time_budget_mins": 60,
56
+ "read_history": [
57
+ "hn_47672818",
58
+ "hn_47673541",
59
+ "hn_47672295"
60
+ ],
61
+ "skill_level": "expert"
62
+ }
63
+ },
64
+ {
65
+ "task_id": "hard",
66
+ "difficulty": "hard",
67
+ "item_count": 100,
68
+ "max_steps": 30,
69
+ "sources": [
70
+ "hackernews",
71
+ "devto",
72
+ "arxiv",
73
+ "reddit"
74
+ ],
75
+ "recommend_k": 15,
76
+ "description": "Curate 15 items from 100 across all sources for a beginner with minimal stated preferences. Must infer interests from feedback.",
77
+ "profile": {
78
+ "interests": {
79
+ "rust": 0.5,
80
+ "systems": 0.4
81
+ },
82
+ "preferred_sources": [],
83
+ "time_budget_mins": 30,
84
+ "read_history": [
85
+ "hn_47672818",
86
+ "hn_47673360",
87
+ "hn_47672884",
88
+ "hn_47641472"
89
+ ],
90
+ "skill_level": "beginner"
91
+ }
92
+ }
93
+ ]
inference.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference Script for Curator Environment
3
+ ============================================
4
+
5
+ Environment Variables:
6
+ API_BASE_URL The API endpoint for the LLM.
7
+ MODEL_NAME The model identifier to use for inference.
8
+ HF_TOKEN Your Hugging Face / API key.
9
+ IMAGE_NAME Docker image name for the environment.
10
+ CURATOR_TASK Task difficulty: "easy", "medium", or "hard" (default: "easy").
11
+
12
+ STDOUT FORMAT:
13
+ [START] task=<task_name> env=curator model=<model_name>
14
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
15
+ [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...,rn>
16
+ """
17
+
18
+ import asyncio
19
+ import json
20
+ import os
21
+ import textwrap
22
+ from typing import Any, Dict, List, Optional
23
+
24
+ from openai import OpenAI
25
+
26
+ from client import CuratorEnv
27
+ from models import CuratorAction
28
+
29
+ IMAGE_NAME = os.getenv("IMAGE_NAME")
30
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
31
+
32
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
33
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
34
+ TASK_NAME = os.getenv("CURATOR_TASK", "easy")
35
+ BENCHMARK = "curator"
36
+ TEMPERATURE = 0.3
37
+ MAX_TOKENS = 2000
38
+ SUCCESS_SCORE_THRESHOLD = 0.3
39
+
40
+ SYSTEM_PROMPT = textwrap.dedent("""
41
+ You are a content curation agent. You help users find the most relevant
42
+ articles from a pool of content items based on their interest profile.
43
+
44
+ Available actions (respond with valid JSON):
45
+
46
+ 1. Filter (remove irrelevant items):
47
+ {"action_type": "filter", "item_ids": ["id1", "id2", ...]}
48
+
49
+ 2. Categorize items:
50
+ {"action_type": "categorize", "categories": {"id1": "urgent", "id2": "skip", ...}}
51
+ Categories: "urgent", "read_later", "share", "skip"
52
+
53
+ 3. Rank items by relevance:
54
+ {"action_type": "rank", "rankings": ["best_id", "second_id", ...]}
55
+
56
+ 4. Final recommendation (ends episode):
57
+ {"action_type": "recommend", "item_ids": ["id1", "id2", ...]}
58
+
59
+ Strategy: First filter out clearly irrelevant items, then rank the remainder,
60
+ then recommend the top items.
61
+
62
+ IMPORTANT: Respond with ONLY a JSON object, no markdown or explanation.
63
+ """).strip()
64
+
65
+
66
+ def log_start(task: str, env: str, model: str) -> None:
67
+ print(f"[START] task={task} env={env} model={model}", flush=True)
68
+
69
+
70
+ def log_step(
71
+ step: int, action: str, reward: float, done: bool, error: Optional[str]
72
+ ) -> None:
73
+ error_val = error if error else "null"
74
+ done_val = str(done).lower()
75
+ print(
76
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
77
+ flush=True,
78
+ )
79
+
80
+
81
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
82
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
83
+ print(
84
+ f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
85
+ flush=True,
86
+ )
87
+
88
+
89
+ def format_items_for_prompt(items: List[Dict], max_items: int = 30) -> str:
90
+ """Format content items into a compact string for the LLM prompt."""
91
+ lines = []
92
+ for item in items[:max_items]:
93
+ tags = ", ".join(item.get("tags", []))
94
+ lines.append(
95
+ f"- [{item['id']}] ({item['source']}) {item['title']} [tags: {tags}] [score: {item.get('score', 0)}]"
96
+ )
97
+ if len(items) > max_items:
98
+ lines.append(f" ... and {len(items) - max_items} more items")
99
+ return "\n".join(lines)
100
+
101
+
102
+ def format_profile_for_prompt(profile: Dict) -> str:
103
+ """Format user profile for the LLM prompt."""
104
+ interests = ", ".join(
105
+ f"{k}={v:.1f}"
106
+ for k, v in sorted(profile.get("interests", {}).items(), key=lambda x: -x[1])
107
+ )
108
+ sources = ", ".join(profile.get("preferred_sources", [])) or "no preference"
109
+ history = profile.get("read_history", [])
110
+ return (
111
+ f"Interests: {interests}\n"
112
+ f"Preferred sources: {sources}\n"
113
+ f"Skill level: {profile.get('skill_level', 'intermediate')}\n"
114
+ f"Time budget: {profile.get('time_budget_mins', 60)} mins\n"
115
+ f"Already read: {len(history)} items ({', '.join(history[:5])}{'...' if len(history) > 5 else ''})"
116
+ )
117
+
118
+
119
+ def build_user_prompt(obs: Any, step: int, last_feedback: Optional[str]) -> str:
120
+ """Build the user prompt from current observation."""
121
+ items = [
122
+ item.model_dump() if hasattr(item, "model_dump") else item for item in obs.items
123
+ ]
124
+ profile = (
125
+ obs.user_profile.model_dump()
126
+ if hasattr(obs.user_profile, "model_dump")
127
+ else obs.user_profile
128
+ )
129
+
130
+ ti = obs.task_info
131
+ prompt = f"""Step {step}/{ti.max_steps}. You must recommend {ti.recommend_k} items.
132
+ Pool: {ti.pool_size} items. Filtered so far: {ti.items_filtered}. Categorized: {ti.items_categorized}.
133
+
134
+ User Profile:
135
+ {format_profile_for_prompt(profile)}
136
+
137
+ Items in pool:
138
+ {format_items_for_prompt(items)}
139
+ """
140
+ if last_feedback:
141
+ prompt += f"\nLast action feedback: {last_feedback}\n"
142
+
143
+ if step >= ti.max_steps - 1:
144
+ prompt += (
145
+ "\nWARNING: This is your last step. You MUST use 'recommend' action now.\n"
146
+ )
147
+ elif step >= ti.max_steps - 2:
148
+ prompt += "\nOnly 2 steps left. Consider recommending soon.\n"
149
+
150
+ return prompt
151
+
152
+
153
+ def parse_action_from_response(text: str) -> Optional[Dict]:
154
+ """Parse a JSON action from LLM response text."""
155
+ text = text.strip()
156
+
157
+ # Try to extract JSON from markdown code blocks
158
+ if "```" in text:
159
+ parts = text.split("```")
160
+ for part in parts[1::2]:
161
+ part = part.strip()
162
+ if part.startswith("json"):
163
+ part = part[4:].strip()
164
+ try:
165
+ return json.loads(part)
166
+ except json.JSONDecodeError:
167
+ continue
168
+
169
+ # Try direct JSON parse
170
+ try:
171
+ return json.loads(text)
172
+ except json.JSONDecodeError:
173
+ pass
174
+
175
+ # Try to find JSON object in text
176
+ start = text.find("{")
177
+ end = text.rfind("}") + 1
178
+ if start >= 0 and end > start:
179
+ try:
180
+ return json.loads(text[start:end])
181
+ except json.JSONDecodeError:
182
+ pass
183
+
184
+ return None
185
+
186
+
187
+ def get_model_action(
188
+ client: OpenAI, obs: Any, step: int, last_feedback: Optional[str]
189
+ ) -> Dict:
190
+ """Get action from LLM."""
191
+ user_prompt = build_user_prompt(obs, step, last_feedback)
192
+
193
+ try:
194
+ completion = client.chat.completions.create(
195
+ model=MODEL_NAME,
196
+ messages=[
197
+ {"role": "system", "content": SYSTEM_PROMPT},
198
+ {"role": "user", "content": user_prompt},
199
+ ],
200
+ temperature=TEMPERATURE,
201
+ max_tokens=MAX_TOKENS,
202
+ stream=False,
203
+ )
204
+ text = (completion.choices[0].message.content or "").strip()
205
+ action = parse_action_from_response(text)
206
+ if action and "action_type" in action:
207
+ return action
208
+ except Exception as exc:
209
+ print(f"[DEBUG] Model request failed: {exc}", flush=True)
210
+
211
+ # Fallback: recommend first N items from pool
212
+ item_ids = [item.id if hasattr(item, "id") else item["id"] for item in obs.items]
213
+ k = obs.task_info.recommend_k if obs.task_info else 5
214
+ return {"action_type": "recommend", "item_ids": item_ids[:k]}
215
+
216
+
217
+ async def main() -> None:
218
+ llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
219
+
220
+ env = await CuratorEnv.from_docker_image(IMAGE_NAME)
221
+
222
+ rewards: List[float] = []
223
+ steps_taken = 0
224
+ score = 0.0
225
+ success = False
226
+ last_feedback: Optional[str] = None
227
+
228
+ log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
229
+
230
+ try:
231
+ result = await env.reset(task_id=TASK_NAME)
232
+ obs = result.observation
233
+
234
+ task_info = obs.task_info
235
+ max_steps = task_info.max_steps if task_info else 10
236
+
237
+ for step in range(1, max_steps + 1):
238
+ if result.done:
239
+ break
240
+
241
+ action_dict = get_model_action(llm_client, obs, step, last_feedback)
242
+ action = CuratorAction(**action_dict)
243
+
244
+ result = await env.step(action)
245
+ obs = result.observation
246
+
247
+ reward = result.reward or 0.0
248
+ done = result.done
249
+ error = None
250
+
251
+ rewards.append(reward)
252
+ steps_taken = step
253
+
254
+ # Summarize action for logging
255
+ action_summary = f"{action.action_type}({len(action.item_ids)}items)"
256
+ log_step(
257
+ step=step, action=action_summary, reward=reward, done=done, error=error
258
+ )
259
+
260
+ # Capture feedback for next prompt
261
+ if obs.feedback:
262
+ last_feedback = obs.feedback.explanation
263
+ else:
264
+ last_feedback = None
265
+
266
+ if done:
267
+ break
268
+
269
+ # Final score is the last reward (from recommend action)
270
+ score = rewards[-1] if rewards else 0.0
271
+ score = min(max(score, 0.0), 1.0)
272
+ success = score >= SUCCESS_SCORE_THRESHOLD
273
+
274
+ finally:
275
+ try:
276
+ await env.close()
277
+ except Exception as e:
278
+ print(f"[DEBUG] env.close() error: {e}", flush=True)
279
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
280
+
281
+
282
+ if __name__ == "__main__":
283
+ asyncio.run(main())
models.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data models for the Curator Environment.
3
+
4
+ Curator is a personalized content curation environment where an agent
5
+ must filter, categorize, rank, and recommend content items from a mixed
6
+ pool of real articles across multiple sources.
7
+ """
8
+
9
+ from typing import Dict, List, Literal, Optional
10
+
11
+ from openenv.core.env_server.types import Action, Observation
12
+ from pydantic import BaseModel, Field
13
+
14
+ # =============================================================================
15
+ # Helper Models
16
+ # =============================================================================
17
+
18
+
19
+ class ContentItem(BaseModel):
20
+ """A single content item from any source."""
21
+
22
+ id: str = Field(..., description="Unique item identifier")
23
+ source: str = Field(
24
+ ..., description="Content source: hackernews, arxiv, devto, reddit"
25
+ )
26
+ title: str = Field(..., description="Item title")
27
+ summary: str = Field(default="", description="Brief summary or description")
28
+ tags: List[str] = Field(default_factory=list, description="Topic tags")
29
+ url: str = Field(default="", description="Original URL")
30
+ author: str = Field(default="", description="Author name")
31
+ score: int = Field(default=0, description="Community score/upvotes")
32
+ reading_time_mins: int = Field(default=5, description="Estimated reading time")
33
+ content_type: str = Field(
34
+ default="article",
35
+ description="Type: article, paper, discussion, job, tutorial, event",
36
+ )
37
+
38
+
39
+ class UserProfile(BaseModel):
40
+ """A user's preference profile for content curation."""
41
+
42
+ interests: Dict[str, float] = Field(
43
+ ..., description="Topic interest weights (0.0-1.0)"
44
+ )
45
+ preferred_sources: List[str] = Field(
46
+ default_factory=list, description="Preferred content sources"
47
+ )
48
+ time_budget_mins: int = Field(
49
+ default=60, description="Available reading time in minutes"
50
+ )
51
+ read_history: List[str] = Field(
52
+ default_factory=list, description="IDs of already-read items"
53
+ )
54
+ skill_level: str = Field(
55
+ default="intermediate",
56
+ description="User expertise: beginner, intermediate, expert",
57
+ )
58
+
59
+
60
+ class ActionFeedback(BaseModel):
61
+ """Feedback from the environment after an action."""
62
+
63
+ relevance_score: float = Field(
64
+ default=0.0, description="How relevant the action's items were (0-1)"
65
+ )
66
+ coverage_score: float = Field(
67
+ default=0.0, description="Source/topic diversity score (0-1)"
68
+ )
69
+ redundancy_penalty: float = Field(
70
+ default=0.0, description="Penalty for recommending already-seen items (0-1)"
71
+ )
72
+ explanation: str = Field(default="", description="Explanation of the feedback")
73
+
74
+
75
+ class TaskInfo(BaseModel):
76
+ """Information about the current task configuration."""
77
+
78
+ task_id: str = Field(..., description="Task identifier: easy, medium, hard")
79
+ difficulty: str = Field(..., description="Difficulty level")
80
+ max_steps: int = Field(..., description="Maximum steps allowed")
81
+ recommend_k: int = Field(..., description="Number of items to recommend")
82
+ pool_size: int = Field(default=0, description="Current items in pool")
83
+ items_filtered: int = Field(default=0, description="Items filtered so far")
84
+ items_categorized: int = Field(default=0, description="Items categorized so far")
85
+ step_number: int = Field(default=0, description="Current step number")
86
+
87
+
88
+ # =============================================================================
89
+ # Action & Observation Models
90
+ # =============================================================================
91
+
92
+
93
+ class CuratorAction(Action):
94
+ """Action for the Curator environment.
95
+
96
+ The agent can filter, categorize, rank, or recommend items.
97
+ """
98
+
99
+ action_type: Literal["filter", "categorize", "rank", "recommend"] = Field(
100
+ ..., description="Type of action to perform"
101
+ )
102
+ item_ids: List[str] = Field(
103
+ default_factory=list,
104
+ description="Item IDs being acted on",
105
+ )
106
+ categories: Optional[
107
+ Dict[str, Literal["urgent", "read_later", "share", "skip"]]
108
+ ] = Field(
109
+ default=None,
110
+ description="Category assignments: {item_id: category} (for categorize action)",
111
+ )
112
+ rankings: Optional[List[str]] = Field(
113
+ default=None,
114
+ description="Ordered list of item IDs by priority (for rank action)",
115
+ )
116
+ reasoning: Optional[str] = Field(
117
+ default=None, description="Agent's reasoning for this action"
118
+ )
119
+
120
+
121
+ class CuratorObservation(Observation):
122
+ """Observation from the Curator environment."""
123
+
124
+ items: List[ContentItem] = Field(
125
+ default_factory=list, description="Current pool of content items"
126
+ )
127
+ user_profile: Optional[UserProfile] = Field(
128
+ default=None, description="User preference profile"
129
+ )
130
+ feedback: Optional[ActionFeedback] = Field(
131
+ default=None, description="Feedback from the last action"
132
+ )
133
+ task_info: Optional[TaskInfo] = Field(
134
+ default=None, description="Current task configuration and progress"
135
+ )
openenv.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: openenv-curator
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
pre-validation.sh ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Prerequisites:
8
+ # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core
10
+ # - curl (usually pre-installed)
11
+ #
12
+ # Run:
13
+ # curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
14
+ #
15
+ # Or download and run locally:
16
+ # chmod +x validate-submission.sh
17
+ # ./validate-submission.sh <ping_url> [repo_dir]
18
+ #
19
+ # Arguments:
20
+ # ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
21
+ # repo_dir Path to your repo (default: current directory)
22
+ #
23
+ # Examples:
24
+ # ./validate-submission.sh https://my-team.hf.space
25
+ # ./validate-submission.sh https://my-team.hf.space ./my-repo
26
+ #
27
+
28
+ set -uo pipefail
29
+
30
+ DOCKER_BUILD_TIMEOUT=600
31
+ if [ -t 1 ]; then
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ BOLD='\033[1m'
36
+ NC='\033[0m'
37
+ else
38
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
39
+ fi
40
+
41
+ run_with_timeout() {
42
+ local secs="$1"; shift
43
+ if command -v timeout &>/dev/null; then
44
+ timeout "$secs" "$@"
45
+ elif command -v gtimeout &>/dev/null; then
46
+ gtimeout "$secs" "$@"
47
+ else
48
+ "$@" &
49
+ local pid=$!
50
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
51
+ local watcher=$!
52
+ wait "$pid" 2>/dev/null
53
+ local rc=$?
54
+ kill "$watcher" 2>/dev/null
55
+ wait "$watcher" 2>/dev/null
56
+ return $rc
57
+ fi
58
+ }
59
+
60
+ portable_mktemp() {
61
+ local prefix="${1:-validate}"
62
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
+ }
64
+
65
+ CLEANUP_FILES=()
66
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
+ trap cleanup EXIT
68
+
69
+ PING_URL="${1:-}"
70
+ REPO_DIR="${2:-.}"
71
+
72
+ if [ -z "$PING_URL" ]; then
73
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
74
+ printf "\n"
75
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
76
+ printf " repo_dir Path to your repo (default: current directory)\n"
77
+ exit 1
78
+ fi
79
+
80
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
81
+ printf "Error: directory '%s' not found\n" "${2:-.}"
82
+ exit 1
83
+ fi
84
+ PING_URL="${PING_URL%/}"
85
+ export PING_URL
86
+ PASS=0
87
+
88
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
89
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
90
+ fail() { log "${RED}FAILED${NC} -- $1"; }
91
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
92
+ stop_at() {
93
+ printf "\n"
94
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
95
+ exit 1
96
+ }
97
+
98
+ printf "\n"
99
+ printf "${BOLD}========================================${NC}\n"
100
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
101
+ printf "${BOLD}========================================${NC}\n"
102
+ log "Repo: $REPO_DIR"
103
+ log "Ping URL: $PING_URL"
104
+ printf "\n"
105
+
106
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
107
+
108
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
109
+ CLEANUP_FILES+=("$CURL_OUTPUT")
110
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
111
+ -H "Content-Type: application/json" -d '{}' \
112
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
113
+
114
+ if [ "$HTTP_CODE" = "200" ]; then
115
+ pass "HF Space is live and responds to /reset"
116
+ elif [ "$HTTP_CODE" = "000" ]; then
117
+ fail "HF Space not reachable (connection failed or timed out)"
118
+ hint "Check your network connection and that the Space is running."
119
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
120
+ stop_at "Step 1"
121
+ else
122
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
123
+ hint "Make sure your Space is running and the URL is correct."
124
+ hint "Try opening $PING_URL in your browser first."
125
+ stop_at "Step 1"
126
+ fi
127
+
128
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
129
+
130
+ if ! command -v docker &>/dev/null; then
131
+ fail "docker command not found"
132
+ hint "Install Docker: https://docs.docker.com/get-docker/"
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
137
+ DOCKER_CONTEXT="$REPO_DIR"
138
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
+ DOCKER_CONTEXT="$REPO_DIR/server"
140
+ else
141
+ fail "No Dockerfile found in repo root or server/ directory"
142
+ stop_at "Step 2"
143
+ fi
144
+
145
+ log " Found Dockerfile in $DOCKER_CONTEXT"
146
+
147
+ BUILD_OK=false
148
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
+
150
+ if [ "$BUILD_OK" = true ]; then
151
+ pass "Docker build succeeded"
152
+ else
153
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
154
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
155
+ stop_at "Step 2"
156
+ fi
157
+
158
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
+
160
+ if ! command -v openenv &>/dev/null; then
161
+ fail "openenv command not found"
162
+ hint "Install it: pip install openenv-core"
163
+ stop_at "Step 3"
164
+ fi
165
+
166
+ VALIDATE_OK=false
167
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
+
169
+ if [ "$VALIDATE_OK" = true ]; then
170
+ pass "openenv validate passed"
171
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
172
+ else
173
+ fail "openenv validate failed"
174
+ printf "%s\n" "$VALIDATE_OUTPUT"
175
+ stop_at "Step 3"
176
+ fi
177
+
178
+ printf "\n"
179
+ printf "${BOLD}========================================${NC}\n"
180
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
181
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
182
+ printf "${BOLD}========================================${NC}\n"
183
+ printf "\n"
184
+
185
+ exit 0
pyproject.toml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "openenv-curator"
7
+ version = "0.1.0"
8
+ description = "Curator: Personalized content curation environment for OpenEnv"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "openenv-core[core]>=0.2.2",
12
+ ]
13
+
14
+ [project.optional-dependencies]
15
+ dev = [
16
+ "pytest>=8.0.0",
17
+ "pytest-cov>=4.0.0",
18
+ ]
19
+ inference = [
20
+ "openai>=1.0",
21
+ ]
22
+
23
+ [project.scripts]
24
+ server = "curator.server.app:main"
25
+
26
+ [tool.setuptools]
27
+ include-package-data = true
28
+ packages = ["curator", "curator.server"]
29
+ package-dir = { "curator" = ".", "curator.server" = "server" }
30
+
31
+ [dependency-groups]
32
+ dev = [
33
+ "ruff>=0.15.9",
34
+ ]
scripts/fetch_data.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Fetch real content items from public APIs and save as static JSON.
4
+
5
+ Sources (all free, no auth):
6
+ - Hacker News (Firebase API)
7
+ - arXiv (public API)
8
+ - DEV.to (public API)
9
+ - Reddit (public JSON)
10
+
11
+ Run once: python scripts/fetch_data.py
12
+ Output: data/items.json
13
+ """
14
+
15
+ import json
16
+ import math
17
+ import time
18
+ import xml.etree.ElementTree as ET
19
+ from pathlib import Path
20
+ from urllib.request import Request, urlopen
21
+
22
+ DATA_DIR = Path(__file__).parent.parent / "data"
23
+
24
+ # Tag extraction keywords
25
+ TAG_KEYWORDS = {
26
+ "ai": [
27
+ "ai",
28
+ "artificial intelligence",
29
+ "machine learning",
30
+ "ml",
31
+ "deep learning",
32
+ "neural",
33
+ ],
34
+ "nlp": [
35
+ "nlp",
36
+ "natural language",
37
+ "language model",
38
+ "llm",
39
+ "gpt",
40
+ "transformer",
41
+ "bert",
42
+ ],
43
+ "web": [
44
+ "web",
45
+ "javascript",
46
+ "react",
47
+ "frontend",
48
+ "css",
49
+ "html",
50
+ "browser",
51
+ "nextjs",
52
+ "vue",
53
+ ],
54
+ "systems": [
55
+ "systems",
56
+ "linux",
57
+ "kernel",
58
+ "os",
59
+ "distributed",
60
+ "infrastructure",
61
+ "devops",
62
+ ],
63
+ "rust": ["rust", "cargo", "rustc", "borrow checker"],
64
+ "python": ["python", "pip", "django", "flask", "fastapi", "pytorch"],
65
+ "go": ["golang", " go ", "goroutine"],
66
+ "security": [
67
+ "security",
68
+ "vulnerability",
69
+ "exploit",
70
+ "crypto",
71
+ "encryption",
72
+ "privacy",
73
+ ],
74
+ "database": ["database", "sql", "postgres", "mongodb", "redis", "sqlite"],
75
+ "cloud": ["cloud", "aws", "gcp", "azure", "kubernetes", "docker", "k8s"],
76
+ "mobile": ["mobile", "ios", "android", "swift", "kotlin", "flutter"],
77
+ "data": [
78
+ "data",
79
+ "analytics",
80
+ "visualization",
81
+ "pandas",
82
+ "spark",
83
+ "etl",
84
+ "pipeline",
85
+ ],
86
+ "career": ["career", "hiring", "interview", "salary", "remote", "job"],
87
+ "startup": ["startup", "funding", "venture", "entrepreneur", "saas", "product"],
88
+ "open-source": [
89
+ "open source",
90
+ "open-source",
91
+ "oss",
92
+ "github",
93
+ "foss",
94
+ "mit license",
95
+ ],
96
+ "robotics": ["robot", "robotics", "autonomous", "drone", "perception", "slam"],
97
+ "cv": ["computer vision", "image", "object detection", "segmentation", "diffusion"],
98
+ }
99
+
100
+
101
+ def extract_tags(title: str, summary: str = "") -> list[str]:
102
+ """Extract topic tags from title and summary text."""
103
+ text = f"{title} {summary}".lower()
104
+ tags = []
105
+ for tag, keywords in TAG_KEYWORDS.items():
106
+ if any(kw in text for kw in keywords):
107
+ tags.append(tag)
108
+ return tags if tags else ["general"]
109
+
110
+
111
+ def fetch_json(url: str, headers: dict | None = None) -> dict | list:
112
+ """Fetch JSON from a URL."""
113
+ req = Request(url, headers=headers or {"User-Agent": "Curator/1.0"})
114
+ with urlopen(req, timeout=30) as resp:
115
+ return json.loads(resp.read().decode())
116
+
117
+
118
+ def fetch_text(url: str) -> str:
119
+ """Fetch raw text from a URL."""
120
+ req = Request(url, headers={"User-Agent": "Curator/1.0"})
121
+ with urlopen(req, timeout=30) as resp:
122
+ return resp.read().decode()
123
+
124
+
125
+ def fetch_hackernews(count: int = 60) -> list[dict]:
126
+ """Fetch top stories from Hacker News."""
127
+ print(f" Fetching {count} Hacker News stories...")
128
+ story_ids = fetch_json("https://hacker-news.firebaseio.com/v0/topstories.json")
129
+ items = []
130
+ for sid in story_ids[:count]:
131
+ try:
132
+ story = fetch_json(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json")
133
+ if not story or story.get("type") != "story":
134
+ continue
135
+ title = story.get("title", "")
136
+ url = story.get("url", f"https://news.ycombinator.com/item?id={sid}")
137
+ items.append(
138
+ {
139
+ "id": f"hn_{sid}",
140
+ "source": "hackernews",
141
+ "title": title,
142
+ "summary": title, # HN doesn't have summaries; title is the content
143
+ "tags": extract_tags(title),
144
+ "url": url,
145
+ "author": story.get("by", ""),
146
+ "score": story.get("score", 0),
147
+ "reading_time_mins": 5,
148
+ "content_type": "article",
149
+ }
150
+ )
151
+ except Exception as e:
152
+ print(f" Skipping HN story {sid}: {e}")
153
+ time.sleep(0.05) # Be polite
154
+ print(f" Got {len(items)} HN items")
155
+ return items
156
+
157
+
158
+ def fetch_arxiv(count: int = 50) -> list[dict]:
159
+ """Fetch recent AI/ML papers from arXiv."""
160
+ print(f" Fetching {count} arXiv papers...")
161
+ categories = "cat:cs.AI+OR+cat:cs.LG+OR+cat:cs.CL"
162
+ url = f"https://export.arxiv.org/api/query?search_query={categories}&sortBy=submittedDate&sortOrder=descending&max_results={count}"
163
+ xml_text = fetch_text(url)
164
+ root = ET.fromstring(xml_text)
165
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
166
+
167
+ items = []
168
+ for entry in root.findall("atom:entry", ns):
169
+ try:
170
+ arxiv_id = entry.find("atom:id", ns).text.split("/abs/")[-1]
171
+ title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
172
+ summary = (
173
+ entry.find("atom:summary", ns).text.strip().replace("\n", " ")[:300]
174
+ )
175
+ authors = [
176
+ a.find("atom:name", ns).text for a in entry.findall("atom:author", ns)
177
+ ]
178
+ link = entry.find("atom:id", ns).text
179
+
180
+ # Estimate reading time from summary length
181
+ word_count = len(summary.split())
182
+ reading_time = max(10, word_count // 20)
183
+
184
+ items.append(
185
+ {
186
+ "id": f"arxiv_{arxiv_id.replace('/', '_').replace('.', '_')}",
187
+ "source": "arxiv",
188
+ "title": title,
189
+ "summary": summary,
190
+ "tags": extract_tags(title, summary),
191
+ "url": link,
192
+ "author": authors[0] if authors else "",
193
+ "score": 0,
194
+ "reading_time_mins": reading_time,
195
+ "content_type": "paper",
196
+ }
197
+ )
198
+ except Exception as e:
199
+ print(f" Skipping arXiv entry: {e}")
200
+
201
+ print(f" Got {len(items)} arXiv items")
202
+ return items
203
+
204
+
205
+ def fetch_devto(count: int = 50) -> list[dict]:
206
+ """Fetch articles from DEV.to."""
207
+ print(f" Fetching {count} DEV.to articles...")
208
+ items = []
209
+ # Fetch from multiple tags to get variety
210
+ tags_to_fetch = ["programming", "ai", "webdev", "python", "tutorial"]
211
+ per_tag = math.ceil(count / len(tags_to_fetch))
212
+
213
+ seen_ids = set()
214
+ for tag in tags_to_fetch:
215
+ try:
216
+ articles = fetch_json(
217
+ f"https://dev.to/api/articles?per_page={per_tag}&tag={tag}&top=7"
218
+ )
219
+ for article in articles:
220
+ aid = article["id"]
221
+ if aid in seen_ids:
222
+ continue
223
+ seen_ids.add(aid)
224
+ title = article.get("title", "")
225
+ desc = article.get("description", "")
226
+ tag_list = article.get("tag_list", [])
227
+ items.append(
228
+ {
229
+ "id": f"devto_{aid}",
230
+ "source": "devto",
231
+ "title": title,
232
+ "summary": desc[:300] if desc else title,
233
+ "tags": extract_tags(title, desc)
234
+ if not tag_list
235
+ else [t.lower() for t in tag_list[:5]],
236
+ "url": article.get("url", ""),
237
+ "author": article.get("user", {}).get("username", ""),
238
+ "score": article.get("positive_reactions_count", 0),
239
+ "reading_time_mins": article.get("reading_time_minutes", 5),
240
+ "content_type": "tutorial"
241
+ if "tutorial" in (tag_list or [])
242
+ else "article",
243
+ }
244
+ )
245
+ time.sleep(0.2)
246
+ except Exception as e:
247
+ print(f" Skipping DEV.to tag {tag}: {e}")
248
+
249
+ items = items[:count]
250
+ print(f" Got {len(items)} DEV.to items")
251
+ return items
252
+
253
+
254
+ def fetch_reddit(count: int = 40) -> list[dict]:
255
+ """Fetch posts from Reddit programming subreddits."""
256
+ print(f" Fetching {count} Reddit posts...")
257
+ items = []
258
+ subreddits = ["programming", "machinelearning", "webdev"]
259
+ per_sub = math.ceil(count / len(subreddits))
260
+
261
+ seen_ids = set()
262
+ for sub in subreddits:
263
+ try:
264
+ data = fetch_json(
265
+ f"https://www.reddit.com/r/{sub}/hot.json?limit={per_sub}",
266
+ headers={"User-Agent": "Curator/1.0 (content-curation-research)"},
267
+ )
268
+ for post in data.get("data", {}).get("children", []):
269
+ pd = post["data"]
270
+ rid = pd["id"]
271
+ if rid in seen_ids or pd.get("stickied"):
272
+ continue
273
+ seen_ids.add(rid)
274
+ title = pd.get("title", "")
275
+ selftext = pd.get("selftext", "")[:300]
276
+ items.append(
277
+ {
278
+ "id": f"reddit_{rid}",
279
+ "source": "reddit",
280
+ "title": title,
281
+ "summary": selftext if selftext else title,
282
+ "tags": extract_tags(title, selftext),
283
+ "url": f"https://reddit.com{pd.get('permalink', '')}",
284
+ "author": pd.get("author", ""),
285
+ "score": pd.get("score", 0),
286
+ "reading_time_mins": max(2, len(selftext.split()) // 200)
287
+ if selftext
288
+ else 3,
289
+ "content_type": "discussion",
290
+ }
291
+ )
292
+ time.sleep(0.5)
293
+ except Exception as e:
294
+ print(f" Skipping Reddit r/{sub}: {e}")
295
+
296
+ items = items[:count]
297
+ print(f" Got {len(items)} Reddit items")
298
+ return items
299
+
300
+
301
+ def compute_relevance(item: dict, profile: dict) -> float:
302
+ """Compute relevance score (0-1) of an item for a user profile.
303
+
304
+ Scoring:
305
+ - 0.50 weight: tag match (sum of matched interest weights / total interest weight)
306
+ - 0.20 weight: source preference (1.0 if preferred, 0.3 otherwise)
307
+ - 0.15 weight: community signal (normalized score/upvotes)
308
+ - 0.10 weight: reading time fit (within budget = 1.0, over = 0.3)
309
+ - 0.05 weight: content type match (paper for expert, tutorial for beginner)
310
+ - Penalty: -0.4 for already-read items
311
+ """
312
+ interests = profile["interests"]
313
+ item_tags = set(item["tags"])
314
+
315
+ if not interests:
316
+ return 0.05
317
+
318
+ # Tag match: how much of the user's interest space does this item cover?
319
+ total_interest_weight = sum(interests.values())
320
+ matched_weight = sum(interests.get(tag, 0.0) for tag in item_tags)
321
+ tag_score = matched_weight / total_interest_weight if total_interest_weight > 0 else 0.0
322
+
323
+ # Source preference
324
+ preferred = profile.get("preferred_sources", [])
325
+ source_score = 1.0 if (not preferred or item["source"] in preferred) else 0.3
326
+
327
+ # Community signal (normalize score: 0-100+ -> 0-1)
328
+ raw_score = item.get("score", 0)
329
+ community_score = min(1.0, raw_score / 200) if raw_score > 0 else 0.2
330
+
331
+ # Reading time fit
332
+ budget = profile.get("time_budget_mins", 60)
333
+ per_item_budget = budget / 5
334
+ time_score = 1.0 if item["reading_time_mins"] <= per_item_budget else 0.3
335
+
336
+ # Content type match
337
+ skill = profile.get("skill_level", "intermediate")
338
+ ctype = item.get("content_type", "article")
339
+ if skill == "expert" and ctype == "paper":
340
+ type_score = 1.0
341
+ elif skill == "beginner" and ctype in ("tutorial", "article"):
342
+ type_score = 1.0
343
+ elif skill == "intermediate":
344
+ type_score = 0.8
345
+ else:
346
+ type_score = 0.5
347
+
348
+ # Weighted combination
349
+ relevance = (
350
+ 0.50 * tag_score
351
+ + 0.20 * source_score
352
+ + 0.15 * community_score
353
+ + 0.10 * time_score
354
+ + 0.05 * type_score
355
+ )
356
+
357
+ # Already-read penalty
358
+ if item["id"] in profile.get("read_history", []):
359
+ relevance -= 0.4
360
+
361
+ return round(max(0.0, min(1.0, relevance)), 4)
362
+
363
+
364
+ def create_tasks() -> list[dict]:
365
+ """Create task definitions with embedded user profiles for 3 difficulty levels."""
366
+ return [
367
+ {
368
+ "task_id": "easy",
369
+ "difficulty": "easy",
370
+ "item_count": 20,
371
+ "max_steps": 10,
372
+ "sources": ["hackernews"],
373
+ "recommend_k": 5,
374
+ "description": "Curate 5 top articles from 20 Hacker News stories for an AI/ML enthusiast.",
375
+ "profile": {
376
+ "interests": {
377
+ "ai": 0.95,
378
+ "nlp": 0.85,
379
+ "python": 0.8,
380
+ "data": 0.7,
381
+ },
382
+ "preferred_sources": ["hackernews", "arxiv"],
383
+ "time_budget_mins": 120,
384
+ "read_history": [],
385
+ "skill_level": "intermediate",
386
+ },
387
+ },
388
+ {
389
+ "task_id": "medium",
390
+ "difficulty": "medium",
391
+ "item_count": 50,
392
+ "max_steps": 20,
393
+ "sources": ["hackernews", "devto", "arxiv"],
394
+ "recommend_k": 10,
395
+ "description": "Curate 10 items from 50 across HN, DEV.to, and arXiv for a senior engineer with broad interests.",
396
+ "profile": {
397
+ "interests": {
398
+ "ai": 0.9,
399
+ "web": 0.7,
400
+ "systems": 0.6,
401
+ "security": 0.5,
402
+ "python": 0.75,
403
+ "cloud": 0.4,
404
+ "open-source": 0.65,
405
+ "startup": 0.3,
406
+ },
407
+ "preferred_sources": ["hackernews", "devto"],
408
+ "time_budget_mins": 60,
409
+ "read_history": [],
410
+ "skill_level": "expert",
411
+ },
412
+ },
413
+ {
414
+ "task_id": "hard",
415
+ "difficulty": "hard",
416
+ "item_count": 100,
417
+ "max_steps": 30,
418
+ "sources": ["hackernews", "devto", "arxiv", "reddit"],
419
+ "recommend_k": 15,
420
+ "description": "Curate 15 items from 100 across all sources for a beginner with minimal stated preferences. Must infer interests from feedback.",
421
+ "profile": {
422
+ "interests": {
423
+ "rust": 0.5,
424
+ "systems": 0.4,
425
+ },
426
+ "preferred_sources": [],
427
+ "time_budget_mins": 30,
428
+ "read_history": [],
429
+ "skill_level": "beginner",
430
+ },
431
+ },
432
+ ]
433
+
434
+
435
+ def main():
436
+ DATA_DIR.mkdir(exist_ok=True)
437
+ print("Fetching real content data from public APIs...\n")
438
+
439
+ # Fetch from all sources
440
+ all_items = []
441
+ all_items.extend(fetch_hackernews(60))
442
+ all_items.extend(fetch_arxiv(50))
443
+ all_items.extend(fetch_devto(50))
444
+ all_items.extend(fetch_reddit(40))
445
+
446
+ print(f"\nTotal items fetched: {len(all_items)}")
447
+
448
+ # Save items
449
+ items_path = DATA_DIR / "items.json"
450
+ with open(items_path, "w") as f:
451
+ json.dump(all_items, f, indent=2)
452
+ print(f"Saved items to {items_path}")
453
+
454
+ # Create tasks (profiles are embedded in each task)
455
+ tasks = create_tasks()
456
+
457
+ # Compute ground truth relevance and set read_history
458
+ ground_truth = {}
459
+ for task in tasks:
460
+ profile = task["profile"]
461
+ sources = task["sources"]
462
+ task_items = [it for it in all_items if it["source"] in sources][
463
+ : task["item_count"]
464
+ ]
465
+
466
+ # Set some items as already read for medium/hard tasks
467
+ if task["task_id"] == "medium" and len(task_items) > 5:
468
+ profile["read_history"] = [task_items[i]["id"] for i in range(0, 6, 2)]
469
+ elif task["task_id"] == "hard" and len(task_items) > 10:
470
+ profile["read_history"] = [task_items[i]["id"] for i in range(0, 10, 3)]
471
+
472
+ relevance = {}
473
+ for item in task_items:
474
+ relevance[item["id"]] = round(compute_relevance(item, profile), 4)
475
+ ground_truth[task["task_id"]] = relevance
476
+
477
+ # Save tasks (with updated read_history in profiles)
478
+ tasks_path = DATA_DIR / "tasks.json"
479
+ with open(tasks_path, "w") as f:
480
+ json.dump(tasks, f, indent=2)
481
+ print(f"Saved tasks to {tasks_path}")
482
+
483
+ gt_path = DATA_DIR / "ground_truth.json"
484
+ with open(gt_path, "w") as f:
485
+ json.dump(ground_truth, f, indent=2)
486
+ print(f"Saved ground truth to {gt_path}")
487
+
488
+ # Summary
489
+ print("\n--- Summary ---")
490
+ for task in tasks:
491
+ tid = task["task_id"]
492
+ gt = ground_truth[tid]
493
+ avg_rel = sum(gt.values()) / len(gt) if gt else 0
494
+ high_rel = sum(1 for v in gt.values() if v >= 0.5)
495
+ print(
496
+ f" {tid}: {len(gt)} items, avg relevance={avg_rel:.3f}, high-relevance={high_rel}"
497
+ )
498
+
499
+
500
+ if __name__ == "__main__":
501
+ main()
server/Dockerfile ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=curator
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ # Health check
75
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
+ CMD curl -f http://localhost:8000/health || exit 1
77
+
78
+ # Run the FastAPI server
79
+ # The module path is constructed to work with the /app/env structure
80
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
server/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Curator environment server components."""
2
+
3
+ from .curator_environment import CuratorEnvironment
4
+
5
+ __all__ = ["CuratorEnvironment"]
server/app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for the Curator Environment.
3
+
4
+ Endpoints:
5
+ - POST /reset: Reset the environment
6
+ - POST /step: Execute an action
7
+ - GET /state: Get current environment state
8
+ - GET /schema: Get action/observation schemas
9
+ - WS /ws: WebSocket endpoint for persistent sessions
10
+
11
+ Usage:
12
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
13
+ """
14
+
15
+ try:
16
+ from openenv.core.env_server.http_server import create_app
17
+ except Exception as e: # pragma: no cover
18
+ raise ImportError("openenv is required. Install with: uv sync") from e
19
+
20
+ try:
21
+ from ..models import CuratorAction, CuratorObservation
22
+ from .curator_environment import CuratorEnvironment
23
+ except (ImportError, ModuleNotFoundError):
24
+ from models import CuratorAction, CuratorObservation
25
+ from server.curator_environment import CuratorEnvironment
26
+
27
+
28
+ app = create_app(
29
+ CuratorEnvironment,
30
+ CuratorAction,
31
+ CuratorObservation,
32
+ env_name="curator",
33
+ max_concurrent_envs=4,
34
+ )
35
+
36
+
37
+ def main(host: str = "0.0.0.0", port: int = 8000):
38
+ import uvicorn
39
+
40
+ uvicorn.run(app, host=host, port=port)
41
+
42
+
43
+ if __name__ == "__main__":
44
+ main()
server/curator_environment.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Curator Environment Implementation.
3
+
4
+ A personalized content curation environment where an agent must filter,
5
+ categorize, rank, and recommend content items from a mixed pool of real
6
+ articles across multiple sources.
7
+ """
8
+
9
+ import copy
10
+ import json
11
+ import random
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional
14
+ from uuid import uuid4
15
+
16
+ from openenv.core.env_server.interfaces import Environment
17
+ from openenv.core.env_server.types import State
18
+
19
+ try:
20
+ from ..models import (
21
+ ActionFeedback,
22
+ ContentItem,
23
+ CuratorAction,
24
+ CuratorObservation,
25
+ TaskInfo,
26
+ UserProfile,
27
+ )
28
+ except ImportError:
29
+ from models import (
30
+ ActionFeedback,
31
+ ContentItem,
32
+ CuratorAction,
33
+ CuratorObservation,
34
+ TaskInfo,
35
+ UserProfile,
36
+ )
37
+
38
+ try:
39
+ from . import grader
40
+ except ImportError:
41
+ from server import grader
42
+
43
+ DATA_DIR = Path(__file__).parent.parent / "data"
44
+
45
+
46
+ class CuratorEnvironment(Environment):
47
+ """
48
+ Personalized content curation environment.
49
+
50
+ The agent receives a pool of real content items and a user profile,
51
+ then must filter, categorize, rank, and recommend the most relevant
52
+ items. Scored using standard IR metrics (NDCG, precision, recall).
53
+
54
+ Tasks:
55
+ - easy: 20 items from 1 source, clear preferences
56
+ - medium: 50 items from 3 sources, nuanced preferences
57
+ - hard: 100 items from 4 sources, minimal initial preferences
58
+ """
59
+
60
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
61
+
62
+ def __init__(self):
63
+ self._state = State(episode_id=str(uuid4()), step_count=0)
64
+
65
+ # Load static data
66
+ self._all_items = self._load_json("items.json")
67
+ self._all_tasks = {t["task_id"]: t for t in self._load_json("tasks.json")}
68
+ self._ground_truth = self._load_json("ground_truth.json")
69
+
70
+ # Episode state
71
+ self._task_config: Optional[dict] = None
72
+ self._profile: Optional[dict] = None
73
+ self._relevance: Dict[str, float] = {}
74
+ self._current_pool: List[dict] = []
75
+ self._items_by_id: Dict[str, dict] = {}
76
+ self._filtered_ids: List[str] = []
77
+ self._categories: Dict[str, str] = {}
78
+ self._last_ranking: List[str] = []
79
+ self._recommended_ids: List[str] = []
80
+ self._items_filtered_count = 0
81
+ self._items_categorized_count = 0
82
+
83
+ @staticmethod
84
+ def _load_json(filename: str) -> dict | list:
85
+ path = DATA_DIR / filename
86
+ with open(path) as f:
87
+ return json.load(f)
88
+
89
+ def reset(self, **kwargs) -> CuratorObservation: # type: ignore[override]
90
+ """Reset the environment with a task configuration.
91
+
92
+ Args:
93
+ **kwargs: Must include 'task_id' ("easy", "medium", or "hard").
94
+ Optional 'seed' for reproducibility.
95
+ """
96
+ task_id = kwargs.get("task_id", "easy")
97
+ seed = kwargs.get("seed", None)
98
+
99
+ if task_id not in self._all_tasks:
100
+ task_id = "easy"
101
+
102
+ self._task_config = self._all_tasks[task_id]
103
+ self._profile = copy.deepcopy(self._task_config["profile"])
104
+ self._relevance = self._ground_truth.get(task_id, {})
105
+
106
+ # Select items for this task
107
+ sources = self._task_config["sources"]
108
+ item_count = self._task_config["item_count"]
109
+ if sources == "all":
110
+ pool = list(self._all_items)
111
+ else:
112
+ pool = [it for it in self._all_items if it["source"] in sources]
113
+
114
+ # Shuffle with seed for reproducibility
115
+ if seed is not None:
116
+ random.seed(seed)
117
+ random.shuffle(pool)
118
+ self._current_pool = pool[:item_count]
119
+ self._items_by_id = {it["id"]: it for it in self._current_pool}
120
+
121
+ # Reset episode state
122
+ self._state = State(episode_id=str(uuid4()), step_count=0)
123
+ self._filtered_ids = []
124
+ self._categories = {}
125
+ self._last_ranking = []
126
+ self._recommended_ids = []
127
+ self._items_filtered_count = 0
128
+ self._items_categorized_count = 0
129
+
130
+ return self._make_observation(reward=0.0, done=False)
131
+
132
+ def step(self, action: CuratorAction) -> CuratorObservation: # type: ignore[override]
133
+ """Execute one step in the environment.
134
+
135
+ Args:
136
+ action: CuratorAction with action_type and relevant fields.
137
+ """
138
+ self._state.step_count += 1
139
+ max_steps = self._task_config["max_steps"]
140
+
141
+ action_type = action.action_type
142
+ reward = 0.0
143
+ feedback = ActionFeedback()
144
+ done = False
145
+
146
+ if action_type == "filter":
147
+ reward, feedback = self._handle_filter(action)
148
+ elif action_type == "categorize":
149
+ reward, feedback = self._handle_categorize(action)
150
+ elif action_type == "rank":
151
+ reward, feedback = self._handle_rank(action)
152
+ elif action_type == "recommend":
153
+ reward, feedback = self._handle_recommend(action)
154
+ done = True
155
+
156
+ # Auto-end if max steps reached
157
+ if self._state.step_count >= max_steps and not done:
158
+ done = True
159
+ # If no recommendation was made, auto-recommend from last ranking or pool
160
+ if not self._recommended_ids:
161
+ k = self._task_config["recommend_k"]
162
+ if self._last_ranking:
163
+ self._recommended_ids = self._last_ranking[:k]
164
+ else:
165
+ pool_ids = [it["id"] for it in self._current_pool]
166
+ self._recommended_ids = pool_ids[:k]
167
+ # Compute final episode score
168
+ reward = self._compute_final_score()
169
+ feedback = ActionFeedback(
170
+ relevance_score=reward,
171
+ explanation="Episode ended (max steps). Auto-recommended from best available ranking.",
172
+ )
173
+
174
+ return self._make_observation(reward=reward, done=done, feedback=feedback)
175
+
176
+ @property
177
+ def state(self) -> State:
178
+ return self._state
179
+
180
+ # =========================================================================
181
+ # Action Handlers
182
+ # =========================================================================
183
+
184
+ def _handle_filter(self, action: CuratorAction) -> tuple[float, ActionFeedback]:
185
+ """Remove items from the pool. Reward for removing low-relevance items."""
186
+ valid_ids = [iid for iid in action.item_ids if iid in self._items_by_id]
187
+ if not valid_ids:
188
+ return 0.0, ActionFeedback(explanation="No valid items to filter.")
189
+
190
+ # Remove from pool
191
+ for iid in valid_ids:
192
+ self._items_by_id.pop(iid, None)
193
+ self._filtered_ids.append(iid)
194
+ self._current_pool = [
195
+ it for it in self._current_pool if it["id"] in self._items_by_id
196
+ ]
197
+ self._items_filtered_count += len(valid_ids)
198
+
199
+ # Score: reward for removing low-relevance items
200
+ quality = grader.filter_quality(valid_ids, self._relevance)
201
+
202
+ return quality, ActionFeedback(
203
+ relevance_score=quality,
204
+ explanation=f"Filtered {len(valid_ids)} items. Quality={quality:.3f}",
205
+ )
206
+
207
+ def _handle_categorize(
208
+ self, action: CuratorAction
209
+ ) -> tuple[float, ActionFeedback]:
210
+ """Categorize items. Reward for matching relevance-derived categories."""
211
+ if not action.categories:
212
+ return 0.0, ActionFeedback(explanation="No categories provided.")
213
+
214
+ valid_cats = {
215
+ iid: cat
216
+ for iid, cat in action.categories.items()
217
+ if iid in self._items_by_id
218
+ }
219
+ if not valid_cats:
220
+ return 0.0, ActionFeedback(explanation="No valid items to categorize.")
221
+
222
+ self._categories.update(valid_cats)
223
+ self._items_categorized_count += len(valid_cats)
224
+
225
+ quality = grader.categorize_quality(valid_cats, self._relevance)
226
+
227
+ return quality, ActionFeedback(
228
+ relevance_score=quality,
229
+ explanation=f"Categorized {len(valid_cats)} items. Accuracy={quality:.3f}",
230
+ )
231
+
232
+ def _handle_rank(self, action: CuratorAction) -> tuple[float, ActionFeedback]:
233
+ """Rank items by priority. Reward based on NDCG."""
234
+ rankings = action.rankings or action.item_ids
235
+ if not rankings:
236
+ return 0.0, ActionFeedback(explanation="No ranking provided.")
237
+
238
+ valid_ranking = [iid for iid in rankings if iid in self._items_by_id]
239
+ self._last_ranking = valid_ranking
240
+
241
+ k = self._task_config["recommend_k"]
242
+ quality = grader.ndcg_at_k(valid_ranking, self._relevance, k)
243
+
244
+ # Also compute coverage
245
+ coverage = grader.source_diversity(valid_ranking[:k], self._items_by_id)
246
+
247
+ return quality, ActionFeedback(
248
+ relevance_score=quality,
249
+ coverage_score=coverage,
250
+ explanation=f"Ranked {len(valid_ranking)} items. NDCG@{k}={quality:.3f}",
251
+ )
252
+
253
+ def _handle_recommend(
254
+ self, action: CuratorAction
255
+ ) -> tuple[float, ActionFeedback]:
256
+ """Final recommendation. Triggers episode end with composite score."""
257
+ rec_ids = action.item_ids
258
+ k = self._task_config["recommend_k"]
259
+
260
+ if not rec_ids:
261
+ # Fall back to last ranking
262
+ if self._last_ranking:
263
+ rec_ids = self._last_ranking[:k]
264
+ else:
265
+ return 0.0, ActionFeedback(
266
+ explanation="No items recommended and no prior ranking."
267
+ )
268
+
269
+ self._recommended_ids = rec_ids[:k]
270
+ score = self._compute_final_score()
271
+
272
+ return score, ActionFeedback(
273
+ relevance_score=score,
274
+ coverage_score=grader.source_diversity(
275
+ self._recommended_ids, self._items_by_id
276
+ ),
277
+ explanation=f"Final recommendation of {len(self._recommended_ids)} items. Score={score:.3f}",
278
+ )
279
+
280
+ # =========================================================================
281
+ # Scoring
282
+ # =========================================================================
283
+
284
+ def _compute_final_score(self) -> float:
285
+ """Compute composite episode score."""
286
+ return grader.grade_episode(
287
+ recommended_ids=self._recommended_ids,
288
+ ranked_ids=self._last_ranking if self._last_ranking else None,
289
+ categories=self._categories if self._categories else None,
290
+ relevance_scores=self._relevance,
291
+ items_by_id=self._items_by_id,
292
+ recommend_k=self._task_config["recommend_k"],
293
+ )
294
+
295
+ # =========================================================================
296
+ # Observation Builder
297
+ # =========================================================================
298
+
299
+ def _make_observation(
300
+ self,
301
+ reward: float,
302
+ done: bool,
303
+ feedback: Optional[ActionFeedback] = None,
304
+ ) -> CuratorObservation:
305
+ items = [ContentItem(**it) for it in self._current_pool]
306
+ profile = UserProfile(**self._profile) if self._profile else None
307
+
308
+ task_info = None
309
+ if self._task_config:
310
+ task_info = TaskInfo(
311
+ task_id=self._task_config["task_id"],
312
+ difficulty=self._task_config["difficulty"],
313
+ max_steps=self._task_config["max_steps"],
314
+ recommend_k=self._task_config["recommend_k"],
315
+ pool_size=len(self._current_pool),
316
+ items_filtered=self._items_filtered_count,
317
+ items_categorized=self._items_categorized_count,
318
+ step_number=self._state.step_count,
319
+ )
320
+
321
+ return CuratorObservation(
322
+ items=items,
323
+ user_profile=profile,
324
+ feedback=feedback,
325
+ task_info=task_info,
326
+ done=done,
327
+ reward=reward,
328
+ metadata={
329
+ "episode_id": self._state.episode_id,
330
+ "step": self._state.step_count,
331
+ },
332
+ )
server/grader.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grading module for Curator environment.
3
+
4
+ Implements standard Information Retrieval metrics for deterministic,
5
+ reproducible scoring of agent performance (0.0-1.0).
6
+ """
7
+
8
+ import math
9
+ from typing import Dict, List, Optional
10
+
11
+
12
+ def dcg_at_k(relevances: List[float], k: int) -> float:
13
+ """Compute Discounted Cumulative Gain at k."""
14
+ dcg = 0.0
15
+ for i, rel in enumerate(relevances[:k]):
16
+ dcg += rel / math.log2(i + 2) # i+2 because log2(1) = 0
17
+ return dcg
18
+
19
+
20
+ def ndcg_at_k(
21
+ ranked_ids: List[str],
22
+ relevance_scores: Dict[str, float],
23
+ k: int,
24
+ ) -> float:
25
+ """Compute Normalized Discounted Cumulative Gain at k.
26
+
27
+ Args:
28
+ ranked_ids: Agent's ranked list of item IDs (best first).
29
+ relevance_scores: Ground truth {item_id: relevance} scores.
30
+ k: Evaluate top-k items.
31
+
32
+ Returns:
33
+ NDCG score in [0, 1].
34
+ """
35
+ if not ranked_ids or not relevance_scores or k <= 0:
36
+ return 0.0
37
+
38
+ # Actual DCG from agent ranking
39
+ actual_rels = [relevance_scores.get(iid, 0.0) for iid in ranked_ids[:k]]
40
+ actual_dcg = dcg_at_k(actual_rels, k)
41
+
42
+ # Ideal DCG (sorted by relevance, descending)
43
+ ideal_rels = sorted(relevance_scores.values(), reverse=True)[:k]
44
+ ideal_dcg = dcg_at_k(ideal_rels, k)
45
+
46
+ if ideal_dcg == 0:
47
+ return 0.0
48
+ return actual_dcg / ideal_dcg
49
+
50
+
51
+ def precision_at_k(
52
+ selected_ids: List[str],
53
+ relevance_scores: Dict[str, float],
54
+ k: int,
55
+ threshold: float = 0.5,
56
+ ) -> float:
57
+ """Compute Precision at k.
58
+
59
+ Args:
60
+ selected_ids: Agent's selected item IDs.
61
+ relevance_scores: Ground truth {item_id: relevance} scores.
62
+ k: Evaluate top-k items.
63
+ threshold: Minimum relevance to count as "relevant".
64
+
65
+ Returns:
66
+ Precision score in [0, 1].
67
+ """
68
+ if not selected_ids or k <= 0:
69
+ return 0.0
70
+
71
+ top_k = selected_ids[:k]
72
+ relevant_count = sum(
73
+ 1 for iid in top_k if relevance_scores.get(iid, 0.0) >= threshold
74
+ )
75
+ return relevant_count / min(k, len(top_k))
76
+
77
+
78
+ def recall_at_k(
79
+ selected_ids: List[str],
80
+ relevance_scores: Dict[str, float],
81
+ k: int,
82
+ threshold: float = 0.5,
83
+ ) -> float:
84
+ """Compute Recall at k.
85
+
86
+ Args:
87
+ selected_ids: Agent's selected item IDs.
88
+ relevance_scores: Ground truth {item_id: relevance} scores.
89
+ k: Evaluate top-k items.
90
+ threshold: Minimum relevance to count as "relevant".
91
+
92
+ Returns:
93
+ Recall score in [0, 1].
94
+ """
95
+ total_relevant = sum(1 for v in relevance_scores.values() if v >= threshold)
96
+ if total_relevant == 0:
97
+ return 1.0 # No relevant items to find
98
+
99
+ top_k = selected_ids[:k]
100
+ found_relevant = sum(
101
+ 1 for iid in top_k if relevance_scores.get(iid, 0.0) >= threshold
102
+ )
103
+ return found_relevant / total_relevant
104
+
105
+
106
+ def source_diversity(selected_ids: List[str], items_by_id: Dict[str, dict]) -> float:
107
+ """Compute source diversity of selected items.
108
+
109
+ Returns:
110
+ Diversity score in [0, 1] based on unique source coverage.
111
+ """
112
+ if not selected_ids:
113
+ return 0.0
114
+
115
+ all_sources = set(it.get("source", "") for it in items_by_id.values())
116
+ selected_sources = set(
117
+ items_by_id[iid].get("source", "") for iid in selected_ids if iid in items_by_id
118
+ )
119
+ if not all_sources:
120
+ return 0.0
121
+ return len(selected_sources) / len(all_sources)
122
+
123
+
124
+ def filter_quality(
125
+ removed_ids: List[str],
126
+ relevance_scores: Dict[str, float],
127
+ ) -> float:
128
+ """Score a filter action: reward for removing low-relevance items.
129
+
130
+ Returns:
131
+ Score in [0, 1]. Higher is better (removed less relevant items).
132
+ """
133
+ if not removed_ids:
134
+ return 0.0
135
+
136
+ avg_relevance_of_removed = sum(
137
+ relevance_scores.get(iid, 0.5) for iid in removed_ids
138
+ ) / len(removed_ids)
139
+
140
+ # Good filtering removes low-relevance items
141
+ return max(0.0, min(1.0, 1.0 - avg_relevance_of_removed))
142
+
143
+
144
+ def categorize_quality(
145
+ agent_categories: Dict[str, str],
146
+ relevance_scores: Dict[str, float],
147
+ threshold_urgent: float = 0.7,
148
+ threshold_read: float = 0.4,
149
+ ) -> float:
150
+ """Score categorization accuracy against relevance-derived ground truth.
151
+
152
+ Ground truth categories derived from relevance:
153
+ >= threshold_urgent → "urgent"
154
+ >= threshold_read → "read_later"
155
+ < threshold_read → "skip"
156
+ (any relevance can be "share" — not penalized)
157
+
158
+ Returns:
159
+ Accuracy score in [0, 1].
160
+ """
161
+ if not agent_categories:
162
+ return 0.0
163
+
164
+ correct = 0
165
+ total = len(agent_categories)
166
+
167
+ for item_id, agent_cat in agent_categories.items():
168
+ rel = relevance_scores.get(item_id, 0.0)
169
+
170
+ # Derive expected category
171
+ if rel >= threshold_urgent:
172
+ expected = {"urgent", "share"}
173
+ elif rel >= threshold_read:
174
+ expected = {"read_later", "share"}
175
+ else:
176
+ expected = {"skip"}
177
+
178
+ if agent_cat in expected:
179
+ correct += 1
180
+
181
+ return correct / total
182
+
183
+
184
+ def grade_episode(
185
+ recommended_ids: List[str],
186
+ ranked_ids: Optional[List[str]],
187
+ categories: Optional[Dict[str, str]],
188
+ relevance_scores: Dict[str, float],
189
+ items_by_id: Dict[str, dict],
190
+ recommend_k: int,
191
+ ) -> float:
192
+ """Compute final episode score (0-1).
193
+
194
+ Composite:
195
+ 0.35 * NDCG@k
196
+ 0.25 * Precision@k
197
+ 0.20 * Recall@k
198
+ 0.10 * Category accuracy
199
+ 0.10 * Source diversity
200
+ """
201
+ # Use recommended_ids as ranking if no explicit ranking
202
+ ranking = ranked_ids if ranked_ids else recommended_ids
203
+
204
+ ndcg = ndcg_at_k(ranking, relevance_scores, recommend_k)
205
+ precision = precision_at_k(recommended_ids, relevance_scores, recommend_k)
206
+ recall = recall_at_k(recommended_ids, relevance_scores, recommend_k)
207
+ cat_acc = categorize_quality(categories, relevance_scores) if categories else 0.0
208
+ diversity = source_diversity(recommended_ids, items_by_id)
209
+
210
+ score = (
211
+ 0.35 * ndcg
212
+ + 0.25 * precision
213
+ + 0.20 * recall
214
+ + 0.10 * cat_acc
215
+ + 0.10 * diversity
216
+ )
217
+ return max(0.0, min(1.0, score))
server/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openenv[core]>=0.2.0
2
+ fastapi>=0.115.0
3
+ uvicorn>=0.24.0
4
+
5
+
6
+
uv.lock ADDED
The diff for this file is too large to render. See raw diff