AdithyaSK HF Staff commited on
Commit
ead51a4
·
1 Parent(s): 3594a5d

added pdf download button - Adithya S K

Browse files
app/src/components/HeroArticle.astro CHANGED
@@ -147,6 +147,20 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
147
  <p><a href={`https://doi.org/${doi}`} target="_blank" rel="noopener noreferrer">{doi}</a></p>
148
  </div>
149
  )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  {repo && (
151
  <div class="meta-container-cell meta-container-cell--repo">
152
  <h3>Code</h3>
@@ -191,14 +205,15 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
191
  class="button"
192
  href={`/${pdfFilename}`}
193
  download={pdfFilename}
 
194
  aria-label={`Download PDF ${pdfFilename}`}
195
  >
196
  Download PDF
197
  </a>
198
  </p>
199
  <div class="pdf-locked" style="display: none;">
200
- <a
201
- class="button button-locked"
202
  href="https://huggingface.co/subscribe/pro"
203
  target="_blank"
204
  rel="noopener noreferrer"
@@ -212,11 +227,76 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
212
  </div>
213
  </div>
214
  )}
 
 
215
  </div>
216
  </header>
217
 
218
  {showPdf && (
219
  <script is:inline>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  // PDF access control for Pro users only
221
  const LOCAL_IS_PRO = true;
222
  const FALLBACK_TIMEOUT_MS = 3000;
@@ -364,14 +444,17 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
364
  }
365
  .meta-container {
366
  max-width: 980px;
367
- display: flex;
368
- flex-direction: row;
369
- justify-content: space-between;
370
  margin: 0 auto;
371
  padding: 0 var(--content-padding-x);
372
- gap: 8px;
373
- flex-wrap: wrap;
374
- row-gap: 12px;
 
 
 
375
  }
376
  .meta-container a:not(.button):not(.repo-button) {
377
  color: var(--primary-color);
@@ -452,10 +535,25 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
452
  background: #1a1a1a;
453
  }
454
 
455
- /* Cell order: Authors, Affiliation, DOI, Published, then Code on the far right */
456
- .meta-container-cell--published { order: 1; }
457
- .meta-container-cell--pdf { order: 2; }
458
- .meta-container-cell--repo { order: 3; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  .authors {
460
  margin: 0;
461
  list-style-type: none;
@@ -477,12 +575,6 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
477
 
478
  @media (max-width: 768px) {
479
  .meta-container-cell:nth-child(even) { text-align: right; }
480
- .meta-container-cell:last-child:nth-child(odd) {
481
- flex-grow: 0;
482
- flex-basis: auto;
483
- margin-left: auto;
484
- text-align: right;
485
- }
486
  }
487
 
488
  @media print {
@@ -514,6 +606,65 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
514
  }
515
  .pdf-pro-only { margin: 0; line-height: 0; }
516
  .pdf-pro-only .button { margin: 0; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  .pro-badge-wrapper {
518
  display: inline-flex;
519
  align-items: center;
 
147
  <p><a href={`https://doi.org/${doi}`} target="_blank" rel="noopener noreferrer">{doi}</a></p>
148
  </div>
149
  )}
150
+ {showPdf && (
151
+ <div class="pdf-modal" role="dialog" aria-modal="true" aria-labelledby="pdf-modal-title" hidden>
152
+ <div class="pdf-modal__inner">
153
+ <p class="pdf-modal__title" id="pdf-modal-title">Heads up before you download</p>
154
+ <p class="pdf-modal__body">Animations and wide comparison tables clip in the PDF. The web version is the canonical reference and stays interactive.</p>
155
+ <div class="pdf-modal__actions">
156
+ <button type="button" class="pdf-modal__btn" data-pdf-modal-action="cancel">Cancel</button>
157
+ <button type="button" class="pdf-modal__btn pdf-modal__btn--primary" data-pdf-modal-action="confirm">Download anyway</button>
158
+ </div>
159
+ </div>
160
+ </div>
161
+ )}
162
+ {(repo || showPdf) && (
163
+ <div class="meta-actions">
164
  {repo && (
165
  <div class="meta-container-cell meta-container-cell--repo">
166
  <h3>Code</h3>
 
205
  class="button"
206
  href={`/${pdfFilename}`}
207
  download={pdfFilename}
208
+ data-pdf-warn="true"
209
  aria-label={`Download PDF ${pdfFilename}`}
210
  >
211
  Download PDF
212
  </a>
213
  </p>
214
  <div class="pdf-locked" style="display: none;">
215
+ <a
216
+ class="button button-locked"
217
  href="https://huggingface.co/subscribe/pro"
218
  target="_blank"
219
  rel="noopener noreferrer"
 
227
  </div>
228
  </div>
229
  )}
230
+ </div>
231
+ )}
232
  </div>
233
  </header>
234
 
235
  {showPdf && (
236
  <script is:inline>
237
+ // Pop up a warning modal when the user clicks Download PDF, then proceed
238
+ // with the download only if they confirm.
239
+ (() => {
240
+ const init = () => {
241
+ const modal = document.querySelector(".pdf-modal");
242
+ if (!modal) return;
243
+ let pendingHref = null;
244
+ let pendingDownload = null;
245
+
246
+ const openModal = (href, downloadName) => {
247
+ pendingHref = href;
248
+ pendingDownload = downloadName;
249
+ modal.classList.add("is-open");
250
+ modal.removeAttribute("hidden");
251
+ const confirmBtn = modal.querySelector('[data-pdf-modal-action="confirm"]');
252
+ if (confirmBtn) confirmBtn.focus();
253
+ };
254
+ const closeModal = () => {
255
+ modal.classList.remove("is-open");
256
+ modal.setAttribute("hidden", "");
257
+ pendingHref = null;
258
+ pendingDownload = null;
259
+ };
260
+ const triggerDownload = () => {
261
+ if (!pendingHref) { closeModal(); return; }
262
+ const a = document.createElement("a");
263
+ a.href = pendingHref;
264
+ if (pendingDownload) a.download = pendingDownload;
265
+ document.body.appendChild(a);
266
+ a.click();
267
+ a.remove();
268
+ closeModal();
269
+ };
270
+
271
+ document.addEventListener("click", (e) => {
272
+ const link = e.target.closest('a[data-pdf-warn="true"]');
273
+ if (link) {
274
+ e.preventDefault();
275
+ openModal(link.getAttribute("href"), link.getAttribute("download"));
276
+ return;
277
+ }
278
+ const action = e.target.closest("[data-pdf-modal-action]");
279
+ if (action) {
280
+ const kind = action.getAttribute("data-pdf-modal-action");
281
+ if (kind === "confirm") triggerDownload();
282
+ else closeModal();
283
+ return;
284
+ }
285
+ if (modal.classList.contains("is-open") && !e.target.closest(".pdf-modal__inner")) {
286
+ closeModal();
287
+ }
288
+ });
289
+ document.addEventListener("keydown", (e) => {
290
+ if (e.key === "Escape" && modal.classList.contains("is-open")) closeModal();
291
+ });
292
+ };
293
+ if (document.readyState === "loading") {
294
+ document.addEventListener("DOMContentLoaded", init, { once: true });
295
+ } else {
296
+ init();
297
+ }
298
+ })();
299
+
300
  // PDF access control for Pro users only
301
  const LOCAL_IS_PRO = true;
302
  const FALLBACK_TIMEOUT_MS = 3000;
 
444
  }
445
  .meta-container {
446
  max-width: 980px;
447
+ display: grid;
448
+ grid-template-columns: minmax(0, 2fr) minmax(0, 1fr) minmax(0, 1fr) auto;
449
+ align-items: start;
450
  margin: 0 auto;
451
  padding: 0 var(--content-padding-x);
452
+ gap: 16px 32px;
453
+ }
454
+ @media (max-width: 768px) {
455
+ .meta-container {
456
+ grid-template-columns: 1fr 1fr;
457
+ }
458
  }
459
  .meta-container a:not(.button):not(.repo-button) {
460
  color: var(--primary-color);
 
535
  background: #1a1a1a;
536
  }
537
 
538
+ /* Code + PDF live in the rightmost grid column as a pair, stacked vertically
539
+ so they share the same column width and stay visually grouped. */
540
+ .meta-actions {
541
+ display: flex;
542
+ flex-direction: column;
543
+ gap: 12px;
544
+ align-items: flex-start;
545
+ }
546
+ .meta-actions .meta-container-cell--repo,
547
+ .meta-actions .meta-container-cell--pdf {
548
+ margin: 0;
549
+ }
550
+ @media (max-width: 768px) {
551
+ .meta-actions {
552
+ grid-column: 1 / -1;
553
+ flex-direction: row;
554
+ gap: 24px;
555
+ }
556
+ }
557
  .authors {
558
  margin: 0;
559
  list-style-type: none;
 
575
 
576
  @media (max-width: 768px) {
577
  .meta-container-cell:nth-child(even) { text-align: right; }
 
 
 
 
 
 
578
  }
579
 
580
  @media print {
 
606
  }
607
  .pdf-pro-only { margin: 0; line-height: 0; }
608
  .pdf-pro-only .button { margin: 0; }
609
+
610
+ /* Modal that pops up when the user clicks Download PDF, telling them about
611
+ the animation/table clipping caveat. They confirm to proceed with download. */
612
+ .pdf-modal {
613
+ position: fixed;
614
+ inset: 0;
615
+ background: rgba(0, 0, 0, 0.55);
616
+ display: none;
617
+ align-items: center;
618
+ justify-content: center;
619
+ z-index: 1000;
620
+ padding: 16px;
621
+ }
622
+ .pdf-modal.is-open { display: flex; }
623
+ .pdf-modal__inner {
624
+ background: var(--surface-bg);
625
+ color: var(--text-color);
626
+ border: 1px solid var(--border-color);
627
+ border-radius: 12px;
628
+ box-shadow: 0 20px 50px rgba(0, 0, 0, 0.35);
629
+ max-width: 420px;
630
+ width: 100%;
631
+ padding: 22px 22px 18px 22px;
632
+ }
633
+ .pdf-modal__title {
634
+ margin: 0 0 8px;
635
+ font-size: 15px;
636
+ font-weight: 700;
637
+ }
638
+ .pdf-modal__body {
639
+ margin: 0 0 18px;
640
+ font-size: 13px;
641
+ line-height: 1.5;
642
+ color: var(--muted-color);
643
+ }
644
+ .pdf-modal__actions {
645
+ display: flex;
646
+ gap: 10px;
647
+ justify-content: flex-end;
648
+ }
649
+ .pdf-modal__btn {
650
+ padding: 7px 14px;
651
+ border-radius: 7px;
652
+ font-size: 12.5px;
653
+ font-weight: 600;
654
+ cursor: pointer;
655
+ border: 1px solid var(--border-color);
656
+ background: var(--surface-bg);
657
+ color: var(--text-color);
658
+ }
659
+ .pdf-modal__btn--primary {
660
+ background: #000;
661
+ color: #fff;
662
+ border-color: var(--primary-color);
663
+ }
664
+ .pdf-modal__btn--primary:hover { background: #1a1a1a; }
665
+ @media print {
666
+ .pdf-modal { display: none !important; }
667
+ }
668
  .pro-badge-wrapper {
669
  display: inline-flex;
670
  align-items: center;
app/src/content/article.mdx CHANGED
@@ -35,7 +35,7 @@ repo: "https://github.com/adithya-s-k/RL_Envs_101"
35
  seoThumbImage: "https://raw.githubusercontent.com/adithya-s-k/RL_Envs_101/refs/heads/main/assets/blog_thumbnail.png"
36
  template: "article"
37
  tableOfContentsAutoCollapse: true
38
- showPdf: false
39
  ---
40
 
41
  import Introduction from "./chapters/introduction.mdx";
 
35
  seoThumbImage: "https://raw.githubusercontent.com/adithya-s-k/RL_Envs_101/refs/heads/main/assets/blog_thumbnail.png"
36
  template: "article"
37
  tableOfContentsAutoCollapse: true
38
+ showPdf: true
39
  ---
40
 
41
  import Introduction from "./chapters/introduction.mdx";
app/src/content/chapters/dimensions.mdx CHANGED
@@ -31,7 +31,7 @@ The most fundamental architectural split: does your environment run as a **separ
31
 
32
  The two patterns differ on three things that show up in practice:
33
 
34
- - **Where it runs.** An HTTP framework lives on its own machine, often a cheap CPU box or a Hugging Face Space. The in-process kind shares the training GPU node.
35
  - **What you install on the trainer side.** HTTP only needs an SDK or `requests`. In-process pulls the full framework package into the training venv.
36
  - **How it scales.** HTTP scales by adding server replicas behind a load balancer. In-process scales by adding more identical training workers, each with its own copy of the environment.
37
 
@@ -133,7 +133,7 @@ The practical read: tool-based control is a portable convention across all six f
133
 
134
  *Where do the prompts come from, and what comes with them?*
135
 
136
- Every rollout starts with a task. The model takes that task as input, acts on the environment across the span of the episode, and the environment scores the result against whatever the task said success looked like. The task is what tells the model *what to do this episode*, the prompt, the input data it operates on, and (for scoring) the expected answer or test that decides whether it succeeded. This is the most varied dimension after reward, the six frameworks land on six different answers for where that task comes from. Some bundle a dataset (Verifiers ships HF `Dataset` integration, GEM has a registry of 24+ built-in environments). Some put the task store on the server (ORS exposes `list_tasks(split)`). Some preprocess JSONL through a CLI (NeMo Gym's `ng_prepare_data`). And two leave it to you (OpenEnv, SkyRL Gym). The cards below trace each path from source to environment.
137
 
138
  <HtmlEmbed src="d3-task-flow.html" frameless />
139
 
@@ -151,7 +151,7 @@ These bundles usually live behind the dataset row, in S3, an HF dataset repo, or
151
 
152
  Frameworks split on how strict the task spec is, and that strictness is what lets a task hop between training runs without rewiring.
153
 
154
- - **Coupled.** [Verifiers](https://github.com/PrimeIntellect-ai/verifiers/blob/main/docs/environments.md) expects an HF `Dataset` with a `prompt` column and optional `answer` or `info` columns; GEM ships built-in environments with their own loaders; ORS and NeMo Gym pin the schema on the server side. The [Environments Hub](https://www.primeintellect.ai/blog/environments) and [OpenReward](https://openreward.ai) go further and standardise the whole package, the layout, the scoring contract, even the wheel-based packaging, so any task that fits the spec runs in any environment that follows it.
155
  - **BYO.** OpenEnv and SkyRL Gym leave the dataset up to you. Prompts come in from any source, the environment doesn't look at the schema, but every new source costs a little integration.
156
 
157
  > **Note: who owns the data transformation.** Coupling means the environment dictates the spec and you transform your raw data to fit. Concretely:
@@ -230,7 +230,7 @@ Once you leave your laptop, the question is who lives where. HTTP frameworks let
230
 
231
  *How do environments scale from development to production, and what are the concurrency limits?*
232
 
233
- RL training generates multiple rollouts per prompt, ideally in parallel, which means interacting with many environments simultaneously. In GRPO specifically, that's `num_generations` (typically 4-16) environments per prompt across the batch: with 64 prompts and `num_generations=8`, you have 512 concurrent environment instances per step. This section covers how the two deployment models handle that.
234
 
235
  #### Two scaling models
236
 
@@ -247,7 +247,7 @@ Beyond orchestration, two things stay constant:
247
 
248
  #### Benchmark results: how containerized environment services scale
249
 
250
- The [openenv-scaling benchmark](https://github.com/burtenshaw/openenv-scaling) tested an environment deployed as a FastAPI server in a Docker container, across five infrastructure configurations. OpenEnv, ORS, and NeMo Gym all follow the same shape, a FastAPI app holding per-session state, packaged in the same image used for HF Spaces, so these numbers are broadly representative of any environment deployed as a containerized service. The benchmark itself runs OpenEnv's WebSocket mode; the per-protocol differences (WS / SSE / REST) matter less than the container-and-load-balancer story.
251
 
252
  Maximum concurrent environments at ≥95% success rate (`wait=1.0s`):
253
 
@@ -279,7 +279,7 @@ The multi-node p99 reflects connection queuing at 16,384 concurrent sessions acr
279
 
280
  1. **Docker adds no meaningful overhead**: Local Docker and uvicorn reach the same 2,048 max batch.
281
  2. **Load balancing configuration matters**: Before fixing Envoy, multi-node achieved only 128 max batch. After: 16,384 (128x improvement).
282
- 3. **HF Spaces caps at ~128 concurrent sessions**: sufficient for development and demos.
283
  4. **The server is rarely the bottleneck**: even a laptop handles 2,048 sessions. The execution backend (sandbox creation, tool execution) dominates per-step latency regardless of framework.
284
  5. **Horizontal scaling is a load-balancer config problem, not a protocol problem**: the 128 → 16,384 jump came from fixing Envoy's settings, not from changing the wire format. Sticky sessions (which WebSocket forces) make this harder to load-balance; for designs targeting thousands of envs, a stateless-per-request shape with a session ID has fewer footguns.
285
 
 
31
 
32
  The two patterns differ on three things that show up in practice:
33
 
34
+ - **Where it runs.** An HTTP framework lives on its own machine, often a cheap CPU box or a [Hugging Face Space](https://huggingface.co/spaces). The in-process kind shares the training GPU node.
35
  - **What you install on the trainer side.** HTTP only needs an SDK or `requests`. In-process pulls the full framework package into the training venv.
36
  - **How it scales.** HTTP scales by adding server replicas behind a load balancer. In-process scales by adding more identical training workers, each with its own copy of the environment.
37
 
 
133
 
134
  *Where do the prompts come from, and what comes with them?*
135
 
136
+ Every rollout starts with a task. The model takes that task as input, acts on the environment across the span of the episode, and the environment scores the result against whatever the task said success looked like. The task is what tells the model *what to do this episode*, the prompt, the input data it operates on, and (for scoring) the expected answer or test that decides whether it succeeded. This is the most varied dimension after reward, the six frameworks land on six different answers for where that task comes from. Some bundle a dataset (Verifiers ships [HF `Dataset`](https://huggingface.co/docs/datasets) integration, GEM has a registry of 24+ built-in environments). Some put the task store on the server (ORS exposes `list_tasks(split)`). Some preprocess JSONL through a CLI (NeMo Gym's `ng_prepare_data`). And two leave it to you (OpenEnv, SkyRL Gym). The cards below trace each path from source to environment.
137
 
138
  <HtmlEmbed src="d3-task-flow.html" frameless />
139
 
 
151
 
152
  Frameworks split on how strict the task spec is, and that strictness is what lets a task hop between training runs without rewiring.
153
 
154
+ - **Coupled.** [Verifiers](https://github.com/PrimeIntellect-ai/verifiers/blob/main/docs/environments.md) expects an [HF `Dataset`](https://huggingface.co/docs/datasets) with a `prompt` column and optional `answer` or `info` columns; GEM ships built-in environments with their own loaders; ORS and NeMo Gym pin the schema on the server side. The [Environments Hub](https://www.primeintellect.ai/blog/environments) and [OpenReward](https://openreward.ai) go further and standardise the whole package, the layout, the scoring contract, even the wheel-based packaging, so any task that fits the spec runs in any environment that follows it.
155
  - **BYO.** OpenEnv and SkyRL Gym leave the dataset up to you. Prompts come in from any source, the environment doesn't look at the schema, but every new source costs a little integration.
156
 
157
  > **Note: who owns the data transformation.** Coupling means the environment dictates the spec and you transform your raw data to fit. Concretely:
 
230
 
231
  *How do environments scale from development to production, and what are the concurrency limits?*
232
 
233
+ RL training generates multiple rollouts per prompt, ideally in parallel, which means interacting with many environments simultaneously. In [GRPO](https://huggingface.co/docs/trl/main/en/grpo_trainer) specifically, that's `num_generations` (typically 4-16) environments per prompt across the batch: with 64 prompts and `num_generations=8`, you have 512 concurrent environment instances per step. This section covers how the two deployment models handle that.
234
 
235
  #### Two scaling models
236
 
 
247
 
248
  #### Benchmark results: how containerized environment services scale
249
 
250
+ The [openenv-scaling benchmark](https://github.com/burtenshaw/openenv-scaling) tested an environment deployed as a FastAPI server in a Docker container, across five infrastructure configurations. OpenEnv, ORS, and NeMo Gym all follow the same shape, a FastAPI app holding per-session state, packaged in the same image used for [HF Spaces](https://huggingface.co/spaces), so these numbers are broadly representative of any environment deployed as a containerized service. The benchmark itself runs OpenEnv's WebSocket mode; the per-protocol differences (WS / SSE / REST) matter less than the container-and-load-balancer story.
251
 
252
  Maximum concurrent environments at ≥95% success rate (`wait=1.0s`):
253
 
 
279
 
280
  1. **Docker adds no meaningful overhead**: Local Docker and uvicorn reach the same 2,048 max batch.
281
  2. **Load balancing configuration matters**: Before fixing Envoy, multi-node achieved only 128 max batch. After: 16,384 (128x improvement).
282
+ 3. **[HF Spaces](https://huggingface.co/spaces) caps at ~128 concurrent sessions**: sufficient for development and demos, and convenient since it's also the largest community catalog of pre-built environments to start from.
283
  4. **The server is rarely the bottleneck**: even a laptop handles 2,048 sessions. The execution backend (sandbox creation, tool execution) dominates per-step latency regardless of framework.
284
  5. **Horizontal scaling is a load-balancer config problem, not a protocol problem**: the 128 → 16,384 jump came from fixing Envoy's settings, not from changing the wire format. Sticky sessions (which WebSocket forces) make this harder to load-balance; for designs targeting thousands of envs, a stateless-per-request shape with a session ID has fewer footguns.
285
 
app/src/content/chapters/framework-inventory.mdx CHANGED
@@ -18,7 +18,7 @@ These are notable RL environment frameworks we evaluated but did not implement.
18
 
19
  | Framework | Creator | Why excluded |
20
  | --- | --- | --- |
21
- | [**Atropos**](https://github.com/NousResearch/atropos) | Nous Research | Different paradigm, environments own inference and POST scored batches to a central API. Not compatible with TRL's turn-by-turn tool calling. |
22
  | [**Harbor**](https://github.com/laude-institute/harbor) | Laude Institute | Eval and RL rollout-generation framework, the official harness for Terminal-Bench 2.0. Runs autonomous agent harnesses (Claude Code, Codex CLI, OpenHands) in parallel containers via Daytona / Modal, the agent drives the loop end-to-end inside the sandbox and emits trajectories. |
23
  | [**RLVE**](https://github.com/Zhiyuan-Zeng/RLVE) | Zhiyuan Zeng | Pure verifier library (445 tasks), `generate() → verify()` with no transport, no tools, no state. Not an environment framework, just problem oracles. |
24
  | [**Reasoning Gym**](https://github.com/open-thought/reasoning-gym) | Open Thought | Procedural task generators + verifiers, same tier as RLVE. Stateless, no multi-turn, no tools. |
 
18
 
19
  | Framework | Creator | Why excluded |
20
  | --- | --- | --- |
21
+ | [**Atropos**](https://github.com/NousResearch/atropos) | Nous Research | Different paradigm, environments own inference and POST scored batches to a central API. Not compatible with [TRL](https://huggingface.co/docs/trl)'s turn-by-turn tool calling. |
22
  | [**Harbor**](https://github.com/laude-institute/harbor) | Laude Institute | Eval and RL rollout-generation framework, the official harness for Terminal-Bench 2.0. Runs autonomous agent harnesses (Claude Code, Codex CLI, OpenHands) in parallel containers via Daytona / Modal, the agent drives the loop end-to-end inside the sandbox and emits trajectories. |
23
  | [**RLVE**](https://github.com/Zhiyuan-Zeng/RLVE) | Zhiyuan Zeng | Pure verifier library (445 tasks), `generate() → verify()` with no transport, no tools, no state. Not an environment framework, just problem oracles. |
24
  | [**Reasoning Gym**](https://github.com/open-thought/reasoning-gym) | Open Thought | Procedural task generators + verifiers, same tier as RLVE. Stateless, no multi-turn, no tools. |
app/src/content/chapters/introduction.mdx CHANGED
@@ -13,7 +13,7 @@ The Qwen team is explicit about why this matters. In the [Qwen3.5 release notes]
13
 
14
  ![Qwen3.5 RL environment scaling](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.5/Figures/qwen3.5_397b_a17b_scaling.png)
15
 
16
- The bottleneck is no longer "can we set up an environment", it's "how do we run 100,000 of them, keep them honest, and feed them into a training loop". Frameworks are emerging to standardise that, and environment hubs are showing up alongside them where pre-built environments can be plugged into a run. The anatomy of an RL environment, what it's actually made of, has stopped being obvious and started being important.
17
 
18
  We built the same environments across multiple [RL environment frameworks](#framework-inventory). Each has a different design for what an environment should look like, what it's composed of, and how it fits into the rest of training. We wanted to understand what components make up an RL environment in the LLM era, how they're built, how different frameworks tackle the same problems, how rewards are wired into the loop, how easy it is to scale, and how the environment fits into the overall RL training run.
19
 
 
13
 
14
  ![Qwen3.5 RL environment scaling](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3.5/Figures/qwen3.5_397b_a17b_scaling.png)
15
 
16
+ The bottleneck is no longer "can we set up an environment", it's "how do we run 100,000 of them, keep them honest, and feed them into a training loop". Frameworks are emerging to standardise that, and environment hubs are showing up alongside them where pre-built environments can be plugged into a run. The largest catalog today sits on [Hugging Face Spaces](https://huggingface.co/spaces), with 4k+ MCP-compatible environments shipped by the community, with [PrimeIntellect's Environments Hub](https://www.primeintellect.ai/blog/environments) and [openreward.ai](https://openreward.ai) adding several thousand more. The anatomy of an RL environment, what it's actually made of, has stopped being obvious and started being important.
17
 
18
  We built the same environments across multiple [RL environment frameworks](#framework-inventory). Each has a different design for what an environment should look like, what it's composed of, and how it fits into the rest of training. We wanted to understand what components make up an RL environment in the LLM era, how they're built, how different frameworks tackle the same problems, how rewards are wired into the loop, how easy it is to scale, and how the environment fits into the overall RL training run.
19
 
app/src/content/chapters/why-comparison.mdx CHANGED
@@ -3,8 +3,8 @@
3
  There is no standard protocol for how LLMs interact with RL environments yet. Each framework picks its own answer for the same handful of questions, and the answers shape how you write code, how you deploy, and what you have to debug when training breaks. The four that mattered most while we were building the same env six ways:
4
 
5
  - **What is an "environment"?** Some frameworks treat it as just a reward function, others include tools, state management, and the full multi-turn loop, others again bundle a whole training pipeline.
6
- - **Where does it run?** Some run as HTTP servers (Docker, HF Spaces) so the env scales independently from training, others run in-process inside the training venv so there's no network hop but no isolation either.
7
- - **How much trainer comes with it?** A few frameworks ship their own trainer (Prime RL, NeMo RL, SkyRL); others require adapters to plug into external training loops like TRL.
8
  - **When does the reward fire?** Per-tool-call, per-step rubric, post-episode verify, or an external scoring function; each makes different assumptions about how dense the signal is and who owns the scoring code.
9
 
10
  The rest of this article walks through these and a handful of related questions, framework by framework, with side-by-side code, benchmark numbers, and a decision tree at the end if you just want a recommendation.
 
3
  There is no standard protocol for how LLMs interact with RL environments yet. Each framework picks its own answer for the same handful of questions, and the answers shape how you write code, how you deploy, and what you have to debug when training breaks. The four that mattered most while we were building the same env six ways:
4
 
5
  - **What is an "environment"?** Some frameworks treat it as just a reward function, others include tools, state management, and the full multi-turn loop, others again bundle a whole training pipeline.
6
+ - **Where does it run?** Some run as HTTP servers (Docker, [HF Spaces](https://huggingface.co/spaces)) so the env scales independently from training, others run in-process inside the training venv so there's no network hop but no isolation either.
7
+ - **How much trainer comes with it?** A few frameworks ship their own trainer (Prime RL, NeMo RL, SkyRL); others require adapters to plug into external training loops like [TRL](https://github.com/huggingface/trl).
8
  - **When does the reward fire?** Per-tool-call, per-step rubric, post-episode verify, or an external scoring function; each makes different assumptions about how dense the signal is and who owns the scoring code.
9
 
10
  The rest of this article walks through these and a handful of related questions, framework by framework, with side-by-side code, benchmark numbers, and a decision tree at the end if you just want a recommendation.