sergiopaniego HF Staff commited on
Commit
07fffa0
·
verified ·
1 Parent(s): 9d67534

Upload folder using huggingface_hub

Browse files
Files changed (13) hide show
  1. Dockerfile +81 -0
  2. README.md +442 -4
  3. __init__.py +78 -0
  4. client.py +464 -0
  5. models.py +109 -0
  6. openenv.yaml +6 -0
  7. prompts.py +373 -0
  8. pyproject.toml +43 -0
  9. server/__init__.py +19 -0
  10. server/app.py +87 -0
  11. server/python_executor.py +327 -0
  12. server/repl_environment.py +512 -0
  13. uv.lock +0 -0
Dockerfile ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local src/core)
10
+ # - Standalone environments (with openenv from pip)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Build argument to control whether we're building standalone or in-repo
19
+ ARG BUILD_MODE=in-repo
20
+ ARG ENV_NAME=repl_env
21
+
22
+ # Copy environment code (always at root of build context)
23
+ COPY . /app/env
24
+
25
+ # For in-repo builds, openenv-core is already in the pyproject.toml dependencies
26
+ # For standalone builds, openenv-core will be installed from pip via pyproject.toml
27
+ WORKDIR /app/env
28
+
29
+ # Ensure uv is available (for local builds where base image lacks it)
30
+ RUN if ! command -v uv >/dev/null 2>&1; then \
31
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
32
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
33
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
34
+ fi
35
+
36
+ # Install git for building from git repos (build-time only)
37
+ RUN apt-get update && apt-get install -y --no-install-recommends \
38
+ git \
39
+ && rm -rf /var/lib/apt/lists/*
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ # Health check using Python (more portable than curl/wget)
75
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
77
+
78
+ # Run the FastAPI server
79
+ # The module path is constructed to work with the /app/env structure
80
+ ENV ENABLE_WEB_INTERFACE=true
81
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -1,10 +1,448 @@
1
  ---
2
- title: Repl
3
- emoji: 👀
4
- colorFrom: gray
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: REPL Environment Server
3
+ emoji: 🎮
4
+ colorFrom: yellow
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
  ---
13
 
14
+ # REPL Environment for OpenEnv
15
+
16
+ A Python REPL environment for training language models on code execution tasks, based on the [Recursive Language Models (RLM)](https://arxiv.org/abs/2512.24601) paradigm.
17
+
18
+ ## Overview
19
+
20
+ The RLM paradigm allows language models to:
21
+ - Execute Python code in a sandboxed REPL environment
22
+ - Make recursive calls to themselves or other LMs via `llm_query()` / `llm_query_batched()`
23
+ - Handle near-infinite context by programmatically decomposing and exploring data
24
+ - Terminate with explicit `FINAL(answer)` or `answer = {"content": ..., "ready": True}` signals
25
+
26
+ ## Features
27
+
28
+ - **Unified API**: Same `REPLEnv` class works for both local and remote execution
29
+ - **Sandboxed Python Execution**: Safe code execution with restricted builtins
30
+ - **Context Loading**: Load large contexts that agents can explore programmatically
31
+ - **Multiple Finalization Patterns**:
32
+ - Direct call: `FINAL(answer)` - helper function injected into namespace
33
+ - Print pattern: `print('FINAL(answer)')` or `print('FINAL_VAR(var_name)')`
34
+ - Prime Intellect style: `answer = {"content": "...", "ready": True}`
35
+ - **Iteration Limits**: Configurable maximum steps per episode
36
+ - **Reward Signals**: Customizable reward functions for RL training
37
+ - **Optional LLM Oracle**: Can enable `llm_query()` and `llm_query_batched()` for recursive calls
38
+
39
+ ## Quick Start
40
+
41
+ ### Local Mode (No Server Required)
42
+
43
+ ```python
44
+ from repl_env import REPLEnv
45
+
46
+ # Create environment - runs locally by default
47
+ with REPLEnv() as env:
48
+ result = env.reset(
49
+ context="This is a large document with lots of text...",
50
+ task_prompt="Find the word count"
51
+ )
52
+
53
+ # Execute code iteratively
54
+ result = env.execute("words = context.split()")
55
+ result = env.execute("count = len(words)")
56
+ result = env.execute("print(f'FINAL({count})')")
57
+
58
+ print(f"Done: {result.done}")
59
+ print(f"Final Answer: {env.state().final_answer}")
60
+ ```
61
+
62
+ ### Remote Server Mode
63
+
64
+ ```python
65
+ from repl_env import REPLEnv
66
+
67
+ # Connect to a running server - same API!
68
+ with REPLEnv(base_url="https://my-server.hf.space") as env:
69
+ result = env.reset(context="...", task_prompt="...")
70
+ result = env.execute("count = len(context)")
71
+ result = env.execute("print(f'FINAL({count})')")
72
+ ```
73
+
74
+ ### Local Mode with LLM Support
75
+
76
+ ```python
77
+ from repl_env import REPLEnv
78
+
79
+ def my_llm_query(prompt: str) -> str:
80
+ return your_llm.generate(prompt)
81
+
82
+ def my_llm_query_batched(prompts: list[str]) -> list[str]:
83
+ return [my_llm_query(p) for p in prompts]
84
+
85
+ # Pass LLM functions for recursive calls
86
+ with REPLEnv(llm_query_fn=my_llm_query, llm_batch_fn=my_llm_query_batched) as env:
87
+ result = env.reset(context=large_document, task_prompt="Summarize this")
88
+
89
+ # Now the executed code can use llm_query() and llm_query_batched()!
90
+ result = env.execute("summary = llm_query('Summarize: ' + context[:1000])")
91
+ ```
92
+
93
+ ### From Docker or HuggingFace Hub
94
+
95
+ ```python
96
+ from repl_env import REPLEnv
97
+
98
+ # Start from Docker image
99
+ env = REPLEnv.from_docker_image("repl-env:latest")
100
+
101
+ # Or from HuggingFace Hub
102
+ env = REPLEnv.from_hub("openenv/repl-env")
103
+ ```
104
+
105
+ ## API Reference
106
+
107
+ ### REPLEnv
108
+
109
+ ```python
110
+ class REPLEnv:
111
+ def __init__(
112
+ self,
113
+ base_url: str | None = None, # Server URL (None = local mode)
114
+ *,
115
+ # Local-only options
116
+ llm_query_fn: Callable | None = None, # Function for llm_query()
117
+ llm_batch_fn: Callable | None = None, # Function for llm_query_batched()
118
+ max_output_length: int = 8192, # Max stdout/stderr chars
119
+ context_preview_length: int = 500, # Chars in context preview
120
+ reward_on_success: float = 1.0, # Reward on FINAL()
121
+ reward_on_iteration: float = 0.0, # Reward per step
122
+ reward_on_failure: float = -0.1, # Reward on max iterations
123
+ reward_on_error: float = -0.05, # Reward on execution error
124
+ # Remote-only options
125
+ connect_timeout_s: float = 10.0,
126
+ message_timeout_s: float = 60.0,
127
+ ): ...
128
+
129
+ def reset(
130
+ self,
131
+ *,
132
+ context: str = "", # Text to analyze (as `context` variable)
133
+ task_prompt: str = "", # Task description
134
+ max_iterations: int = 30, # Max code execution steps
135
+ seed: int | None = None, # Random seed
136
+ episode_id: str | None = None, # Custom episode ID
137
+ hf_token: str | None = None, # HF token for llm_query (remote mode)
138
+ llm_model: str | None = None, # Model for llm_query (remote mode)
139
+ ) -> StepResult[REPLObservation]: ...
140
+
141
+ def execute(self, code: str) -> StepResult[REPLObservation]: ...
142
+ def step(self, action: REPLAction) -> StepResult[REPLObservation]: ...
143
+ def submit_final_answer(self, answer: str) -> StepResult[REPLObservation]: ...
144
+ def state(self) -> REPLState: ...
145
+ def close(self) -> None: ...
146
+ ```
147
+
148
+ ### Action Space
149
+
150
+ ```python
151
+ class REPLAction:
152
+ code: str = "" # Python code to execute
153
+ is_final: bool = False # Whether this signals the final answer
154
+ final_answer: str | None = None # The final answer (if is_final=True)
155
+ ```
156
+
157
+ ### Observation Space
158
+
159
+ ```python
160
+ class REPLObservation:
161
+ result: CodeBlockResult # Execution result (stdout, stderr, etc.)
162
+ context_preview: str | None # First 500 chars of context
163
+ context_length: int # Total context length
164
+ available_variables: list # Variables in namespace
165
+ iteration: int # Current iteration
166
+ max_iterations: int # Max iterations
167
+ done: bool # Episode complete?
168
+ reward: float # Step reward
169
+ metadata: dict # Additional info (final_answer, etc.)
170
+ ```
171
+
172
+ ## Finalization Patterns
173
+
174
+ ### Pattern 1: Direct FINAL() call (recommended)
175
+ ```python
176
+ result = env.execute("answer = 42")
177
+ result = env.execute("FINAL(answer)")
178
+ # -> done=True, final_answer="42"
179
+ ```
180
+
181
+ ### Pattern 2: FINAL() via print
182
+ ```python
183
+ result = env.execute("answer = 42")
184
+ result = env.execute("print(f'FINAL({answer})')")
185
+ # -> done=True, final_answer="42"
186
+ ```
187
+
188
+ ### Pattern 3: FINAL_VAR() for variable reference
189
+ ```python
190
+ result = env.execute("my_result = 'The answer is 42'")
191
+ # Direct call (recommended) - pass variable name as string
192
+ # FINAL_VAR looks up the variable and returns FINAL(value)
193
+ result = env.execute('FINAL_VAR("my_result")')
194
+ # -> done=True, final_answer="The answer is 42"
195
+
196
+ # Also works via print (for regex detection)
197
+ result = env.execute("print('FINAL_VAR(my_result)')")
198
+ # -> done=True, final_answer="The answer is 42"
199
+ ```
200
+
201
+ ### Pattern 4: Prime Intellect style answer dict
202
+ ```python
203
+ result = env.execute("answer['content'] = '42'")
204
+ result = env.execute("answer['ready'] = True")
205
+ # -> done=True, final_answer="42"
206
+ ```
207
+
208
+ ## Prompts Module
209
+
210
+ The `prompts` module provides RLM-style prompts and parsing utilities:
211
+
212
+ ```python
213
+ from repl_env.prompts import (
214
+ # System prompts (from official RLM repo)
215
+ RLM_SYSTEM_PROMPT, # Base prompt with llm_query_batched
216
+ RLM_SYSTEM_PROMPT_QWEN, # For Qwen models (adds cost warning)
217
+
218
+ # Prompt building
219
+ QueryMetadata, # Context metadata dataclass
220
+ build_rlm_system_prompt, # Build system messages with metadata
221
+ build_user_prompt, # Build user prompt for each iteration
222
+ build_initial_prompt, # Convenience wrapper for iteration 0
223
+
224
+ # Parsing utilities
225
+ extract_code_blocks, # Extract code from ```repl``` or ```python``` blocks
226
+ format_observation, # Format execution result for LLM
227
+ )
228
+
229
+ # Example: Build messages using official RLM style
230
+ query_metadata = QueryMetadata(
231
+ context_lengths=[len(context)],
232
+ context_total_length=len(context),
233
+ context_type="str",
234
+ )
235
+ messages = build_rlm_system_prompt(RLM_SYSTEM_PROMPT_QWEN, query_metadata)
236
+ messages.append(build_user_prompt(root_prompt="Count words in the context", iteration=0))
237
+
238
+ # Extract code from LLM response (supports ```repl``` and ```python```)
239
+ response = "Here's my solution:\n```repl\ncount = len(context.split())\nFINAL(count)\n```"
240
+ code_blocks = extract_code_blocks(response) # ["count = len(context.split())\nFINAL(count)"]
241
+ ```
242
+
243
+ ## Examples
244
+
245
+ See the `examples/` directory for complete working examples:
246
+
247
+ - **`examples/repl_with_llm.py`** - Full RLM loop with local Qwen model
248
+ - **`examples/repl_oolong_simple.py`** - RLM on Oolong benchmark with HuggingFace Inference API
249
+
250
+ Run examples:
251
+ ```bash
252
+ # Full RLM example with local model (requires GPU)
253
+ python examples/repl_with_llm.py
254
+
255
+ # Oolong benchmark with HF Inference API (requires HF_TOKEN)
256
+ python examples/repl_oolong_simple.py
257
+ ```
258
+
259
+ ## Model Usage
260
+
261
+ ### Inference Loop
262
+
263
+ A typical model inference loop where the LLM generates code and the environment executes it:
264
+
265
+ ```python
266
+ from repl_env import REPLEnv
267
+ from repl_env.prompts import RLM_SYSTEM_PROMPT, build_initial_prompt, extract_code_blocks, format_observation
268
+
269
+ # Works with both local and remote!
270
+ with REPLEnv(base_url="http://localhost:8000") as env: # or REPLEnv() for local
271
+ result = env.reset(
272
+ context="The quick brown fox jumps over the lazy dog. " * 1000,
273
+ task_prompt="Count how many times 'fox' appears"
274
+ )
275
+
276
+ messages = [
277
+ {"role": "system", "content": RLM_SYSTEM_PROMPT},
278
+ {"role": "user", "content": build_initial_prompt(
279
+ task_prompt="Count how many times 'fox' appears",
280
+ context_length=result.observation.context_length,
281
+ context_preview=result.observation.context_preview,
282
+ variables=result.observation.available_variables,
283
+ )},
284
+ ]
285
+
286
+ while not result.done:
287
+ # Get code from LLM
288
+ response = your_llm.chat(messages)
289
+ code_blocks = extract_code_blocks(response)
290
+
291
+ for code in code_blocks:
292
+ result = env.execute(code)
293
+ if result.done:
294
+ break
295
+
296
+ # Update conversation
297
+ messages.append({"role": "assistant", "content": response})
298
+ messages.append({"role": "user", "content": format_observation(result.observation)})
299
+
300
+ print(f"Final answer: {env.state().final_answer}")
301
+ ```
302
+
303
+ ### Recursive LLM Calls (RLM Paradigm)
304
+
305
+ The key insight of RLM is that models can make recursive calls to themselves or other LLMs from within the code:
306
+
307
+ ```python
308
+ from repl_env import REPLEnv
309
+
310
+ def llm_query(prompt: str) -> str:
311
+ """Single LLM call - model can call this from executed code"""
312
+ return your_llm.generate(prompt)
313
+
314
+ def llm_query_batched(prompts: list[str]) -> list[str]:
315
+ """Batch LLM calls for efficiency (parallel in production)"""
316
+ return [your_llm.generate(p) for p in prompts]
317
+
318
+ # Create environment with LLM oracle (local mode)
319
+ with REPLEnv(llm_query_fn=llm_query, llm_batch_fn=llm_query_batched) as env:
320
+ result = env.reset(
321
+ context=massive_document, # Could be 100K+ chars
322
+ task_prompt="Summarize each section and find key themes"
323
+ )
324
+
325
+ # The model can now generate code like this:
326
+ code = """
327
+ # Split document into sections
328
+ sections = context.split('\\n\\n')
329
+
330
+ # Use LLM to summarize each section (recursive call!)
331
+ summaries = llm_query_batched([f"Summarize: {s[:1000]}" for s in sections[:10]])
332
+
333
+ # Combine summaries
334
+ combined = '\\n'.join(summaries)
335
+
336
+ # Final synthesis using another LLM call
337
+ answer['content'] = llm_query(f"Find key themes in: {combined}")
338
+ answer['ready'] = True
339
+ """
340
+
341
+ result = env.execute(code)
342
+ print(f"Done: {result.done}, Answer: {env.state().final_answer}")
343
+ ```
344
+
345
+ ### RL Training Integration
346
+
347
+ For RL training, integrate with frameworks like TRL, prime-rl, or verifiers:
348
+
349
+ ```python
350
+ from repl_env import REPLEnv
351
+
352
+ def collect_trajectory(env, policy, context, task):
353
+ """Collect a single trajectory for RL training"""
354
+ result = env.reset(context=context, task_prompt=task)
355
+
356
+ trajectory = []
357
+ total_reward = 0
358
+
359
+ while not result.done:
360
+ # Policy generates code
361
+ code = policy.generate(result.observation)
362
+
363
+ # Step environment
364
+ next_result = env.execute(code)
365
+
366
+ # Store transition
367
+ trajectory.append({
368
+ "observation": result.observation,
369
+ "action": code,
370
+ "reward": next_result.reward,
371
+ "next_observation": next_result.observation,
372
+ "done": next_result.done,
373
+ })
374
+
375
+ total_reward += next_result.reward
376
+ result = next_result
377
+
378
+ return trajectory, total_reward
379
+
380
+ # Training loop
381
+ with REPLEnv(
382
+ reward_on_success=1.0,
383
+ reward_on_iteration=0.0,
384
+ reward_on_error=-0.05,
385
+ reward_on_failure=-0.1,
386
+ ) as env:
387
+ for epoch in range(num_epochs):
388
+ for context, task, ground_truth in dataset:
389
+ trajectory, reward = collect_trajectory(env, policy, context, task)
390
+
391
+ # Verify answer correctness (optional external reward)
392
+ if trajectory:
393
+ final_answer = env.state().final_answer
394
+ if final_answer == ground_truth:
395
+ reward += verification_bonus
396
+
397
+ # Update policy (use your RL framework - PPO, GRPO, DPO, etc.)
398
+ policy.update(trajectory, reward)
399
+ ```
400
+
401
+ ### Reward Configuration
402
+
403
+ Configure rewards for different outcomes:
404
+
405
+ ```python
406
+ env = REPLEnv(
407
+ reward_on_success=1.0, # When FINAL() is called
408
+ reward_on_iteration=0.0, # Per step (can be negative to encourage efficiency)
409
+ reward_on_error=-0.05, # When code execution fails
410
+ reward_on_failure=-0.1, # When max iterations reached without answer
411
+ )
412
+ ```
413
+
414
+ ## Environment Configuration
415
+
416
+ | Environment Variable | Description | Default |
417
+ |---------------------|-------------|---------|
418
+ | `REPL_CONTEXT` | Initial context to load | "" |
419
+ | `REPL_TASK_PROMPT` | Task description | "" |
420
+ | `REPL_MAX_ITERATIONS` | Max steps per episode | 30 |
421
+ | `HF_TOKEN` | HuggingFace token for llm_query (server fallback) | None |
422
+ | `LLM_MODEL` | Model for llm_query/llm_query_batched | Qwen/Qwen3-Coder-480B-A35B-Instruct |
423
+
424
+ ## Running the Server
425
+
426
+ ### Using UV
427
+ ```bash
428
+ cd envs/repl_env
429
+ uv run --project . server
430
+ ```
431
+
432
+ ### Using Docker
433
+ ```bash
434
+ docker build -t repl-env:latest -f server/Dockerfile .
435
+ docker run -p 8000:8000 repl-env:latest
436
+ ```
437
+
438
+ ### Testing
439
+ ```bash
440
+ pytest tests/
441
+ ```
442
+
443
+ ## References
444
+
445
+ - [RLM Paper (arXiv:2512.24601)](https://arxiv.org/abs/2512.24601)
446
+ - [RLM Implementation](https://github.com/alexzhang13/rlm)
447
+ - [Alex Zhang's RLM Blog](https://alexzhang13.github.io/blog/2025/rlm/)
448
+ - [Prime Intellect RLM Blog](https://www.primeintellect.ai/blog/rlm)
__init__.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ REPL Environment for OpenEnv.
9
+
10
+ A Python REPL environment for training language models on code execution tasks,
11
+ based on the Recursive Language Models (RLM) paradigm.
12
+
13
+ This environment allows language models to:
14
+ - Execute Python code in a sandboxed REPL
15
+ - Work with large contexts loaded as variables
16
+ - Finalize answers via FINAL(), FINAL_VAR(), or answer dict pattern
17
+ - Optionally make recursive LLM calls via llm_query() / llm_query_batched()
18
+
19
+ Example:
20
+ >>> from repl_env import REPLEnv, REPLAction
21
+ >>>
22
+ >>> # Start from Docker
23
+ >>> env = REPLEnv.from_docker_image("repl-env:latest")
24
+ >>>
25
+ >>> # Reset with context
26
+ >>> result = env.reset(context="Hello World", task_prompt="Count characters")
27
+ >>>
28
+ >>> # Execute code
29
+ >>> result = env.execute("count = len(context)")
30
+ >>> result = env.execute("print(f'FINAL({count})')")
31
+ >>>
32
+ >>> # Check result
33
+ >>> print(f"Done: {result.done}, Answer: {result.observation.metadata['final_answer']}")
34
+ >>>
35
+ >>> env.close()
36
+
37
+ References:
38
+ - RLM Paper: https://arxiv.org/abs/2512.24601
39
+ - Prime Intellect Blog: https://www.primeintellect.ai/blog/rlm
40
+ - Alex Zhang Blog: https://alexzhang13.github.io/blog/2025/rlm/
41
+ """
42
+
43
+ from .models import REPLAction, REPLObservation, REPLState, CodeBlockResult
44
+ from .client import REPLEnv
45
+ from .prompts import (
46
+ # System prompts
47
+ RLM_SYSTEM_PROMPT,
48
+ RLM_SYSTEM_PROMPT_QWEN,
49
+ # Prompt building
50
+ QueryMetadata,
51
+ build_rlm_system_prompt,
52
+ build_user_prompt,
53
+ build_initial_prompt,
54
+ # Parsing utilities
55
+ extract_code_blocks,
56
+ format_observation,
57
+ )
58
+
59
+ __all__ = [
60
+ # Models
61
+ "REPLAction",
62
+ "REPLObservation",
63
+ "REPLState",
64
+ "CodeBlockResult",
65
+ # Client
66
+ "REPLEnv",
67
+ # System prompts
68
+ "RLM_SYSTEM_PROMPT",
69
+ "RLM_SYSTEM_PROMPT_QWEN",
70
+ # Prompt building
71
+ "QueryMetadata",
72
+ "build_rlm_system_prompt",
73
+ "build_user_prompt",
74
+ "build_initial_prompt",
75
+ # Parsing utilities
76
+ "extract_code_blocks",
77
+ "format_observation",
78
+ ]
client.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ REPL Environment Client.
9
+
10
+ This module provides a unified client for the REPL Environment that works
11
+ with both remote servers (via WebSocket) and local execution (no server needed).
12
+
13
+ Examples:
14
+ # Connect to remote server with your HF token for sub-LLM calls
15
+ env = REPLEnv(base_url="https://my-server.hf.space")
16
+ result = env.reset(
17
+ context="...",
18
+ task_prompt="...",
19
+ hf_token=os.environ["HF_TOKEN"], # Server uses this for llm_query
20
+ )
21
+
22
+ # Run locally (no server)
23
+ env = REPLEnv()
24
+
25
+ # Local with LLM support
26
+ env = REPLEnv(llm_query_fn=my_llm, llm_batch_fn=my_batch)
27
+
28
+ # All use the same interface
29
+ result = env.execute("x = len(context)")
30
+ env.close()
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
36
+
37
+ # Support both in-repo and standalone imports
38
+ try:
39
+ from openenv.core.client_types import StepResult
40
+ from openenv.core.env_client import EnvClient
41
+ from .models import REPLAction, REPLObservation, REPLState, CodeBlockResult
42
+ except ImportError:
43
+ from openenv.core.client_types import StepResult
44
+ from openenv.core.env_client import EnvClient
45
+ from models import REPLAction, REPLObservation, REPLState, CodeBlockResult
46
+
47
+ if TYPE_CHECKING:
48
+ from .server.repl_environment import REPLEnvironment
49
+
50
+
51
+ class REPLEnv:
52
+ """
53
+ Unified client for the REPL Environment.
54
+
55
+ Works with both remote servers and local execution, providing the same
56
+ interface regardless of where the code runs.
57
+
58
+ Examples:
59
+ >>> # Connect to a running server
60
+ >>> with REPLEnv(base_url="http://localhost:8000") as env:
61
+ ... result = env.reset(context="Hello World", task_prompt="Count chars")
62
+ ... result = env.execute("count = len(context)")
63
+ ... result = env.execute("print(f'FINAL({count})')")
64
+ ... print(result.done) # True
65
+
66
+ >>> # Run locally without a server
67
+ >>> with REPLEnv() as env:
68
+ ... result = env.reset(context="Hello World", task_prompt="Count chars")
69
+ ... result = env.execute("count = len(context)")
70
+ ... print(result.observation.result.success) # True
71
+
72
+ >>> # Local with LLM support for recursive calls
73
+ >>> def my_llm(prompt: str) -> str:
74
+ ... return "LLM response"
75
+ >>> with REPLEnv(llm_query_fn=my_llm) as env:
76
+ ... result = env.reset(context="...")
77
+ ... result = env.execute("response = llm_query('Summarize: ' + context)")
78
+
79
+ >>> # From Docker image
80
+ >>> env = REPLEnv.from_docker_image("repl-env:latest")
81
+
82
+ >>> # From HuggingFace Hub
83
+ >>> env = REPLEnv.from_hub("openenv/repl-env")
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ base_url: Optional[str] = None,
89
+ *,
90
+ # Local-only options (ignored when base_url is set)
91
+ llm_query_fn: Optional[Callable[[str], str]] = None,
92
+ llm_batch_fn: Optional[Callable[[List[str]], List[str]]] = None,
93
+ max_output_length: int = 8192,
94
+ context_preview_length: int = 500,
95
+ reward_on_success: float = 1.0,
96
+ reward_on_iteration: float = 0.0,
97
+ reward_on_failure: float = -0.1,
98
+ reward_on_error: float = -0.05,
99
+ # Connection options (ignored when running locally)
100
+ connect_timeout_s: float = 10.0,
101
+ message_timeout_s: float = 60.0,
102
+ ):
103
+ """
104
+ Initialize REPL environment.
105
+
106
+ Args:
107
+ base_url: Server URL. If None, runs locally without a server.
108
+ llm_query_fn: Function for llm_query() calls (local mode only).
109
+ llm_batch_fn: Function for llm_query_batched() calls (local mode only).
110
+ max_output_length: Max stdout/stderr chars per execution (local only).
111
+ context_preview_length: Chars to show in context preview (local only).
112
+ reward_on_success: Reward when final answer submitted (local only).
113
+ reward_on_iteration: Reward per iteration step (local only).
114
+ reward_on_failure: Reward when max iterations reached (local only).
115
+ reward_on_error: Reward when code execution fails (local only).
116
+ connect_timeout_s: WebSocket connection timeout (remote only).
117
+ message_timeout_s: Message response timeout (remote only).
118
+ """
119
+ self._base_url = base_url
120
+ self._local_env: Optional[REPLEnvironment] = None
121
+ self._remote_client: Optional[_RemoteREPLClient] = None
122
+
123
+ # Store local-mode options
124
+ self._llm_query_fn = llm_query_fn
125
+ self._llm_batch_fn = llm_batch_fn
126
+ self._max_output_length = max_output_length
127
+ self._context_preview_length = context_preview_length
128
+ self._reward_on_success = reward_on_success
129
+ self._reward_on_iteration = reward_on_iteration
130
+ self._reward_on_failure = reward_on_failure
131
+ self._reward_on_error = reward_on_error
132
+
133
+ # Store remote-mode options
134
+ self._connect_timeout_s = connect_timeout_s
135
+ self._message_timeout_s = message_timeout_s
136
+
137
+ # Provider for container/runtime lifecycle (set by factory methods)
138
+ self._provider = None
139
+
140
+ def _ensure_initialized(self) -> None:
141
+ """Initialize the appropriate backend (local or remote)."""
142
+ if self._local_env is not None or self._remote_client is not None:
143
+ return
144
+
145
+ if self._base_url is None:
146
+ # Local mode: create REPLEnvironment directly
147
+ from .server.repl_environment import REPLEnvironment
148
+
149
+ self._local_env = REPLEnvironment(
150
+ max_output_length=self._max_output_length,
151
+ context_preview_length=self._context_preview_length,
152
+ reward_on_success=self._reward_on_success,
153
+ reward_on_iteration=self._reward_on_iteration,
154
+ reward_on_failure=self._reward_on_failure,
155
+ reward_on_error=self._reward_on_error,
156
+ llm_query_fn=self._llm_query_fn,
157
+ llm_batch_fn=self._llm_batch_fn,
158
+ )
159
+ else:
160
+ # Remote mode: create WebSocket client
161
+ self._remote_client = _RemoteREPLClient(
162
+ base_url=self._base_url,
163
+ connect_timeout_s=self._connect_timeout_s,
164
+ message_timeout_s=self._message_timeout_s,
165
+ provider=self._provider,
166
+ )
167
+ self._remote_client.connect()
168
+
169
+ def reset(
170
+ self,
171
+ *,
172
+ context: str = "",
173
+ task_prompt: str = "",
174
+ max_iterations: int = 30,
175
+ seed: Optional[int] = None,
176
+ episode_id: Optional[str] = None,
177
+ hf_token: Optional[str] = None,
178
+ llm_model: Optional[str] = None,
179
+ ) -> StepResult[REPLObservation]:
180
+ """
181
+ Reset the environment for a new episode.
182
+
183
+ Args:
184
+ context: Text content to analyze (accessible as `context` variable).
185
+ task_prompt: Description of the task to solve.
186
+ max_iterations: Maximum code execution steps before timeout.
187
+ seed: Optional random seed for reproducibility.
188
+ episode_id: Optional custom episode identifier.
189
+ hf_token: Optional HuggingFace token for llm_query/llm_query_batched.
190
+ When provided, the server uses this token for sub-LLM calls
191
+ instead of its own configured token.
192
+ llm_model: Optional model name for LLM functions (default: Qwen3-Coder-480B).
193
+
194
+ Returns:
195
+ StepResult with initial observation.
196
+ """
197
+ self._ensure_initialized()
198
+
199
+ if self._local_env is not None:
200
+ # Local mode
201
+ self._local_env.max_iterations = max_iterations
202
+ obs = self._local_env.reset(
203
+ seed=seed,
204
+ episode_id=episode_id,
205
+ context=context,
206
+ task_prompt=task_prompt,
207
+ hf_token=hf_token,
208
+ llm_model=llm_model,
209
+ )
210
+ return self._wrap_observation(obs)
211
+ else:
212
+ # Remote mode
213
+ assert self._remote_client is not None
214
+ return self._remote_client.reset(
215
+ context=context,
216
+ task_prompt=task_prompt,
217
+ max_iterations=max_iterations,
218
+ seed=seed,
219
+ episode_id=episode_id,
220
+ hf_token=hf_token,
221
+ llm_model=llm_model,
222
+ )
223
+
224
+ def step(self, action: REPLAction) -> StepResult[REPLObservation]:
225
+ """
226
+ Execute a REPL action.
227
+
228
+ Args:
229
+ action: REPLAction containing code to execute.
230
+
231
+ Returns:
232
+ StepResult with execution observation.
233
+ """
234
+ self._ensure_initialized()
235
+
236
+ if self._local_env is not None:
237
+ obs = self._local_env.step(action)
238
+ return self._wrap_observation(obs)
239
+ else:
240
+ assert self._remote_client is not None
241
+ return self._remote_client.step(action)
242
+
243
+ def execute(self, code: str) -> StepResult[REPLObservation]:
244
+ """
245
+ Execute Python code in the REPL.
246
+
247
+ Convenience method that wraps step() with a code-only action.
248
+
249
+ Args:
250
+ code: Python code to execute.
251
+
252
+ Returns:
253
+ StepResult with execution observation.
254
+ """
255
+ return self.step(REPLAction(code=code))
256
+
257
+ def submit_final_answer(self, answer: str) -> StepResult[REPLObservation]:
258
+ """
259
+ Submit a final answer and terminate the episode.
260
+
261
+ Args:
262
+ answer: The final answer string.
263
+
264
+ Returns:
265
+ StepResult with done=True.
266
+ """
267
+ return self.step(REPLAction(code="", is_final=True, final_answer=answer))
268
+
269
+ def get_variable(self, name: str) -> StepResult[REPLObservation]:
270
+ """
271
+ Retrieve and print a variable from the REPL namespace.
272
+
273
+ Args:
274
+ name: Variable name to retrieve.
275
+
276
+ Returns:
277
+ StepResult with variable value in stdout.
278
+ """
279
+ return self.execute(f"print(repr({name}))")
280
+
281
+ def state(self) -> REPLState:
282
+ """
283
+ Get current environment state.
284
+
285
+ Returns:
286
+ REPLState with current environment information.
287
+ """
288
+ self._ensure_initialized()
289
+
290
+ if self._local_env is not None:
291
+ return self._local_env.state
292
+ else:
293
+ assert self._remote_client is not None
294
+ return self._remote_client.state()
295
+
296
+ def list_variables(self) -> List[str]:
297
+ """
298
+ Get list of available variables in the current session.
299
+
300
+ Returns:
301
+ List of variable names.
302
+ """
303
+ return self.state().namespace_keys
304
+
305
+ def close(self) -> None:
306
+ """Clean up resources."""
307
+ if self._local_env is not None:
308
+ self._local_env.close()
309
+ self._local_env = None
310
+
311
+ if self._remote_client is not None:
312
+ self._remote_client.close()
313
+ self._remote_client = None
314
+
315
+ def _wrap_observation(self, obs: REPLObservation) -> StepResult[REPLObservation]:
316
+ """Wrap a local REPLObservation in a StepResult."""
317
+ return StepResult(
318
+ observation=obs,
319
+ reward=obs.reward,
320
+ done=obs.done,
321
+ )
322
+
323
+ # Context manager support
324
+
325
+ def __enter__(self) -> "REPLEnv":
326
+ """Enter context manager."""
327
+ self._ensure_initialized()
328
+ return self
329
+
330
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
331
+ """Exit context manager."""
332
+ self.close()
333
+
334
+ # Factory methods
335
+
336
+ @classmethod
337
+ def from_docker_image(
338
+ cls,
339
+ image: str,
340
+ **kwargs: Any,
341
+ ) -> "REPLEnv":
342
+ """
343
+ Create a REPL environment by spinning up a Docker container.
344
+
345
+ Args:
346
+ image: Docker image name to run (e.g., "repl-env:latest").
347
+ **kwargs: Additional arguments passed to container start.
348
+
349
+ Returns:
350
+ Connected REPLEnv instance.
351
+ """
352
+ from openenv.core.containers.runtime import LocalDockerProvider
353
+
354
+ provider = LocalDockerProvider()
355
+ base_url = provider.start_container(image, **kwargs)
356
+ provider.wait_for_ready(base_url)
357
+
358
+ env = cls(base_url=base_url)
359
+ env._provider = provider
360
+ env._ensure_initialized()
361
+ return env
362
+
363
+ @classmethod
364
+ def from_hub(
365
+ cls,
366
+ repo_id: str,
367
+ *,
368
+ use_docker: bool = True,
369
+ **kwargs: Any,
370
+ ) -> "REPLEnv":
371
+ """
372
+ Create a REPL environment from a HuggingFace Space.
373
+
374
+ Args:
375
+ repo_id: HuggingFace space identifier (e.g., "openenv/repl-env").
376
+ use_docker: If True, pull from HF registry. If False, run with UV.
377
+ **kwargs: Additional arguments passed to provider.
378
+
379
+ Returns:
380
+ Connected REPLEnv instance.
381
+ """
382
+ if use_docker:
383
+ from openenv.core.containers.runtime import LocalDockerProvider
384
+
385
+ provider = LocalDockerProvider()
386
+ tag = kwargs.pop("tag", "latest")
387
+ image = f"registry.hf.space/{repo_id.replace('/', '-')}:{tag}"
388
+ base_url = provider.start_container(image, **kwargs)
389
+ provider.wait_for_ready(base_url)
390
+ else:
391
+ from openenv.core.containers.runtime import UVProvider
392
+
393
+ project_path = kwargs.pop(
394
+ "project_path", f"git+https://huggingface.co/spaces/{repo_id}"
395
+ )
396
+ provider = UVProvider(project_path=project_path, **kwargs)
397
+ base_url = provider.start()
398
+ provider.wait_for_ready()
399
+
400
+ env = cls(base_url=base_url)
401
+ env._provider = provider
402
+ env._ensure_initialized()
403
+ return env
404
+
405
+
406
+ class _RemoteREPLClient(EnvClient[REPLAction, REPLObservation, REPLState]):
407
+ """
408
+ Internal WebSocket client for remote REPL connections.
409
+
410
+ This is the original EnvClient-based implementation, now used internally
411
+ by REPLEnv for remote mode.
412
+ """
413
+
414
+ def _step_payload(self, action: REPLAction) -> Dict:
415
+ """Convert REPLAction to JSON payload for step request."""
416
+ return {
417
+ "code": action.code,
418
+ "is_final": action.is_final,
419
+ "final_answer": action.final_answer,
420
+ }
421
+
422
+ def _parse_result(self, payload: Dict) -> StepResult[REPLObservation]:
423
+ """Parse server response into StepResult[REPLObservation]."""
424
+ obs_data = payload.get("observation", {})
425
+ result_data = obs_data.get("result", {})
426
+
427
+ observation = REPLObservation(
428
+ result=CodeBlockResult(
429
+ stdout=result_data.get("stdout", ""),
430
+ stderr=result_data.get("stderr", ""),
431
+ locals_snapshot=result_data.get("locals_snapshot", {}),
432
+ execution_time=result_data.get("execution_time", 0.0),
433
+ success=result_data.get("success", True),
434
+ exception=result_data.get("exception"),
435
+ ),
436
+ context_preview=obs_data.get("context_preview"),
437
+ context_length=obs_data.get("context_length", 0),
438
+ available_variables=obs_data.get("available_variables", []),
439
+ iteration=obs_data.get("iteration", 0),
440
+ max_iterations=obs_data.get("max_iterations", 30),
441
+ done=payload.get("done", False),
442
+ reward=payload.get("reward"),
443
+ metadata=obs_data.get("metadata", {}),
444
+ )
445
+
446
+ return StepResult(
447
+ observation=observation,
448
+ reward=payload.get("reward"),
449
+ done=payload.get("done", False),
450
+ )
451
+
452
+ def _parse_state(self, payload: Dict) -> REPLState:
453
+ """Parse server response into REPLState object."""
454
+ return REPLState(
455
+ episode_id=payload.get("episode_id"),
456
+ step_count=payload.get("step_count", 0),
457
+ context=payload.get("context"),
458
+ task_prompt=payload.get("task_prompt"),
459
+ iteration=payload.get("iteration", 0),
460
+ max_iterations=payload.get("max_iterations", 30),
461
+ namespace_keys=payload.get("namespace_keys", []),
462
+ final_answer=payload.get("final_answer"),
463
+ total_execution_time=payload.get("total_execution_time", 0.0),
464
+ )
models.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Data models for the REPL Environment.
9
+
10
+ The REPL environment provides a Python REPL for training language models
11
+ on code execution tasks, based on the Recursive Language Models (RLM) paradigm.
12
+
13
+ Supports two finalization patterns:
14
+ 1. RLM-style: print('FINAL(answer)') or print('FINAL_VAR(var_name)')
15
+ 2. Prime Intellect style: answer = {"content": "...", "ready": True}
16
+ """
17
+
18
+ from typing import Any, Dict, List, Optional
19
+
20
+ from pydantic import BaseModel, Field
21
+
22
+ # Support both in-repo and standalone imports
23
+ try:
24
+ from openenv.core.env_server.types import Action, Observation, State
25
+ except ImportError:
26
+ from openenv.core.env_server.types import Action, Observation, State
27
+
28
+
29
+ class REPLAction(Action):
30
+ """Action containing Python code to execute in the REPL.
31
+
32
+ Supports multiple finalization patterns:
33
+ 1. RLM-style: print('FINAL(answer)') or print('FINAL_VAR(var_name)') in code
34
+ 2. Prime Intellect style: answer = {"content": "...", "ready": True} in namespace
35
+ 3. Explicit: Set is_final=True with final_answer
36
+ """
37
+
38
+ code: str = Field(default="", description="Python code to execute")
39
+ is_final: bool = Field(
40
+ default=False, description="Whether this action signals the final answer"
41
+ )
42
+ final_answer: Optional[str] = Field(
43
+ default=None, description="Final answer if is_final=True"
44
+ )
45
+
46
+
47
+ class CodeBlockResult(BaseModel):
48
+ """Result of executing a single code block."""
49
+
50
+ stdout: str = Field(default="", description="Standard output from execution")
51
+ stderr: str = Field(default="", description="Standard error from execution")
52
+ locals_snapshot: Dict[str, str] = Field(
53
+ default_factory=dict,
54
+ description="String representations of new/modified variables",
55
+ )
56
+ execution_time: float = Field(
57
+ default=0.0, ge=0, description="Execution time in seconds"
58
+ )
59
+ success: bool = Field(default=True, description="Whether execution succeeded")
60
+ exception: Optional[str] = Field(
61
+ default=None, description="Exception message if execution failed"
62
+ )
63
+
64
+
65
+ class REPLObservation(Observation):
66
+ """Observation returned after code execution in the REPL."""
67
+
68
+ result: CodeBlockResult = Field(
69
+ default_factory=CodeBlockResult, description="Result of code execution"
70
+ )
71
+ context_preview: Optional[str] = Field(
72
+ default=None,
73
+ description="Preview of the context (first N chars) if context is loaded",
74
+ )
75
+ context_length: int = Field(
76
+ default=0, ge=0, description="Total length of context in characters"
77
+ )
78
+ available_variables: List[str] = Field(
79
+ default_factory=list,
80
+ description="List of variable names available in the namespace",
81
+ )
82
+ iteration: int = Field(default=0, ge=0, description="Current iteration number")
83
+ max_iterations: int = Field(
84
+ default=30, ge=1, description="Maximum allowed iterations"
85
+ )
86
+
87
+
88
+ class REPLState(State):
89
+ """Extended state for REPL environment."""
90
+
91
+ context: Optional[str] = Field(
92
+ default=None, description="The context/problem to work with"
93
+ )
94
+ task_prompt: Optional[str] = Field(
95
+ default=None, description="The task description to solve"
96
+ )
97
+ iteration: int = Field(default=0, ge=0, description="Current iteration number")
98
+ max_iterations: int = Field(
99
+ default=30, ge=1, description="Max iterations before termination"
100
+ )
101
+ namespace_keys: List[str] = Field(
102
+ default_factory=list, description="Variables currently in namespace"
103
+ )
104
+ final_answer: Optional[str] = Field(
105
+ default=None, description="Final answer if episode is complete"
106
+ )
107
+ total_execution_time: float = Field(
108
+ default=0.0, ge=0, description="Total code execution time in seconds"
109
+ )
openenv.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: repl
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
prompts.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ RLM System Prompts and Parsing Utilities for the REPL Environment.
9
+
10
+ Based on the official RLM repo: https://github.com/alexzhang13/rlm
11
+
12
+ Two versions available:
13
+ - RLM_SYSTEM_PROMPT: Base prompt from the repo (with llm_query_batched)
14
+ - RLM_SYSTEM_PROMPT_QWEN: For Qwen3-Coder-480B (adds IMPORTANT cost warning)
15
+
16
+ Parsing utilities help extract code blocks and format observations.
17
+ """
18
+
19
+ import re
20
+ import textwrap
21
+ from dataclasses import dataclass
22
+ from typing import List, Optional
23
+
24
+
25
+ # =============================================================================
26
+ # Query Metadata (for context info)
27
+ # =============================================================================
28
+
29
+
30
+ @dataclass
31
+ class QueryMetadata:
32
+ """Metadata about the context for building prompts."""
33
+
34
+ context_lengths: List[int]
35
+ context_total_length: int
36
+ context_type: str = "str" # "str" or "List[str]"
37
+
38
+
39
+ # =============================================================================
40
+ # System Prompt from Official RLM Repo
41
+ # =============================================================================
42
+
43
+ RLM_SYSTEM_PROMPT = textwrap.dedent(
44
+ """You are tasked with answering a query with associated context. You can access, transform, and analyze this context interactively in a REPL environment that can recursively query sub-LLMs, which you are strongly encouraged to use as much as possible. You will be queried iteratively until you provide a final answer.
45
+
46
+ The REPL environment is initialized with:
47
+ 1. A `context` variable that contains extremely important information about your query. You should check the content of the `context` variable to understand what you are working with. Make sure you look through it sufficiently as you answer your query.
48
+ 2. A `llm_query` function that allows you to query an LLM (that can handle around 500K chars) inside your REPL environment.
49
+ 3. A `llm_query_batched` function that allows you to query multiple prompts concurrently: `llm_query_batched(prompts: List[str]) -> List[str]`. This is much faster than sequential `llm_query` calls when you have multiple independent queries. Results are returned in the same order as the input prompts.
50
+ 4. The ability to use `print()` statements to view the output of your REPL code and continue your reasoning.
51
+
52
+ You will only be able to see truncated outputs from the REPL environment, so you should use the query LLM function on variables you want to analyze. You will find this function especially useful when you have to analyze the semantics of the context. Use these variables as buffers to build up your final answer.
53
+ Make sure to explicitly look through the entire context in REPL before answering your query. An example strategy is to first look at the context and figure out a chunking strategy, then break up the context into smart chunks, and query an LLM per chunk with a particular question and save the answers to a buffer, then query an LLM with all the buffers to produce your final answer.
54
+
55
+ You can use the REPL environment to help you understand your context, especially if it is huge. Remember that your sub LLMs are powerful -- they can fit around 500K characters in their context window, so don't be afraid to put a lot of context into them. For example, a viable strategy is to feed 10 documents per sub-LLM query. Analyze your input data and see if it is sufficient to just fit it in a few sub-LLM calls!
56
+
57
+ When you want to execute Python code in the REPL environment, wrap it in triple backticks with 'repl' language identifier. For example, say we want our recursive model to search for the magic number in the context (assuming the context is a string), and the context is very long, so we want to chunk it:
58
+ ```repl
59
+ chunk = context[:10000]
60
+ answer = llm_query(f"What is the magic number in the context? Here is the chunk: {{chunk}}")
61
+ print(answer)
62
+ ```
63
+
64
+ As an example, suppose you're trying to answer a question about a book. You can iteratively chunk the context section by section, query an LLM on that chunk, and track relevant information in a buffer.
65
+ ```repl
66
+ query = "In Harry Potter and the Sorcerer's Stone, did Gryffindor win the House Cup because they led?"
67
+ for i, section in enumerate(context):
68
+ if i == len(context) - 1:
69
+ buffer = llm_query(f"You are on the last section of the book. So far you know that: {{buffers}}. Gather from this last section to answer {{query}}. Here is the section: {{section}}")
70
+ print(f"Based on reading iteratively through the book, the answer is: {{buffer}}")
71
+ else:
72
+ buffer = llm_query(f"You are iteratively looking through a book, and are on section {{i}} of {{len(context)}}. Gather information to help answer {{query}}. Here is the section: {{section}}")
73
+ print(f"After section {{i}} of {{len(context)}}, you have tracked: {{buffer}}")
74
+ ```
75
+
76
+ As another example, when the context isn't that long (e.g. >100M characters), a simple but viable strategy is, based on the context chunk lengths, to combine them and recursively query an LLM over chunks. For example, if the context is a List[str], we ask the same query over each chunk using `llm_query_batched` for concurrent processing:
77
+ ```repl
78
+ query = "A man became famous for his book "The Great Gatsby". How many jobs did he have?"
79
+ # Suppose our context is ~1M chars, and we want each sub-LLM query to be ~0.1M chars so we split it into 10 chunks
80
+ chunk_size = len(context) // 10
81
+ chunks = []
82
+ for i in range(10):
83
+ if i < 9:
84
+ chunk_str = "\\n".join(context[i*chunk_size:(i+1)*chunk_size])
85
+ else:
86
+ chunk_str = "\\n".join(context[i*chunk_size:])
87
+ chunks.append(chunk_str)
88
+
89
+ # Use batched query for concurrent processing - much faster than sequential calls!
90
+ prompts = [f"Try to answer the following query: {{query}}. Here are the documents:\\n{{chunk}}. Only answer if you are confident in your answer based on the evidence." for chunk in chunks]
91
+ answers = llm_query_batched(prompts)
92
+ for i, answer in enumerate(answers):
93
+ print(f"I got the answer from chunk {{i}}: {{answer}}")
94
+ final_answer = llm_query(f"Aggregating all the answers per chunk, answer the original query about total number of jobs: {{query}}\\n\\nAnswers:\\n" + "\\n".join(answers))
95
+ ```
96
+
97
+ As a final example, after analyzing the context and realizing its separated by Markdown headers, we can maintain state through buffers by chunking the context by headers, and iteratively querying an LLM over it:
98
+ ```repl
99
+ # After finding out the context is separated by Markdown headers, we can chunk, summarize, and answer
100
+ import re
101
+ sections = re.split(r'### (.+)', context["content"])
102
+ buffers = []
103
+ for i in range(1, len(sections), 2):
104
+ header = sections[i]
105
+ info = sections[i+1]
106
+ summary = llm_query(f"Summarize this {{header}} section: {{info}}")
107
+ buffers.append(f"{{header}}: {{summary}}")
108
+ final_answer = llm_query(f"Based on these summaries, answer the original query: {{query}}\\n\\nSummaries:\\n" + "\\n".join(buffers))
109
+ ```
110
+ In the next step, we can return FINAL_VAR("final_answer").
111
+
112
+ IMPORTANT: When you are done with the iterative process, you MUST provide a final answer using one of the FINAL functions. Do not use these unless you have completed your task. You have two options:
113
+ 1. Use FINAL(value) to provide the answer directly, e.g., FINAL(42) or FINAL(my_variable)
114
+ 2. Use FINAL_VAR("variable_name") to return a variable by name, e.g., FINAL_VAR("final_answer")
115
+
116
+ Think step by step carefully, plan, and execute this plan immediately in your response -- do not just say "I will do this" or "I will do that". Output to the REPL environment and recursive LLMs as much as possible. Remember to explicitly answer the original query in your final answer.
117
+ """
118
+ )
119
+
120
+
121
+ # =============================================================================
122
+ # System Prompt for Qwen3-Coder-480B (with IMPORTANT cost warning from paper)
123
+ # Adds cost warning after the "sub LLMs are powerful" paragraph
124
+ # =============================================================================
125
+
126
+ RLM_SYSTEM_PROMPT_QWEN = textwrap.dedent(
127
+ """You are tasked with answering a query with associated context. You can access, transform, and analyze this context interactively in a REPL environment that can recursively query sub-LLMs, which you are strongly encouraged to use as much as possible. You will be queried iteratively until you provide a final answer.
128
+
129
+ The REPL environment is initialized with:
130
+ 1. A `context` variable that contains extremely important information about your query. You should check the content of the `context` variable to understand what you are working with. Make sure you look through it sufficiently as you answer your query.
131
+ 2. A `llm_query` function that allows you to query an LLM (that can handle around 500K chars) inside your REPL environment.
132
+ 3. A `llm_query_batched` function that allows you to query multiple prompts concurrently: `llm_query_batched(prompts: List[str]) -> List[str]`. This is much faster than sequential `llm_query` calls when you have multiple independent queries. Results are returned in the same order as the input prompts.
133
+ 4. The ability to use `print()` statements to view the output of your REPL code and continue your reasoning.
134
+
135
+ You will only be able to see truncated outputs from the REPL environment, so you should use the query LLM function on variables you want to analyze. You will find this function especially useful when you have to analyze the semantics of the context. Use these variables as buffers to build up your final answer.
136
+ Make sure to explicitly look through the entire context in REPL before answering your query. An example strategy is to first look at the context and figure out a chunking strategy, then break up the context into smart chunks, and query an LLM per chunk with a particular question and save the answers to a buffer, then query an LLM with all the buffers to produce your final answer.
137
+
138
+ You can use the REPL environment to help you understand your context, especially if it is huge. Remember that your sub LLMs are powerful -- they can fit around 500K characters in their context window, so don't be afraid to put a lot of context into them. For example, a viable strategy is to feed 10 documents per sub-LLM query. Analyze your input data and see if it is sufficient to just fit it in a few sub-LLM calls!
139
+
140
+ IMPORTANT: Be very careful about using 'llm_query' as it incurs high runtime costs. Always batch as much information as reasonably possible into each call (aim for around ~200k characters per call). For example, if you have 1000 lines of information to process, it's much better to split into chunks of 5 and call 'llm_query' on each chunk (200 calls total) rather than making 1000 individual calls. Minimize the number of 'llm_query' calls by batching related information together.
141
+
142
+ When you want to execute Python code in the REPL environment, wrap it in triple backticks with 'repl' language identifier. For example, say we want our recursive model to search for the magic number in the context (assuming the context is a string), and the context is very long, so we want to chunk it:
143
+ ```repl
144
+ chunk = context[:10000]
145
+ answer = llm_query(f"What is the magic number in the context? Here is the chunk: {{chunk}}")
146
+ print(answer)
147
+ ```
148
+
149
+ As an example, suppose you're trying to answer a question about a book. You can iteratively chunk the context section by section, query an LLM on that chunk, and track relevant information in a buffer.
150
+ ```repl
151
+ query = "In Harry Potter and the Sorcerer's Stone, did Gryffindor win the House Cup because they led?"
152
+ for i, section in enumerate(context):
153
+ if i == len(context) - 1:
154
+ buffer = llm_query(f"You are on the last section of the book. So far you know that: {{buffers}}. Gather from this last section to answer {{query}}. Here is the section: {{section}}")
155
+ print(f"Based on reading iteratively through the book, the answer is: {{buffer}}")
156
+ else:
157
+ buffer = llm_query(f"You are iteratively looking through a book, and are on section {{i}} of {{len(context)}}. Gather information to help answer {{query}}. Here is the section: {{section}}")
158
+ print(f"After section {{i}} of {{len(context)}}, you have tracked: {{buffer}}")
159
+ ```
160
+
161
+ As another example, when the context isn't that long (e.g. >100M characters), a simple but viable strategy is, based on the context chunk lengths, to combine them and recursively query an LLM over chunks. For example, if the context is a List[str], we ask the same query over each chunk using `llm_query_batched` for concurrent processing:
162
+ ```repl
163
+ query = "A man became famous for his book "The Great Gatsby". How many jobs did he have?"
164
+ # Suppose our context is ~1M chars, and we want each sub-LLM query to be ~0.1M chars so we split it into 10 chunks
165
+ chunk_size = len(context) // 10
166
+ chunks = []
167
+ for i in range(10):
168
+ if i < 9:
169
+ chunk_str = "\\n".join(context[i*chunk_size:(i+1)*chunk_size])
170
+ else:
171
+ chunk_str = "\\n".join(context[i*chunk_size:])
172
+ chunks.append(chunk_str)
173
+
174
+ # Use batched query for concurrent processing - much faster than sequential calls!
175
+ prompts = [f"Try to answer the following query: {{query}}. Here are the documents:\\n{{chunk}}. Only answer if you are confident in your answer based on the evidence." for chunk in chunks]
176
+ answers = llm_query_batched(prompts)
177
+ for i, answer in enumerate(answers):
178
+ print(f"I got the answer from chunk {{i}}: {{answer}}")
179
+ final_answer = llm_query(f"Aggregating all the answers per chunk, answer the original query about total number of jobs: {{query}}\\n\\nAnswers:\\n" + "\\n".join(answers))
180
+ ```
181
+
182
+ As a final example, after analyzing the context and realizing its separated by Markdown headers, we can maintain state through buffers by chunking the context by headers, and iteratively querying an LLM over it:
183
+ ```repl
184
+ # After finding out the context is separated by Markdown headers, we can chunk, summarize, and answer
185
+ import re
186
+ sections = re.split(r'### (.+)', context["content"])
187
+ buffers = []
188
+ for i in range(1, len(sections), 2):
189
+ header = sections[i]
190
+ info = sections[i+1]
191
+ summary = llm_query(f"Summarize this {{header}} section: {{info}}")
192
+ buffers.append(f"{{header}}: {{summary}}")
193
+ final_answer = llm_query(f"Based on these summaries, answer the original query: {{query}}\\n\\nSummaries:\\n" + "\\n".join(buffers))
194
+ ```
195
+ In the next step, we can return FINAL_VAR("final_answer").
196
+
197
+ IMPORTANT: When you are done with the iterative process, you MUST provide a final answer using one of the FINAL functions. Do not use these unless you have completed your task. You have two options:
198
+ 1. Use FINAL(value) to provide the answer directly, e.g., FINAL(42) or FINAL(my_variable)
199
+ 2. Use FINAL_VAR("variable_name") to return a variable by name, e.g., FINAL_VAR("final_answer")
200
+
201
+ Think step by step carefully, plan, and execute this plan immediately in your response -- do not just say "I will do this" or "I will do that". Output to the REPL environment and recursive LLMs as much as possible. Remember to explicitly answer the original query in your final answer.
202
+ """
203
+ )
204
+
205
+
206
+ # =============================================================================
207
+ # User Prompt Templates (from official RLM repo)
208
+ # =============================================================================
209
+
210
+ USER_PROMPT = """Think step-by-step on what to do using the REPL environment (which contains the context) to answer the prompt.\n\nContinue using the REPL environment, which has the `context` variable, and querying sub-LLMs by writing to ```repl``` tags, and determine your answer. Your next action:"""
211
+
212
+ USER_PROMPT_WITH_ROOT = """Think step-by-step on what to do using the REPL environment (which contains the context) to answer the original prompt: \"{root_prompt}\".\n\nContinue using the REPL environment, which has the `context` variable, and querying sub-LLMs by writing to ```repl``` tags, and determine your answer. Your next action:"""
213
+
214
+
215
+ # =============================================================================
216
+ # Prompt Building Functions (from official RLM repo)
217
+ # =============================================================================
218
+
219
+
220
+ def build_rlm_system_prompt(
221
+ system_prompt: str,
222
+ query_metadata: QueryMetadata,
223
+ ) -> List[dict]:
224
+ """
225
+ Build the initial system prompt for the REPL environment based on extra prompt metadata.
226
+
227
+ Args:
228
+ system_prompt: The system prompt to use
229
+ query_metadata: QueryMetadata object containing context metadata
230
+
231
+ Returns:
232
+ List of message dictionaries [system, assistant(metadata)]
233
+ """
234
+ context_lengths = query_metadata.context_lengths
235
+ context_total_length = query_metadata.context_total_length
236
+ context_type = query_metadata.context_type
237
+
238
+ # If there are more than 100 chunks, truncate to the first 100 chunks.
239
+ if len(context_lengths) > 100:
240
+ others = len(context_lengths) - 100
241
+ context_lengths_str = (
242
+ str(context_lengths[:100]) + "... [" + str(others) + " others]"
243
+ )
244
+ else:
245
+ context_lengths_str = str(context_lengths)
246
+
247
+ metadata_prompt = f"Your context is a {context_type} with {context_total_length} total characters, and is broken up into chunks of char lengths: {context_lengths_str}."
248
+
249
+ return [
250
+ {"role": "system", "content": system_prompt},
251
+ {"role": "assistant", "content": metadata_prompt},
252
+ ]
253
+
254
+
255
+ def build_user_prompt(
256
+ root_prompt: Optional[str] = None,
257
+ iteration: int = 0,
258
+ context_count: int = 1,
259
+ history_count: int = 0,
260
+ ) -> dict:
261
+ """
262
+ Build the user prompt for a given iteration.
263
+
264
+ Args:
265
+ root_prompt: The original query/task
266
+ iteration: Current iteration number (0 = first)
267
+ context_count: Number of context variables available
268
+ history_count: Number of prior conversation histories
269
+
270
+ Returns:
271
+ User message dict
272
+ """
273
+ if iteration == 0:
274
+ safeguard = "You have not interacted with the REPL environment or seen your prompt / context yet. Your next action should be to look through and figure out how to answer the prompt, so don't just provide a final answer yet.\n\n"
275
+ prompt = safeguard + (
276
+ USER_PROMPT_WITH_ROOT.format(root_prompt=root_prompt)
277
+ if root_prompt
278
+ else USER_PROMPT
279
+ )
280
+ else:
281
+ prompt = "The history before is your previous interactions with the REPL environment. " + (
282
+ USER_PROMPT_WITH_ROOT.format(root_prompt=root_prompt)
283
+ if root_prompt
284
+ else USER_PROMPT
285
+ )
286
+
287
+ # Inform model about multiple contexts if present
288
+ if context_count > 1:
289
+ prompt += f"\n\nNote: You have {context_count} contexts available (context_0 through context_{context_count - 1})."
290
+
291
+ # Inform model about prior conversation histories if present
292
+ if history_count > 0:
293
+ if history_count == 1:
294
+ prompt += "\n\nNote: You have 1 prior conversation history available in the `history` variable."
295
+ else:
296
+ prompt += f"\n\nNote: You have {history_count} prior conversation histories available (history_0 through history_{history_count - 1})."
297
+
298
+ return {"role": "user", "content": prompt}
299
+
300
+
301
+ # =============================================================================
302
+ # Convenience Functions (for backward compatibility)
303
+ # =============================================================================
304
+
305
+
306
+ def build_initial_prompt(
307
+ task_prompt: str,
308
+ context_length: int,
309
+ context_preview: Optional[str] = None,
310
+ variables: Optional[List[str]] = None,
311
+ **kwargs,
312
+ ) -> str:
313
+ """Build the initial user prompt (convenience wrapper).
314
+
315
+ Args:
316
+ task_prompt: The task to accomplish
317
+ context_length: Total length of the context
318
+ context_preview: Preview of the context (not used)
319
+ variables: List of available variable names (not used)
320
+
321
+ Returns:
322
+ Formatted initial prompt string
323
+ """
324
+ return build_user_prompt(root_prompt=task_prompt, iteration=0)["content"]
325
+
326
+
327
+ # =============================================================================
328
+ # Parsing Utilities
329
+ # =============================================================================
330
+
331
+
332
+ def extract_code_blocks(text: str, language: str = "python") -> List[str]:
333
+ """Extract code blocks from LLM response.
334
+
335
+ Supports both ```repl``` (official RLM) and ```python``` style blocks.
336
+
337
+ Args:
338
+ text: The LLM response text
339
+ language: Language identifier to match (default "python")
340
+
341
+ Returns:
342
+ List of code strings extracted from the response
343
+ """
344
+ # Match 'repl' (official) and 'python' (common alternative)
345
+ patterns = [
346
+ r"```repl\s*(.*?)```",
347
+ rf"```{language}\s*(.*?)```",
348
+ ]
349
+
350
+ all_matches = []
351
+ for pattern in patterns:
352
+ matches = re.findall(pattern, text, re.DOTALL)
353
+ all_matches.extend(m.strip() for m in matches if m.strip())
354
+
355
+ return all_matches
356
+
357
+
358
+ def format_observation(obs) -> str:
359
+ """Format a REPLObservation into observation text for the LLM.
360
+
361
+ Args:
362
+ obs: The REPLObservation from env.step()
363
+
364
+ Returns:
365
+ Formatted observation string
366
+ """
367
+ output = obs.result.stdout.strip() if obs.result.stdout else "(no output)"
368
+
369
+ if obs.result.success:
370
+ return f"Code output:\n{output}"
371
+ else:
372
+ error = obs.result.stderr or obs.result.exception or "Unknown error"
373
+ return f"Code output:\n{output}\n\nERROR: {error}\nFix the error. Remember: 'context' is already defined."
pyproject.toml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-repl"
13
+ version = "0.1.0"
14
+ description = "Recursive Language Model REPL Environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv dependencies (required for server functionality)
18
+ "openenv-core @ git+https://github.com/meta-pytorch/OpenEnv.git@main",
19
+ "fastapi>=0.115.0",
20
+ "pydantic>=2.0.0",
21
+ "uvicorn>=0.24.0",
22
+ "requests>=2.31.0",
23
+ # Environment-specific dependencies
24
+ "smolagents>=1.22.0,<2",
25
+ # LLM support via HuggingFace Inference API
26
+ "huggingface_hub>=0.20.0",
27
+ ]
28
+
29
+ [project.optional-dependencies]
30
+ dev = [
31
+ "pytest>=8.0.0",
32
+ "pytest-cov>=4.0.0",
33
+ ]
34
+
35
+ [project.scripts]
36
+ # Server entry point - enables running via: uv run --project . server
37
+ # or: python -m repl_env.server.app
38
+ server = "repl_env.server.app:main"
39
+
40
+ [tool.setuptools]
41
+ # Explicitly list packages - "repl_env" maps to current dir
42
+ packages = ["repl_env", "repl_env.server"]
43
+ package-dir = {"repl_env" = ".", "repl_env.server" = "server"}
server/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ REPL Environment Server Components.
9
+
10
+ This module contains the server-side implementation of the REPL environment.
11
+ """
12
+
13
+ from .repl_environment import REPLEnvironment
14
+ from .python_executor import PythonExecutor
15
+
16
+ __all__ = [
17
+ "REPLEnvironment",
18
+ "PythonExecutor",
19
+ ]
server/app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ FastAPI application for the REPL Environment.
9
+
10
+ This module creates an HTTP server that exposes the REPLEnvironment
11
+ over HTTP and WebSocket endpoints, compatible with EnvClient.
12
+
13
+ The server includes llm_query and llm_query_batched support via HuggingFace Inference API,
14
+ enabling the Recursive Language Model (RLM) paradigm.
15
+
16
+ LLM Token Configuration:
17
+ 1. Client can pass `hf_token` in reset() - RECOMMENDED
18
+ 2. Server fallback: HF_TOKEN environment variable
19
+
20
+ LLM functions are created dynamically in REPLEnvironment.reset() based on the
21
+ available token (client or server).
22
+
23
+ Usage:
24
+ # Development (with auto-reload):
25
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
26
+
27
+ # Production:
28
+ uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
29
+
30
+ # Or run directly:
31
+ uv run --project . server
32
+
33
+ Environment Variables:
34
+ HF_TOKEN: Fallback HuggingFace API token (client token takes priority)
35
+ LLM_MODEL: Model to use for llm_query/llm_query_batched (default: Qwen/Qwen3-Coder-480B-A35B-Instruct)
36
+ """
37
+ import os
38
+
39
+ # Support both in-repo and standalone imports
40
+ try:
41
+ # In-repo imports (when running from OpenEnv repository)
42
+ from openenv.core.env_server.http_server import create_app
43
+ from ..models import REPLAction, REPLObservation
44
+ from .repl_environment import REPLEnvironment
45
+ except ImportError:
46
+ # Standalone imports (when environment is standalone with openenv from pip)
47
+ from openenv.core.env_server.http_server import create_app
48
+ from models import REPLAction, REPLObservation
49
+ from server.repl_environment import REPLEnvironment
50
+
51
+
52
+ # ============== LLM CONFIGURATION ==============
53
+ LLM_MODEL = os.environ.get("LLM_MODEL", "Qwen/Qwen3-Coder-480B-A35B-Instruct")
54
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
55
+ # ===============================================
56
+
57
+ # Log LLM configuration
58
+ if HF_TOKEN:
59
+ print(f"[REPL Server] LLM support ENABLED (server token configured)")
60
+ print(f"[REPL Server] Default model: {LLM_MODEL}")
61
+ else:
62
+ print("[REPL Server] No server HF_TOKEN configured")
63
+ print("[REPL Server] LLM functions will be enabled if client passes hf_token in reset()")
64
+
65
+ # Simple factory - LLM functions are created dynamically in reset() based on token
66
+ env_factory = REPLEnvironment
67
+
68
+ # Create the app with web interface and README integration
69
+ app = create_app(env_factory, REPLAction, REPLObservation, env_name="repl_env")
70
+
71
+
72
+ def main():
73
+ """
74
+ Entry point for direct execution via uv run or python -m.
75
+
76
+ This function enables running the server without Docker:
77
+ uv run --project . server
78
+ python -m envs.repl_env.server.app
79
+ openenv serve repl_env
80
+ """
81
+ import uvicorn
82
+
83
+ uvicorn.run(app, host="0.0.0.0", port=8000)
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
server/python_executor.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Sandboxed Python code executor for the REPL environment.
9
+
10
+ Uses smolagents.LocalPythonExecutor as the backend for battle-tested sandboxed
11
+ execution, with RLM-specific features on top:
12
+ - Context loading (set_context)
13
+ - Variable access (get_variable, list_variables)
14
+ - Function injection (inject_function for llm_query, llm_query_batched)
15
+ - Output capped at 8,192 characters per turn (configurable)
16
+ - Persistent namespace across code blocks
17
+ """
18
+
19
+ import json
20
+ import logging
21
+ import time
22
+ import traceback
23
+ from collections.abc import Callable
24
+ from typing import Any, Dict, List, Optional
25
+
26
+ from smolagents import LocalPythonExecutor
27
+
28
+ logger = logging.getLogger(__name__)
29
+ logger.addHandler(logging.NullHandler())
30
+
31
+
32
+ class PythonExecutor:
33
+ """Sandboxed Python code executor with persistent namespace.
34
+
35
+ Wraps smolagents.LocalPythonExecutor with RLM-specific features:
36
+ - Context loading for RLM tasks
37
+ - Variable tracking for observation
38
+ - Function injection for llm_query, llm_query_batched
39
+ - Configurable output length limit (default 8192 chars per Prime Intellect)
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ max_output_length: int = 8192,
45
+ timeout: float = 30.0,
46
+ allowed_imports: Optional[List[str]] = None,
47
+ ):
48
+ """Initialize the executor.
49
+
50
+ Args:
51
+ max_output_length: Maximum characters for stdout/stderr (default 8192)
52
+ timeout: Execution timeout in seconds (passed to LocalPythonExecutor)
53
+ allowed_imports: List of allowed module names for import
54
+ """
55
+ self.max_output_length = max_output_length
56
+ self.timeout = timeout
57
+
58
+ # Default allowed imports for RLM tasks
59
+ default_imports = [
60
+ "re",
61
+ "json",
62
+ "math",
63
+ "random",
64
+ "collections",
65
+ "itertools",
66
+ "functools",
67
+ "operator",
68
+ "string",
69
+ "textwrap",
70
+ "difflib",
71
+ "statistics",
72
+ "decimal",
73
+ "fractions",
74
+ "datetime",
75
+ "copy",
76
+ "pprint",
77
+ "typing",
78
+ "dataclasses",
79
+ "enum",
80
+ "bisect",
81
+ "heapq",
82
+ "array",
83
+ "struct",
84
+ "base64",
85
+ "hashlib",
86
+ "hmac",
87
+ "uuid",
88
+ ]
89
+
90
+ self.allowed_imports = allowed_imports or default_imports
91
+
92
+ # Initialize the smolagents executor
93
+ self._executor = LocalPythonExecutor(
94
+ additional_authorized_imports=self.allowed_imports
95
+ )
96
+
97
+ # Track variables we've set (for list_variables)
98
+ self._user_variables: set[str] = set()
99
+
100
+ # Track callable functions to register with send_tools
101
+ self._callable_tools: Dict[str, Callable[..., Any]] = {}
102
+
103
+ # Register helper utilities
104
+ self._register_helpers()
105
+
106
+ def _register_helpers(self) -> None:
107
+ """Register helper functions with the executor."""
108
+ helpers = {
109
+ "format_exc": traceback.format_exc,
110
+ "safe_json_dumps": lambda obj: json.dumps(obj, default=lambda o: repr(o)),
111
+ }
112
+ # Register helpers as callable tools
113
+ for name, func in helpers.items():
114
+ self.inject_function(name, func)
115
+
116
+ def _sync_callable_tools(self) -> None:
117
+ """Sync callable functions with the executor via send_tools."""
118
+ if self._callable_tools:
119
+ try:
120
+ # Type ignore: smolagents accepts callables despite Tool type hint
121
+ self._executor.send_tools(self._callable_tools) # type: ignore[arg-type]
122
+ except Exception:
123
+ logger.debug("send_tools failed; continuing without extra tools", exc_info=True)
124
+
125
+ def set_context(self, context: str, variable_name: str = "context") -> None:
126
+ """Load context into namespace as a variable.
127
+
128
+ Args:
129
+ context: The context string to load
130
+ variable_name: Name of the variable (default "context")
131
+ """
132
+ self.set_variable(variable_name, context)
133
+
134
+ def set_variable(self, name: str, value: Any) -> None:
135
+ """Set a variable in the namespace.
136
+
137
+ Args:
138
+ name: Variable name
139
+ value: Variable value
140
+ """
141
+ # Access the executor's internal state to set variables
142
+ if hasattr(self._executor, 'state'):
143
+ self._executor.state[name] = value
144
+ else:
145
+ # Fallback: store in injected vars for later retrieval
146
+ self._executor._injected_vars = getattr(self._executor, '_injected_vars', {})
147
+ self._executor._injected_vars[name] = value
148
+
149
+ self._user_variables.add(name)
150
+
151
+ def get_variable(self, name: str) -> Optional[Any]:
152
+ """Retrieve a variable from namespace.
153
+
154
+ Args:
155
+ name: Variable name
156
+
157
+ Returns:
158
+ The variable value or None if not found
159
+ """
160
+ # Try to get from executor's state
161
+ if hasattr(self._executor, 'state'):
162
+ return self._executor.state.get(name)
163
+
164
+ # Fallback to injected vars
165
+ if hasattr(self._executor, '_injected_vars'):
166
+ return self._executor._injected_vars.get(name)
167
+
168
+ return None
169
+
170
+ def list_variables(self) -> List[str]:
171
+ """List non-private variables in namespace.
172
+
173
+ Returns:
174
+ List of variable names (excluding private and builtins)
175
+ """
176
+ variables = set()
177
+
178
+ # Get from executor's state
179
+ if hasattr(self._executor, 'state'):
180
+ for key in self._executor.state:
181
+ if not key.startswith('_'):
182
+ variables.add(key)
183
+
184
+ # Include tracked user variables
185
+ variables.update(self._user_variables)
186
+
187
+ return list(variables)
188
+
189
+ def execute(self, code: str) -> Dict[str, Any]:
190
+ """Execute Python code and return results.
191
+
192
+ Args:
193
+ code: Python code to execute
194
+
195
+ Returns:
196
+ Dictionary with stdout, stderr, locals_snapshot, execution_time,
197
+ success, and exception fields
198
+ """
199
+ start_time = time.time()
200
+ success = True
201
+ exception_msg = None
202
+ new_locals: Dict[str, str] = {}
203
+
204
+ # Track state before execution
205
+ pre_state_keys = set()
206
+ if hasattr(self._executor, 'state'):
207
+ pre_state_keys = set(self._executor.state.keys())
208
+
209
+ stdout_parts: list[str] = []
210
+ stderr_parts: list[str] = []
211
+
212
+ try:
213
+ exec_result = self._executor(code)
214
+
215
+ # Extract logs/prints
216
+ try:
217
+ logs = getattr(exec_result, "logs", None)
218
+ if logs:
219
+ stdout_parts.append(str(logs))
220
+ except Exception:
221
+ logger.debug("Failed to read exec_result.logs", exc_info=True)
222
+
223
+ # Extract the result / output value
224
+ try:
225
+ if hasattr(exec_result, "output"):
226
+ out_val = exec_result.output
227
+ if out_val is not None:
228
+ try:
229
+ stdout_parts.append(json.dumps(out_val))
230
+ except Exception:
231
+ stdout_parts.append(repr(out_val))
232
+ except Exception:
233
+ logger.debug("Failed to read exec_result.output", exc_info=True)
234
+
235
+ # Check for errors
236
+ try:
237
+ err = getattr(exec_result, "error", None)
238
+ if err:
239
+ stderr_parts.append(str(err))
240
+ success = False
241
+ exception_msg = str(err)
242
+ except Exception:
243
+ logger.debug("Failed to read exec_result.error", exc_info=True)
244
+
245
+ try:
246
+ ex = getattr(exec_result, "exception", None)
247
+ if ex:
248
+ stderr_parts.append(str(ex))
249
+ success = False
250
+ exception_msg = str(ex)
251
+ except Exception:
252
+ logger.debug("Failed to read exec_result.exception", exc_info=True)
253
+
254
+ # Determine success from exit_code if available
255
+ try:
256
+ if hasattr(exec_result, "exit_code"):
257
+ if exec_result.exit_code is not None and exec_result.exit_code != 0:
258
+ success = False
259
+ elif hasattr(exec_result, "success"):
260
+ success = bool(exec_result.success)
261
+ except Exception:
262
+ logger.debug("Failed to determine exec_result exit code", exc_info=True)
263
+
264
+ except Exception as e:
265
+ success = False
266
+ exception_msg = f"{type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
267
+ stderr_parts.append(exception_msg)
268
+
269
+ execution_time = time.time() - start_time
270
+
271
+ # Capture new/modified variables
272
+ if hasattr(self._executor, 'state'):
273
+ for key in self._executor.state:
274
+ if key not in pre_state_keys and not key.startswith('_'):
275
+ try:
276
+ val = self._executor.state[key]
277
+ val_repr = repr(val)
278
+ if len(val_repr) > 500:
279
+ val_repr = val_repr[:500] + "..."
280
+ new_locals[key] = val_repr
281
+ self._user_variables.add(key)
282
+ except Exception:
283
+ new_locals[key] = "<unrepresentable>"
284
+
285
+ # Compose stdout/stderr
286
+ stdout = "\n".join(part for part in stdout_parts if part)
287
+ stderr = "\n".join(part for part in stderr_parts if part)
288
+
289
+ # Truncate output to max_output_length
290
+ if len(stdout) > self.max_output_length:
291
+ stdout = stdout[:self.max_output_length] + f"\n... (truncated, total {len(stdout)} chars)"
292
+
293
+ if len(stderr) > self.max_output_length:
294
+ stderr = stderr[:self.max_output_length] + f"\n... (truncated, total {len(stderr)} chars)"
295
+
296
+ return {
297
+ "stdout": stdout,
298
+ "stderr": stderr,
299
+ "locals_snapshot": new_locals,
300
+ "execution_time": execution_time,
301
+ "success": success,
302
+ "exception": exception_msg,
303
+ }
304
+
305
+ def reset(self) -> None:
306
+ """Reset namespace to initial state."""
307
+ # Create a new executor instance
308
+ self._executor = LocalPythonExecutor(
309
+ additional_authorized_imports=self.allowed_imports
310
+ )
311
+ self._user_variables.clear()
312
+ self._callable_tools.clear()
313
+ self._register_helpers()
314
+
315
+ def inject_function(self, name: str, func: Callable[..., Any]) -> None:
316
+ """Inject a callable function into the namespace.
317
+
318
+ Used for adding llm_query, llm_query_batched, FINAL, etc.
319
+
320
+ Args:
321
+ name: Function name in namespace
322
+ func: The callable to inject
323
+ """
324
+ # Add to callable tools and sync with executor
325
+ self._callable_tools[name] = func
326
+ self._user_variables.add(name)
327
+ self._sync_callable_tools()
server/repl_environment.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ REPL Environment Implementation.
9
+
10
+ A Python REPL environment for training language models on code execution tasks,
11
+ based on the Recursive Language Models (RLM) paradigm.
12
+
13
+ References:
14
+ - RLM Paper: https://arxiv.org/abs/2512.24601
15
+ - Prime Intellect Blog: https://www.primeintellect.ai/blog/rlm
16
+ - Alex Zhang Blog: https://alexzhang13.github.io/blog/2025/rlm/
17
+ """
18
+
19
+ import os
20
+ import re
21
+ from collections.abc import Callable
22
+ from typing import Any, Dict, List, Optional
23
+ from uuid import uuid4
24
+
25
+ # Support both in-repo and standalone imports
26
+ try:
27
+ from openenv.core.env_server.interfaces import Environment
28
+ from openenv.core.env_server.types import EnvironmentMetadata
29
+ except ImportError:
30
+ from openenv.core.env_server.interfaces import Environment
31
+ from openenv.core.env_server.types import EnvironmentMetadata
32
+
33
+ try:
34
+ from ..models import REPLAction, REPLObservation, REPLState, CodeBlockResult
35
+ except ImportError:
36
+ from models import REPLAction, REPLObservation, REPLState, CodeBlockResult
37
+
38
+ try:
39
+ from .python_executor import PythonExecutor
40
+ except ImportError:
41
+ from python_executor import PythonExecutor
42
+
43
+
44
+ class REPLEnvironment(Environment):
45
+ """
46
+ A REPL environment for training language models to use code execution.
47
+
48
+ Based on the Recursive Language Models (RLM) paradigm, this environment allows
49
+ language models to:
50
+ - Execute Python code in a sandboxed REPL
51
+ - Work with large contexts loaded as variables
52
+ - Finalize answers via FINAL(), FINAL_VAR(), or answer dict pattern
53
+ - Optionally make recursive LLM calls via llm_query() / llm_query_batched()
54
+
55
+ Supports two finalization patterns:
56
+ 1. RLM-style: print('FINAL(answer)') or print('FINAL_VAR(var_name)')
57
+ 2. Prime Intellect style: answer = {"content": "...", "ready": True}
58
+
59
+ Example:
60
+ >>> env = REPLEnvironment(context="Hello World", task_prompt="Count chars")
61
+ >>> obs = env.reset()
62
+ >>> print(obs.context_preview) # "Hello World"
63
+ >>>
64
+ >>> obs = env.step(REPLAction(code="result = len(context)"))
65
+ >>> print(obs.result.success) # True
66
+ >>> print(obs.available_variables) # ["context", "result", "answer"]
67
+ >>>
68
+ >>> obs = env.step(REPLAction(code="print(f'FINAL({result})')"))
69
+ >>> print(obs.done) # True
70
+ >>> print(obs.metadata["final_answer"]) # "11"
71
+ """
72
+
73
+ SUPPORTS_CONCURRENT_SESSIONS = True
74
+
75
+ def __init__(
76
+ self,
77
+ context: Optional[str] = None,
78
+ task_prompt: Optional[str] = None,
79
+ max_iterations: int = 30,
80
+ max_output_length: int = 8192,
81
+ context_preview_length: int = 500,
82
+ reward_on_success: float = 1.0,
83
+ reward_on_iteration: float = 0.0,
84
+ reward_on_failure: float = -0.1,
85
+ reward_on_error: float = -0.05,
86
+ llm_query_fn: Optional[Callable[[str], str]] = None,
87
+ llm_batch_fn: Optional[Callable[[List[str]], List[str]]] = None,
88
+ ):
89
+ """Initialize the REPL environment.
90
+
91
+ Args:
92
+ context: Initial context to load (can also be set via REPL_CONTEXT env var)
93
+ task_prompt: Task description (can also be set via REPL_TASK_PROMPT env var)
94
+ max_iterations: Maximum steps per episode (default 30, env var REPL_MAX_ITERATIONS)
95
+ max_output_length: Max chars for stdout/stderr per turn (default 8192)
96
+ context_preview_length: Chars to show in context preview (default 500)
97
+ reward_on_success: Reward when final answer is submitted (default 1.0)
98
+ reward_on_iteration: Reward per iteration step (default 0.0)
99
+ reward_on_failure: Reward when max iterations reached (default -0.1)
100
+ reward_on_error: Reward when code execution fails (default -0.05)
101
+ llm_query_fn: Optional function for llm_query() support
102
+ llm_batch_fn: Optional function for llm_query_batched() support
103
+ """
104
+ self.initial_context = context or os.environ.get("REPL_CONTEXT", "")
105
+ self.initial_task_prompt = task_prompt or os.environ.get("REPL_TASK_PROMPT", "")
106
+ self.max_iterations = int(os.environ.get("REPL_MAX_ITERATIONS", max_iterations))
107
+ self.max_output_length = max_output_length
108
+ self.context_preview_length = context_preview_length
109
+
110
+ # Reward configuration
111
+ self.reward_on_success = reward_on_success
112
+ self.reward_on_iteration = reward_on_iteration
113
+ self.reward_on_failure = reward_on_failure
114
+ self.reward_on_error = reward_on_error
115
+
116
+ # Optional LLM functions for recursive calls
117
+ self.llm_query_fn = llm_query_fn
118
+ self.llm_batch_fn = llm_batch_fn
119
+
120
+ # State (initialized on reset)
121
+ self._state: Optional[REPLState] = None
122
+ self._executor: Optional[PythonExecutor] = None
123
+
124
+ def _create_llm_functions(
125
+ self,
126
+ hf_token: str,
127
+ llm_model: Optional[str] = None,
128
+ ) -> None:
129
+ """Create LLM functions dynamically using client-provided token.
130
+
131
+ This allows clients to use their own HF token instead of the server's.
132
+
133
+ Args:
134
+ hf_token: HuggingFace API token
135
+ llm_model: Model to use (default: Qwen/Qwen3-Coder-480B-A35B-Instruct)
136
+ """
137
+ from concurrent.futures import ThreadPoolExecutor, as_completed
138
+
139
+ try:
140
+ from huggingface_hub import InferenceClient
141
+ except ImportError:
142
+ # huggingface_hub not installed, skip LLM functions
143
+ return
144
+
145
+ model = llm_model or os.environ.get(
146
+ "LLM_MODEL", "Qwen/Qwen3-Coder-480B-A35B-Instruct"
147
+ )
148
+ client = InferenceClient(model=model, token=hf_token)
149
+
150
+ def llm_query(prompt: str) -> str:
151
+ """Query the LLM with a prompt and return the response."""
152
+ try:
153
+ messages = [{"role": "user", "content": prompt}]
154
+ response = client.chat_completion(
155
+ messages=messages,
156
+ max_tokens=2048,
157
+ temperature=0.7,
158
+ )
159
+ return response.choices[0].message.content or ""
160
+ except Exception as e:
161
+ return f"Error calling LLM: {e}"
162
+
163
+ def llm_query_batched(prompts: List[str]) -> List[str]:
164
+ """Query the LLM with multiple prompts in parallel."""
165
+ if not prompts:
166
+ return []
167
+
168
+ max_workers = min(len(prompts), 8)
169
+ results: List[str] = [""] * len(prompts)
170
+
171
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
172
+ future_to_idx = {
173
+ executor.submit(llm_query, prompt): idx
174
+ for idx, prompt in enumerate(prompts)
175
+ }
176
+ for future in as_completed(future_to_idx):
177
+ idx = future_to_idx[future]
178
+ try:
179
+ results[idx] = future.result()
180
+ except Exception as e:
181
+ results[idx] = f"Error: {e}"
182
+
183
+ return results
184
+
185
+ self.llm_query_fn = llm_query
186
+ self.llm_batch_fn = llm_query_batched
187
+
188
+ def reset(
189
+ self,
190
+ seed: Optional[int] = None,
191
+ episode_id: Optional[str] = None,
192
+ context: Optional[str] = None,
193
+ task_prompt: Optional[str] = None,
194
+ hf_token: Optional[str] = None,
195
+ llm_model: Optional[str] = None,
196
+ **kwargs: Any,
197
+ ) -> REPLObservation:
198
+ """Reset the environment with optional new context.
199
+
200
+ Args:
201
+ seed: Optional random seed (for reproducibility)
202
+ episode_id: Optional episode identifier (if not provided, one is generated)
203
+ context: Context to load (overrides initial_context)
204
+ task_prompt: Task description (overrides initial_task_prompt)
205
+ hf_token: Optional HuggingFace token for llm_query/llm_query_batched.
206
+ If provided, creates LLM functions using this token.
207
+ llm_model: Optional model name for LLM functions (default: from env or Qwen3-Coder)
208
+ **kwargs: Additional reset parameters
209
+
210
+ Returns:
211
+ Initial REPLObservation with environment ready message
212
+ """
213
+ effective_context = context or self.initial_context
214
+ effective_task_prompt = task_prompt or self.initial_task_prompt
215
+
216
+ # Create LLM functions if not already provided at init
217
+ # Priority: client hf_token > server HF_TOKEN env var
218
+ if not self.llm_query_fn:
219
+ effective_token = hf_token or os.environ.get("HF_TOKEN")
220
+ if effective_token:
221
+ self._create_llm_functions(effective_token, llm_model)
222
+
223
+ # Initialize state
224
+ self._state = REPLState(
225
+ episode_id=episode_id or str(uuid4()),
226
+ step_count=0,
227
+ context=effective_context,
228
+ task_prompt=effective_task_prompt,
229
+ iteration=0,
230
+ max_iterations=self.max_iterations,
231
+ namespace_keys=[],
232
+ final_answer=None,
233
+ total_execution_time=0.0,
234
+ )
235
+
236
+ # Initialize executor
237
+ self._executor = PythonExecutor(max_output_length=self.max_output_length)
238
+
239
+ # Initialize answer dict (Prime Intellect style)
240
+ self._executor.set_variable("answer", {"content": "", "ready": False})
241
+
242
+ # Load context into namespace if provided
243
+ if effective_context:
244
+ self._executor.set_context(effective_context)
245
+
246
+ # Inject LLM functions if provided
247
+ # Names: llm_query (single), llm_query_batched (official RLM), llm_batch (alias)
248
+ if self.llm_query_fn:
249
+ self._executor.inject_function("llm_query", self.llm_query_fn)
250
+ if self.llm_batch_fn:
251
+ self._executor.inject_function("llm_query_batched", self.llm_batch_fn) # Official name
252
+ self._executor.inject_function("llm_batch", self.llm_batch_fn) # Alias
253
+
254
+ # Inject FINAL helper function so both FINAL(x) and print(f'FINAL({x})') work
255
+ # Returns the FINAL pattern as a string so it appears in stdout for detection
256
+ def final_helper(value):
257
+ """Helper that returns FINAL(value) string for detection."""
258
+ return f"FINAL({value})"
259
+
260
+ self._executor.inject_function("FINAL", final_helper)
261
+
262
+ # Inject FINAL_VAR helper that looks up variable and returns FINAL(value)
263
+ # This matches official RLM behavior - strips quotes from var_name and looks up in namespace
264
+ executor = self._executor # Capture for closure
265
+
266
+ def final_var_helper(var_name: str):
267
+ """Look up variable by name and return FINAL(value) for detection."""
268
+ # Strip quotes if present (handles both FINAL_VAR("x") and FINAL_VAR(x))
269
+ var_name_clean = str(var_name).strip().strip("\"'")
270
+ # Look up variable in executor namespace
271
+ value = executor.get_variable(var_name_clean)
272
+ if value is not None:
273
+ return f"FINAL({value})"
274
+ return f"FINAL_VAR({var_name_clean})" # Fallback for regex detection
275
+
276
+ self._executor.inject_function("FINAL_VAR", final_var_helper)
277
+
278
+ # Update namespace keys
279
+ self._state.namespace_keys = self._executor.list_variables()
280
+
281
+ # Build initial message
282
+ message_parts = ["REPL environment initialized."]
283
+ if effective_context:
284
+ message_parts.append(
285
+ f"Context loaded ({len(effective_context)} chars). "
286
+ "Use 'context' variable to access it."
287
+ )
288
+ if effective_task_prompt:
289
+ message_parts.append(f"Task: {effective_task_prompt}")
290
+ message_parts.append(
291
+ "Use answer['content'] to store your answer, "
292
+ "and set answer['ready'] = True when done."
293
+ )
294
+
295
+ return REPLObservation(
296
+ result=CodeBlockResult(
297
+ stdout="\n".join(message_parts),
298
+ stderr="",
299
+ locals_snapshot={},
300
+ execution_time=0.0,
301
+ success=True,
302
+ exception=None,
303
+ ),
304
+ context_preview=(
305
+ effective_context[: self.context_preview_length]
306
+ if effective_context
307
+ else None
308
+ ),
309
+ context_length=len(effective_context) if effective_context else 0,
310
+ available_variables=self._state.namespace_keys,
311
+ iteration=0,
312
+ max_iterations=self.max_iterations,
313
+ done=False,
314
+ reward=0.0,
315
+ metadata={
316
+ "task_prompt": effective_task_prompt,
317
+ "message": "Environment ready.",
318
+ },
319
+ )
320
+
321
+ def step(
322
+ self,
323
+ action: REPLAction,
324
+ timeout_s: Optional[float] = None,
325
+ **kwargs: Any,
326
+ ) -> REPLObservation:
327
+ """Execute code and return observation.
328
+
329
+ Args:
330
+ action: REPLAction containing code to execute
331
+ timeout_s: Optional timeout in seconds (not currently used)
332
+ **kwargs: Additional step parameters
333
+
334
+ Returns:
335
+ REPLObservation with execution results
336
+ """
337
+ if self._state is None or self._executor is None:
338
+ raise RuntimeError("Environment not initialized. Call reset() first.")
339
+
340
+ self._state.step_count += 1
341
+ self._state.iteration += 1
342
+
343
+ # Check if agent explicitly signals final answer
344
+ if action.is_final:
345
+ self._state.final_answer = action.final_answer or ""
346
+ return self._create_final_observation(
347
+ success=True,
348
+ message="Final answer submitted.",
349
+ reward=self.reward_on_success,
350
+ )
351
+
352
+ # Check iteration limit
353
+ if self._state.iteration >= self.max_iterations:
354
+ # Check if there's a partial answer in the answer dict
355
+ answer_var = self._executor.get_variable("answer")
356
+ if isinstance(answer_var, dict) and answer_var.get("content"):
357
+ self._state.final_answer = str(answer_var.get("content", ""))
358
+ return self._create_final_observation(
359
+ success=False,
360
+ message=f"Maximum iterations ({self.max_iterations}) reached.",
361
+ reward=self.reward_on_failure,
362
+ )
363
+
364
+ # Execute code
365
+ result = self._executor.execute(action.code)
366
+ self._state.total_execution_time += result["execution_time"]
367
+ self._state.namespace_keys = self._executor.list_variables()
368
+
369
+ # Calculate reward
370
+ reward = self.reward_on_iteration
371
+ if not result["success"]:
372
+ reward += self.reward_on_error
373
+
374
+ # Check for final answer patterns
375
+ final_answer = self._extract_final_answer(result["stdout"])
376
+ done = final_answer is not None
377
+
378
+ if done:
379
+ self._state.final_answer = final_answer
380
+ reward = self.reward_on_success
381
+
382
+ return REPLObservation(
383
+ result=CodeBlockResult(
384
+ stdout=result["stdout"],
385
+ stderr=result["stderr"],
386
+ locals_snapshot=result["locals_snapshot"],
387
+ execution_time=result["execution_time"],
388
+ success=result["success"],
389
+ exception=result["exception"],
390
+ ),
391
+ context_preview=(
392
+ self._state.context[: self.context_preview_length]
393
+ if self._state.context
394
+ else None
395
+ ),
396
+ context_length=len(self._state.context) if self._state.context else 0,
397
+ available_variables=self._state.namespace_keys,
398
+ iteration=self._state.iteration,
399
+ max_iterations=self.max_iterations,
400
+ done=done,
401
+ reward=reward,
402
+ metadata={
403
+ "task_prompt": self._state.task_prompt,
404
+ "final_answer": final_answer,
405
+ "execution_time": result["execution_time"],
406
+ },
407
+ )
408
+
409
+ def _extract_final_answer(self, stdout: str) -> Optional[str]:
410
+ """Extract final answer from output.
411
+
412
+ Supports multiple patterns:
413
+ 1. RLM-style: FINAL(answer) in stdout
414
+ 2. RLM-style: FINAL_VAR(variable_name) in stdout
415
+ 3. Prime Intellect style: answer = {"content": "...", "ready": True} in namespace
416
+
417
+ Args:
418
+ stdout: Standard output from code execution
419
+
420
+ Returns:
421
+ Final answer string or None if not found
422
+ """
423
+ # Pattern 1: RLM-style FINAL(answer)
424
+ final_match = re.search(r"FINAL\((.*?)\)", stdout, re.DOTALL)
425
+ if final_match:
426
+ return final_match.group(1).strip()
427
+
428
+ # Pattern 2: RLM-style FINAL_VAR(variable_name)
429
+ final_var_match = re.search(r"FINAL_VAR\((\w+)\)", stdout)
430
+ if final_var_match and self._executor:
431
+ var_name = final_var_match.group(1)
432
+ value = self._executor.get_variable(var_name)
433
+ if value is not None:
434
+ return str(value)
435
+
436
+ # Pattern 3: Prime Intellect style answer dict
437
+ if self._executor:
438
+ answer_var = self._executor.get_variable("answer")
439
+ if isinstance(answer_var, dict):
440
+ if answer_var.get("ready", False):
441
+ return str(answer_var.get("content", ""))
442
+
443
+ return None
444
+
445
+ def _create_final_observation(
446
+ self, success: bool, message: str, reward: float
447
+ ) -> REPLObservation:
448
+ """Create observation for episode termination.
449
+
450
+ Args:
451
+ success: Whether the episode ended successfully
452
+ message: Termination message
453
+ reward: Final reward value
454
+
455
+ Returns:
456
+ Final REPLObservation with done=True
457
+ """
458
+ return REPLObservation(
459
+ result=CodeBlockResult(
460
+ stdout=message,
461
+ stderr="",
462
+ locals_snapshot={},
463
+ execution_time=0.0,
464
+ success=success,
465
+ exception=None,
466
+ ),
467
+ context_preview=None,
468
+ context_length=0,
469
+ available_variables=[],
470
+ iteration=self._state.iteration if self._state else 0,
471
+ max_iterations=self.max_iterations,
472
+ done=True,
473
+ reward=reward,
474
+ metadata={
475
+ "final_answer": self._state.final_answer if self._state else None,
476
+ "total_execution_time": (
477
+ self._state.total_execution_time if self._state else 0
478
+ ),
479
+ "total_iterations": self._state.iteration if self._state else 0,
480
+ },
481
+ )
482
+
483
+ @property
484
+ def state(self) -> REPLState:
485
+ """Get the current environment state.
486
+
487
+ Returns:
488
+ Current REPLState
489
+
490
+ Raises:
491
+ RuntimeError: If environment not initialized
492
+ """
493
+ if self._state is None:
494
+ raise RuntimeError("Environment not initialized. Call reset() first.")
495
+ return self._state
496
+
497
+ def close(self) -> None:
498
+ """Cleanup resources."""
499
+ self._executor = None
500
+ self._state = None
501
+
502
+ def get_metadata(self) -> EnvironmentMetadata:
503
+ """Get environment metadata.
504
+
505
+ Returns:
506
+ EnvironmentMetadata with environment info
507
+ """
508
+ return EnvironmentMetadata(
509
+ name="repl_env",
510
+ description="Python REPL environment for RLM-style code execution",
511
+ version="0.1.0",
512
+ )
uv.lock ADDED
The diff for this file is too large to render. See raw diff