Parthiban007 commited on
Commit
9763ffa
·
verified ·
1 Parent(s): 3955490

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-stage build for Rust Coder Environment
2
+ # Build context: repo root (contains models.py, problems.json, pyproject.toml, uv.lock)
3
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
4
+ FROM ${BASE_IMAGE} AS builder
5
+
6
+ # 1. Environment Setup
7
+ USER root
8
+ WORKDIR /app
9
+
10
+ # Install build essentials for Rust (linker, etc.)
11
+ RUN apt-get update && \
12
+ apt-get install -y --no-install-recommends git curl build-essential ca-certificates && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+ # 2. Create the non-root user (Hugging Face default)
16
+ RUN useradd -m -u 1000 user
17
+ USER user
18
+ ENV HOME=/home/user \
19
+ PATH=/home/user/.cargo/bin:/home/user/.local/bin:$PATH
20
+
21
+ # 3. Install Rust toolchain as 'user'
22
+ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
23
+ RUN rustup toolchain install stable
24
+
25
+ # 4. Copy environment code and install Python dependencies
26
+ WORKDIR $HOME/app/env
27
+ COPY --chown=user . $HOME/app/env
28
+
29
+ # Install uv (if not present) and then the virtual environment
30
+ RUN if ! command -v uv >/dev/null 2>&1; then \
31
+ curl -LsSf https://astral.sh/uv/install.sh | sh; \
32
+ fi
33
+
34
+ RUN --mount=type=cache,target=/home/user/.cache/uv,uid=1000,gid=1000 \
35
+ uv sync --no-editable
36
+
37
+ # -------------------------------------------------------------
38
+ # Final Runtime Stage
39
+ # -------------------------------------------------------------
40
+ FROM ${BASE_IMAGE}
41
+
42
+ USER root
43
+ RUN apt-get update && \
44
+ apt-get install -y --no-install-recommends curl build-essential ca-certificates && \
45
+ rm -rf /var/lib/apt/lists/*
46
+
47
+ # Create the user again in the final stage
48
+ RUN useradd -m -u 1000 user
49
+ USER user
50
+ ENV HOME=/home/user \
51
+ PATH="/home/user/app/env/.venv/bin:/home/user/.cargo/bin:$PATH" \
52
+ PYTHONPATH="/home/user/app/env:$PYTHONPATH"
53
+
54
+ # Copy Cargo/Rustup from builder and then the local code
55
+ COPY --from=builder --chown=user /home/user/.cargo /home/user/.cargo
56
+ COPY --from=builder --chown=user /home/user/.rustup /home/user/.rustup
57
+
58
+ WORKDIR $HOME/app/env
59
+ COPY --chown=user . $HOME/app/env
60
+
61
+ # Install uv and Python dependencies in the FINAL stage
62
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh
63
+ RUN uv sync --no-editable
64
+
65
+ # -------------------------------------------------------------
66
+ # Final Config
67
+ # -------------------------------------------------------------
68
+ EXPOSE 8000
69
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
70
+ CMD curl -f http://localhost:8000/health || exit 1
71
+
72
+ ENV ENABLE_WEB_INTERFACE=true
73
+ CMD ["/home/user/app/env/.venv/bin/uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000", "--log-level", "info"]
README.md CHANGED
@@ -1,10 +1,218 @@
1
  ---
2
- title: Rust Coder
3
- emoji: 🚀
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: docker
 
 
7
  pinned: false
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Rust Coder OpenEnv
3
+ emoji: 🦀
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: docker
7
+ app_port: 8000
8
+ base_path: /web
9
  pinned: false
10
+ tags:
11
+ - openenv
12
+ - software-engineering
13
+ - rust
14
  ---
15
 
16
+ # Rust Coder: Systems Engineering Environment
17
+
18
+ Rust Coder is a high-fidelity **OpenEnv** environment designed to evaluate and train LLM agents on real-world Rust systems programming tasks. Unlike toy environments, Rust Coder simulates valid engineering scenarios involving the borrow checker, concurrency, and memory safety.
19
+
20
+ ## Motivation
21
+
22
+ Rust is uniquely challenging for AI agents due to its strict compile-time safety guarantees. This environment provides a 10-task progression that measures an agent's ability to:
23
+
24
+ 1. Fix borrow checker violations
25
+ 2. Correctly annotate lifetimes
26
+ 3. Resolve concurrency deadlocks
27
+ 4. Write unsafe FFI code correctly
28
+ 5. Identify and prevent memory leaks
29
+ 6. Optimize data pipelines for performance
30
+
31
+ ---
32
+
33
+ ## Action Space
34
+
35
+ **Type**: `RustCoderAction`
36
+
37
+ The agent submits a single string containing the complete, fixed Rust source code.
38
+
39
+ | Field | Type | Description |
40
+ |-------|--------|------------------------------------------|
41
+ | `code` | string | Full Rust source code to compile and test |
42
+
43
+ ## Observation Space
44
+
45
+ **Type**: `RustCoderObservation`
46
+
47
+ The environment returns detailed feedback after each submission:
48
+
49
+ | Field | Type | Description |
50
+ |------------------------|-------------|-----------------------------------------------------|
51
+ | `problem_description` | string | Task requirements and context |
52
+ | `starter_code` | string | The intentionally broken code to fix |
53
+ | `compilation_success` | bool | Whether `rustc` compiled the submitted code |
54
+ | `compilation_output` | string | Raw compiler errors and warnings |
55
+ | `test_results` | list[dict] | Per-test pass/fail results with error details |
56
+ | `reward_breakdown` | dict | Weighted score breakdown across 5 dimensions |
57
+
58
+ ---
59
+
60
+ ## Reward Function
61
+
62
+ Total reward is a weighted sum of 5 dimensions, each normalized to [0, 1]:
63
+
64
+ | Dimension | Weight | Metric |
65
+ |-----------------|--------|---------------------------------------------------|
66
+ | Compilation | 40% | Binary success/failure of `rustc` |
67
+ | Correctness | 20% | Fraction of test assertions that pass |
68
+ | Coverage | 20% | Fraction of tests that successfully ran |
69
+ | Elegance | 10% | Code quality heuristics (avoids `.unwrap()`, long lines, `unsafe`) |
70
+ | Efficiency | 10% | Execution time vs. per-problem baseline |
71
+
72
+ Reward provides partial signal at every step — compilation alone earns 0.40, passing all tests earns up to 1.0.
73
+
74
+ ---
75
+
76
+ ## Tasks
77
+
78
+ 10 sequential problems with increasing difficulty:
79
+
80
+ | ID | Title | Difficulty | Skill Evaluated |
81
+ |----|------------------------------------|------------|-------------------------------|
82
+ | 1 | Broken CLI Argument Parser | Easy | Enums & pattern matching |
83
+ | 2 | Conflicting Borrows | Easy→Med | Borrow checker |
84
+ | 3 | Invalid Lifetime Annotations | Medium | Lifetime annotations |
85
+ | 4 | Business Logic Errors | Medium | Math & correctness |
86
+ | 5 | Linked List Management | Medium | Ownership & data structures |
87
+ | 6 | Multi-threaded Deadlocks | Hard | Mutex & concurrency |
88
+ | 7 | Async Borrowing Conflicts | Hard | Async/await lifetimes |
89
+ | 8 | Unsafe FFI Integration | Hard | `unsafe` & C interop |
90
+ | 9 | Inefficient Data Pipeline | Hard | Performance optimization |
91
+ | 10 | Memory Leak Prevention | Hard+ | Weak pointers & ownership |
92
+
93
+ ---
94
+
95
+ ## Environment Variables / Secrets
96
+
97
+ The environment reads the following variables. Set them as **HF Space secrets** (Settings → Variables and Secrets) when deploying to Hugging Face, or in a local `.env` file for development.
98
+
99
+ | Variable | Required | Default | Description |
100
+ |----------------|----------|--------------------------------------|--------------------------------------|
101
+ | `HF_TOKEN` | Yes | — | Hugging Face API token for LLM calls |
102
+ | `API_BASE_URL` | No | `https://router.huggingface.co/v1` | Inference endpoint |
103
+ | `MODEL_NAME` | No | `Qwen/Qwen2.5-72B-Instruct` | Model to use for evaluation |
104
+
105
+ > **Note**: The `.env` file is excluded from Docker images by `.dockerignore`. On HF Spaces, secrets are injected as OS environment variables by the platform — `load_dotenv()` silently does nothing if no file is present, and `os.getenv()` reads from the platform-injected vars. This is the correct behavior.
106
+
107
+ ---
108
+
109
+ ## Setup & Usage
110
+
111
+ ### Local Development
112
+
113
+ ```bash
114
+ # 1. Clone and enter the repo
115
+ git clone https://github.com/your-username/rust_coder
116
+ cd rust_coder
117
+
118
+ # 2. Create .env with your credentials
119
+ cat > .env << EOF
120
+ HF_TOKEN=hf_your_token_here
121
+ API_BASE_URL=https://router.huggingface.co/v1
122
+ MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
123
+ EOF
124
+
125
+ # 3. Build the Docker image (uses root Dockerfile)
126
+ docker build -t rust_coder:latest .
127
+
128
+ # 4. Run the environment server
129
+ docker run -d -p 8000:8000 --env-file .env --name rust_env rust_coder:latest
130
+
131
+ # 5. Verify it's healthy
132
+ curl http://localhost:8000/health
133
+ # → {"status": "healthy"}
134
+
135
+ # 6. Run the inference benchmark
136
+ python inference.py
137
+ ```
138
+
139
+ ### Docker Commands Reference
140
+
141
+ ```bash
142
+ # Build
143
+ docker build -t rust_coder:latest .
144
+
145
+ # Run with .env file
146
+ docker run -d -p 8000:8000 --env-file .env --name rust_env rust_coder:latest
147
+
148
+ # View logs
149
+ docker logs rust_env
150
+
151
+ # Stop
152
+ docker stop rust_env
153
+ ```
154
+
155
+ ### Environment API
156
+
157
+ ```bash
158
+ # Reset (returns first problem)
159
+ curl -X POST http://localhost:8000/reset
160
+
161
+ # Step (submit Rust code)
162
+ curl -X POST http://localhost:8000/step \
163
+ -H "Content-Type: application/json" \
164
+ -d '{"action": {"code": "fn main() { println!(\"hello\"); }"}}'
165
+
166
+ # Health check
167
+ curl http://localhost:8000/health
168
+ ```
169
+
170
+ ### HF Spaces Deployment
171
+
172
+ ```bash
173
+ # Install HF CLI
174
+ pip install huggingface_hub
175
+
176
+ # Login
177
+ huggingface-cli login
178
+
179
+ # Push to Space
180
+ openenv push --repo-id your-username/rust-coder
181
+ ```
182
+
183
+ Then go to your Space settings and add secrets:
184
+ - `HF_TOKEN` → your Hugging Face API token
185
+ - `MODEL_NAME` → e.g. `Qwen/Qwen2.5-72B-Instruct`
186
+
187
+ ---
188
+
189
+ ## Baseline Scores
190
+
191
+ Baseline using **Qwen/Qwen2.5-72B-Instruct** via Hugging Face router:
192
+
193
+ | Metric | Score |
194
+ |----------------|-------|
195
+ | Average reward | 0.59 |
196
+ | Compilation % | ~85% |
197
+ | Correctness % | ~45% |
198
+
199
+ ---
200
+
201
+ ## Project Structure
202
+
203
+ ```
204
+ rust_coder/
205
+ ├── Dockerfile # Root Dockerfile (used by validator + HF Spaces)
206
+ ├── server/Dockerfile # Identical copy (used for -f flag builds)
207
+ ├── openenv.yaml # OpenEnv spec metadata
208
+ ├── pyproject.toml # Python package config
209
+ ├── uv.lock # Locked dependencies
210
+ ├── problems.json # 10 coding problems dataset
211
+ ├── models.py # Pydantic action/observation types
212
+ ├── client.py # WebSocket client for RustCoderEnv
213
+ ├── inference.py # Baseline inference script (entry point)
214
+ ├── __init__.py # Package exports
215
+ └── server/
216
+ ├── app.py # FastAPI + Gradio server
217
+ └── rust_coder_environment.py # Core environment logic
218
+ ```
__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Rust Coder Environment."""
8
+
9
+ from .client import RustCoderEnv
10
+ from .models import RustCoderAction, RustCoderObservation
11
+
12
+ __all__ = [
13
+ "RustCoderAction",
14
+ "RustCoderObservation",
15
+ "RustCoderEnv",
16
+ ]
client.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Rust Coder Environment Client."""
2
+
3
+ from typing import Dict
4
+
5
+ from openenv.core import EnvClient
6
+ from openenv.core.client_types import StepResult
7
+ from openenv.core.env_server.types import State
8
+
9
+ from models import RustCoderAction, RustCoderObservation
10
+
11
+
12
+ class RustCoderEnv(
13
+ EnvClient[RustCoderAction, RustCoderObservation, State]
14
+ ):
15
+ """
16
+ Client for the Rust Coder Environment.
17
+
18
+ Maintains a persistent WebSocket connection to the environment server.
19
+
20
+ Example:
21
+ >>> with RustCoderEnv(base_url="http://localhost:8000") as client:
22
+ ... result = client.reset()
23
+ ... print(result.observation.problem_description)
24
+ ... result = client.step(RustCoderAction(code="fn main() {}"))
25
+ ... print(result.reward)
26
+
27
+ Example with Docker:
28
+ >>> client = RustCoderEnv.from_docker_image("rust_coder-env:latest")
29
+ >>> try:
30
+ ... result = client.reset()
31
+ ... result = client.step(RustCoderAction(code="fn main() {}"))
32
+ ... finally:
33
+ ... client.close()
34
+ """
35
+
36
+ def _step_payload(self, action: RustCoderAction) -> Dict:
37
+ """Convert RustCoderAction to JSON payload for step message."""
38
+ return {
39
+ "code": action.code,
40
+ }
41
+
42
+ def _parse_result(self, payload: Dict) -> StepResult[RustCoderObservation]:
43
+ """Parse server response into StepResult[RustCoderObservation]."""
44
+ obs_data = payload.get("observation", {})
45
+ observation = RustCoderObservation(
46
+ problem_description=obs_data.get("problem_description", ""),
47
+ starter_code=obs_data.get("starter_code", ""),
48
+ compilation_success=obs_data.get("compilation_success", False),
49
+ compilation_output=obs_data.get("compilation_output", ""),
50
+ test_results=obs_data.get("test_results", []),
51
+ reward_breakdown=obs_data.get("reward_breakdown", {}),
52
+ done=payload.get("done", False),
53
+ reward=payload.get("reward", 0.0),
54
+ )
55
+
56
+ return StepResult(
57
+ observation=observation,
58
+ reward=payload.get("reward", 0.0),
59
+ done=payload.get("done", False),
60
+ )
61
+
62
+ def _parse_state(self, payload: Dict) -> State:
63
+ """Parse server response into State object."""
64
+ return State(
65
+ episode_id=payload.get("episode_id"),
66
+ step_count=payload.get("step_count", 0),
67
+ )
inference.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import asyncio
5
+ from typing import List, Optional
6
+ from openai import OpenAI
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ # --- Competition Configuration ---
12
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
13
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
14
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
15
+ ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000"
16
+
17
+ # Episode constants: 10 problems, each worth max reward 1.0
18
+ MAX_STEPS = 10
19
+ MAX_TOTAL_REWARD = 10.0
20
+ SUCCESS_SCORE_THRESHOLD = 0.5
21
+
22
+ # Import client (ensure rust_coder is in PYTHONPATH)
23
+ from client import RustCoderEnv
24
+ from models import RustCoderAction
25
+
26
+ # --- Strict Logging Helpers ---
27
+ def log_start(task: str, env: str, model: str):
28
+ print(f'[START] task="{task}" env="{env}" model="{model}"', flush=True)
29
+
30
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
31
+ escaped_action = action.replace('\n', '\\n')[:100] + "..."
32
+ log_line = f'[STEP] step={step} action="{escaped_action}" reward={reward:.4f} done={str(done).lower()}'
33
+ if error:
34
+ log_line += f' error="{error}"'
35
+ print(log_line, flush=True)
36
+
37
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]):
38
+ print(f'[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={json.dumps(rewards)}', flush=True)
39
+
40
+ # --- LLM Solution Logic ---
41
+ async def get_model_code(prompt: str, client: OpenAI) -> str:
42
+ """Call the LLM to get a Rust solution."""
43
+ try:
44
+ completion = client.chat.completions.create(
45
+ model=MODEL_NAME,
46
+ messages=[
47
+ {"role": "system", "content": "You are a senior Rust systems engineer. Return ONLY the complete, fixed Rust code. No explanation."},
48
+ {"role": "user", "content": prompt},
49
+ ],
50
+ temperature=0.1,
51
+ )
52
+ text = (completion.choices[0].message.content or "").strip()
53
+
54
+ # Extract code from markdown
55
+ if "```rust" in text:
56
+ text = text.split("```rust")[1].split("```")[0]
57
+ elif "```" in text:
58
+ text = text.split("```")[1].split("```")[0]
59
+ return text.strip()
60
+ except Exception as e:
61
+ print(f"[DEBUG] LLM Request failed: {e}", flush=True)
62
+ return f"// Error: {e}"
63
+
64
+ # --- Main Evaluation Loop ---
65
+ async def main():
66
+ if not HF_TOKEN:
67
+ print("Error: HF_TOKEN/API_KEY not found in environment.")
68
+ return
69
+
70
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
71
+ env = RustCoderEnv(base_url=ENV_URL)
72
+
73
+ log_start(task="rust_coder", env="RustCoder-v1", model=MODEL_NAME)
74
+
75
+ rewards: List[float] = []
76
+ steps_taken = 0
77
+ score = 0.0
78
+ success = False
79
+
80
+ try:
81
+ # Start the single episode (10 problems)
82
+ result = await env.reset()
83
+ obs = result.observation
84
+
85
+ for step in range(1, MAX_STEPS + 1):
86
+ if result.done:
87
+ break
88
+
89
+ steps_taken = step
90
+
91
+ # Format prompt including starter code if available
92
+ prompt = obs.problem_description
93
+ if obs.starter_code:
94
+ prompt += f"\n\nStarter Code:\n```rust\n{obs.starter_code}\n```"
95
+
96
+ # 1. Ask model for solution to current task
97
+ code_solution = await get_model_code(prompt, client)
98
+
99
+ # 2. Environment step
100
+ result = await env.step(RustCoderAction(code=code_solution))
101
+ obs = result.observation
102
+ reward = result.reward or 0.0
103
+ done = result.done
104
+
105
+ rewards.append(reward)
106
+ log_step(step=step, action=code_solution, reward=reward, done=done)
107
+
108
+ if done:
109
+ break
110
+
111
+ # Normalize score to [0, 1] matching sample format
112
+ score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
113
+ score = min(max(score, 0.0), 1.0)
114
+ success = score >= SUCCESS_SCORE_THRESHOLD
115
+
116
+ except Exception as e:
117
+ print(f"[DEBUG] Runtime error: {e}", flush=True)
118
+ log_step(step=steps_taken + 1, action="error", reward=0.0, done=True, error=str(e))
119
+
120
+ finally:
121
+ try:
122
+ await env.close()
123
+ except Exception as e:
124
+ print(f"[DEBUG] env.close() error: {e}", flush=True)
125
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
126
+
127
+ if __name__ == "__main__":
128
+ asyncio.run(main())
models.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Data models for the Rust Coder Environment.
9
+
10
+ The rust_coder environment is a simple test environment that echoes back messages.
11
+ """
12
+
13
+ from openenv.core.env_server.types import Action, Observation
14
+ from pydantic import Field
15
+
16
+
17
+ class RustCoderAction(Action):
18
+ """Action for the Rust Coder environment - contains the Rust code to evaluate."""
19
+
20
+ code: str = Field(..., description="Rust source code to compile and run")
21
+
22
+
23
+ class RustCoderObservation(Observation):
24
+ """Observation space for the Rust Coder environment."""
25
+
26
+ problem_description: str = Field(default="", description="The text description of the current coding task, including requirements.")
27
+ starter_code: str = Field(default="", description="The specific Rust code snippet that needs fixing for this task.")
28
+ compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
29
+ compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
30
+ test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
31
+ reward_breakdown: dict = Field(default_factory=dict, description="Detailed components of the 0.0-1.0 reward.")
openenv.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: rust_coder
3
+ description: "High-fidelity RL environment for evaluating LLM agents on Rust systems programming, including borrow checking, safe concurrency, and memory management."
4
+ type: space
5
+ runtime: fastapi
6
+ app: server.app:app
7
+ port: 8000
8
+ dockerfile: Dockerfile
9
+ tags:
10
+ - openenv
11
+ - software-engineering
12
+ - rust
13
+ - coding-benchmark
14
+
15
+ # Task Definition (Easy -> Medium -> Hard)
16
+ tasks:
17
+ - id: 1
18
+ title: "Broken CLI Argument Parser"
19
+ difficulty: "Easy"
20
+ - id: 2
21
+ title: "Conflicting Borrows"
22
+ difficulty: "Easy"
23
+ - id: 3
24
+ title: "Lifetime Annotations"
25
+ difficulty: "Medium"
26
+ - id: 4
27
+ title: "Business Logic"
28
+ difficulty: "Medium"
29
+ - id: 5
30
+ title: "Linked List Management"
31
+ difficulty: "Medium"
32
+ - id: 6
33
+ title: "Multi-threaded Deadlocks"
34
+ difficulty: "Hard"
35
+ - id: 7
36
+ title: "Async Borrowing"
37
+ difficulty: "Hard"
38
+ - id: 8
39
+ title: "Unsafe FFI Integration"
40
+ difficulty: "Hard"
41
+ - id: 9
42
+ title: "Inefficient Data Pipelines"
43
+ difficulty: "Hard"
44
+ - id: 10
45
+ title: "Memory Leak Prevention"
46
+ difficulty: "Hard+"
47
+
48
+ # Definitions for Documentation and Graders
49
+ action_space:
50
+ type: "RustCoderAction"
51
+ description: "A single string containing the fixed Rust code."
52
+
53
+ observation_space:
54
+ type: "RustCoderObservation"
55
+ description: "Observation containing problem description, compilation logs, test results, and reward breakdown."
56
+
openenv_rust_coder.egg-info/PKG-INFO ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: openenv-rust_coder
3
+ Version: 0.1.0
4
+ Summary: Rust Coder environment for OpenEnv
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: openenv-core[core]>=0.2.2
7
+ Requires-Dist: openai>=1.0.0
8
+ Requires-Dist: pydantic>=2.0.0
9
+ Provides-Extra: dev
10
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
11
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
openenv_rust_coder.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ ./__init__.py
4
+ ./client.py
5
+ ./inference.py
6
+ ./models.py
7
+ openenv_rust_coder.egg-info/PKG-INFO
8
+ openenv_rust_coder.egg-info/SOURCES.txt
9
+ openenv_rust_coder.egg-info/dependency_links.txt
10
+ openenv_rust_coder.egg-info/entry_points.txt
11
+ openenv_rust_coder.egg-info/requires.txt
12
+ openenv_rust_coder.egg-info/top_level.txt
13
+ server/__init__.py
14
+ server/app.py
15
+ server/rust_coder_environment.py
openenv_rust_coder.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
openenv_rust_coder.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ server = rust_coder.server.app:main
openenv_rust_coder.egg-info/requires.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.2
2
+ openai>=1.0.0
3
+ pydantic>=2.0.0
4
+
5
+ [dev]
6
+ pytest>=8.0.0
7
+ pytest-cov>=4.0.0
openenv_rust_coder.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ rust_coder
problems.json ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": 1,
4
+ "title": "Broken CLI Argument Parser",
5
+ "difficulty": "Easy",
6
+ "description": "Fix a command-line tool that parses user input to determine file operations (read, write, append). The implementation uses enums and pattern matching but contains: Mismatched types in enum variants, Incomplete match arms, Incorrect handling of optional arguments. The parser must compile and correctly interpret valid command-line inputs like: 'read file.txt' -> FileOp::Read('file.txt'), 'write file.txt content' -> FileOp::Write('file.txt', Some('content')), 'append file.txt' -> FileOp::Append('file.txt')",
7
+ "starter_code": "#[derive(Debug, PartialEq)]\nenum FileOp {\n Read(String),\n Write(String, Option<String>),\n Append(String),\n}\n\nfn parse_command(input: &str) -> Option<FileOp> {\n let parts: Vec<&str> = input.split_whitespace().collect();\n \n match parts.get(0) {\n Some(&\"read\") => {\n let filename = parts.get(1)?;\n FileOp::Read(filename.to_string()) // BUG: Missing Some()\n }\n Some(&\"write\") => {\n let filename = parts.get(1)?;\n let content = parts.get(2).map(|s| s.to_string());\n Some(FileOp::Write(filename.to_string(), content))\n }\n Some(&\"append\") => {\n let filename = parts.get(1)?;\n // BUG: Missing return statement\n }\n _ => None,\n }\n}\n\nfn main() {\n println!(\"CLI Parser Test\");\n}",
8
+ "tests": [
9
+ {
10
+ "name": "parse_read_command",
11
+ "input_code": "parse_command(\"read file.txt\")",
12
+ "expected_output": "Some\\(FileOp::Read\\(\"file\\.txt\"\\)\\)",
13
+ "should_compile": false,
14
+ "test_assertion": "assert_eq!(parse_command(\"read file.txt\"), Some(FileOp::Read(\"file.txt\".to_string())))"
15
+ },
16
+ {
17
+ "name": "parse_write_command",
18
+ "input_code": "parse_command(\"write file.txt hello\")",
19
+ "expected_output": "Some\\(FileOp::Write\\(\"file\\.txt\", Some\\(\"hello\"\\)\\)\\)",
20
+ "should_compile": false,
21
+ "test_assertion": "assert_eq!(parse_command(\"write file.txt hello\"), Some(FileOp::Write(\"file.txt\".to_string(), Some(\"hello\".to_string()))))"
22
+ },
23
+ {
24
+ "name": "parse_append_command",
25
+ "input_code": "parse_command(\"append file.txt\")",
26
+ "expected_output": "Some\\(FileOp::Append\\(\"file\\.txt\"\\)\\)",
27
+ "should_compile": false,
28
+ "test_assertion": "assert_eq!(parse_command(\"append file.txt\"), Some(FileOp::Append(\"file.txt\".to_string())))"
29
+ }
30
+ ],
31
+ "performance_baseline_ms": 10.0
32
+ },
33
+ {
34
+ "id": 2,
35
+ "title": "Conflicting Borrows in Collection Processing",
36
+ "difficulty": "Easy\u2192Medium",
37
+ "description": "Fix a function that processes a vector of strings while conditionally modifying elements and storing references for later use. The implementation mixes mutable and immutable borrows within the same scope, causing borrow checker conflicts. Requirements: Iterate through vector of strings, Store uppercase versions in a results vector, Handle optional transformations without borrowing conflicts, Must compile and execute without panics",
38
+ "starter_code": "fn process_strings(strings: &mut Vec<String>) -> Vec<String> {\n let mut results = Vec::new();\n \n for s in strings {\n // BUG: Cannot borrow as mutable while immutable borrow is active\n let upper = s.to_uppercase();\n s.push_str(\"_processed\"); // Mutable borrow\n results.push(upper);\n }\n \n results\n}\n\nfn main() {\n println!(\"String processing\");\n}",
39
+ "tests": [
40
+ {
41
+ "name": "process_single_string",
42
+ "input_code": "let mut v = vec![\"hello\".to_string()]; process_strings(&mut v);",
43
+ "should_compile": false,
44
+ "test_assertion": "assert_eq!(process_strings(&mut vec![\"hello\".to_string()]), vec![\"HELLO\".to_string()])"
45
+ },
46
+ {
47
+ "name": "process_multiple_strings",
48
+ "input_code": "let mut v = vec![\"a\".to_string(), \"b\".to_string()]; process_strings(&mut v);",
49
+ "should_compile": false,
50
+ "test_assertion": "assert_eq!(process_strings(&mut vec![\"a\".to_string(), \"b\".to_string()]), vec![\"A\".to_string(), \"B\".to_string()])"
51
+ }
52
+ ],
53
+ "performance_baseline_ms": 50.0
54
+ },
55
+ {
56
+ "id": 3,
57
+ "title": "Invalid Lifetime Annotations in Text API",
58
+ "difficulty": "Medium",
59
+ "description": "Fix a text-processing utility that accepts multiple string slices and returns a reference derived from them. The function either fails to compile or produces incorrect lifetime relationships, risking references that outlive their input data. Requirements: Function must accept multiple &str parameters, Return a &str derived from the inputs, Properly annotate lifetimes, Must be safe (no dangling references)",
60
+ "starter_code": "// BUG: Invalid lifetime annotations - which lifetime should the return type use?\nfn longest_text<'a>(s1: &'a str, s2: &'a str) -> &'a str {\n if s1.len() > s2.len() {\n s1\n } else {\n s2\n }\n}\n\n// BUG: This function has a lifetime issue\nfn find_first_word(s: &str) -> &str {\n let bytes = s.as_bytes();\n for (i, &byte) in bytes.iter().enumerate() {\n if byte == b' ' {\n return &s[0..i];\n }\n }\n &s[..]\n}\n\nfn main() {\n println!(\"Lifetime test\");\n}",
61
+ "tests": [
62
+ {
63
+ "name": "longest_text_basic",
64
+ "input_code": "longest_text(\"abc\", \"de\")",
65
+ "expected_output": "\"abc\"",
66
+ "should_compile": true,
67
+ "test_assertion": "assert_eq!(longest_text(\"abc\", \"de\"), \"abc\")"
68
+ },
69
+ {
70
+ "name": "find_first_word_with_space",
71
+ "input_code": "find_first_word(\"hello world\")",
72
+ "expected_output": "\"hello\"",
73
+ "should_compile": true,
74
+ "test_assertion": "assert_eq!(find_first_word(\"hello world\"), \"hello\")"
75
+ },
76
+ {
77
+ "name": "find_first_word_no_space",
78
+ "input_code": "find_first_word(\"hello\")",
79
+ "expected_output": "\"hello\"",
80
+ "should_compile": true,
81
+ "test_assertion": "assert_eq!(find_first_word(\"hello\"), \"hello\")"
82
+ }
83
+ ],
84
+ "performance_baseline_ms": 10.0
85
+ },
86
+ {
87
+ "id": 4,
88
+ "title": "Business Logic Producing Incorrect Results",
89
+ "difficulty": "Medium",
90
+ "description": "Fix a module implementing order validation logic including pricing, discounts, and boundary conditions. The code compiles but produces incorrect outputs for edge cases such as: Zero values, Overlapping discounts, Large numeric inputs, Negative prices. Requirements: Calculate order total correctly, Apply discounts properly (no double-counting), Handle edge cases (zero items, negative values), Be mathematically sound",
91
+ "starter_code": "#[derive(Debug, Clone)]\nstruct Order {\n quantity: i32,\n unit_price: f64,\n discount_percent: f64,\n}\n\nimpl Order {\n fn new(quantity: i32, unit_price: f64) -> Self {\n Order {\n quantity,\n unit_price,\n discount_percent: 0.0,\n }\n }\n\n fn with_discount(mut self, discount: f64) -> Self {\n self.discount_percent = discount;\n self\n }\n\n fn calculate_total(&self) -> f64 {\n let subtotal = self.quantity as f64 * self.unit_price;\n // BUG: Incorrect discount calculation\n let discount = subtotal * (self.discount_percent / 100.0);\n subtotal - discount // Missing rounding/validation\n }\n}\n\nfn main() {\n println!(\"Order test\");\n}",
92
+ "tests": [
93
+ {
94
+ "name": "simple_order",
95
+ "input_code": "Order::new(10, 5.0).calculate_total()",
96
+ "expected_output": "50\\.0",
97
+ "should_compile": true,
98
+ "test_assertion": "assert_eq!(Order::new(10, 5.0).calculate_total(), 50.0)"
99
+ },
100
+ {
101
+ "name": "order_with_discount",
102
+ "input_code": "Order::new(10, 5.0).with_discount(10.0).calculate_total()",
103
+ "expected_output": "45\\.0",
104
+ "should_compile": true,
105
+ "test_assertion": "assert_eq!(Order::new(10, 5.0).with_discount(10.0).calculate_total(), 45.0)"
106
+ },
107
+ {
108
+ "name": "zero_quantity",
109
+ "input_code": "Order::new(0, 5.0).calculate_total()",
110
+ "expected_output": "0\\.0",
111
+ "should_compile": true,
112
+ "test_assertion": "assert_eq!(Order::new(0, 5.0).calculate_total(), 0.0)"
113
+ }
114
+ ],
115
+ "performance_baseline_ms": 10.0
116
+ },
117
+ {
118
+ "id": 5,
119
+ "title": "Corrupted Singly Linked List",
120
+ "difficulty": "Medium\u2192Hard",
121
+ "description": "Fix a custom singly linked list that supports insertion, deletion, and traversal. The implementation incorrectly manages node ownership and pointer transitions, resulting in: Lost nodes, Inconsistent traversal output, Occasional runtime panics. Requirements: Insert elements at head, Delete elements correctly, Traverse without panics, No memory leaks or lost data",
122
+ "starter_code": "use std::ptr;\n\n#[derive(Debug)]\nstruct Node<T> {\n value: T,\n next: Option<Box<Node<T>>>,\n}\n\n#[derive(Debug)]\nstruct LinkedList<T> {\n head: Option<Box<Node<T>>>,\n}\n\nimpl<T> LinkedList<T> {\n fn new() -> Self {\n LinkedList { head: None }\n }\n\n fn insert(&mut self, value: T) {\n let new_node = Box::new(Node {\n value,\n next: None, // BUG: Should move self.head into next\n });\n self.head = Some(new_node);\n }\n\n fn len(&self) -> usize {\n let mut count = 0;\n let mut current = &self.head;\n while let Some(node) = current {\n count += 1;\n current = &node.next; // Correct, but insert is broken\n }\n count\n }\n}\n\nfn main() {\n println!(\"LinkedList test\");\n}",
123
+ "tests": [
124
+ {
125
+ "name": "insert_single_element",
126
+ "input_code": "let mut ll = LinkedList::new(); ll.insert(5); ll.len()",
127
+ "expected_output": "1",
128
+ "should_compile": true,
129
+ "test_assertion": "let mut ll = LinkedList::new(); ll.insert(5); assert_eq!(ll.len(), 1)"
130
+ },
131
+ {
132
+ "name": "insert_multiple_elements",
133
+ "input_code": "let mut ll = LinkedList::new(); ll.insert(1); ll.insert(2); ll.insert(3); ll.len()",
134
+ "expected_output": "3",
135
+ "should_compile": true,
136
+ "test_assertion": "let mut ll = LinkedList::new(); ll.insert(1); ll.insert(2); ll.insert(3); assert_eq!(ll.len(), 3)"
137
+ }
138
+ ],
139
+ "performance_baseline_ms": 20.0
140
+ },
141
+ {
142
+ "id": 6,
143
+ "title": "Deadlock in Multi-threaded Worker System",
144
+ "difficulty": "Hard",
145
+ "description": "Fix a worker system using multiple threads to process jobs from a shared queue protected by synchronization primitives. Under certain workloads, threads block indefinitely due to: Improper lock acquisition order, Shared state handling issues, Missing signal/wake mechanisms. Requirements: Spawn N worker threads, Process jobs from shared queue without deadlock, Handle shutdown gracefully, No panics under load",
146
+ "starter_code": "use std::sync::{Arc, Mutex, mpsc};\nuse std::thread;\n\nfn worker_system(num_workers: usize, jobs: Vec<i32>) -> Vec<i32> {\n let (tx, rx) = mpsc::channel();\n let rx = Arc::new(Mutex::new(rx));\n let results = Arc::new(Mutex::new(Vec::new()));\n \n let mut handles = vec![];\n \n for _ in 0..num_workers {\n let rx = Arc::clone(&rx);\n let results = Arc::clone(&results);\n \n let handle = thread::spawn(move || {\n loop {\n // BUG: Lock acquired but never released before trying to acquire results lock\n let receiver = rx.lock().unwrap();\n match receiver.try_recv() {\n Ok(job) => {\n let result = job * 2;\n // BUG: Tries to lock results while still holding rx lock - DEADLOCK\n results.lock().unwrap().push(result);\n }\n Err(_) => break,\n }\n }\n });\n handles.push(handle);\n }\n \n for job in jobs {\n let _ = tx.send(job); // Ignore send errors\n }\n drop(tx);\n \n for handle in handles {\n let _ = handle.join();\n }\n \n Arc::try_unwrap(results)\n .unwrap()\n .into_inner()\n .unwrap()\n}\n\nfn main() {\n println!(\"Worker system test\");\n}",
147
+ "tests": [
148
+ {
149
+ "name": "single_worker_single_job",
150
+ "input_code": "worker_system(1, vec![5])",
151
+ "expected_output": "vec!\\[10\\]",
152
+ "should_compile": true,
153
+ "test_assertion": "assert_eq!(worker_system(1, vec![5]), vec![10])"
154
+ },
155
+ {
156
+ "name": "multiple_workers",
157
+ "input_code": "worker_system(2, vec![1, 2, 3])",
158
+ "expected_output": "vec!\\[(2|4|6)\\].*vec!\\[(2|4|6)\\].*vec!\\[(2|4|6)\\]",
159
+ "should_compile": true,
160
+ "test_assertion": "let mut result = worker_system(2, vec![1, 2, 3]); result.sort(); assert_eq!(result, vec![2, 4, 6])"
161
+ }
162
+ ],
163
+ "performance_baseline_ms": 500.0
164
+ },
165
+ {
166
+ "id": 7,
167
+ "title": "Async Function with Borrowing Conflicts",
168
+ "difficulty": "Hard",
169
+ "description": "Fix an asynchronous function that processes input data and performs non-blocking operations while returning references tied to the input. The implementation violates borrowing constraints in an async context, leading to: Compilation errors when using references across await points, Invalid reference usage. Requirements: Accept &str input, Perform async operation, Return derived reference, Must be sound and compile",
170
+ "starter_code": "use std::pin::Pin;\nuse std::future::Future;\n\n// BUG: Cannot return reference that outlives await point\nasync fn process_async(input: &str) -> &str {\n // Simulating async work\n // tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;\n \n // BUG: input reference cannot be returned from async context like this\n input\n}\n\n// Better approach: return owned data or 'static reference\nfn process_sync(input: &str) -> String {\n input.to_uppercase()\n}\n\nfn main() {\n println!(\"Async test\");\n}",
171
+ "tests": [
172
+ {
173
+ "name": "process_sync_basic",
174
+ "input_code": "process_sync(\"hello\")",
175
+ "expected_output": "\"HELLO\"",
176
+ "should_compile": true,
177
+ "test_assertion": "assert_eq!(process_sync(\"hello\"), \"HELLO\")"
178
+ },
179
+ {
180
+ "name": "process_sync_uppercase",
181
+ "input_code": "process_sync(\"Hello World\")",
182
+ "expected_output": "\"HELLO WORLD\"",
183
+ "should_compile": true,
184
+ "test_assertion": "assert_eq!(process_sync(\"Hello World\"), \"HELLO WORLD\")"
185
+ }
186
+ ],
187
+ "performance_baseline_ms": 50.0
188
+ },
189
+ {
190
+ "id": 8,
191
+ "title": "Unsafe FFI Integration Causing Crashes",
192
+ "difficulty": "Hard",
193
+ "description": "Fix Rust code that interfaces with an external C library using raw pointers. The implementation incorrectly handles: Pointer ownership, Memory allocation and deallocation, Undefined behavior risks. Requirements: Safely wrap C library calls, Properly manage memory (allocate/deallocate), No undefined behavior, Handle errors gracefully",
194
+ "starter_code": "extern \"C\" {\n fn malloc(size: usize) -> *mut u8;\n fn free(ptr: *mut u8);\n}\n\nfn allocate_and_init(size: usize) -> Vec<u8> {\n unsafe {\n let ptr = malloc(size);\n // BUG: No null check - ptr could be null\n // BUG: Memory not initialized before use\n let slice = std::slice::from_raw_parts_mut(ptr, size);\n \n // Copy to vec and free\n let vec = slice.to_vec();\n free(ptr); // BUG: Freeing memory still referenced in vec\n vec\n }\n}\n\nfn main() {\n println!(\"FFI test\");\n}",
195
+ "tests": [
196
+ {
197
+ "name": "allocate_small_buffer",
198
+ "input_code": "allocate_and_init(10).len()",
199
+ "expected_output": "10",
200
+ "should_compile": false,
201
+ "test_assertion": "assert_eq!(allocate_and_init(10).len(), 10)"
202
+ }
203
+ ],
204
+ "performance_baseline_ms": 100.0
205
+ },
206
+ {
207
+ "id": 9,
208
+ "title": "Inefficient Data Processing Pipeline",
209
+ "difficulty": "Hard",
210
+ "description": "Fix a data pipeline that reads large datasets, applies transformations, and aggregates results. While functionally correct, the implementation has: Excessive memory allocations, Redundant iterations, Inefficient data copying. Requirements: Process data efficiently, Minimize allocations and copies, Use iterators when possible, Produce correct results with better performance",
211
+ "starter_code": "fn process_data(numbers: Vec<i32>) -> i32 {\n // BUG: Multiple unnecessary allocations and iterations\n \n // First pass: filter evens (allocates new vector)\n let evens: Vec<i32> = numbers.iter()\n .filter(|n| n % 2 == 0)\n .copied()\n .collect();\n \n // Second pass: double values (allocates another vector)\n let doubled: Vec<i32> = evens.iter()\n .map(|n| n * 2)\n .collect();\n \n // Third pass: sum (unnecessary iteration)\n let sum: i32 = doubled.iter().sum();\n \n // Fourth pass: filter again (redundant)\n let final_sum: i32 = doubled.iter()\n .filter(|n| n % 4 == 0)\n .sum();\n \n final_sum\n}\n\nfn main() {\n println!(\"Efficiency test\");\n}",
212
+ "tests": [
213
+ {
214
+ "name": "simple_pipeline",
215
+ "input_code": "process_data(vec![1, 2, 3, 4, 5, 6])",
216
+ "expected_output": "16",
217
+ "should_compile": true,
218
+ "test_assertion": "assert_eq!(process_data(vec![1, 2, 3, 4, 5, 6]), 16)"
219
+ },
220
+ {
221
+ "name": "all_odd_numbers",
222
+ "input_code": "process_data(vec![1, 3, 5, 7])",
223
+ "expected_output": "0",
224
+ "should_compile": true,
225
+ "test_assertion": "assert_eq!(process_data(vec![1, 3, 5, 7]), 0)"
226
+ }
227
+ ],
228
+ "performance_baseline_ms": 50.0
229
+ },
230
+ {
231
+ "id": 10,
232
+ "title": "Reference-counted Cache with Memory Leak",
233
+ "difficulty": "Hard+",
234
+ "description": "Fix a caching system using reference-counted pointers to share data across components. The design creates cyclic references between cached objects, preventing memory from being released and causing memory usage to grow over time. Requirements: Implement caching without memory leaks, Break circular reference patterns, Use Rc/Arc correctly with Weak pointers when needed, Memory should be released when cache is cleared",
235
+ "starter_code": "use std::rc::Rc;\nuse std::cell::RefCell;\n\n#[derive(Debug)]\nstruct CacheNode<T> {\n key: String,\n value: T,\n // BUG: This creates a cycle that prevents garbage collection\n related: RefCell<Option<Rc<CacheNode<T>>>>,\n}\n\n#[derive(Debug)]\nstruct Cache<T> {\n items: RefCell<Vec<Rc<CacheNode<T>>>>,\n}\n\nimpl<T: Clone> Cache<T> {\n fn new() -> Self {\n Cache {\n items: RefCell::new(Vec::new()),\n }\n }\n\n fn insert(&self, key: String, value: T) {\n let node = Rc::new(CacheNode {\n key,\n value,\n related: RefCell::new(None),\n });\n \n // BUG: Creating cyclic references\n if let Some(last) = self.items.borrow().last() {\n // Rc to Rc creates a cycle\n if let Ok(mut r) = last.related.try_borrow_mut() {\n *r = Some(Rc::clone(&node)); // Cycle here!\n }\n }\n \n self.items.borrow_mut().push(node);\n }\n}\n\nfn main() {\n println!(\"Cache test\");\n}",
236
+ "tests": [
237
+ {
238
+ "name": "cache_insert_single",
239
+ "input_code": "let c = Cache::new(); c.insert(\"key\".to_string(), \"value\".to_string()); c.items.borrow().len()",
240
+ "expected_output": "1",
241
+ "should_compile": true,
242
+ "test_assertion": "let c = Cache::new(); c.insert(\"key\".to_string(), \"value\".to_string()); assert_eq!(c.items.borrow().len(), 1)"
243
+ },
244
+ {
245
+ "name": "cache_insert_multiple",
246
+ "input_code": "let c = Cache::new(); c.insert(\"k1\".to_string(), \"v1\".to_string()); c.insert(\"k2\".to_string(), \"v2\".to_string()); c.items.borrow().len()",
247
+ "expected_output": "2",
248
+ "should_compile": true,
249
+ "test_assertion": "let c = Cache::new(); c.insert(\"k1\".to_string(), \"v1\".to_string()); c.insert(\"k2\".to_string(), \"v2\".to_string()); assert_eq!(c.items.borrow().len(), 2)"
250
+ }
251
+ ],
252
+ "performance_baseline_ms": 100.0
253
+ }
254
+ ]
pyproject.toml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-rust_coder"
13
+ version = "0.1.0"
14
+ description = "Rust Coder environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.2",
21
+ "openai>=1.0.0",
22
+ "pydantic>=2.0.0",
23
+ "gradio>=4.0.0",
24
+ # Environment-specific dependencies
25
+ # Add all dependencies needed for your environment here
26
+ # Examples:
27
+ # "numpy>=1.19.0",
28
+ # "torch>=2.0.0",
29
+ # "gymnasium>=0.29.0",
30
+ # "openspiel>=1.0.0",
31
+ # "smolagents>=1.22.0,<2",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ dev = [
36
+ "pytest>=8.0.0",
37
+ "pytest-cov>=4.0.0",
38
+ ]
39
+
40
+ [project.scripts]
41
+ # Server entry point - enables running via: uv run --project . server
42
+ # or: python -m rust_coder.server.app
43
+ server = "rust_coder.server.app:main"
44
+
45
+ [tool.setuptools]
46
+ include-package-data = true
47
+ packages = ["rust_coder", "rust_coder.server"]
48
+ package-dir = { "rust_coder" = ".", "rust_coder.server" = "server" }
scripts/validate-submission.sh ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — OpenEnv Submission Validator (Hardened)
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and OpenEnv spec compliance.
6
+ # Mandatory Log Format Check included.
7
+
8
+ set -e
9
+
10
+ PING_URL=$1
11
+ REPO_DIR=${2:-"."}
12
+
13
+ if [ -z "$PING_URL" ]; then
14
+ echo "Usage: $0 <ping_url> [repo_dir]"
15
+ echo "Example: bash scripts/validate-submission.sh https://huggingface.co/spaces/user/rust-coder"
16
+ exit 1
17
+ fi
18
+
19
+ echo "--- 🔍 1. Testing Connection to HF Space ---"
20
+ # Check health or root
21
+ RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" "$PING_URL/health" || curl -s -o /dev/null -w "%{http_code}" "$PING_URL/")
22
+
23
+ if [ "$RESPONSE" -ne 200 ]; then
24
+ echo "❌ FAILED: Space at $PING_URL returned $RESPONSE (expected 200)"
25
+ echo " Ensure your Space is 'Running' and public."
26
+ exit 1
27
+ fi
28
+ echo "✅ PASSED: Connection OK"
29
+
30
+ echo "--- 🔍 2. Validating OpenEnv Spec ---"
31
+ cd "$REPO_DIR"
32
+ if command -v openenv &>/dev/null; then
33
+ if ! openenv validate; then
34
+ echo "❌ FAILED: openenv validate failed. Check your openenv.yaml syntax."
35
+ exit 1
36
+ fi
37
+ echo "✅ PASSED: openenv.yaml is valid"
38
+ else
39
+ echo "⚠️ WARNING: 'openenv' command not found. Skipping local spec validation."
40
+ echo " (Ensure you've run 'pip install openenv-core' if you want this check)"
41
+ fi
42
+
43
+ echo "--- 🔍 3. Checking Mandatory Logging Format ---"
44
+ # The judge requires [START], [STEP], and [END] in stdout
45
+ if grep -q "\[START\]" "inference.py" && grep -q "\[STEP\]" "inference.py" && grep -q "\[END\]" "inference.py"; then
46
+ echo "✅ PASSED: inference.py contains mandatory [START/STEP/END] logs."
47
+ else
48
+ echo "❌ FAILED: inference.py is missing mandatory structured logging."
49
+ echo " See documentation for [START], [STEP], and [END] format."
50
+ exit 1
51
+ fi
52
+
53
+ echo "--- 🔍 4. Verifying File Structure ---"
54
+ FILES=("inference.py" "problems.json" "server/Dockerfile" "openenv.yaml")
55
+ for f in "${FILES[@]}"; do
56
+ if [ ! -f "$f" ]; then
57
+ echo "❌ FAILED: Missing required file: $f"
58
+ exit 1
59
+ fi
60
+ done
61
+ echo "✅ PASSED: All core files exist."
62
+
63
+ echo "--- 🔍 5. Checking Task Count ---"
64
+ TASK_COUNT=$(grep -c "id:" "openenv.yaml" || true)
65
+ if [ "$TASK_COUNT" -lt 3 ]; then
66
+ echo "❌ FAILED: Found only $TASK_COUNT tasks in openenv.yaml (minimum 3 required)."
67
+ exit 1
68
+ fi
69
+ echo "✅ PASSED: Found $TASK_COUNT tasks."
70
+
71
+ echo ""
72
+ echo "🎉 SUCCESS: Your submission has passed all local checks!"
73
+ echo "You are ready to submit your Space URL: $PING_URL"
server/Dockerfile ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-stage build for Rust Coder Environment
2
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
3
+ FROM ${BASE_IMAGE} AS builder
4
+
5
+ # 1. Environment Setup
6
+ USER root
7
+ WORKDIR /app
8
+
9
+ # Install build essentials for Rust (linker, etc.)
10
+ RUN apt-get update && \
11
+ apt-get install -y --no-install-recommends git curl build-essential ca-certificates && \
12
+ rm -rf /var/lib/apt/lists/*
13
+
14
+ # 2. Create the non-root user (Hugging Face default)
15
+ RUN useradd -m -u 1000 user
16
+ USER user
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.cargo/bin:/home/user/.local/bin:$PATH
19
+
20
+ # 3. Install Rust toolchain as 'user'
21
+ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
22
+ RUN rustup toolchain install stable
23
+
24
+ # 4. Copy environment code and install Python dependencies
25
+ WORKDIR $HOME/app/env
26
+ COPY --chown=user . $HOME/app/env
27
+
28
+ # Install uv (if not present) and then the virtual environment
29
+ RUN if ! command -v uv >/dev/null 2>&1; then \
30
+ curl -LsSf https://astral.sh/uv/install.sh | sh; \
31
+ fi
32
+
33
+ RUN --mount=type=cache,target=/home/user/.cache/uv,uid=1000,gid=1000 \
34
+ uv sync --no-editable
35
+
36
+ # -------------------------------------------------------------
37
+ # Final Runtime Stage
38
+ # -------------------------------------------------------------
39
+ FROM ${BASE_IMAGE}
40
+
41
+ USER root
42
+ RUN apt-get update && \
43
+ apt-get install -y --no-install-recommends curl build-essential ca-certificates && \
44
+ rm -rf /var/lib/apt/lists/*
45
+
46
+ # Create the user again in the final stage
47
+ RUN useradd -m -u 1000 user
48
+ USER user
49
+ ENV HOME=/home/user \
50
+ PATH="/home/user/app/env/.venv/bin:/home/user/.cargo/bin:$PATH" \
51
+ PYTHONPATH="/home/user/app/env:$PYTHONPATH"
52
+
53
+ # 4. Copy Cargo/Rustup from builder and then the local code
54
+ COPY --from=builder --chown=user /home/user/.cargo /home/user/.cargo
55
+ COPY --from=builder --chown=user /home/user/.rustup /home/user/.rustup
56
+
57
+ WORKDIR $HOME/app/env
58
+ COPY --chown=user . $HOME/app/env
59
+
60
+ # 5. Install uv and Python dependencies in the FINAL stage
61
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh
62
+ RUN uv sync --no-editable
63
+
64
+ # -------------------------------------------------------------
65
+ # Final Config
66
+ # -------------------------------------------------------------
67
+ EXPOSE 8000
68
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
69
+ CMD curl -f http://localhost:8000/health || exit 1
70
+
71
+ # Start the server using 'uv run' to ensure .venv context is preserved
72
+ CMD ["/home/user/app/env/.venv/bin/uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000", "--log-level", "debug"]
server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Rust Coder environment server components."""
8
+
9
+ from .rust_coder_environment import RustCoderEnvironment
10
+
11
+ __all__ = ["RustCoderEnvironment"]
server/app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for the Rust Coder Environment.
3
+
4
+ Endpoints:
5
+ POST /reset — Start new episode (loads next problem)
6
+ POST /step — Submit Rust code for evaluation
7
+ GET /state — Get current episode state
8
+ GET /schema — Action/observation JSON schemas
9
+ WS /ws — WebSocket for persistent sessions
10
+ """
11
+
12
+ import os
13
+ import gradio as gr
14
+ from openai import OpenAI
15
+ from dotenv import load_dotenv
16
+ from openenv.core.env_server.http_server import create_app
17
+
18
+ from models import RustCoderAction, RustCoderObservation
19
+ from server.rust_coder_environment import RustCoderEnvironment
20
+
21
+ load_dotenv()
22
+
23
+ # --- Core OpenEnv Server Setup ---
24
+ # Use a distinct name for the OpenEnv FastAPI instance
25
+ openenv_app = create_app(
26
+ RustCoderEnvironment,
27
+ RustCoderAction,
28
+ RustCoderObservation,
29
+ env_name="rust_coder",
30
+ max_concurrent_envs=1,
31
+ )
32
+
33
+ # Add a health check endpoint for Docker directly to the base app
34
+ @openenv_app.get("/health")
35
+ async def health_check():
36
+ return {"status": "healthy"}
37
+
38
+ # --- Shared Logic ---
39
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
40
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
41
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
42
+
43
+ def get_llm_solution(problem_desc: str):
44
+ """Call LLM to get a Rust solution"""
45
+ try:
46
+ client_llm = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
47
+ completion = client_llm.chat.completions.create(
48
+ model=MODEL_NAME,
49
+ messages=[
50
+ {"role": "system", "content": "You are an expert Rust developer. Respond ONLY with the code solution, no explanation."},
51
+ {"role": "user", "content": f"Fix the following Rust problem:\n{problem_desc}"},
52
+ ],
53
+ temperature=0.2,
54
+ )
55
+ text = (completion.choices[0].message.content or "").strip()
56
+ # Clean markdown code blocks
57
+ if "```rust" in text:
58
+ text = text.split("```rust")[1].split("```")[0]
59
+ elif "```" in text:
60
+ text = text.split("```")[1].split("```")[0]
61
+ return text.strip()
62
+ except Exception as e:
63
+ return f"// LLM Error: {e}"
64
+
65
+ def evaluate_single(problem_id, code=None):
66
+ """Run evaluation for a specific problem. If code is None, it asks the LLM."""
67
+ try:
68
+ idx = int(problem_id.split(":")[0]) - 1
69
+ problem = RustCoderEnvironment().problems[idx]
70
+
71
+ # 1. Get code from LLM if not provided
72
+ solution_code = code if code else get_llm_solution(problem["description"])
73
+
74
+ # 2. Guard: If LLM failed, do not evaluate
75
+ if solution_code.startswith("// LLM Error"):
76
+ return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
77
+
78
+ # 3. Evaluate properly
79
+ env = RustCoderEnvironment()
80
+ # Reset to the specifically requested index
81
+ state = env.reset(start_index=idx)
82
+ state = env.step(RustCoderAction(code=solution_code))
83
+
84
+ metrics = {
85
+ "Total Reward": f"{state.reward:.2f}",
86
+ "Compilation": "Success" if state.compilation_success else "Failed",
87
+ "Metrics": state.reward_breakdown
88
+ }
89
+ return solution_code, metrics
90
+ except Exception as e:
91
+ return f"// Error: {e}", {"error": f"Evaluation system error: {e}"}
92
+
93
+ def run_benchmark(progress=gr.Progress()):
94
+ """Run all 10 problems through the LLM and show summary"""
95
+ try:
96
+ env = RustCoderEnvironment()
97
+ rows = []
98
+ total_score = 0.0
99
+
100
+ # Check if token is actually present
101
+ test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
102
+ if not test_token:
103
+ return "## Error: HF_TOKEN is not set. Add it to your HF Space secrets or local .env file.", []
104
+
105
+ for i in range(len(env.problems)):
106
+ progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
107
+ problem = env.problems[i]
108
+ code = get_llm_solution(problem["description"])
109
+
110
+ reward = 0.0
111
+ compiled = "Failed (LLM Error)"
112
+
113
+ if not code.startswith("// LLM Error"):
114
+ env.reset(start_index=i)
115
+ state = env.step(RustCoderAction(code=code))
116
+ reward = state.reward
117
+ compiled = "Success" if state.compilation_success else "Failed"
118
+
119
+ rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
120
+ total_score += reward
121
+
122
+ avg_score = total_score / len(env.problems)
123
+ summary_md = f"## Benchmark Summary\n**Final Environment Score: {avg_score:.2f} / 1.0**"
124
+ return summary_md, rows
125
+ except Exception as e:
126
+ return f"### Benchmark Error: {e}", []
127
+
128
+ # --- Build the Gradio UI ---
129
+ def create_dashboard():
130
+ with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
131
+ gr.Markdown("# 🦀 Rust Coder: LLM Evaluation Dashboard")
132
+
133
+ with gr.Tab("Individual Task Evaluation"):
134
+ with gr.Row():
135
+ with gr.Column(scale=1):
136
+ p_env = RustCoderEnvironment()
137
+ p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
138
+ dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
139
+ desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
140
+
141
+ with gr.Column(scale=1):
142
+ run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
143
+ code_display = gr.Code(label="AI Generated Solution", interactive=False)
144
+ results_json = gr.JSON(label="Metric Breakdown")
145
+
146
+ def update_desc(p_str):
147
+ idx = int(p_str.split(":")[0]) - 1
148
+ p = p_env.problems[idx]
149
+ return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
150
+
151
+ dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
152
+ run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
153
+
154
+ with gr.Tab("Full Environment Benchmark"):
155
+ gr.Markdown("### Complete Environment Suite")
156
+ gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
157
+
158
+ b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
159
+ b_sum = gr.Markdown()
160
+ b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
161
+
162
+ b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
163
+
164
+ return demo
165
+
166
+ # Final consolidated Gradio App mounted on the FastAPI server
167
+ app = gr.mount_gradio_app(openenv_app, create_dashboard(), path="/")
168
+
169
+ def main(host: str = "0.0.0.0", port: int = 8000) -> None:
170
+ """Entry point: uv run server or python -m server.app"""
171
+ import uvicorn
172
+ uvicorn.run(app, host=host, port=port)
173
+
174
+
175
+ if __name__ == "__main__":
176
+ main()
server/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openenv[core]>=0.2.0
2
+ fastapi>=0.115.0
3
+ uvicorn>=0.24.0
4
+
5
+
6
+
server/rust_coder_environment.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rust Coder Environment Implementation.
3
+
4
+ Evaluates LLM-generated Rust code against 10 sequential coding problems.
5
+ Multi-dimensional reward system: Compilation(40%), Correctness(20%),
6
+ Coverage(20%), Elegance(10%), Efficiency(10%).
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import re
12
+ import subprocess
13
+ import tempfile
14
+ import time
15
+ from typing import Dict, List, Optional, Tuple
16
+
17
+ from openenv.core.env_server.interfaces import Environment
18
+
19
+ from models import RustCoderAction, RustCoderObservation
20
+
21
+
22
+ # Resolve problems.json: look in same dir as this file, then parent
23
+ _HERE = os.path.dirname(os.path.abspath(__file__))
24
+ _PROBLEMS_PATHS = [
25
+ os.path.join(_HERE, "problems.json"), # server/problems.json
26
+ os.path.join(_HERE, "..", "problems.json"), # root problems.json
27
+ "problems.json", # cwd fallback
28
+ ]
29
+
30
+
31
+ def _find_problems_file() -> str:
32
+ """Return the first existing problems.json path."""
33
+ for path in _PROBLEMS_PATHS:
34
+ if os.path.exists(path):
35
+ return os.path.abspath(path)
36
+ raise FileNotFoundError(
37
+ f"problems.json not found. Searched: {_PROBLEMS_PATHS}"
38
+ )
39
+
40
+
41
+ class RustCoderEnvironment(Environment):
42
+ """
43
+ OpenEnv-compliant environment for evaluating Rust code submissions.
44
+
45
+ Manages 10 sequential coding problems. Each episode is a single problem:
46
+ - reset() → loads the current problem, returns its description
47
+ - step(action) → compiles & tests submitted code, returns reward
48
+ - After step(), the episode is done; next reset() loads the next problem.
49
+
50
+ Reward breakdown (all components normalized to [0, 1]):
51
+ Compilation 40% — code compiles without errors
52
+ Correctness 20% — fraction of test assertions that pass
53
+ Coverage 20% — fraction of tests attempted to run
54
+ Elegance 10% — code quality heuristics
55
+ Efficiency 10% — execution time vs. problem baseline
56
+ """
57
+
58
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
59
+
60
+ # Compile / run timeouts (seconds)
61
+ COMPILE_TIMEOUT = 30
62
+ RUN_TIMEOUT = 10
63
+
64
+ def __init__(self) -> None:
65
+ """Initialize environment and load problems from JSON."""
66
+ self.problems: List[Dict] = self._load_problems()
67
+ self.current_problem_idx: int = 0
68
+ self.step_count: int = 0
69
+
70
+ # ------------------------------------------------------------------
71
+ # Internal helpers
72
+ # ------------------------------------------------------------------
73
+
74
+ def _load_problems(self) -> List[Dict]:
75
+ """Load and validate the problems list from problems.json."""
76
+ path = _find_problems_file()
77
+ with open(path, "r", encoding="utf-8") as f:
78
+ data = json.load(f)
79
+ if not isinstance(data, list) or len(data) == 0:
80
+ raise ValueError("problems.json must be a non-empty JSON array.")
81
+ return data
82
+
83
+ def _current_problem(self) -> Dict:
84
+ idx = self.current_problem_idx % len(self.problems)
85
+ return self.problems[idx]
86
+
87
+ # ------------------------------------------------------------------
88
+ # OpenEnv interface
89
+ # ------------------------------------------------------------------
90
+
91
+ @property
92
+ def state(self):
93
+ """Return minimal state info (step count, problem index)."""
94
+ from openenv.core.env_server.types import State
95
+ return State(episode_id=None, step_count=self.step_count)
96
+
97
+ def reset(self, start_index: int = 0) -> RustCoderObservation:
98
+ """Start a new episode, defaulting to the first problem."""
99
+ self.current_problem_idx = start_index % len(self.problems)
100
+ self.step_count = 0
101
+ problem = self.problems[self.current_problem_idx]
102
+
103
+ return RustCoderObservation(
104
+ problem_description=problem["description"],
105
+ starter_code=problem["starter_code"],
106
+ compilation_success=False,
107
+ compilation_output="",
108
+ test_results=[],
109
+ reward_breakdown={},
110
+ done=False,
111
+ reward=0.0,
112
+ )
113
+
114
+ def step(self, action: RustCoderAction) -> RustCoderObservation:
115
+ """Evaluate the submitted code and advance the task index within the single episode."""
116
+ self.step_count += 1
117
+ problem = self.problems[self.current_problem_idx]
118
+ code = action.code
119
+
120
+ # ── 1. Compilation (40%) ──────────────────────────────────────
121
+ compilation_success, compilation_output = self._compile_check(code)
122
+ r_compilation = 1.0 if compilation_success else 0.0
123
+
124
+ # ── 2. Correctness + Coverage (20% each) ─────────────────────
125
+ test_results: List[Dict] = []
126
+ r_correctness = 0.0
127
+ r_coverage = 0.0
128
+
129
+ if compilation_success:
130
+ tests = problem.get("tests", [])
131
+ if tests:
132
+ test_results = self._run_tests(code, tests)
133
+ passed = sum(1 for t in test_results if t.get("passed", False))
134
+ ran = sum(1 for t in test_results if t.get("ran", False))
135
+ r_correctness = passed / len(tests)
136
+ r_coverage = ran / len(tests)
137
+ else:
138
+ # No tests defined — give full credit to both dimensions
139
+ r_correctness = 1.0
140
+ r_coverage = 1.0
141
+
142
+ # ── 3. Elegance (10%) ─────────────────────────────────────────
143
+ r_elegance = self._score_elegance(code)
144
+
145
+ # ── 4. Efficiency (10%) ───────────────────────────────────────
146
+ baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
147
+ r_efficiency = 0.0
148
+ if compilation_success:
149
+ r_efficiency = self._score_efficiency(code, baseline_ms)
150
+
151
+ # ── Total reward ──────────────────────────────────────────────
152
+ reward_breakdown = {
153
+ "Compilation": round(r_compilation, 4),
154
+ "Correctness": round(r_correctness, 4),
155
+ "Coverage": round(r_coverage, 4),
156
+ "Elegance": round(r_elegance, 4),
157
+ "Efficiency": round(r_efficiency, 4),
158
+ }
159
+ # Calculate weighted total reward
160
+ total_reward = round(
161
+ r_compilation * 0.40
162
+ + r_correctness * 0.20
163
+ + r_coverage * 0.20
164
+ + r_elegance * 0.10
165
+ + r_efficiency * 0.10,
166
+ 4,
167
+ )
168
+
169
+ # ── Advance Logic ─────────────────────────────────────────────
170
+ self.current_problem_idx += 1
171
+ done = self.current_problem_idx >= len(self.problems)
172
+
173
+ next_prob_desc = "--- ALL TASKS COMPLETED in this episode ---"
174
+ next_starter = ""
175
+ if not done:
176
+ next_prob = self.problems[self.current_problem_idx]
177
+ next_prob_desc = f"--- NEXT TASK: {next_prob['title']} ---\n\n{next_prob['description']}"
178
+ next_starter = next_prob['starter_code']
179
+
180
+ return RustCoderObservation(
181
+ problem_description=next_prob_desc,
182
+ starter_code=next_starter,
183
+ compilation_success=compilation_success,
184
+ compilation_output=compilation_output[:2000], # cap length
185
+ test_results=test_results,
186
+ reward_breakdown=reward_breakdown,
187
+ done=done,
188
+ reward=total_reward,
189
+ )
190
+
191
+ # ------------------------------------------------------------------
192
+ # Compilation
193
+ # ------------------------------------------------------------------
194
+
195
+ def _compile_check(self, code: str) -> Tuple[bool, str]:
196
+ """
197
+ Compile code as a Rust library crate.
198
+ Returns (success, compiler output).
199
+ """
200
+ with tempfile.TemporaryDirectory() as tmpdir:
201
+ src = os.path.join(tmpdir, "submission.rs")
202
+ out = os.path.join(tmpdir, "submission.rlib")
203
+ with open(src, "w", encoding="utf-8") as f:
204
+ f.write(code)
205
+ try:
206
+ proc = subprocess.run(
207
+ ["rustc", "--crate-type=lib", src, "-o", out,
208
+ "--edition=2021"],
209
+ capture_output=True,
210
+ text=True,
211
+ timeout=self.COMPILE_TIMEOUT,
212
+ )
213
+ return proc.returncode == 0, (proc.stdout + proc.stderr).strip()
214
+ except subprocess.TimeoutExpired:
215
+ return False, "Compilation timed out."
216
+ except FileNotFoundError:
217
+ return False, "rustc not found — is the Rust toolchain installed?"
218
+
219
+ # ------------------------------------------------------------------
220
+ # Correctness / Coverage
221
+ # ------------------------------------------------------------------
222
+
223
+ def _strip_main(self, code: str) -> str:
224
+ """
225
+ Remove fn main() { ... } blocks from submitted code so we can
226
+ inject our own test main. Handles simple single-level braces.
227
+ """
228
+ # Remove pub/private fn main() { ... }
229
+ pattern = re.compile(
230
+ r'(pub\s+)?fn\s+main\s*\(\s*\)\s*(?:->\s*[^{]+)?\s*\{',
231
+ re.MULTILINE,
232
+ )
233
+ match = pattern.search(code)
234
+ if not match:
235
+ return code
236
+ start = match.start()
237
+ depth = 0
238
+ i = match.end() - 1 # position of the opening '{'
239
+ while i < len(code):
240
+ if code[i] == '{':
241
+ depth += 1
242
+ elif code[i] == '}':
243
+ depth -= 1
244
+ if depth == 0:
245
+ return code[:start] + code[i + 1:]
246
+ i += 1
247
+ return code # malformed; return as-is
248
+
249
+ def _build_test_binary(
250
+ self, code: str, assertion: str, tmpdir: str, test_name: str
251
+ ) -> Tuple[bool, str, str]:
252
+ """
253
+ Build a runnable Rust binary that executes one test assertion.
254
+ Returns (compiled_ok, binary_path, compiler_output).
255
+ """
256
+ body = self._strip_main(code)
257
+ src_code = f"""
258
+ #[allow(unused_imports, dead_code, unused_variables, unused_mut)]
259
+ {body}
260
+
261
+ fn main() {{
262
+ {assertion};
263
+ println!("PASS:{test_name}");
264
+ }}
265
+ """
266
+ src_path = os.path.join(tmpdir, f"{test_name}.rs")
267
+ bin_path = os.path.join(tmpdir, test_name)
268
+ with open(src_path, "w", encoding="utf-8") as f:
269
+ f.write(src_code)
270
+ try:
271
+ proc = subprocess.run(
272
+ ["rustc", src_path, "-o", bin_path, "--edition=2021"],
273
+ capture_output=True,
274
+ text=True,
275
+ timeout=self.COMPILE_TIMEOUT,
276
+ )
277
+ return proc.returncode == 0, bin_path, (proc.stdout + proc.stderr).strip()
278
+ except subprocess.TimeoutExpired:
279
+ return False, "", "Compile timed out for test."
280
+ except FileNotFoundError:
281
+ return False, "", "rustc not found."
282
+
283
+ def _run_tests(self, code: str, tests: List[Dict]) -> List[Dict]:
284
+ """
285
+ Run each test assertion as a separate Rust binary.
286
+ Returns list of result dicts with keys: name, passed, ran, error.
287
+ """
288
+ results = []
289
+ with tempfile.TemporaryDirectory() as tmpdir:
290
+ for i, test in enumerate(tests):
291
+ name = test.get("name", f"test_{i}")
292
+ assertion = test.get("test_assertion", "")
293
+ should_compile = test.get("should_compile", True)
294
+
295
+ result: Dict = {
296
+ "name": name,
297
+ "passed": False,
298
+ "ran": False,
299
+ "error": None,
300
+ }
301
+
302
+ if not assertion:
303
+ result["error"] = "No test assertion defined."
304
+ results.append(result)
305
+ continue
306
+
307
+ # Some tests are expected to fail compilation (should_compile=False)
308
+ # treat successful compilation + correct output as pass
309
+ bin_test_name = f"t{i}_{name[:20]}"
310
+ compiled, bin_path, compiler_out = self._build_test_binary(
311
+ code, assertion, tmpdir, bin_test_name
312
+ )
313
+
314
+ if not compiled:
315
+ if not should_compile:
316
+ # The problem's starter code deliberately doesn't compile;
317
+ # if the submission also doesn't compile this test → skip
318
+ result["error"] = "Binary compile failed (expected for broken starter)."
319
+ else:
320
+ result["error"] = f"Compile error: {compiler_out[:300]}"
321
+ result["ran"] = False
322
+ results.append(result)
323
+ continue
324
+
325
+ # Run the binary
326
+ result["ran"] = True
327
+ try:
328
+ run_proc = subprocess.run(
329
+ [bin_path],
330
+ capture_output=True,
331
+ text=True,
332
+ timeout=self.RUN_TIMEOUT,
333
+ )
334
+ stdout = run_proc.stdout.strip()
335
+ if run_proc.returncode == 0 and f"PASS:{bin_test_name}" in stdout:
336
+ result["passed"] = True
337
+ else:
338
+ result["error"] = (
339
+ f"Test failed. Exit={run_proc.returncode}. "
340
+ f"stderr={run_proc.stderr[:200]}"
341
+ )
342
+ except subprocess.TimeoutExpired:
343
+ result["error"] = "Test execution timed out."
344
+ except Exception as exc:
345
+ result["error"] = str(exc)
346
+
347
+ results.append(result)
348
+
349
+ return results
350
+
351
+ # ------------------------------------------------------------------
352
+ # Elegance scoring
353
+ # ------------------------------------------------------------------
354
+
355
+ def _score_elegance(self, code: str) -> float:
356
+ """
357
+ Heuristic code-quality score in [0, 1].
358
+
359
+ Penalties:
360
+ - Each `.unwrap()` call → -0.15 (max -0.45)
361
+ - Each `.expect(` call → -0.05 (max -0.15)
362
+ - Lines > 100 chars → -0.05 per violation (max -0.20)
363
+ - `unsafe` blocks → -0.20 unless problem requires FFI
364
+
365
+ Bonuses:
366
+ - Uses `?` operator → +0.10
367
+ - Uses `match` expressions → +0.05
368
+ - Has doc comments (`///`) → +0.05
369
+ """
370
+ score = 1.0
371
+
372
+ unwrap_count = len(re.findall(r'\.unwrap\(\)', code))
373
+ score -= min(unwrap_count * 0.15, 0.45)
374
+
375
+ expect_count = len(re.findall(r'\.expect\(', code))
376
+ score -= min(expect_count * 0.05, 0.15)
377
+
378
+ long_lines = sum(1 for line in code.splitlines() if len(line) > 100)
379
+ score -= min(long_lines * 0.05, 0.20)
380
+
381
+ if "unsafe" in code:
382
+ score -= 0.20
383
+
384
+ if "?" in code:
385
+ score += 0.10
386
+ if "match " in code or "match\n" in code:
387
+ score += 0.05
388
+ if "///" in code:
389
+ score += 0.05
390
+
391
+ return round(max(0.0, min(1.0, score)), 4)
392
+
393
+ # ------------------------------------------------------------------
394
+ # Efficiency scoring
395
+ # ------------------------------------------------------------------
396
+
397
+ def _score_efficiency(self, code: str, baseline_ms: float) -> float:
398
+ """
399
+ Time the execution by compiling + running a minimal binary.
400
+ Score = min(1.0, baseline_ms / actual_ms).
401
+ Returns 0.0 if compilation or execution fails.
402
+ """
403
+ body = self._strip_main(code)
404
+ # Build a binary with an empty main to measure startup + run overhead
405
+ test_src = f"""
406
+ #[allow(unused_imports, dead_code, unused_variables)]
407
+ {body}
408
+
409
+ fn main() {{}}
410
+ """
411
+ with tempfile.TemporaryDirectory() as tmpdir:
412
+ src_path = os.path.join(tmpdir, "eff.rs")
413
+ bin_path = os.path.join(tmpdir, "eff")
414
+ with open(src_path, "w", encoding="utf-8") as f:
415
+ f.write(test_src)
416
+ try:
417
+ # Compile
418
+ proc = subprocess.run(
419
+ ["rustc", src_path, "-o", bin_path, "--edition=2021"],
420
+ capture_output=True, text=True, timeout=self.COMPILE_TIMEOUT,
421
+ )
422
+ if proc.returncode != 0:
423
+ return 0.0
424
+ # Time the run
425
+ t0 = time.monotonic()
426
+ run_proc = subprocess.run(
427
+ [bin_path], capture_output=True, timeout=self.RUN_TIMEOUT
428
+ )
429
+ elapsed_ms = (time.monotonic() - t0) * 1000.0
430
+ if run_proc.returncode != 0:
431
+ return 0.0
432
+ return round(min(1.0, baseline_ms / max(elapsed_ms, 0.1)), 4)
433
+ except Exception:
434
+ return 0.0
uv.lock ADDED
The diff for this file is too large to render. See raw diff