Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- Dockerfile +73 -0
- README.md +213 -5
- __init__.py +16 -0
- client.py +67 -0
- inference.py +128 -0
- models.py +31 -0
- openenv.yaml +56 -0
- openenv_rust_coder.egg-info/PKG-INFO +11 -0
- openenv_rust_coder.egg-info/SOURCES.txt +15 -0
- openenv_rust_coder.egg-info/dependency_links.txt +1 -0
- openenv_rust_coder.egg-info/entry_points.txt +2 -0
- openenv_rust_coder.egg-info/requires.txt +7 -0
- openenv_rust_coder.egg-info/top_level.txt +1 -0
- problems.json +254 -0
- pyproject.toml +48 -0
- scripts/validate-submission.sh +73 -0
- server/Dockerfile +72 -0
- server/__init__.py +11 -0
- server/app.py +176 -0
- server/requirements.txt +6 -0
- server/rust_coder_environment.py +434 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-stage build for Rust Coder Environment
|
| 2 |
+
# Build context: repo root (contains models.py, problems.json, pyproject.toml, uv.lock)
|
| 3 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 4 |
+
FROM ${BASE_IMAGE} AS builder
|
| 5 |
+
|
| 6 |
+
# 1. Environment Setup
|
| 7 |
+
USER root
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
# Install build essentials for Rust (linker, etc.)
|
| 11 |
+
RUN apt-get update && \
|
| 12 |
+
apt-get install -y --no-install-recommends git curl build-essential ca-certificates && \
|
| 13 |
+
rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# 2. Create the non-root user (Hugging Face default)
|
| 16 |
+
RUN useradd -m -u 1000 user
|
| 17 |
+
USER user
|
| 18 |
+
ENV HOME=/home/user \
|
| 19 |
+
PATH=/home/user/.cargo/bin:/home/user/.local/bin:$PATH
|
| 20 |
+
|
| 21 |
+
# 3. Install Rust toolchain as 'user'
|
| 22 |
+
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
| 23 |
+
RUN rustup toolchain install stable
|
| 24 |
+
|
| 25 |
+
# 4. Copy environment code and install Python dependencies
|
| 26 |
+
WORKDIR $HOME/app/env
|
| 27 |
+
COPY --chown=user . $HOME/app/env
|
| 28 |
+
|
| 29 |
+
# Install uv (if not present) and then the virtual environment
|
| 30 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 31 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh; \
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
RUN --mount=type=cache,target=/home/user/.cache/uv,uid=1000,gid=1000 \
|
| 35 |
+
uv sync --no-editable
|
| 36 |
+
|
| 37 |
+
# -------------------------------------------------------------
|
| 38 |
+
# Final Runtime Stage
|
| 39 |
+
# -------------------------------------------------------------
|
| 40 |
+
FROM ${BASE_IMAGE}
|
| 41 |
+
|
| 42 |
+
USER root
|
| 43 |
+
RUN apt-get update && \
|
| 44 |
+
apt-get install -y --no-install-recommends curl build-essential ca-certificates && \
|
| 45 |
+
rm -rf /var/lib/apt/lists/*
|
| 46 |
+
|
| 47 |
+
# Create the user again in the final stage
|
| 48 |
+
RUN useradd -m -u 1000 user
|
| 49 |
+
USER user
|
| 50 |
+
ENV HOME=/home/user \
|
| 51 |
+
PATH="/home/user/app/env/.venv/bin:/home/user/.cargo/bin:$PATH" \
|
| 52 |
+
PYTHONPATH="/home/user/app/env:$PYTHONPATH"
|
| 53 |
+
|
| 54 |
+
# Copy Cargo/Rustup from builder and then the local code
|
| 55 |
+
COPY --from=builder --chown=user /home/user/.cargo /home/user/.cargo
|
| 56 |
+
COPY --from=builder --chown=user /home/user/.rustup /home/user/.rustup
|
| 57 |
+
|
| 58 |
+
WORKDIR $HOME/app/env
|
| 59 |
+
COPY --chown=user . $HOME/app/env
|
| 60 |
+
|
| 61 |
+
# Install uv and Python dependencies in the FINAL stage
|
| 62 |
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 63 |
+
RUN uv sync --no-editable
|
| 64 |
+
|
| 65 |
+
# -------------------------------------------------------------
|
| 66 |
+
# Final Config
|
| 67 |
+
# -------------------------------------------------------------
|
| 68 |
+
EXPOSE 8000
|
| 69 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 70 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 71 |
+
|
| 72 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 73 |
+
CMD ["/home/user/app/env/.venv/bin/uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000", "--log-level", "info"]
|
README.md
CHANGED
|
@@ -1,10 +1,218 @@
|
|
| 1 |
---
|
| 2 |
-
title: Rust Coder
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
|
|
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Rust Coder OpenEnv
|
| 3 |
+
emoji: 🦀
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
base_path: /web
|
| 9 |
pinned: false
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- software-engineering
|
| 13 |
+
- rust
|
| 14 |
---
|
| 15 |
|
| 16 |
+
# Rust Coder: Systems Engineering Environment
|
| 17 |
+
|
| 18 |
+
Rust Coder is a high-fidelity **OpenEnv** environment designed to evaluate and train LLM agents on real-world Rust systems programming tasks. Unlike toy environments, Rust Coder simulates valid engineering scenarios involving the borrow checker, concurrency, and memory safety.
|
| 19 |
+
|
| 20 |
+
## Motivation
|
| 21 |
+
|
| 22 |
+
Rust is uniquely challenging for AI agents due to its strict compile-time safety guarantees. This environment provides a 10-task progression that measures an agent's ability to:
|
| 23 |
+
|
| 24 |
+
1. Fix borrow checker violations
|
| 25 |
+
2. Correctly annotate lifetimes
|
| 26 |
+
3. Resolve concurrency deadlocks
|
| 27 |
+
4. Write unsafe FFI code correctly
|
| 28 |
+
5. Identify and prevent memory leaks
|
| 29 |
+
6. Optimize data pipelines for performance
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Action Space
|
| 34 |
+
|
| 35 |
+
**Type**: `RustCoderAction`
|
| 36 |
+
|
| 37 |
+
The agent submits a single string containing the complete, fixed Rust source code.
|
| 38 |
+
|
| 39 |
+
| Field | Type | Description |
|
| 40 |
+
|-------|--------|------------------------------------------|
|
| 41 |
+
| `code` | string | Full Rust source code to compile and test |
|
| 42 |
+
|
| 43 |
+
## Observation Space
|
| 44 |
+
|
| 45 |
+
**Type**: `RustCoderObservation`
|
| 46 |
+
|
| 47 |
+
The environment returns detailed feedback after each submission:
|
| 48 |
+
|
| 49 |
+
| Field | Type | Description |
|
| 50 |
+
|------------------------|-------------|-----------------------------------------------------|
|
| 51 |
+
| `problem_description` | string | Task requirements and context |
|
| 52 |
+
| `starter_code` | string | The intentionally broken code to fix |
|
| 53 |
+
| `compilation_success` | bool | Whether `rustc` compiled the submitted code |
|
| 54 |
+
| `compilation_output` | string | Raw compiler errors and warnings |
|
| 55 |
+
| `test_results` | list[dict] | Per-test pass/fail results with error details |
|
| 56 |
+
| `reward_breakdown` | dict | Weighted score breakdown across 5 dimensions |
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
## Reward Function
|
| 61 |
+
|
| 62 |
+
Total reward is a weighted sum of 5 dimensions, each normalized to [0, 1]:
|
| 63 |
+
|
| 64 |
+
| Dimension | Weight | Metric |
|
| 65 |
+
|-----------------|--------|---------------------------------------------------|
|
| 66 |
+
| Compilation | 40% | Binary success/failure of `rustc` |
|
| 67 |
+
| Correctness | 20% | Fraction of test assertions that pass |
|
| 68 |
+
| Coverage | 20% | Fraction of tests that successfully ran |
|
| 69 |
+
| Elegance | 10% | Code quality heuristics (avoids `.unwrap()`, long lines, `unsafe`) |
|
| 70 |
+
| Efficiency | 10% | Execution time vs. per-problem baseline |
|
| 71 |
+
|
| 72 |
+
Reward provides partial signal at every step — compilation alone earns 0.40, passing all tests earns up to 1.0.
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## Tasks
|
| 77 |
+
|
| 78 |
+
10 sequential problems with increasing difficulty:
|
| 79 |
+
|
| 80 |
+
| ID | Title | Difficulty | Skill Evaluated |
|
| 81 |
+
|----|------------------------------------|------------|-------------------------------|
|
| 82 |
+
| 1 | Broken CLI Argument Parser | Easy | Enums & pattern matching |
|
| 83 |
+
| 2 | Conflicting Borrows | Easy→Med | Borrow checker |
|
| 84 |
+
| 3 | Invalid Lifetime Annotations | Medium | Lifetime annotations |
|
| 85 |
+
| 4 | Business Logic Errors | Medium | Math & correctness |
|
| 86 |
+
| 5 | Linked List Management | Medium | Ownership & data structures |
|
| 87 |
+
| 6 | Multi-threaded Deadlocks | Hard | Mutex & concurrency |
|
| 88 |
+
| 7 | Async Borrowing Conflicts | Hard | Async/await lifetimes |
|
| 89 |
+
| 8 | Unsafe FFI Integration | Hard | `unsafe` & C interop |
|
| 90 |
+
| 9 | Inefficient Data Pipeline | Hard | Performance optimization |
|
| 91 |
+
| 10 | Memory Leak Prevention | Hard+ | Weak pointers & ownership |
|
| 92 |
+
|
| 93 |
+
---
|
| 94 |
+
|
| 95 |
+
## Environment Variables / Secrets
|
| 96 |
+
|
| 97 |
+
The environment reads the following variables. Set them as **HF Space secrets** (Settings → Variables and Secrets) when deploying to Hugging Face, or in a local `.env` file for development.
|
| 98 |
+
|
| 99 |
+
| Variable | Required | Default | Description |
|
| 100 |
+
|----------------|----------|--------------------------------------|--------------------------------------|
|
| 101 |
+
| `HF_TOKEN` | Yes | — | Hugging Face API token for LLM calls |
|
| 102 |
+
| `API_BASE_URL` | No | `https://router.huggingface.co/v1` | Inference endpoint |
|
| 103 |
+
| `MODEL_NAME` | No | `Qwen/Qwen2.5-72B-Instruct` | Model to use for evaluation |
|
| 104 |
+
|
| 105 |
+
> **Note**: The `.env` file is excluded from Docker images by `.dockerignore`. On HF Spaces, secrets are injected as OS environment variables by the platform — `load_dotenv()` silently does nothing if no file is present, and `os.getenv()` reads from the platform-injected vars. This is the correct behavior.
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## Setup & Usage
|
| 110 |
+
|
| 111 |
+
### Local Development
|
| 112 |
+
|
| 113 |
+
```bash
|
| 114 |
+
# 1. Clone and enter the repo
|
| 115 |
+
git clone https://github.com/your-username/rust_coder
|
| 116 |
+
cd rust_coder
|
| 117 |
+
|
| 118 |
+
# 2. Create .env with your credentials
|
| 119 |
+
cat > .env << EOF
|
| 120 |
+
HF_TOKEN=hf_your_token_here
|
| 121 |
+
API_BASE_URL=https://router.huggingface.co/v1
|
| 122 |
+
MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 123 |
+
EOF
|
| 124 |
+
|
| 125 |
+
# 3. Build the Docker image (uses root Dockerfile)
|
| 126 |
+
docker build -t rust_coder:latest .
|
| 127 |
+
|
| 128 |
+
# 4. Run the environment server
|
| 129 |
+
docker run -d -p 8000:8000 --env-file .env --name rust_env rust_coder:latest
|
| 130 |
+
|
| 131 |
+
# 5. Verify it's healthy
|
| 132 |
+
curl http://localhost:8000/health
|
| 133 |
+
# → {"status": "healthy"}
|
| 134 |
+
|
| 135 |
+
# 6. Run the inference benchmark
|
| 136 |
+
python inference.py
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
### Docker Commands Reference
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# Build
|
| 143 |
+
docker build -t rust_coder:latest .
|
| 144 |
+
|
| 145 |
+
# Run with .env file
|
| 146 |
+
docker run -d -p 8000:8000 --env-file .env --name rust_env rust_coder:latest
|
| 147 |
+
|
| 148 |
+
# View logs
|
| 149 |
+
docker logs rust_env
|
| 150 |
+
|
| 151 |
+
# Stop
|
| 152 |
+
docker stop rust_env
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
### Environment API
|
| 156 |
+
|
| 157 |
+
```bash
|
| 158 |
+
# Reset (returns first problem)
|
| 159 |
+
curl -X POST http://localhost:8000/reset
|
| 160 |
+
|
| 161 |
+
# Step (submit Rust code)
|
| 162 |
+
curl -X POST http://localhost:8000/step \
|
| 163 |
+
-H "Content-Type: application/json" \
|
| 164 |
+
-d '{"action": {"code": "fn main() { println!(\"hello\"); }"}}'
|
| 165 |
+
|
| 166 |
+
# Health check
|
| 167 |
+
curl http://localhost:8000/health
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
### HF Spaces Deployment
|
| 171 |
+
|
| 172 |
+
```bash
|
| 173 |
+
# Install HF CLI
|
| 174 |
+
pip install huggingface_hub
|
| 175 |
+
|
| 176 |
+
# Login
|
| 177 |
+
huggingface-cli login
|
| 178 |
+
|
| 179 |
+
# Push to Space
|
| 180 |
+
openenv push --repo-id your-username/rust-coder
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
Then go to your Space settings and add secrets:
|
| 184 |
+
- `HF_TOKEN` → your Hugging Face API token
|
| 185 |
+
- `MODEL_NAME` → e.g. `Qwen/Qwen2.5-72B-Instruct`
|
| 186 |
+
|
| 187 |
+
---
|
| 188 |
+
|
| 189 |
+
## Baseline Scores
|
| 190 |
+
|
| 191 |
+
Baseline using **Qwen/Qwen2.5-72B-Instruct** via Hugging Face router:
|
| 192 |
+
|
| 193 |
+
| Metric | Score |
|
| 194 |
+
|----------------|-------|
|
| 195 |
+
| Average reward | 0.59 |
|
| 196 |
+
| Compilation % | ~85% |
|
| 197 |
+
| Correctness % | ~45% |
|
| 198 |
+
|
| 199 |
+
---
|
| 200 |
+
|
| 201 |
+
## Project Structure
|
| 202 |
+
|
| 203 |
+
```
|
| 204 |
+
rust_coder/
|
| 205 |
+
├── Dockerfile # Root Dockerfile (used by validator + HF Spaces)
|
| 206 |
+
├── server/Dockerfile # Identical copy (used for -f flag builds)
|
| 207 |
+
├── openenv.yaml # OpenEnv spec metadata
|
| 208 |
+
├── pyproject.toml # Python package config
|
| 209 |
+
├── uv.lock # Locked dependencies
|
| 210 |
+
├── problems.json # 10 coding problems dataset
|
| 211 |
+
├── models.py # Pydantic action/observation types
|
| 212 |
+
├── client.py # WebSocket client for RustCoderEnv
|
| 213 |
+
├── inference.py # Baseline inference script (entry point)
|
| 214 |
+
├── __init__.py # Package exports
|
| 215 |
+
└── server/
|
| 216 |
+
├── app.py # FastAPI + Gradio server
|
| 217 |
+
└── rust_coder_environment.py # Core environment logic
|
| 218 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Rust Coder Environment."""
|
| 8 |
+
|
| 9 |
+
from .client import RustCoderEnv
|
| 10 |
+
from .models import RustCoderAction, RustCoderObservation
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"RustCoderAction",
|
| 14 |
+
"RustCoderObservation",
|
| 15 |
+
"RustCoderEnv",
|
| 16 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Rust Coder Environment Client."""
|
| 2 |
+
|
| 3 |
+
from typing import Dict
|
| 4 |
+
|
| 5 |
+
from openenv.core import EnvClient
|
| 6 |
+
from openenv.core.client_types import StepResult
|
| 7 |
+
from openenv.core.env_server.types import State
|
| 8 |
+
|
| 9 |
+
from models import RustCoderAction, RustCoderObservation
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class RustCoderEnv(
|
| 13 |
+
EnvClient[RustCoderAction, RustCoderObservation, State]
|
| 14 |
+
):
|
| 15 |
+
"""
|
| 16 |
+
Client for the Rust Coder Environment.
|
| 17 |
+
|
| 18 |
+
Maintains a persistent WebSocket connection to the environment server.
|
| 19 |
+
|
| 20 |
+
Example:
|
| 21 |
+
>>> with RustCoderEnv(base_url="http://localhost:8000") as client:
|
| 22 |
+
... result = client.reset()
|
| 23 |
+
... print(result.observation.problem_description)
|
| 24 |
+
... result = client.step(RustCoderAction(code="fn main() {}"))
|
| 25 |
+
... print(result.reward)
|
| 26 |
+
|
| 27 |
+
Example with Docker:
|
| 28 |
+
>>> client = RustCoderEnv.from_docker_image("rust_coder-env:latest")
|
| 29 |
+
>>> try:
|
| 30 |
+
... result = client.reset()
|
| 31 |
+
... result = client.step(RustCoderAction(code="fn main() {}"))
|
| 32 |
+
... finally:
|
| 33 |
+
... client.close()
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def _step_payload(self, action: RustCoderAction) -> Dict:
|
| 37 |
+
"""Convert RustCoderAction to JSON payload for step message."""
|
| 38 |
+
return {
|
| 39 |
+
"code": action.code,
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
def _parse_result(self, payload: Dict) -> StepResult[RustCoderObservation]:
|
| 43 |
+
"""Parse server response into StepResult[RustCoderObservation]."""
|
| 44 |
+
obs_data = payload.get("observation", {})
|
| 45 |
+
observation = RustCoderObservation(
|
| 46 |
+
problem_description=obs_data.get("problem_description", ""),
|
| 47 |
+
starter_code=obs_data.get("starter_code", ""),
|
| 48 |
+
compilation_success=obs_data.get("compilation_success", False),
|
| 49 |
+
compilation_output=obs_data.get("compilation_output", ""),
|
| 50 |
+
test_results=obs_data.get("test_results", []),
|
| 51 |
+
reward_breakdown=obs_data.get("reward_breakdown", {}),
|
| 52 |
+
done=payload.get("done", False),
|
| 53 |
+
reward=payload.get("reward", 0.0),
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
return StepResult(
|
| 57 |
+
observation=observation,
|
| 58 |
+
reward=payload.get("reward", 0.0),
|
| 59 |
+
done=payload.get("done", False),
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 63 |
+
"""Parse server response into State object."""
|
| 64 |
+
return State(
|
| 65 |
+
episode_id=payload.get("episode_id"),
|
| 66 |
+
step_count=payload.get("step_count", 0),
|
| 67 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import asyncio
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
# --- Competition Configuration ---
|
| 12 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 13 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 14 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 15 |
+
ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000"
|
| 16 |
+
|
| 17 |
+
# Episode constants: 10 problems, each worth max reward 1.0
|
| 18 |
+
MAX_STEPS = 10
|
| 19 |
+
MAX_TOTAL_REWARD = 10.0
|
| 20 |
+
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 21 |
+
|
| 22 |
+
# Import client (ensure rust_coder is in PYTHONPATH)
|
| 23 |
+
from client import RustCoderEnv
|
| 24 |
+
from models import RustCoderAction
|
| 25 |
+
|
| 26 |
+
# --- Strict Logging Helpers ---
|
| 27 |
+
def log_start(task: str, env: str, model: str):
|
| 28 |
+
print(f'[START] task="{task}" env="{env}" model="{model}"', flush=True)
|
| 29 |
+
|
| 30 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
|
| 31 |
+
escaped_action = action.replace('\n', '\\n')[:100] + "..."
|
| 32 |
+
log_line = f'[STEP] step={step} action="{escaped_action}" reward={reward:.4f} done={str(done).lower()}'
|
| 33 |
+
if error:
|
| 34 |
+
log_line += f' error="{error}"'
|
| 35 |
+
print(log_line, flush=True)
|
| 36 |
+
|
| 37 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]):
|
| 38 |
+
print(f'[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={json.dumps(rewards)}', flush=True)
|
| 39 |
+
|
| 40 |
+
# --- LLM Solution Logic ---
|
| 41 |
+
async def get_model_code(prompt: str, client: OpenAI) -> str:
|
| 42 |
+
"""Call the LLM to get a Rust solution."""
|
| 43 |
+
try:
|
| 44 |
+
completion = client.chat.completions.create(
|
| 45 |
+
model=MODEL_NAME,
|
| 46 |
+
messages=[
|
| 47 |
+
{"role": "system", "content": "You are a senior Rust systems engineer. Return ONLY the complete, fixed Rust code. No explanation."},
|
| 48 |
+
{"role": "user", "content": prompt},
|
| 49 |
+
],
|
| 50 |
+
temperature=0.1,
|
| 51 |
+
)
|
| 52 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 53 |
+
|
| 54 |
+
# Extract code from markdown
|
| 55 |
+
if "```rust" in text:
|
| 56 |
+
text = text.split("```rust")[1].split("```")[0]
|
| 57 |
+
elif "```" in text:
|
| 58 |
+
text = text.split("```")[1].split("```")[0]
|
| 59 |
+
return text.strip()
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"[DEBUG] LLM Request failed: {e}", flush=True)
|
| 62 |
+
return f"// Error: {e}"
|
| 63 |
+
|
| 64 |
+
# --- Main Evaluation Loop ---
|
| 65 |
+
async def main():
|
| 66 |
+
if not HF_TOKEN:
|
| 67 |
+
print("Error: HF_TOKEN/API_KEY not found in environment.")
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 71 |
+
env = RustCoderEnv(base_url=ENV_URL)
|
| 72 |
+
|
| 73 |
+
log_start(task="rust_coder", env="RustCoder-v1", model=MODEL_NAME)
|
| 74 |
+
|
| 75 |
+
rewards: List[float] = []
|
| 76 |
+
steps_taken = 0
|
| 77 |
+
score = 0.0
|
| 78 |
+
success = False
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
# Start the single episode (10 problems)
|
| 82 |
+
result = await env.reset()
|
| 83 |
+
obs = result.observation
|
| 84 |
+
|
| 85 |
+
for step in range(1, MAX_STEPS + 1):
|
| 86 |
+
if result.done:
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
steps_taken = step
|
| 90 |
+
|
| 91 |
+
# Format prompt including starter code if available
|
| 92 |
+
prompt = obs.problem_description
|
| 93 |
+
if obs.starter_code:
|
| 94 |
+
prompt += f"\n\nStarter Code:\n```rust\n{obs.starter_code}\n```"
|
| 95 |
+
|
| 96 |
+
# 1. Ask model for solution to current task
|
| 97 |
+
code_solution = await get_model_code(prompt, client)
|
| 98 |
+
|
| 99 |
+
# 2. Environment step
|
| 100 |
+
result = await env.step(RustCoderAction(code=code_solution))
|
| 101 |
+
obs = result.observation
|
| 102 |
+
reward = result.reward or 0.0
|
| 103 |
+
done = result.done
|
| 104 |
+
|
| 105 |
+
rewards.append(reward)
|
| 106 |
+
log_step(step=step, action=code_solution, reward=reward, done=done)
|
| 107 |
+
|
| 108 |
+
if done:
|
| 109 |
+
break
|
| 110 |
+
|
| 111 |
+
# Normalize score to [0, 1] matching sample format
|
| 112 |
+
score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
|
| 113 |
+
score = min(max(score, 0.0), 1.0)
|
| 114 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"[DEBUG] Runtime error: {e}", flush=True)
|
| 118 |
+
log_step(step=steps_taken + 1, action="error", reward=0.0, done=True, error=str(e))
|
| 119 |
+
|
| 120 |
+
finally:
|
| 121 |
+
try:
|
| 122 |
+
await env.close()
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"[DEBUG] env.close() error: {e}", flush=True)
|
| 125 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
asyncio.run(main())
|
models.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Data models for the Rust Coder Environment.
|
| 9 |
+
|
| 10 |
+
The rust_coder environment is a simple test environment that echoes back messages.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from openenv.core.env_server.types import Action, Observation
|
| 14 |
+
from pydantic import Field
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class RustCoderAction(Action):
|
| 18 |
+
"""Action for the Rust Coder environment - contains the Rust code to evaluate."""
|
| 19 |
+
|
| 20 |
+
code: str = Field(..., description="Rust source code to compile and run")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class RustCoderObservation(Observation):
|
| 24 |
+
"""Observation space for the Rust Coder environment."""
|
| 25 |
+
|
| 26 |
+
problem_description: str = Field(default="", description="The text description of the current coding task, including requirements.")
|
| 27 |
+
starter_code: str = Field(default="", description="The specific Rust code snippet that needs fixing for this task.")
|
| 28 |
+
compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
|
| 29 |
+
compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
|
| 30 |
+
test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
|
| 31 |
+
reward_breakdown: dict = Field(default_factory=dict, description="Detailed components of the 0.0-1.0 reward.")
|
openenv.yaml
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: rust_coder
|
| 3 |
+
description: "High-fidelity RL environment for evaluating LLM agents on Rust systems programming, including borrow checking, safe concurrency, and memory management."
|
| 4 |
+
type: space
|
| 5 |
+
runtime: fastapi
|
| 6 |
+
app: server.app:app
|
| 7 |
+
port: 8000
|
| 8 |
+
dockerfile: Dockerfile
|
| 9 |
+
tags:
|
| 10 |
+
- openenv
|
| 11 |
+
- software-engineering
|
| 12 |
+
- rust
|
| 13 |
+
- coding-benchmark
|
| 14 |
+
|
| 15 |
+
# Task Definition (Easy -> Medium -> Hard)
|
| 16 |
+
tasks:
|
| 17 |
+
- id: 1
|
| 18 |
+
title: "Broken CLI Argument Parser"
|
| 19 |
+
difficulty: "Easy"
|
| 20 |
+
- id: 2
|
| 21 |
+
title: "Conflicting Borrows"
|
| 22 |
+
difficulty: "Easy"
|
| 23 |
+
- id: 3
|
| 24 |
+
title: "Lifetime Annotations"
|
| 25 |
+
difficulty: "Medium"
|
| 26 |
+
- id: 4
|
| 27 |
+
title: "Business Logic"
|
| 28 |
+
difficulty: "Medium"
|
| 29 |
+
- id: 5
|
| 30 |
+
title: "Linked List Management"
|
| 31 |
+
difficulty: "Medium"
|
| 32 |
+
- id: 6
|
| 33 |
+
title: "Multi-threaded Deadlocks"
|
| 34 |
+
difficulty: "Hard"
|
| 35 |
+
- id: 7
|
| 36 |
+
title: "Async Borrowing"
|
| 37 |
+
difficulty: "Hard"
|
| 38 |
+
- id: 8
|
| 39 |
+
title: "Unsafe FFI Integration"
|
| 40 |
+
difficulty: "Hard"
|
| 41 |
+
- id: 9
|
| 42 |
+
title: "Inefficient Data Pipelines"
|
| 43 |
+
difficulty: "Hard"
|
| 44 |
+
- id: 10
|
| 45 |
+
title: "Memory Leak Prevention"
|
| 46 |
+
difficulty: "Hard+"
|
| 47 |
+
|
| 48 |
+
# Definitions for Documentation and Graders
|
| 49 |
+
action_space:
|
| 50 |
+
type: "RustCoderAction"
|
| 51 |
+
description: "A single string containing the fixed Rust code."
|
| 52 |
+
|
| 53 |
+
observation_space:
|
| 54 |
+
type: "RustCoderObservation"
|
| 55 |
+
description: "Observation containing problem description, compilation logs, test results, and reward breakdown."
|
| 56 |
+
|
openenv_rust_coder.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-rust_coder
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Rust Coder environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
+
Requires-Dist: openai>=1.0.0
|
| 8 |
+
Requires-Dist: pydantic>=2.0.0
|
| 9 |
+
Provides-Extra: dev
|
| 10 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 11 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_rust_coder.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
./__init__.py
|
| 4 |
+
./client.py
|
| 5 |
+
./inference.py
|
| 6 |
+
./models.py
|
| 7 |
+
openenv_rust_coder.egg-info/PKG-INFO
|
| 8 |
+
openenv_rust_coder.egg-info/SOURCES.txt
|
| 9 |
+
openenv_rust_coder.egg-info/dependency_links.txt
|
| 10 |
+
openenv_rust_coder.egg-info/entry_points.txt
|
| 11 |
+
openenv_rust_coder.egg-info/requires.txt
|
| 12 |
+
openenv_rust_coder.egg-info/top_level.txt
|
| 13 |
+
server/__init__.py
|
| 14 |
+
server/app.py
|
| 15 |
+
server/rust_coder_environment.py
|
openenv_rust_coder.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_rust_coder.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = rust_coder.server.app:main
|
openenv_rust_coder.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
openai>=1.0.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
|
| 5 |
+
[dev]
|
| 6 |
+
pytest>=8.0.0
|
| 7 |
+
pytest-cov>=4.0.0
|
openenv_rust_coder.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
rust_coder
|
problems.json
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": 1,
|
| 4 |
+
"title": "Broken CLI Argument Parser",
|
| 5 |
+
"difficulty": "Easy",
|
| 6 |
+
"description": "Fix a command-line tool that parses user input to determine file operations (read, write, append). The implementation uses enums and pattern matching but contains: Mismatched types in enum variants, Incomplete match arms, Incorrect handling of optional arguments. The parser must compile and correctly interpret valid command-line inputs like: 'read file.txt' -> FileOp::Read('file.txt'), 'write file.txt content' -> FileOp::Write('file.txt', Some('content')), 'append file.txt' -> FileOp::Append('file.txt')",
|
| 7 |
+
"starter_code": "#[derive(Debug, PartialEq)]\nenum FileOp {\n Read(String),\n Write(String, Option<String>),\n Append(String),\n}\n\nfn parse_command(input: &str) -> Option<FileOp> {\n let parts: Vec<&str> = input.split_whitespace().collect();\n \n match parts.get(0) {\n Some(&\"read\") => {\n let filename = parts.get(1)?;\n FileOp::Read(filename.to_string()) // BUG: Missing Some()\n }\n Some(&\"write\") => {\n let filename = parts.get(1)?;\n let content = parts.get(2).map(|s| s.to_string());\n Some(FileOp::Write(filename.to_string(), content))\n }\n Some(&\"append\") => {\n let filename = parts.get(1)?;\n // BUG: Missing return statement\n }\n _ => None,\n }\n}\n\nfn main() {\n println!(\"CLI Parser Test\");\n}",
|
| 8 |
+
"tests": [
|
| 9 |
+
{
|
| 10 |
+
"name": "parse_read_command",
|
| 11 |
+
"input_code": "parse_command(\"read file.txt\")",
|
| 12 |
+
"expected_output": "Some\\(FileOp::Read\\(\"file\\.txt\"\\)\\)",
|
| 13 |
+
"should_compile": false,
|
| 14 |
+
"test_assertion": "assert_eq!(parse_command(\"read file.txt\"), Some(FileOp::Read(\"file.txt\".to_string())))"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"name": "parse_write_command",
|
| 18 |
+
"input_code": "parse_command(\"write file.txt hello\")",
|
| 19 |
+
"expected_output": "Some\\(FileOp::Write\\(\"file\\.txt\", Some\\(\"hello\"\\)\\)\\)",
|
| 20 |
+
"should_compile": false,
|
| 21 |
+
"test_assertion": "assert_eq!(parse_command(\"write file.txt hello\"), Some(FileOp::Write(\"file.txt\".to_string(), Some(\"hello\".to_string()))))"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"name": "parse_append_command",
|
| 25 |
+
"input_code": "parse_command(\"append file.txt\")",
|
| 26 |
+
"expected_output": "Some\\(FileOp::Append\\(\"file\\.txt\"\\)\\)",
|
| 27 |
+
"should_compile": false,
|
| 28 |
+
"test_assertion": "assert_eq!(parse_command(\"append file.txt\"), Some(FileOp::Append(\"file.txt\".to_string())))"
|
| 29 |
+
}
|
| 30 |
+
],
|
| 31 |
+
"performance_baseline_ms": 10.0
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"id": 2,
|
| 35 |
+
"title": "Conflicting Borrows in Collection Processing",
|
| 36 |
+
"difficulty": "Easy\u2192Medium",
|
| 37 |
+
"description": "Fix a function that processes a vector of strings while conditionally modifying elements and storing references for later use. The implementation mixes mutable and immutable borrows within the same scope, causing borrow checker conflicts. Requirements: Iterate through vector of strings, Store uppercase versions in a results vector, Handle optional transformations without borrowing conflicts, Must compile and execute without panics",
|
| 38 |
+
"starter_code": "fn process_strings(strings: &mut Vec<String>) -> Vec<String> {\n let mut results = Vec::new();\n \n for s in strings {\n // BUG: Cannot borrow as mutable while immutable borrow is active\n let upper = s.to_uppercase();\n s.push_str(\"_processed\"); // Mutable borrow\n results.push(upper);\n }\n \n results\n}\n\nfn main() {\n println!(\"String processing\");\n}",
|
| 39 |
+
"tests": [
|
| 40 |
+
{
|
| 41 |
+
"name": "process_single_string",
|
| 42 |
+
"input_code": "let mut v = vec![\"hello\".to_string()]; process_strings(&mut v);",
|
| 43 |
+
"should_compile": false,
|
| 44 |
+
"test_assertion": "assert_eq!(process_strings(&mut vec![\"hello\".to_string()]), vec![\"HELLO\".to_string()])"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"name": "process_multiple_strings",
|
| 48 |
+
"input_code": "let mut v = vec![\"a\".to_string(), \"b\".to_string()]; process_strings(&mut v);",
|
| 49 |
+
"should_compile": false,
|
| 50 |
+
"test_assertion": "assert_eq!(process_strings(&mut vec![\"a\".to_string(), \"b\".to_string()]), vec![\"A\".to_string(), \"B\".to_string()])"
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
"performance_baseline_ms": 50.0
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"id": 3,
|
| 57 |
+
"title": "Invalid Lifetime Annotations in Text API",
|
| 58 |
+
"difficulty": "Medium",
|
| 59 |
+
"description": "Fix a text-processing utility that accepts multiple string slices and returns a reference derived from them. The function either fails to compile or produces incorrect lifetime relationships, risking references that outlive their input data. Requirements: Function must accept multiple &str parameters, Return a &str derived from the inputs, Properly annotate lifetimes, Must be safe (no dangling references)",
|
| 60 |
+
"starter_code": "// BUG: Invalid lifetime annotations - which lifetime should the return type use?\nfn longest_text<'a>(s1: &'a str, s2: &'a str) -> &'a str {\n if s1.len() > s2.len() {\n s1\n } else {\n s2\n }\n}\n\n// BUG: This function has a lifetime issue\nfn find_first_word(s: &str) -> &str {\n let bytes = s.as_bytes();\n for (i, &byte) in bytes.iter().enumerate() {\n if byte == b' ' {\n return &s[0..i];\n }\n }\n &s[..]\n}\n\nfn main() {\n println!(\"Lifetime test\");\n}",
|
| 61 |
+
"tests": [
|
| 62 |
+
{
|
| 63 |
+
"name": "longest_text_basic",
|
| 64 |
+
"input_code": "longest_text(\"abc\", \"de\")",
|
| 65 |
+
"expected_output": "\"abc\"",
|
| 66 |
+
"should_compile": true,
|
| 67 |
+
"test_assertion": "assert_eq!(longest_text(\"abc\", \"de\"), \"abc\")"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"name": "find_first_word_with_space",
|
| 71 |
+
"input_code": "find_first_word(\"hello world\")",
|
| 72 |
+
"expected_output": "\"hello\"",
|
| 73 |
+
"should_compile": true,
|
| 74 |
+
"test_assertion": "assert_eq!(find_first_word(\"hello world\"), \"hello\")"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "find_first_word_no_space",
|
| 78 |
+
"input_code": "find_first_word(\"hello\")",
|
| 79 |
+
"expected_output": "\"hello\"",
|
| 80 |
+
"should_compile": true,
|
| 81 |
+
"test_assertion": "assert_eq!(find_first_word(\"hello\"), \"hello\")"
|
| 82 |
+
}
|
| 83 |
+
],
|
| 84 |
+
"performance_baseline_ms": 10.0
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"id": 4,
|
| 88 |
+
"title": "Business Logic Producing Incorrect Results",
|
| 89 |
+
"difficulty": "Medium",
|
| 90 |
+
"description": "Fix a module implementing order validation logic including pricing, discounts, and boundary conditions. The code compiles but produces incorrect outputs for edge cases such as: Zero values, Overlapping discounts, Large numeric inputs, Negative prices. Requirements: Calculate order total correctly, Apply discounts properly (no double-counting), Handle edge cases (zero items, negative values), Be mathematically sound",
|
| 91 |
+
"starter_code": "#[derive(Debug, Clone)]\nstruct Order {\n quantity: i32,\n unit_price: f64,\n discount_percent: f64,\n}\n\nimpl Order {\n fn new(quantity: i32, unit_price: f64) -> Self {\n Order {\n quantity,\n unit_price,\n discount_percent: 0.0,\n }\n }\n\n fn with_discount(mut self, discount: f64) -> Self {\n self.discount_percent = discount;\n self\n }\n\n fn calculate_total(&self) -> f64 {\n let subtotal = self.quantity as f64 * self.unit_price;\n // BUG: Incorrect discount calculation\n let discount = subtotal * (self.discount_percent / 100.0);\n subtotal - discount // Missing rounding/validation\n }\n}\n\nfn main() {\n println!(\"Order test\");\n}",
|
| 92 |
+
"tests": [
|
| 93 |
+
{
|
| 94 |
+
"name": "simple_order",
|
| 95 |
+
"input_code": "Order::new(10, 5.0).calculate_total()",
|
| 96 |
+
"expected_output": "50\\.0",
|
| 97 |
+
"should_compile": true,
|
| 98 |
+
"test_assertion": "assert_eq!(Order::new(10, 5.0).calculate_total(), 50.0)"
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"name": "order_with_discount",
|
| 102 |
+
"input_code": "Order::new(10, 5.0).with_discount(10.0).calculate_total()",
|
| 103 |
+
"expected_output": "45\\.0",
|
| 104 |
+
"should_compile": true,
|
| 105 |
+
"test_assertion": "assert_eq!(Order::new(10, 5.0).with_discount(10.0).calculate_total(), 45.0)"
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"name": "zero_quantity",
|
| 109 |
+
"input_code": "Order::new(0, 5.0).calculate_total()",
|
| 110 |
+
"expected_output": "0\\.0",
|
| 111 |
+
"should_compile": true,
|
| 112 |
+
"test_assertion": "assert_eq!(Order::new(0, 5.0).calculate_total(), 0.0)"
|
| 113 |
+
}
|
| 114 |
+
],
|
| 115 |
+
"performance_baseline_ms": 10.0
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"id": 5,
|
| 119 |
+
"title": "Corrupted Singly Linked List",
|
| 120 |
+
"difficulty": "Medium\u2192Hard",
|
| 121 |
+
"description": "Fix a custom singly linked list that supports insertion, deletion, and traversal. The implementation incorrectly manages node ownership and pointer transitions, resulting in: Lost nodes, Inconsistent traversal output, Occasional runtime panics. Requirements: Insert elements at head, Delete elements correctly, Traverse without panics, No memory leaks or lost data",
|
| 122 |
+
"starter_code": "use std::ptr;\n\n#[derive(Debug)]\nstruct Node<T> {\n value: T,\n next: Option<Box<Node<T>>>,\n}\n\n#[derive(Debug)]\nstruct LinkedList<T> {\n head: Option<Box<Node<T>>>,\n}\n\nimpl<T> LinkedList<T> {\n fn new() -> Self {\n LinkedList { head: None }\n }\n\n fn insert(&mut self, value: T) {\n let new_node = Box::new(Node {\n value,\n next: None, // BUG: Should move self.head into next\n });\n self.head = Some(new_node);\n }\n\n fn len(&self) -> usize {\n let mut count = 0;\n let mut current = &self.head;\n while let Some(node) = current {\n count += 1;\n current = &node.next; // Correct, but insert is broken\n }\n count\n }\n}\n\nfn main() {\n println!(\"LinkedList test\");\n}",
|
| 123 |
+
"tests": [
|
| 124 |
+
{
|
| 125 |
+
"name": "insert_single_element",
|
| 126 |
+
"input_code": "let mut ll = LinkedList::new(); ll.insert(5); ll.len()",
|
| 127 |
+
"expected_output": "1",
|
| 128 |
+
"should_compile": true,
|
| 129 |
+
"test_assertion": "let mut ll = LinkedList::new(); ll.insert(5); assert_eq!(ll.len(), 1)"
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"name": "insert_multiple_elements",
|
| 133 |
+
"input_code": "let mut ll = LinkedList::new(); ll.insert(1); ll.insert(2); ll.insert(3); ll.len()",
|
| 134 |
+
"expected_output": "3",
|
| 135 |
+
"should_compile": true,
|
| 136 |
+
"test_assertion": "let mut ll = LinkedList::new(); ll.insert(1); ll.insert(2); ll.insert(3); assert_eq!(ll.len(), 3)"
|
| 137 |
+
}
|
| 138 |
+
],
|
| 139 |
+
"performance_baseline_ms": 20.0
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"id": 6,
|
| 143 |
+
"title": "Deadlock in Multi-threaded Worker System",
|
| 144 |
+
"difficulty": "Hard",
|
| 145 |
+
"description": "Fix a worker system using multiple threads to process jobs from a shared queue protected by synchronization primitives. Under certain workloads, threads block indefinitely due to: Improper lock acquisition order, Shared state handling issues, Missing signal/wake mechanisms. Requirements: Spawn N worker threads, Process jobs from shared queue without deadlock, Handle shutdown gracefully, No panics under load",
|
| 146 |
+
"starter_code": "use std::sync::{Arc, Mutex, mpsc};\nuse std::thread;\n\nfn worker_system(num_workers: usize, jobs: Vec<i32>) -> Vec<i32> {\n let (tx, rx) = mpsc::channel();\n let rx = Arc::new(Mutex::new(rx));\n let results = Arc::new(Mutex::new(Vec::new()));\n \n let mut handles = vec![];\n \n for _ in 0..num_workers {\n let rx = Arc::clone(&rx);\n let results = Arc::clone(&results);\n \n let handle = thread::spawn(move || {\n loop {\n // BUG: Lock acquired but never released before trying to acquire results lock\n let receiver = rx.lock().unwrap();\n match receiver.try_recv() {\n Ok(job) => {\n let result = job * 2;\n // BUG: Tries to lock results while still holding rx lock - DEADLOCK\n results.lock().unwrap().push(result);\n }\n Err(_) => break,\n }\n }\n });\n handles.push(handle);\n }\n \n for job in jobs {\n let _ = tx.send(job); // Ignore send errors\n }\n drop(tx);\n \n for handle in handles {\n let _ = handle.join();\n }\n \n Arc::try_unwrap(results)\n .unwrap()\n .into_inner()\n .unwrap()\n}\n\nfn main() {\n println!(\"Worker system test\");\n}",
|
| 147 |
+
"tests": [
|
| 148 |
+
{
|
| 149 |
+
"name": "single_worker_single_job",
|
| 150 |
+
"input_code": "worker_system(1, vec![5])",
|
| 151 |
+
"expected_output": "vec!\\[10\\]",
|
| 152 |
+
"should_compile": true,
|
| 153 |
+
"test_assertion": "assert_eq!(worker_system(1, vec![5]), vec![10])"
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"name": "multiple_workers",
|
| 157 |
+
"input_code": "worker_system(2, vec![1, 2, 3])",
|
| 158 |
+
"expected_output": "vec!\\[(2|4|6)\\].*vec!\\[(2|4|6)\\].*vec!\\[(2|4|6)\\]",
|
| 159 |
+
"should_compile": true,
|
| 160 |
+
"test_assertion": "let mut result = worker_system(2, vec![1, 2, 3]); result.sort(); assert_eq!(result, vec![2, 4, 6])"
|
| 161 |
+
}
|
| 162 |
+
],
|
| 163 |
+
"performance_baseline_ms": 500.0
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"id": 7,
|
| 167 |
+
"title": "Async Function with Borrowing Conflicts",
|
| 168 |
+
"difficulty": "Hard",
|
| 169 |
+
"description": "Fix an asynchronous function that processes input data and performs non-blocking operations while returning references tied to the input. The implementation violates borrowing constraints in an async context, leading to: Compilation errors when using references across await points, Invalid reference usage. Requirements: Accept &str input, Perform async operation, Return derived reference, Must be sound and compile",
|
| 170 |
+
"starter_code": "use std::pin::Pin;\nuse std::future::Future;\n\n// BUG: Cannot return reference that outlives await point\nasync fn process_async(input: &str) -> &str {\n // Simulating async work\n // tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;\n \n // BUG: input reference cannot be returned from async context like this\n input\n}\n\n// Better approach: return owned data or 'static reference\nfn process_sync(input: &str) -> String {\n input.to_uppercase()\n}\n\nfn main() {\n println!(\"Async test\");\n}",
|
| 171 |
+
"tests": [
|
| 172 |
+
{
|
| 173 |
+
"name": "process_sync_basic",
|
| 174 |
+
"input_code": "process_sync(\"hello\")",
|
| 175 |
+
"expected_output": "\"HELLO\"",
|
| 176 |
+
"should_compile": true,
|
| 177 |
+
"test_assertion": "assert_eq!(process_sync(\"hello\"), \"HELLO\")"
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"name": "process_sync_uppercase",
|
| 181 |
+
"input_code": "process_sync(\"Hello World\")",
|
| 182 |
+
"expected_output": "\"HELLO WORLD\"",
|
| 183 |
+
"should_compile": true,
|
| 184 |
+
"test_assertion": "assert_eq!(process_sync(\"Hello World\"), \"HELLO WORLD\")"
|
| 185 |
+
}
|
| 186 |
+
],
|
| 187 |
+
"performance_baseline_ms": 50.0
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"id": 8,
|
| 191 |
+
"title": "Unsafe FFI Integration Causing Crashes",
|
| 192 |
+
"difficulty": "Hard",
|
| 193 |
+
"description": "Fix Rust code that interfaces with an external C library using raw pointers. The implementation incorrectly handles: Pointer ownership, Memory allocation and deallocation, Undefined behavior risks. Requirements: Safely wrap C library calls, Properly manage memory (allocate/deallocate), No undefined behavior, Handle errors gracefully",
|
| 194 |
+
"starter_code": "extern \"C\" {\n fn malloc(size: usize) -> *mut u8;\n fn free(ptr: *mut u8);\n}\n\nfn allocate_and_init(size: usize) -> Vec<u8> {\n unsafe {\n let ptr = malloc(size);\n // BUG: No null check - ptr could be null\n // BUG: Memory not initialized before use\n let slice = std::slice::from_raw_parts_mut(ptr, size);\n \n // Copy to vec and free\n let vec = slice.to_vec();\n free(ptr); // BUG: Freeing memory still referenced in vec\n vec\n }\n}\n\nfn main() {\n println!(\"FFI test\");\n}",
|
| 195 |
+
"tests": [
|
| 196 |
+
{
|
| 197 |
+
"name": "allocate_small_buffer",
|
| 198 |
+
"input_code": "allocate_and_init(10).len()",
|
| 199 |
+
"expected_output": "10",
|
| 200 |
+
"should_compile": false,
|
| 201 |
+
"test_assertion": "assert_eq!(allocate_and_init(10).len(), 10)"
|
| 202 |
+
}
|
| 203 |
+
],
|
| 204 |
+
"performance_baseline_ms": 100.0
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"id": 9,
|
| 208 |
+
"title": "Inefficient Data Processing Pipeline",
|
| 209 |
+
"difficulty": "Hard",
|
| 210 |
+
"description": "Fix a data pipeline that reads large datasets, applies transformations, and aggregates results. While functionally correct, the implementation has: Excessive memory allocations, Redundant iterations, Inefficient data copying. Requirements: Process data efficiently, Minimize allocations and copies, Use iterators when possible, Produce correct results with better performance",
|
| 211 |
+
"starter_code": "fn process_data(numbers: Vec<i32>) -> i32 {\n // BUG: Multiple unnecessary allocations and iterations\n \n // First pass: filter evens (allocates new vector)\n let evens: Vec<i32> = numbers.iter()\n .filter(|n| n % 2 == 0)\n .copied()\n .collect();\n \n // Second pass: double values (allocates another vector)\n let doubled: Vec<i32> = evens.iter()\n .map(|n| n * 2)\n .collect();\n \n // Third pass: sum (unnecessary iteration)\n let sum: i32 = doubled.iter().sum();\n \n // Fourth pass: filter again (redundant)\n let final_sum: i32 = doubled.iter()\n .filter(|n| n % 4 == 0)\n .sum();\n \n final_sum\n}\n\nfn main() {\n println!(\"Efficiency test\");\n}",
|
| 212 |
+
"tests": [
|
| 213 |
+
{
|
| 214 |
+
"name": "simple_pipeline",
|
| 215 |
+
"input_code": "process_data(vec![1, 2, 3, 4, 5, 6])",
|
| 216 |
+
"expected_output": "16",
|
| 217 |
+
"should_compile": true,
|
| 218 |
+
"test_assertion": "assert_eq!(process_data(vec![1, 2, 3, 4, 5, 6]), 16)"
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"name": "all_odd_numbers",
|
| 222 |
+
"input_code": "process_data(vec![1, 3, 5, 7])",
|
| 223 |
+
"expected_output": "0",
|
| 224 |
+
"should_compile": true,
|
| 225 |
+
"test_assertion": "assert_eq!(process_data(vec![1, 3, 5, 7]), 0)"
|
| 226 |
+
}
|
| 227 |
+
],
|
| 228 |
+
"performance_baseline_ms": 50.0
|
| 229 |
+
},
|
| 230 |
+
{
|
| 231 |
+
"id": 10,
|
| 232 |
+
"title": "Reference-counted Cache with Memory Leak",
|
| 233 |
+
"difficulty": "Hard+",
|
| 234 |
+
"description": "Fix a caching system using reference-counted pointers to share data across components. The design creates cyclic references between cached objects, preventing memory from being released and causing memory usage to grow over time. Requirements: Implement caching without memory leaks, Break circular reference patterns, Use Rc/Arc correctly with Weak pointers when needed, Memory should be released when cache is cleared",
|
| 235 |
+
"starter_code": "use std::rc::Rc;\nuse std::cell::RefCell;\n\n#[derive(Debug)]\nstruct CacheNode<T> {\n key: String,\n value: T,\n // BUG: This creates a cycle that prevents garbage collection\n related: RefCell<Option<Rc<CacheNode<T>>>>,\n}\n\n#[derive(Debug)]\nstruct Cache<T> {\n items: RefCell<Vec<Rc<CacheNode<T>>>>,\n}\n\nimpl<T: Clone> Cache<T> {\n fn new() -> Self {\n Cache {\n items: RefCell::new(Vec::new()),\n }\n }\n\n fn insert(&self, key: String, value: T) {\n let node = Rc::new(CacheNode {\n key,\n value,\n related: RefCell::new(None),\n });\n \n // BUG: Creating cyclic references\n if let Some(last) = self.items.borrow().last() {\n // Rc to Rc creates a cycle\n if let Ok(mut r) = last.related.try_borrow_mut() {\n *r = Some(Rc::clone(&node)); // Cycle here!\n }\n }\n \n self.items.borrow_mut().push(node);\n }\n}\n\nfn main() {\n println!(\"Cache test\");\n}",
|
| 236 |
+
"tests": [
|
| 237 |
+
{
|
| 238 |
+
"name": "cache_insert_single",
|
| 239 |
+
"input_code": "let c = Cache::new(); c.insert(\"key\".to_string(), \"value\".to_string()); c.items.borrow().len()",
|
| 240 |
+
"expected_output": "1",
|
| 241 |
+
"should_compile": true,
|
| 242 |
+
"test_assertion": "let c = Cache::new(); c.insert(\"key\".to_string(), \"value\".to_string()); assert_eq!(c.items.borrow().len(), 1)"
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"name": "cache_insert_multiple",
|
| 246 |
+
"input_code": "let c = Cache::new(); c.insert(\"k1\".to_string(), \"v1\".to_string()); c.insert(\"k2\".to_string(), \"v2\".to_string()); c.items.borrow().len()",
|
| 247 |
+
"expected_output": "2",
|
| 248 |
+
"should_compile": true,
|
| 249 |
+
"test_assertion": "let c = Cache::new(); c.insert(\"k1\".to_string(), \"v1\".to_string()); c.insert(\"k2\".to_string(), \"v2\".to_string()); assert_eq!(c.items.borrow().len(), 2)"
|
| 250 |
+
}
|
| 251 |
+
],
|
| 252 |
+
"performance_baseline_ms": 100.0
|
| 253 |
+
}
|
| 254 |
+
]
|
pyproject.toml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-rust_coder"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Rust Coder environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.2",
|
| 21 |
+
"openai>=1.0.0",
|
| 22 |
+
"pydantic>=2.0.0",
|
| 23 |
+
"gradio>=4.0.0",
|
| 24 |
+
# Environment-specific dependencies
|
| 25 |
+
# Add all dependencies needed for your environment here
|
| 26 |
+
# Examples:
|
| 27 |
+
# "numpy>=1.19.0",
|
| 28 |
+
# "torch>=2.0.0",
|
| 29 |
+
# "gymnasium>=0.29.0",
|
| 30 |
+
# "openspiel>=1.0.0",
|
| 31 |
+
# "smolagents>=1.22.0,<2",
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
[project.optional-dependencies]
|
| 35 |
+
dev = [
|
| 36 |
+
"pytest>=8.0.0",
|
| 37 |
+
"pytest-cov>=4.0.0",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
[project.scripts]
|
| 41 |
+
# Server entry point - enables running via: uv run --project . server
|
| 42 |
+
# or: python -m rust_coder.server.app
|
| 43 |
+
server = "rust_coder.server.app:main"
|
| 44 |
+
|
| 45 |
+
[tool.setuptools]
|
| 46 |
+
include-package-data = true
|
| 47 |
+
packages = ["rust_coder", "rust_coder.server"]
|
| 48 |
+
package-dir = { "rust_coder" = ".", "rust_coder.server" = "server" }
|
scripts/validate-submission.sh
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh — OpenEnv Submission Validator (Hardened)
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and OpenEnv spec compliance.
|
| 6 |
+
# Mandatory Log Format Check included.
|
| 7 |
+
|
| 8 |
+
set -e
|
| 9 |
+
|
| 10 |
+
PING_URL=$1
|
| 11 |
+
REPO_DIR=${2:-"."}
|
| 12 |
+
|
| 13 |
+
if [ -z "$PING_URL" ]; then
|
| 14 |
+
echo "Usage: $0 <ping_url> [repo_dir]"
|
| 15 |
+
echo "Example: bash scripts/validate-submission.sh https://huggingface.co/spaces/user/rust-coder"
|
| 16 |
+
exit 1
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
echo "--- 🔍 1. Testing Connection to HF Space ---"
|
| 20 |
+
# Check health or root
|
| 21 |
+
RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" "$PING_URL/health" || curl -s -o /dev/null -w "%{http_code}" "$PING_URL/")
|
| 22 |
+
|
| 23 |
+
if [ "$RESPONSE" -ne 200 ]; then
|
| 24 |
+
echo "❌ FAILED: Space at $PING_URL returned $RESPONSE (expected 200)"
|
| 25 |
+
echo " Ensure your Space is 'Running' and public."
|
| 26 |
+
exit 1
|
| 27 |
+
fi
|
| 28 |
+
echo "✅ PASSED: Connection OK"
|
| 29 |
+
|
| 30 |
+
echo "--- 🔍 2. Validating OpenEnv Spec ---"
|
| 31 |
+
cd "$REPO_DIR"
|
| 32 |
+
if command -v openenv &>/dev/null; then
|
| 33 |
+
if ! openenv validate; then
|
| 34 |
+
echo "❌ FAILED: openenv validate failed. Check your openenv.yaml syntax."
|
| 35 |
+
exit 1
|
| 36 |
+
fi
|
| 37 |
+
echo "✅ PASSED: openenv.yaml is valid"
|
| 38 |
+
else
|
| 39 |
+
echo "⚠️ WARNING: 'openenv' command not found. Skipping local spec validation."
|
| 40 |
+
echo " (Ensure you've run 'pip install openenv-core' if you want this check)"
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
echo "--- 🔍 3. Checking Mandatory Logging Format ---"
|
| 44 |
+
# The judge requires [START], [STEP], and [END] in stdout
|
| 45 |
+
if grep -q "\[START\]" "inference.py" && grep -q "\[STEP\]" "inference.py" && grep -q "\[END\]" "inference.py"; then
|
| 46 |
+
echo "✅ PASSED: inference.py contains mandatory [START/STEP/END] logs."
|
| 47 |
+
else
|
| 48 |
+
echo "❌ FAILED: inference.py is missing mandatory structured logging."
|
| 49 |
+
echo " See documentation for [START], [STEP], and [END] format."
|
| 50 |
+
exit 1
|
| 51 |
+
fi
|
| 52 |
+
|
| 53 |
+
echo "--- 🔍 4. Verifying File Structure ---"
|
| 54 |
+
FILES=("inference.py" "problems.json" "server/Dockerfile" "openenv.yaml")
|
| 55 |
+
for f in "${FILES[@]}"; do
|
| 56 |
+
if [ ! -f "$f" ]; then
|
| 57 |
+
echo "❌ FAILED: Missing required file: $f"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
done
|
| 61 |
+
echo "✅ PASSED: All core files exist."
|
| 62 |
+
|
| 63 |
+
echo "--- 🔍 5. Checking Task Count ---"
|
| 64 |
+
TASK_COUNT=$(grep -c "id:" "openenv.yaml" || true)
|
| 65 |
+
if [ "$TASK_COUNT" -lt 3 ]; then
|
| 66 |
+
echo "❌ FAILED: Found only $TASK_COUNT tasks in openenv.yaml (minimum 3 required)."
|
| 67 |
+
exit 1
|
| 68 |
+
fi
|
| 69 |
+
echo "✅ PASSED: Found $TASK_COUNT tasks."
|
| 70 |
+
|
| 71 |
+
echo ""
|
| 72 |
+
echo "🎉 SUCCESS: Your submission has passed all local checks!"
|
| 73 |
+
echo "You are ready to submit your Space URL: $PING_URL"
|
server/Dockerfile
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-stage build for Rust Coder Environment
|
| 2 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 3 |
+
FROM ${BASE_IMAGE} AS builder
|
| 4 |
+
|
| 5 |
+
# 1. Environment Setup
|
| 6 |
+
USER root
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# Install build essentials for Rust (linker, etc.)
|
| 10 |
+
RUN apt-get update && \
|
| 11 |
+
apt-get install -y --no-install-recommends git curl build-essential ca-certificates && \
|
| 12 |
+
rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# 2. Create the non-root user (Hugging Face default)
|
| 15 |
+
RUN useradd -m -u 1000 user
|
| 16 |
+
USER user
|
| 17 |
+
ENV HOME=/home/user \
|
| 18 |
+
PATH=/home/user/.cargo/bin:/home/user/.local/bin:$PATH
|
| 19 |
+
|
| 20 |
+
# 3. Install Rust toolchain as 'user'
|
| 21 |
+
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
| 22 |
+
RUN rustup toolchain install stable
|
| 23 |
+
|
| 24 |
+
# 4. Copy environment code and install Python dependencies
|
| 25 |
+
WORKDIR $HOME/app/env
|
| 26 |
+
COPY --chown=user . $HOME/app/env
|
| 27 |
+
|
| 28 |
+
# Install uv (if not present) and then the virtual environment
|
| 29 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 30 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh; \
|
| 31 |
+
fi
|
| 32 |
+
|
| 33 |
+
RUN --mount=type=cache,target=/home/user/.cache/uv,uid=1000,gid=1000 \
|
| 34 |
+
uv sync --no-editable
|
| 35 |
+
|
| 36 |
+
# -------------------------------------------------------------
|
| 37 |
+
# Final Runtime Stage
|
| 38 |
+
# -------------------------------------------------------------
|
| 39 |
+
FROM ${BASE_IMAGE}
|
| 40 |
+
|
| 41 |
+
USER root
|
| 42 |
+
RUN apt-get update && \
|
| 43 |
+
apt-get install -y --no-install-recommends curl build-essential ca-certificates && \
|
| 44 |
+
rm -rf /var/lib/apt/lists/*
|
| 45 |
+
|
| 46 |
+
# Create the user again in the final stage
|
| 47 |
+
RUN useradd -m -u 1000 user
|
| 48 |
+
USER user
|
| 49 |
+
ENV HOME=/home/user \
|
| 50 |
+
PATH="/home/user/app/env/.venv/bin:/home/user/.cargo/bin:$PATH" \
|
| 51 |
+
PYTHONPATH="/home/user/app/env:$PYTHONPATH"
|
| 52 |
+
|
| 53 |
+
# 4. Copy Cargo/Rustup from builder and then the local code
|
| 54 |
+
COPY --from=builder --chown=user /home/user/.cargo /home/user/.cargo
|
| 55 |
+
COPY --from=builder --chown=user /home/user/.rustup /home/user/.rustup
|
| 56 |
+
|
| 57 |
+
WORKDIR $HOME/app/env
|
| 58 |
+
COPY --chown=user . $HOME/app/env
|
| 59 |
+
|
| 60 |
+
# 5. Install uv and Python dependencies in the FINAL stage
|
| 61 |
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 62 |
+
RUN uv sync --no-editable
|
| 63 |
+
|
| 64 |
+
# -------------------------------------------------------------
|
| 65 |
+
# Final Config
|
| 66 |
+
# -------------------------------------------------------------
|
| 67 |
+
EXPOSE 8000
|
| 68 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 69 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 70 |
+
|
| 71 |
+
# Start the server using 'uv run' to ensure .venv context is preserved
|
| 72 |
+
CMD ["/home/user/app/env/.venv/bin/uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000", "--log-level", "debug"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Rust Coder environment server components."""
|
| 8 |
+
|
| 9 |
+
from .rust_coder_environment import RustCoderEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["RustCoderEnvironment"]
|
server/app.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application for the Rust Coder Environment.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
POST /reset — Start new episode (loads next problem)
|
| 6 |
+
POST /step — Submit Rust code for evaluation
|
| 7 |
+
GET /state — Get current episode state
|
| 8 |
+
GET /schema — Action/observation JSON schemas
|
| 9 |
+
WS /ws — WebSocket for persistent sessions
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import gradio as gr
|
| 14 |
+
from openai import OpenAI
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
from openenv.core.env_server.http_server import create_app
|
| 17 |
+
|
| 18 |
+
from models import RustCoderAction, RustCoderObservation
|
| 19 |
+
from server.rust_coder_environment import RustCoderEnvironment
|
| 20 |
+
|
| 21 |
+
load_dotenv()
|
| 22 |
+
|
| 23 |
+
# --- Core OpenEnv Server Setup ---
|
| 24 |
+
# Use a distinct name for the OpenEnv FastAPI instance
|
| 25 |
+
openenv_app = create_app(
|
| 26 |
+
RustCoderEnvironment,
|
| 27 |
+
RustCoderAction,
|
| 28 |
+
RustCoderObservation,
|
| 29 |
+
env_name="rust_coder",
|
| 30 |
+
max_concurrent_envs=1,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Add a health check endpoint for Docker directly to the base app
|
| 34 |
+
@openenv_app.get("/health")
|
| 35 |
+
async def health_check():
|
| 36 |
+
return {"status": "healthy"}
|
| 37 |
+
|
| 38 |
+
# --- Shared Logic ---
|
| 39 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 40 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 41 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 42 |
+
|
| 43 |
+
def get_llm_solution(problem_desc: str):
|
| 44 |
+
"""Call LLM to get a Rust solution"""
|
| 45 |
+
try:
|
| 46 |
+
client_llm = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 47 |
+
completion = client_llm.chat.completions.create(
|
| 48 |
+
model=MODEL_NAME,
|
| 49 |
+
messages=[
|
| 50 |
+
{"role": "system", "content": "You are an expert Rust developer. Respond ONLY with the code solution, no explanation."},
|
| 51 |
+
{"role": "user", "content": f"Fix the following Rust problem:\n{problem_desc}"},
|
| 52 |
+
],
|
| 53 |
+
temperature=0.2,
|
| 54 |
+
)
|
| 55 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 56 |
+
# Clean markdown code blocks
|
| 57 |
+
if "```rust" in text:
|
| 58 |
+
text = text.split("```rust")[1].split("```")[0]
|
| 59 |
+
elif "```" in text:
|
| 60 |
+
text = text.split("```")[1].split("```")[0]
|
| 61 |
+
return text.strip()
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return f"// LLM Error: {e}"
|
| 64 |
+
|
| 65 |
+
def evaluate_single(problem_id, code=None):
|
| 66 |
+
"""Run evaluation for a specific problem. If code is None, it asks the LLM."""
|
| 67 |
+
try:
|
| 68 |
+
idx = int(problem_id.split(":")[0]) - 1
|
| 69 |
+
problem = RustCoderEnvironment().problems[idx]
|
| 70 |
+
|
| 71 |
+
# 1. Get code from LLM if not provided
|
| 72 |
+
solution_code = code if code else get_llm_solution(problem["description"])
|
| 73 |
+
|
| 74 |
+
# 2. Guard: If LLM failed, do not evaluate
|
| 75 |
+
if solution_code.startswith("// LLM Error"):
|
| 76 |
+
return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
|
| 77 |
+
|
| 78 |
+
# 3. Evaluate properly
|
| 79 |
+
env = RustCoderEnvironment()
|
| 80 |
+
# Reset to the specifically requested index
|
| 81 |
+
state = env.reset(start_index=idx)
|
| 82 |
+
state = env.step(RustCoderAction(code=solution_code))
|
| 83 |
+
|
| 84 |
+
metrics = {
|
| 85 |
+
"Total Reward": f"{state.reward:.2f}",
|
| 86 |
+
"Compilation": "Success" if state.compilation_success else "Failed",
|
| 87 |
+
"Metrics": state.reward_breakdown
|
| 88 |
+
}
|
| 89 |
+
return solution_code, metrics
|
| 90 |
+
except Exception as e:
|
| 91 |
+
return f"// Error: {e}", {"error": f"Evaluation system error: {e}"}
|
| 92 |
+
|
| 93 |
+
def run_benchmark(progress=gr.Progress()):
|
| 94 |
+
"""Run all 10 problems through the LLM and show summary"""
|
| 95 |
+
try:
|
| 96 |
+
env = RustCoderEnvironment()
|
| 97 |
+
rows = []
|
| 98 |
+
total_score = 0.0
|
| 99 |
+
|
| 100 |
+
# Check if token is actually present
|
| 101 |
+
test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 102 |
+
if not test_token:
|
| 103 |
+
return "## Error: HF_TOKEN is not set. Add it to your HF Space secrets or local .env file.", []
|
| 104 |
+
|
| 105 |
+
for i in range(len(env.problems)):
|
| 106 |
+
progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
|
| 107 |
+
problem = env.problems[i]
|
| 108 |
+
code = get_llm_solution(problem["description"])
|
| 109 |
+
|
| 110 |
+
reward = 0.0
|
| 111 |
+
compiled = "Failed (LLM Error)"
|
| 112 |
+
|
| 113 |
+
if not code.startswith("// LLM Error"):
|
| 114 |
+
env.reset(start_index=i)
|
| 115 |
+
state = env.step(RustCoderAction(code=code))
|
| 116 |
+
reward = state.reward
|
| 117 |
+
compiled = "Success" if state.compilation_success else "Failed"
|
| 118 |
+
|
| 119 |
+
rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
|
| 120 |
+
total_score += reward
|
| 121 |
+
|
| 122 |
+
avg_score = total_score / len(env.problems)
|
| 123 |
+
summary_md = f"## Benchmark Summary\n**Final Environment Score: {avg_score:.2f} / 1.0**"
|
| 124 |
+
return summary_md, rows
|
| 125 |
+
except Exception as e:
|
| 126 |
+
return f"### Benchmark Error: {e}", []
|
| 127 |
+
|
| 128 |
+
# --- Build the Gradio UI ---
|
| 129 |
+
def create_dashboard():
|
| 130 |
+
with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
|
| 131 |
+
gr.Markdown("# 🦀 Rust Coder: LLM Evaluation Dashboard")
|
| 132 |
+
|
| 133 |
+
with gr.Tab("Individual Task Evaluation"):
|
| 134 |
+
with gr.Row():
|
| 135 |
+
with gr.Column(scale=1):
|
| 136 |
+
p_env = RustCoderEnvironment()
|
| 137 |
+
p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
|
| 138 |
+
dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
|
| 139 |
+
desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
|
| 140 |
+
|
| 141 |
+
with gr.Column(scale=1):
|
| 142 |
+
run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
|
| 143 |
+
code_display = gr.Code(label="AI Generated Solution", interactive=False)
|
| 144 |
+
results_json = gr.JSON(label="Metric Breakdown")
|
| 145 |
+
|
| 146 |
+
def update_desc(p_str):
|
| 147 |
+
idx = int(p_str.split(":")[0]) - 1
|
| 148 |
+
p = p_env.problems[idx]
|
| 149 |
+
return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
|
| 150 |
+
|
| 151 |
+
dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
|
| 152 |
+
run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
|
| 153 |
+
|
| 154 |
+
with gr.Tab("Full Environment Benchmark"):
|
| 155 |
+
gr.Markdown("### Complete Environment Suite")
|
| 156 |
+
gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
|
| 157 |
+
|
| 158 |
+
b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
|
| 159 |
+
b_sum = gr.Markdown()
|
| 160 |
+
b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
|
| 161 |
+
|
| 162 |
+
b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
|
| 163 |
+
|
| 164 |
+
return demo
|
| 165 |
+
|
| 166 |
+
# Final consolidated Gradio App mounted on the FastAPI server
|
| 167 |
+
app = gr.mount_gradio_app(openenv_app, create_dashboard(), path="/")
|
| 168 |
+
|
| 169 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 170 |
+
"""Entry point: uv run server or python -m server.app"""
|
| 171 |
+
import uvicorn
|
| 172 |
+
uvicorn.run(app, host=host, port=port)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
main()
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv[core]>=0.2.0
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
server/rust_coder_environment.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rust Coder Environment Implementation.
|
| 3 |
+
|
| 4 |
+
Evaluates LLM-generated Rust code against 10 sequential coding problems.
|
| 5 |
+
Multi-dimensional reward system: Compilation(40%), Correctness(20%),
|
| 6 |
+
Coverage(20%), Elegance(10%), Efficiency(10%).
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import re
|
| 12 |
+
import subprocess
|
| 13 |
+
import tempfile
|
| 14 |
+
import time
|
| 15 |
+
from typing import Dict, List, Optional, Tuple
|
| 16 |
+
|
| 17 |
+
from openenv.core.env_server.interfaces import Environment
|
| 18 |
+
|
| 19 |
+
from models import RustCoderAction, RustCoderObservation
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Resolve problems.json: look in same dir as this file, then parent
|
| 23 |
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 24 |
+
_PROBLEMS_PATHS = [
|
| 25 |
+
os.path.join(_HERE, "problems.json"), # server/problems.json
|
| 26 |
+
os.path.join(_HERE, "..", "problems.json"), # root problems.json
|
| 27 |
+
"problems.json", # cwd fallback
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _find_problems_file() -> str:
|
| 32 |
+
"""Return the first existing problems.json path."""
|
| 33 |
+
for path in _PROBLEMS_PATHS:
|
| 34 |
+
if os.path.exists(path):
|
| 35 |
+
return os.path.abspath(path)
|
| 36 |
+
raise FileNotFoundError(
|
| 37 |
+
f"problems.json not found. Searched: {_PROBLEMS_PATHS}"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class RustCoderEnvironment(Environment):
|
| 42 |
+
"""
|
| 43 |
+
OpenEnv-compliant environment for evaluating Rust code submissions.
|
| 44 |
+
|
| 45 |
+
Manages 10 sequential coding problems. Each episode is a single problem:
|
| 46 |
+
- reset() → loads the current problem, returns its description
|
| 47 |
+
- step(action) → compiles & tests submitted code, returns reward
|
| 48 |
+
- After step(), the episode is done; next reset() loads the next problem.
|
| 49 |
+
|
| 50 |
+
Reward breakdown (all components normalized to [0, 1]):
|
| 51 |
+
Compilation 40% — code compiles without errors
|
| 52 |
+
Correctness 20% — fraction of test assertions that pass
|
| 53 |
+
Coverage 20% — fraction of tests attempted to run
|
| 54 |
+
Elegance 10% — code quality heuristics
|
| 55 |
+
Efficiency 10% — execution time vs. problem baseline
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 59 |
+
|
| 60 |
+
# Compile / run timeouts (seconds)
|
| 61 |
+
COMPILE_TIMEOUT = 30
|
| 62 |
+
RUN_TIMEOUT = 10
|
| 63 |
+
|
| 64 |
+
def __init__(self) -> None:
|
| 65 |
+
"""Initialize environment and load problems from JSON."""
|
| 66 |
+
self.problems: List[Dict] = self._load_problems()
|
| 67 |
+
self.current_problem_idx: int = 0
|
| 68 |
+
self.step_count: int = 0
|
| 69 |
+
|
| 70 |
+
# ------------------------------------------------------------------
|
| 71 |
+
# Internal helpers
|
| 72 |
+
# ------------------------------------------------------------------
|
| 73 |
+
|
| 74 |
+
def _load_problems(self) -> List[Dict]:
|
| 75 |
+
"""Load and validate the problems list from problems.json."""
|
| 76 |
+
path = _find_problems_file()
|
| 77 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 78 |
+
data = json.load(f)
|
| 79 |
+
if not isinstance(data, list) or len(data) == 0:
|
| 80 |
+
raise ValueError("problems.json must be a non-empty JSON array.")
|
| 81 |
+
return data
|
| 82 |
+
|
| 83 |
+
def _current_problem(self) -> Dict:
|
| 84 |
+
idx = self.current_problem_idx % len(self.problems)
|
| 85 |
+
return self.problems[idx]
|
| 86 |
+
|
| 87 |
+
# ------------------------------------------------------------------
|
| 88 |
+
# OpenEnv interface
|
| 89 |
+
# ------------------------------------------------------------------
|
| 90 |
+
|
| 91 |
+
@property
|
| 92 |
+
def state(self):
|
| 93 |
+
"""Return minimal state info (step count, problem index)."""
|
| 94 |
+
from openenv.core.env_server.types import State
|
| 95 |
+
return State(episode_id=None, step_count=self.step_count)
|
| 96 |
+
|
| 97 |
+
def reset(self, start_index: int = 0) -> RustCoderObservation:
|
| 98 |
+
"""Start a new episode, defaulting to the first problem."""
|
| 99 |
+
self.current_problem_idx = start_index % len(self.problems)
|
| 100 |
+
self.step_count = 0
|
| 101 |
+
problem = self.problems[self.current_problem_idx]
|
| 102 |
+
|
| 103 |
+
return RustCoderObservation(
|
| 104 |
+
problem_description=problem["description"],
|
| 105 |
+
starter_code=problem["starter_code"],
|
| 106 |
+
compilation_success=False,
|
| 107 |
+
compilation_output="",
|
| 108 |
+
test_results=[],
|
| 109 |
+
reward_breakdown={},
|
| 110 |
+
done=False,
|
| 111 |
+
reward=0.0,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
def step(self, action: RustCoderAction) -> RustCoderObservation:
|
| 115 |
+
"""Evaluate the submitted code and advance the task index within the single episode."""
|
| 116 |
+
self.step_count += 1
|
| 117 |
+
problem = self.problems[self.current_problem_idx]
|
| 118 |
+
code = action.code
|
| 119 |
+
|
| 120 |
+
# ── 1. Compilation (40%) ──────────────────────────────────────
|
| 121 |
+
compilation_success, compilation_output = self._compile_check(code)
|
| 122 |
+
r_compilation = 1.0 if compilation_success else 0.0
|
| 123 |
+
|
| 124 |
+
# ── 2. Correctness + Coverage (20% each) ─────────────────────
|
| 125 |
+
test_results: List[Dict] = []
|
| 126 |
+
r_correctness = 0.0
|
| 127 |
+
r_coverage = 0.0
|
| 128 |
+
|
| 129 |
+
if compilation_success:
|
| 130 |
+
tests = problem.get("tests", [])
|
| 131 |
+
if tests:
|
| 132 |
+
test_results = self._run_tests(code, tests)
|
| 133 |
+
passed = sum(1 for t in test_results if t.get("passed", False))
|
| 134 |
+
ran = sum(1 for t in test_results if t.get("ran", False))
|
| 135 |
+
r_correctness = passed / len(tests)
|
| 136 |
+
r_coverage = ran / len(tests)
|
| 137 |
+
else:
|
| 138 |
+
# No tests defined — give full credit to both dimensions
|
| 139 |
+
r_correctness = 1.0
|
| 140 |
+
r_coverage = 1.0
|
| 141 |
+
|
| 142 |
+
# ── 3. Elegance (10%) ─────────────────────────────────────────
|
| 143 |
+
r_elegance = self._score_elegance(code)
|
| 144 |
+
|
| 145 |
+
# ── 4. Efficiency (10%) ───────────────────────────────────────
|
| 146 |
+
baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
|
| 147 |
+
r_efficiency = 0.0
|
| 148 |
+
if compilation_success:
|
| 149 |
+
r_efficiency = self._score_efficiency(code, baseline_ms)
|
| 150 |
+
|
| 151 |
+
# ── Total reward ──────────────────────────────────────────────
|
| 152 |
+
reward_breakdown = {
|
| 153 |
+
"Compilation": round(r_compilation, 4),
|
| 154 |
+
"Correctness": round(r_correctness, 4),
|
| 155 |
+
"Coverage": round(r_coverage, 4),
|
| 156 |
+
"Elegance": round(r_elegance, 4),
|
| 157 |
+
"Efficiency": round(r_efficiency, 4),
|
| 158 |
+
}
|
| 159 |
+
# Calculate weighted total reward
|
| 160 |
+
total_reward = round(
|
| 161 |
+
r_compilation * 0.40
|
| 162 |
+
+ r_correctness * 0.20
|
| 163 |
+
+ r_coverage * 0.20
|
| 164 |
+
+ r_elegance * 0.10
|
| 165 |
+
+ r_efficiency * 0.10,
|
| 166 |
+
4,
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# ── Advance Logic ─────────────────────────────────────────────
|
| 170 |
+
self.current_problem_idx += 1
|
| 171 |
+
done = self.current_problem_idx >= len(self.problems)
|
| 172 |
+
|
| 173 |
+
next_prob_desc = "--- ALL TASKS COMPLETED in this episode ---"
|
| 174 |
+
next_starter = ""
|
| 175 |
+
if not done:
|
| 176 |
+
next_prob = self.problems[self.current_problem_idx]
|
| 177 |
+
next_prob_desc = f"--- NEXT TASK: {next_prob['title']} ---\n\n{next_prob['description']}"
|
| 178 |
+
next_starter = next_prob['starter_code']
|
| 179 |
+
|
| 180 |
+
return RustCoderObservation(
|
| 181 |
+
problem_description=next_prob_desc,
|
| 182 |
+
starter_code=next_starter,
|
| 183 |
+
compilation_success=compilation_success,
|
| 184 |
+
compilation_output=compilation_output[:2000], # cap length
|
| 185 |
+
test_results=test_results,
|
| 186 |
+
reward_breakdown=reward_breakdown,
|
| 187 |
+
done=done,
|
| 188 |
+
reward=total_reward,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# ------------------------------------------------------------------
|
| 192 |
+
# Compilation
|
| 193 |
+
# ------------------------------------------------------------------
|
| 194 |
+
|
| 195 |
+
def _compile_check(self, code: str) -> Tuple[bool, str]:
|
| 196 |
+
"""
|
| 197 |
+
Compile code as a Rust library crate.
|
| 198 |
+
Returns (success, compiler output).
|
| 199 |
+
"""
|
| 200 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 201 |
+
src = os.path.join(tmpdir, "submission.rs")
|
| 202 |
+
out = os.path.join(tmpdir, "submission.rlib")
|
| 203 |
+
with open(src, "w", encoding="utf-8") as f:
|
| 204 |
+
f.write(code)
|
| 205 |
+
try:
|
| 206 |
+
proc = subprocess.run(
|
| 207 |
+
["rustc", "--crate-type=lib", src, "-o", out,
|
| 208 |
+
"--edition=2021"],
|
| 209 |
+
capture_output=True,
|
| 210 |
+
text=True,
|
| 211 |
+
timeout=self.COMPILE_TIMEOUT,
|
| 212 |
+
)
|
| 213 |
+
return proc.returncode == 0, (proc.stdout + proc.stderr).strip()
|
| 214 |
+
except subprocess.TimeoutExpired:
|
| 215 |
+
return False, "Compilation timed out."
|
| 216 |
+
except FileNotFoundError:
|
| 217 |
+
return False, "rustc not found — is the Rust toolchain installed?"
|
| 218 |
+
|
| 219 |
+
# ------------------------------------------------------------------
|
| 220 |
+
# Correctness / Coverage
|
| 221 |
+
# ------------------------------------------------------------------
|
| 222 |
+
|
| 223 |
+
def _strip_main(self, code: str) -> str:
|
| 224 |
+
"""
|
| 225 |
+
Remove fn main() { ... } blocks from submitted code so we can
|
| 226 |
+
inject our own test main. Handles simple single-level braces.
|
| 227 |
+
"""
|
| 228 |
+
# Remove pub/private fn main() { ... }
|
| 229 |
+
pattern = re.compile(
|
| 230 |
+
r'(pub\s+)?fn\s+main\s*\(\s*\)\s*(?:->\s*[^{]+)?\s*\{',
|
| 231 |
+
re.MULTILINE,
|
| 232 |
+
)
|
| 233 |
+
match = pattern.search(code)
|
| 234 |
+
if not match:
|
| 235 |
+
return code
|
| 236 |
+
start = match.start()
|
| 237 |
+
depth = 0
|
| 238 |
+
i = match.end() - 1 # position of the opening '{'
|
| 239 |
+
while i < len(code):
|
| 240 |
+
if code[i] == '{':
|
| 241 |
+
depth += 1
|
| 242 |
+
elif code[i] == '}':
|
| 243 |
+
depth -= 1
|
| 244 |
+
if depth == 0:
|
| 245 |
+
return code[:start] + code[i + 1:]
|
| 246 |
+
i += 1
|
| 247 |
+
return code # malformed; return as-is
|
| 248 |
+
|
| 249 |
+
def _build_test_binary(
|
| 250 |
+
self, code: str, assertion: str, tmpdir: str, test_name: str
|
| 251 |
+
) -> Tuple[bool, str, str]:
|
| 252 |
+
"""
|
| 253 |
+
Build a runnable Rust binary that executes one test assertion.
|
| 254 |
+
Returns (compiled_ok, binary_path, compiler_output).
|
| 255 |
+
"""
|
| 256 |
+
body = self._strip_main(code)
|
| 257 |
+
src_code = f"""
|
| 258 |
+
#[allow(unused_imports, dead_code, unused_variables, unused_mut)]
|
| 259 |
+
{body}
|
| 260 |
+
|
| 261 |
+
fn main() {{
|
| 262 |
+
{assertion};
|
| 263 |
+
println!("PASS:{test_name}");
|
| 264 |
+
}}
|
| 265 |
+
"""
|
| 266 |
+
src_path = os.path.join(tmpdir, f"{test_name}.rs")
|
| 267 |
+
bin_path = os.path.join(tmpdir, test_name)
|
| 268 |
+
with open(src_path, "w", encoding="utf-8") as f:
|
| 269 |
+
f.write(src_code)
|
| 270 |
+
try:
|
| 271 |
+
proc = subprocess.run(
|
| 272 |
+
["rustc", src_path, "-o", bin_path, "--edition=2021"],
|
| 273 |
+
capture_output=True,
|
| 274 |
+
text=True,
|
| 275 |
+
timeout=self.COMPILE_TIMEOUT,
|
| 276 |
+
)
|
| 277 |
+
return proc.returncode == 0, bin_path, (proc.stdout + proc.stderr).strip()
|
| 278 |
+
except subprocess.TimeoutExpired:
|
| 279 |
+
return False, "", "Compile timed out for test."
|
| 280 |
+
except FileNotFoundError:
|
| 281 |
+
return False, "", "rustc not found."
|
| 282 |
+
|
| 283 |
+
def _run_tests(self, code: str, tests: List[Dict]) -> List[Dict]:
|
| 284 |
+
"""
|
| 285 |
+
Run each test assertion as a separate Rust binary.
|
| 286 |
+
Returns list of result dicts with keys: name, passed, ran, error.
|
| 287 |
+
"""
|
| 288 |
+
results = []
|
| 289 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 290 |
+
for i, test in enumerate(tests):
|
| 291 |
+
name = test.get("name", f"test_{i}")
|
| 292 |
+
assertion = test.get("test_assertion", "")
|
| 293 |
+
should_compile = test.get("should_compile", True)
|
| 294 |
+
|
| 295 |
+
result: Dict = {
|
| 296 |
+
"name": name,
|
| 297 |
+
"passed": False,
|
| 298 |
+
"ran": False,
|
| 299 |
+
"error": None,
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
if not assertion:
|
| 303 |
+
result["error"] = "No test assertion defined."
|
| 304 |
+
results.append(result)
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
# Some tests are expected to fail compilation (should_compile=False)
|
| 308 |
+
# treat successful compilation + correct output as pass
|
| 309 |
+
bin_test_name = f"t{i}_{name[:20]}"
|
| 310 |
+
compiled, bin_path, compiler_out = self._build_test_binary(
|
| 311 |
+
code, assertion, tmpdir, bin_test_name
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
if not compiled:
|
| 315 |
+
if not should_compile:
|
| 316 |
+
# The problem's starter code deliberately doesn't compile;
|
| 317 |
+
# if the submission also doesn't compile this test → skip
|
| 318 |
+
result["error"] = "Binary compile failed (expected for broken starter)."
|
| 319 |
+
else:
|
| 320 |
+
result["error"] = f"Compile error: {compiler_out[:300]}"
|
| 321 |
+
result["ran"] = False
|
| 322 |
+
results.append(result)
|
| 323 |
+
continue
|
| 324 |
+
|
| 325 |
+
# Run the binary
|
| 326 |
+
result["ran"] = True
|
| 327 |
+
try:
|
| 328 |
+
run_proc = subprocess.run(
|
| 329 |
+
[bin_path],
|
| 330 |
+
capture_output=True,
|
| 331 |
+
text=True,
|
| 332 |
+
timeout=self.RUN_TIMEOUT,
|
| 333 |
+
)
|
| 334 |
+
stdout = run_proc.stdout.strip()
|
| 335 |
+
if run_proc.returncode == 0 and f"PASS:{bin_test_name}" in stdout:
|
| 336 |
+
result["passed"] = True
|
| 337 |
+
else:
|
| 338 |
+
result["error"] = (
|
| 339 |
+
f"Test failed. Exit={run_proc.returncode}. "
|
| 340 |
+
f"stderr={run_proc.stderr[:200]}"
|
| 341 |
+
)
|
| 342 |
+
except subprocess.TimeoutExpired:
|
| 343 |
+
result["error"] = "Test execution timed out."
|
| 344 |
+
except Exception as exc:
|
| 345 |
+
result["error"] = str(exc)
|
| 346 |
+
|
| 347 |
+
results.append(result)
|
| 348 |
+
|
| 349 |
+
return results
|
| 350 |
+
|
| 351 |
+
# ------------------------------------------------------------------
|
| 352 |
+
# Elegance scoring
|
| 353 |
+
# ------------------------------------------------------------------
|
| 354 |
+
|
| 355 |
+
def _score_elegance(self, code: str) -> float:
|
| 356 |
+
"""
|
| 357 |
+
Heuristic code-quality score in [0, 1].
|
| 358 |
+
|
| 359 |
+
Penalties:
|
| 360 |
+
- Each `.unwrap()` call → -0.15 (max -0.45)
|
| 361 |
+
- Each `.expect(` call → -0.05 (max -0.15)
|
| 362 |
+
- Lines > 100 chars → -0.05 per violation (max -0.20)
|
| 363 |
+
- `unsafe` blocks → -0.20 unless problem requires FFI
|
| 364 |
+
|
| 365 |
+
Bonuses:
|
| 366 |
+
- Uses `?` operator → +0.10
|
| 367 |
+
- Uses `match` expressions → +0.05
|
| 368 |
+
- Has doc comments (`///`) → +0.05
|
| 369 |
+
"""
|
| 370 |
+
score = 1.0
|
| 371 |
+
|
| 372 |
+
unwrap_count = len(re.findall(r'\.unwrap\(\)', code))
|
| 373 |
+
score -= min(unwrap_count * 0.15, 0.45)
|
| 374 |
+
|
| 375 |
+
expect_count = len(re.findall(r'\.expect\(', code))
|
| 376 |
+
score -= min(expect_count * 0.05, 0.15)
|
| 377 |
+
|
| 378 |
+
long_lines = sum(1 for line in code.splitlines() if len(line) > 100)
|
| 379 |
+
score -= min(long_lines * 0.05, 0.20)
|
| 380 |
+
|
| 381 |
+
if "unsafe" in code:
|
| 382 |
+
score -= 0.20
|
| 383 |
+
|
| 384 |
+
if "?" in code:
|
| 385 |
+
score += 0.10
|
| 386 |
+
if "match " in code or "match\n" in code:
|
| 387 |
+
score += 0.05
|
| 388 |
+
if "///" in code:
|
| 389 |
+
score += 0.05
|
| 390 |
+
|
| 391 |
+
return round(max(0.0, min(1.0, score)), 4)
|
| 392 |
+
|
| 393 |
+
# ------------------------------------------------------------------
|
| 394 |
+
# Efficiency scoring
|
| 395 |
+
# ------------------------------------------------------------------
|
| 396 |
+
|
| 397 |
+
def _score_efficiency(self, code: str, baseline_ms: float) -> float:
|
| 398 |
+
"""
|
| 399 |
+
Time the execution by compiling + running a minimal binary.
|
| 400 |
+
Score = min(1.0, baseline_ms / actual_ms).
|
| 401 |
+
Returns 0.0 if compilation or execution fails.
|
| 402 |
+
"""
|
| 403 |
+
body = self._strip_main(code)
|
| 404 |
+
# Build a binary with an empty main to measure startup + run overhead
|
| 405 |
+
test_src = f"""
|
| 406 |
+
#[allow(unused_imports, dead_code, unused_variables)]
|
| 407 |
+
{body}
|
| 408 |
+
|
| 409 |
+
fn main() {{}}
|
| 410 |
+
"""
|
| 411 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 412 |
+
src_path = os.path.join(tmpdir, "eff.rs")
|
| 413 |
+
bin_path = os.path.join(tmpdir, "eff")
|
| 414 |
+
with open(src_path, "w", encoding="utf-8") as f:
|
| 415 |
+
f.write(test_src)
|
| 416 |
+
try:
|
| 417 |
+
# Compile
|
| 418 |
+
proc = subprocess.run(
|
| 419 |
+
["rustc", src_path, "-o", bin_path, "--edition=2021"],
|
| 420 |
+
capture_output=True, text=True, timeout=self.COMPILE_TIMEOUT,
|
| 421 |
+
)
|
| 422 |
+
if proc.returncode != 0:
|
| 423 |
+
return 0.0
|
| 424 |
+
# Time the run
|
| 425 |
+
t0 = time.monotonic()
|
| 426 |
+
run_proc = subprocess.run(
|
| 427 |
+
[bin_path], capture_output=True, timeout=self.RUN_TIMEOUT
|
| 428 |
+
)
|
| 429 |
+
elapsed_ms = (time.monotonic() - t0) * 1000.0
|
| 430 |
+
if run_proc.returncode != 0:
|
| 431 |
+
return 0.0
|
| 432 |
+
return round(min(1.0, baseline_ms / max(elapsed_ms, 0.1)), 4)
|
| 433 |
+
except Exception:
|
| 434 |
+
return 0.0
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|