Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- README.md +8 -2
- client.py +1 -1
- inference.py +22 -11
- models.py +1 -1
- problems.json +10 -10
- server/app.py +14 -223
- server/rust_coder_environment.py +60 -22
README.md
CHANGED
|
@@ -49,7 +49,7 @@ The environment returns detailed feedback after each submission:
|
|
| 49 |
| Field | Type | Description |
|
| 50 |
|------------------------|-------------|-----------------------------------------------------|
|
| 51 |
| `problem_description` | string | Task requirements and context |
|
| 52 |
-
| `
|
| 53 |
| `compilation_success` | bool | Whether `rustc` compiled the submitted code |
|
| 54 |
| `compilation_output` | string | Raw compiler errors and warnings |
|
| 55 |
| `test_results` | list[dict] | Per-test pass/fail results with error details |
|
|
@@ -213,6 +213,12 @@ rust_coder/
|
|
| 213 |
βββ inference.py # Baseline inference script (entry point)
|
| 214 |
βββ __init__.py # Package exports
|
| 215 |
βββ server/
|
| 216 |
-
βββ app.py # FastAPI
|
| 217 |
βββ rust_coder_environment.py # Core environment logic
|
| 218 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
| Field | Type | Description |
|
| 50 |
|------------------------|-------------|-----------------------------------------------------|
|
| 51 |
| `problem_description` | string | Task requirements and context |
|
| 52 |
+
| `header_section` | string | LeetCode-style scaffold (imports + signatures/types) |
|
| 53 |
| `compilation_success` | bool | Whether `rustc` compiled the submitted code |
|
| 54 |
| `compilation_output` | string | Raw compiler errors and warnings |
|
| 55 |
| `test_results` | list[dict] | Per-test pass/fail results with error details |
|
|
|
|
| 213 |
βββ inference.py # Baseline inference script (entry point)
|
| 214 |
βββ __init__.py # Package exports
|
| 215 |
βββ server/
|
| 216 |
+
βββ app.py # FastAPI OpenEnv server entrypoint
|
| 217 |
βββ rust_coder_environment.py # Core environment logic
|
| 218 |
```
|
| 219 |
+
|
| 220 |
+
## HF Space runtime model
|
| 221 |
+
|
| 222 |
+
- The Hugging Face Space serves the environment via `uvicorn server.app:app` (see `openenv.yaml` and `Dockerfile`).
|
| 223 |
+
- The built-in OpenEnv web UI may send an empty action on Step; this environment supports that by auto-calling the LLM when `action.code` is empty (unless disabled via `AUTO_LLM_ON_EMPTY_STEP=0`).
|
| 224 |
+
- `inference.py` is the required baseline runner used by the validator/judge. It connects to the running Space and drives `reset()`/`step()` in a loop, emitting strict `[START]`/`[STEP]`/`[END]` stdout lines.
|
client.py
CHANGED
|
@@ -44,7 +44,7 @@ class RustCoderEnv(
|
|
| 44 |
obs_data = payload.get("observation", {})
|
| 45 |
observation = RustCoderObservation(
|
| 46 |
problem_description=obs_data.get("problem_description", ""),
|
| 47 |
-
|
| 48 |
compilation_success=obs_data.get("compilation_success", False),
|
| 49 |
compilation_output=obs_data.get("compilation_output", ""),
|
| 50 |
test_results=obs_data.get("test_results", []),
|
|
|
|
| 44 |
obs_data = payload.get("observation", {})
|
| 45 |
observation = RustCoderObservation(
|
| 46 |
problem_description=obs_data.get("problem_description", ""),
|
| 47 |
+
header_section=obs_data.get("header_section", ""),
|
| 48 |
compilation_success=obs_data.get("compilation_success", False),
|
| 49 |
compilation_output=obs_data.get("compilation_output", ""),
|
| 50 |
test_results=obs_data.get("test_results", []),
|
inference.py
CHANGED
|
@@ -34,17 +34,28 @@ from models import RustCoderAction
|
|
| 34 |
|
| 35 |
# --- Strict Logging Helpers ---
|
| 36 |
def log_start(task: str, env: str, model: str):
|
| 37 |
-
|
|
|
|
| 38 |
|
| 39 |
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
def log_end(success: bool, steps: int, score: float, rewards: List[float]):
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
# --- LLM Solution Logic ---
|
| 50 |
async def get_model_code(prompt: str, client: OpenAI) -> str:
|
|
@@ -110,10 +121,10 @@ async def main():
|
|
| 110 |
|
| 111 |
steps_taken = step
|
| 112 |
|
| 113 |
-
# Format prompt including
|
| 114 |
prompt = obs.problem_description
|
| 115 |
-
if obs
|
| 116 |
-
prompt += f"\n\
|
| 117 |
|
| 118 |
# 1. Ask model for solution to current task
|
| 119 |
code_solution = await get_model_code(prompt, client)
|
|
@@ -126,7 +137,7 @@ async def main():
|
|
| 126 |
done = result.done
|
| 127 |
|
| 128 |
rewards.append(reward)
|
| 129 |
-
log_step(step=step, action=code_solution, reward=reward, done=done)
|
| 130 |
|
| 131 |
if done:
|
| 132 |
break
|
|
|
|
| 34 |
|
| 35 |
# --- Strict Logging Helpers ---
|
| 36 |
def log_start(task: str, env: str, model: str):
|
| 37 |
+
# REQUIRED exact stdout format (no quotes)
|
| 38 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 39 |
|
| 40 |
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
|
| 41 |
+
# REQUIRED exact stdout format:
|
| 42 |
+
# [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 43 |
+
action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")
|
| 44 |
+
action_str = action_str[:200] # keep single-line + bounded
|
| 45 |
+
err_field = "null" if error is None else str(error).replace("\r", "\\r").replace("\n", "\\n")
|
| 46 |
+
reward_2 = f"{float(reward or 0.0):.2f}"
|
| 47 |
+
print(
|
| 48 |
+
f"[STEP] step={step} action={action_str} reward={reward_2} done={str(bool(done)).lower()} error={err_field}",
|
| 49 |
+
flush=True,
|
| 50 |
+
)
|
| 51 |
|
| 52 |
def log_end(success: bool, steps: int, score: float, rewards: List[float]):
|
| 53 |
+
# REQUIRED exact stdout format, rewards as comma-separated 2dp
|
| 54 |
+
rewards_str = ",".join(f"{float(r or 0.0):.2f}" for r in rewards)
|
| 55 |
+
print(
|
| 56 |
+
f"[END] success={str(bool(success)).lower()} steps={steps} score={float(score or 0.0):.2f} rewards={rewards_str}",
|
| 57 |
+
flush=True,
|
| 58 |
+
)
|
| 59 |
|
| 60 |
# --- LLM Solution Logic ---
|
| 61 |
async def get_model_code(prompt: str, client: OpenAI) -> str:
|
|
|
|
| 121 |
|
| 122 |
steps_taken = step
|
| 123 |
|
| 124 |
+
# Format prompt including header_section if available
|
| 125 |
prompt = obs.problem_description
|
| 126 |
+
if getattr(obs, "header_section", ""):
|
| 127 |
+
prompt += f"\n\nHeader Section (must be included verbatim in final code):\n```rust\n{obs.header_section}\n```"
|
| 128 |
|
| 129 |
# 1. Ask model for solution to current task
|
| 130 |
code_solution = await get_model_code(prompt, client)
|
|
|
|
| 137 |
done = result.done
|
| 138 |
|
| 139 |
rewards.append(reward)
|
| 140 |
+
log_step(step=step, action=code_solution, reward=reward, done=done, error=None)
|
| 141 |
|
| 142 |
if done:
|
| 143 |
break
|
models.py
CHANGED
|
@@ -24,7 +24,7 @@ class RustCoderObservation(Observation):
|
|
| 24 |
"""Observation space for the Rust Coder environment."""
|
| 25 |
|
| 26 |
problem_description: str = Field(default="", description="The text description of the current coding task, including requirements.")
|
| 27 |
-
|
| 28 |
compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
|
| 29 |
compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
|
| 30 |
test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
|
|
|
|
| 24 |
"""Observation space for the Rust Coder environment."""
|
| 25 |
|
| 26 |
problem_description: str = Field(default="", description="The text description of the current coding task, including requirements.")
|
| 27 |
+
header_section: str = Field(default="", description="LeetCode-style header/scaffold (imports + signatures/types) for deterministic evaluation.")
|
| 28 |
compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
|
| 29 |
compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
|
| 30 |
test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
|
problems.json
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
"title": "Broken CLI Argument Parser",
|
| 5 |
"difficulty": "Easy",
|
| 6 |
"description": "Fix a command-line tool that parses user input to determine file operations (read, write, append). The implementation uses enums and pattern matching but contains: Mismatched types in enum variants, Incomplete match arms, Incorrect handling of optional arguments. The parser must compile and correctly interpret valid command-line inputs like: 'read file.txt' -> FileOp::Read('file.txt'), 'write file.txt content' -> FileOp::Write('file.txt', Some('content')), 'append file.txt' -> FileOp::Append('file.txt')",
|
| 7 |
-
"
|
| 8 |
"tests": [
|
| 9 |
{
|
| 10 |
"name": "parse_read_command",
|
|
@@ -35,7 +35,7 @@
|
|
| 35 |
"title": "Conflicting Borrows in Collection Processing",
|
| 36 |
"difficulty": "Easy\u2192Medium",
|
| 37 |
"description": "Fix a function that processes a vector of strings while conditionally modifying elements and storing references for later use. The implementation mixes mutable and immutable borrows within the same scope, causing borrow checker conflicts. Requirements: Iterate through vector of strings, Store uppercase versions in a results vector, Handle optional transformations without borrowing conflicts, Must compile and execute without panics",
|
| 38 |
-
"
|
| 39 |
"tests": [
|
| 40 |
{
|
| 41 |
"name": "process_single_string",
|
|
@@ -57,7 +57,7 @@
|
|
| 57 |
"title": "Invalid Lifetime Annotations in Text API",
|
| 58 |
"difficulty": "Medium",
|
| 59 |
"description": "Fix a text-processing utility that accepts multiple string slices and returns a reference derived from them. The function either fails to compile or produces incorrect lifetime relationships, risking references that outlive their input data. Requirements: Function must accept multiple &str parameters, Return a &str derived from the inputs, Properly annotate lifetimes, Must be safe (no dangling references)",
|
| 60 |
-
"
|
| 61 |
"tests": [
|
| 62 |
{
|
| 63 |
"name": "longest_text_basic",
|
|
@@ -88,7 +88,7 @@
|
|
| 88 |
"title": "Business Logic Producing Incorrect Results",
|
| 89 |
"difficulty": "Medium",
|
| 90 |
"description": "Fix a module implementing order validation logic including pricing, discounts, and boundary conditions. The code compiles but produces incorrect outputs for edge cases such as: Zero values, Overlapping discounts, Large numeric inputs, Negative prices. Requirements: Calculate order total correctly, Apply discounts properly (no double-counting), Handle edge cases (zero items, negative values), Be mathematically sound",
|
| 91 |
-
"
|
| 92 |
"tests": [
|
| 93 |
{
|
| 94 |
"name": "simple_order",
|
|
@@ -119,7 +119,7 @@
|
|
| 119 |
"title": "Corrupted Singly Linked List",
|
| 120 |
"difficulty": "Medium\u2192Hard",
|
| 121 |
"description": "Fix a custom singly linked list that supports insertion, deletion, and traversal. The implementation incorrectly manages node ownership and pointer transitions, resulting in: Lost nodes, Inconsistent traversal output, Occasional runtime panics. Requirements: Insert elements at head, Delete elements correctly, Traverse without panics, No memory leaks or lost data",
|
| 122 |
-
"
|
| 123 |
"tests": [
|
| 124 |
{
|
| 125 |
"name": "insert_single_element",
|
|
@@ -143,7 +143,7 @@
|
|
| 143 |
"title": "Deadlock in Multi-threaded Worker System",
|
| 144 |
"difficulty": "Hard",
|
| 145 |
"description": "Fix a worker system using multiple threads to process jobs from a shared queue protected by synchronization primitives. Under certain workloads, threads block indefinitely due to: Improper lock acquisition order, Shared state handling issues, Missing signal/wake mechanisms. Requirements: Spawn N worker threads, Process jobs from shared queue without deadlock, Handle shutdown gracefully, No panics under load",
|
| 146 |
-
"
|
| 147 |
"tests": [
|
| 148 |
{
|
| 149 |
"name": "single_worker_single_job",
|
|
@@ -167,7 +167,7 @@
|
|
| 167 |
"title": "Async Function with Borrowing Conflicts",
|
| 168 |
"difficulty": "Hard",
|
| 169 |
"description": "Fix an asynchronous function that processes input data and performs non-blocking operations while returning references tied to the input. The implementation violates borrowing constraints in an async context, leading to: Compilation errors when using references across await points, Invalid reference usage. Requirements: Accept &str input, Perform async operation, Return derived reference, Must be sound and compile",
|
| 170 |
-
"
|
| 171 |
"tests": [
|
| 172 |
{
|
| 173 |
"name": "process_sync_basic",
|
|
@@ -191,7 +191,7 @@
|
|
| 191 |
"title": "Unsafe FFI Integration Causing Crashes",
|
| 192 |
"difficulty": "Hard",
|
| 193 |
"description": "Fix Rust code that interfaces with an external C library using raw pointers. The implementation incorrectly handles: Pointer ownership, Memory allocation and deallocation, Undefined behavior risks. Requirements: Safely wrap C library calls, Properly manage memory (allocate/deallocate), No undefined behavior, Handle errors gracefully",
|
| 194 |
-
"
|
| 195 |
"tests": [
|
| 196 |
{
|
| 197 |
"name": "allocate_small_buffer",
|
|
@@ -208,7 +208,7 @@
|
|
| 208 |
"title": "Inefficient Data Processing Pipeline",
|
| 209 |
"difficulty": "Hard",
|
| 210 |
"description": "Fix a data pipeline that reads large datasets, applies transformations, and aggregates results. While functionally correct, the implementation has: Excessive memory allocations, Redundant iterations, Inefficient data copying. Requirements: Process data efficiently, Minimize allocations and copies, Use iterators when possible, Produce correct results with better performance",
|
| 211 |
-
"
|
| 212 |
"tests": [
|
| 213 |
{
|
| 214 |
"name": "simple_pipeline",
|
|
@@ -232,7 +232,7 @@
|
|
| 232 |
"title": "Reference-counted Cache with Memory Leak",
|
| 233 |
"difficulty": "Hard+",
|
| 234 |
"description": "Fix a caching system using reference-counted pointers to share data across components. The design creates cyclic references between cached objects, preventing memory from being released and causing memory usage to grow over time. Requirements: Implement caching without memory leaks, Break circular reference patterns, Use Rc/Arc correctly with Weak pointers when needed, Memory should be released when cache is cleared",
|
| 235 |
-
"
|
| 236 |
"tests": [
|
| 237 |
{
|
| 238 |
"name": "cache_insert_single",
|
|
|
|
| 4 |
"title": "Broken CLI Argument Parser",
|
| 5 |
"difficulty": "Easy",
|
| 6 |
"description": "Fix a command-line tool that parses user input to determine file operations (read, write, append). The implementation uses enums and pattern matching but contains: Mismatched types in enum variants, Incomplete match arms, Incorrect handling of optional arguments. The parser must compile and correctly interpret valid command-line inputs like: 'read file.txt' -> FileOp::Read('file.txt'), 'write file.txt content' -> FileOp::Write('file.txt', Some('content')), 'append file.txt' -> FileOp::Append('file.txt')",
|
| 7 |
+
"header_section": "#[derive(Debug, PartialEq)]\nenum FileOp {\n Read(String),\n Write(String, Option<String>),\n Append(String),\n}\n\nfn parse_command(input: &str) -> Option<FileOp> {\n let parts: Vec<&str> = input.split_whitespace().collect();\n \n match parts.get(0) {\n Some(&\"read\") => {\n let filename = parts.get(1)?;\n FileOp::Read(filename.to_string()) // BUG: Missing Some()\n }\n Some(&\"write\") => {\n let filename = parts.get(1)?;\n let content = parts.get(2).map(|s| s.to_string());\n Some(FileOp::Write(filename.to_string(), content))\n }\n Some(&\"append\") => {\n let filename = parts.get(1)?;\n // BUG: Missing return statement\n }\n _ => None,\n }\n}\n\nfn main() {\n println!(\"CLI Parser Test\");\n}",
|
| 8 |
"tests": [
|
| 9 |
{
|
| 10 |
"name": "parse_read_command",
|
|
|
|
| 35 |
"title": "Conflicting Borrows in Collection Processing",
|
| 36 |
"difficulty": "Easy\u2192Medium",
|
| 37 |
"description": "Fix a function that processes a vector of strings while conditionally modifying elements and storing references for later use. The implementation mixes mutable and immutable borrows within the same scope, causing borrow checker conflicts. Requirements: Iterate through vector of strings, Store uppercase versions in a results vector, Handle optional transformations without borrowing conflicts, Must compile and execute without panics",
|
| 38 |
+
"header_section": "fn process_strings(strings: &mut Vec<String>) -> Vec<String> {\n let mut results = Vec::new();\n \n for s in strings {\n // BUG: Cannot borrow as mutable while immutable borrow is active\n let upper = s.to_uppercase();\n s.push_str(\"_processed\"); // Mutable borrow\n results.push(upper);\n }\n \n results\n}\n\nfn main() {\n println!(\"String processing\");\n}",
|
| 39 |
"tests": [
|
| 40 |
{
|
| 41 |
"name": "process_single_string",
|
|
|
|
| 57 |
"title": "Invalid Lifetime Annotations in Text API",
|
| 58 |
"difficulty": "Medium",
|
| 59 |
"description": "Fix a text-processing utility that accepts multiple string slices and returns a reference derived from them. The function either fails to compile or produces incorrect lifetime relationships, risking references that outlive their input data. Requirements: Function must accept multiple &str parameters, Return a &str derived from the inputs, Properly annotate lifetimes, Must be safe (no dangling references)",
|
| 60 |
+
"header_section": "// BUG: Invalid lifetime annotations - which lifetime should the return type use?\nfn longest_text<'a>(s1: &'a str, s2: &'a str) -> &'a str {\n if s1.len() > s2.len() {\n s1\n } else {\n s2\n }\n}\n\n// BUG: This function has a lifetime issue\nfn find_first_word(s: &str) -> &str {\n let bytes = s.as_bytes();\n for (i, &byte) in bytes.iter().enumerate() {\n if byte == b' ' {\n return &s[0..i];\n }\n }\n &s[..]\n}\n\nfn main() {\n println!(\"Lifetime test\");\n}",
|
| 61 |
"tests": [
|
| 62 |
{
|
| 63 |
"name": "longest_text_basic",
|
|
|
|
| 88 |
"title": "Business Logic Producing Incorrect Results",
|
| 89 |
"difficulty": "Medium",
|
| 90 |
"description": "Fix a module implementing order validation logic including pricing, discounts, and boundary conditions. The code compiles but produces incorrect outputs for edge cases such as: Zero values, Overlapping discounts, Large numeric inputs, Negative prices. Requirements: Calculate order total correctly, Apply discounts properly (no double-counting), Handle edge cases (zero items, negative values), Be mathematically sound",
|
| 91 |
+
"header_section": "#[derive(Debug, Clone)]\nstruct Order {\n quantity: i32,\n unit_price: f64,\n discount_percent: f64,\n}\n\nimpl Order {\n fn new(quantity: i32, unit_price: f64) -> Self {\n Order {\n quantity,\n unit_price,\n discount_percent: 0.0,\n }\n }\n\n fn with_discount(mut self, discount: f64) -> Self {\n self.discount_percent = discount;\n self\n }\n\n fn calculate_total(&self) -> f64 {\n let subtotal = self.quantity as f64 * self.unit_price;\n // BUG: Incorrect discount calculation\n let discount = subtotal * (self.discount_percent / 100.0);\n subtotal - discount // Missing rounding/validation\n }\n}\n\nfn main() {\n println!(\"Order test\");\n}",
|
| 92 |
"tests": [
|
| 93 |
{
|
| 94 |
"name": "simple_order",
|
|
|
|
| 119 |
"title": "Corrupted Singly Linked List",
|
| 120 |
"difficulty": "Medium\u2192Hard",
|
| 121 |
"description": "Fix a custom singly linked list that supports insertion, deletion, and traversal. The implementation incorrectly manages node ownership and pointer transitions, resulting in: Lost nodes, Inconsistent traversal output, Occasional runtime panics. Requirements: Insert elements at head, Delete elements correctly, Traverse without panics, No memory leaks or lost data",
|
| 122 |
+
"header_section": "use std::ptr;\n\n#[derive(Debug)]\nstruct Node<T> {\n value: T,\n next: Option<Box<Node<T>>>,\n}\n\n#[derive(Debug)]\nstruct LinkedList<T> {\n head: Option<Box<Node<T>>>,\n}\n\nimpl<T> LinkedList<T> {\n fn new() -> Self {\n LinkedList { head: None }\n }\n\n fn insert(&mut self, value: T) {\n let new_node = Box::new(Node {\n value,\n next: None, // BUG: Should move self.head into next\n });\n self.head = Some(new_node);\n }\n\n fn len(&self) -> usize {\n let mut count = 0;\n let mut current = &self.head;\n while let Some(node) = current {\n count += 1;\n current = &node.next; // Correct, but insert is broken\n }\n count\n }\n}\n\nfn main() {\n println!(\"LinkedList test\");\n}",
|
| 123 |
"tests": [
|
| 124 |
{
|
| 125 |
"name": "insert_single_element",
|
|
|
|
| 143 |
"title": "Deadlock in Multi-threaded Worker System",
|
| 144 |
"difficulty": "Hard",
|
| 145 |
"description": "Fix a worker system using multiple threads to process jobs from a shared queue protected by synchronization primitives. Under certain workloads, threads block indefinitely due to: Improper lock acquisition order, Shared state handling issues, Missing signal/wake mechanisms. Requirements: Spawn N worker threads, Process jobs from shared queue without deadlock, Handle shutdown gracefully, No panics under load",
|
| 146 |
+
"header_section": "use std::sync::{Arc, Mutex, mpsc};\nuse std::thread;\n\nfn worker_system(num_workers: usize, jobs: Vec<i32>) -> Vec<i32> {\n let (tx, rx) = mpsc::channel();\n let rx = Arc::new(Mutex::new(rx));\n let results = Arc::new(Mutex::new(Vec::new()));\n \n let mut handles = vec![];\n \n for _ in 0..num_workers {\n let rx = Arc::clone(&rx);\n let results = Arc::clone(&results);\n \n let handle = thread::spawn(move || {\n loop {\n // BUG: Lock acquired but never released before trying to acquire results lock\n let receiver = rx.lock().unwrap();\n match receiver.try_recv() {\n Ok(job) => {\n let result = job * 2;\n // BUG: Tries to lock results while still holding rx lock - DEADLOCK\n results.lock().unwrap().push(result);\n }\n Err(_) => break,\n }\n }\n });\n handles.push(handle);\n }\n \n for job in jobs {\n let _ = tx.send(job); // Ignore send errors\n }\n drop(tx);\n \n for handle in handles {\n let _ = handle.join();\n }\n \n Arc::try_unwrap(results)\n .unwrap()\n .into_inner()\n .unwrap()\n}\n\nfn main() {\n println!(\"Worker system test\");\n}",
|
| 147 |
"tests": [
|
| 148 |
{
|
| 149 |
"name": "single_worker_single_job",
|
|
|
|
| 167 |
"title": "Async Function with Borrowing Conflicts",
|
| 168 |
"difficulty": "Hard",
|
| 169 |
"description": "Fix an asynchronous function that processes input data and performs non-blocking operations while returning references tied to the input. The implementation violates borrowing constraints in an async context, leading to: Compilation errors when using references across await points, Invalid reference usage. Requirements: Accept &str input, Perform async operation, Return derived reference, Must be sound and compile",
|
| 170 |
+
"header_section": "use std::pin::Pin;\nuse std::future::Future;\n\n// BUG: Cannot return reference that outlives await point\nasync fn process_async(input: &str) -> &str {\n // Simulating async work\n // tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;\n \n // BUG: input reference cannot be returned from async context like this\n input\n}\n\n// Better approach: return owned data or 'static reference\nfn process_sync(input: &str) -> String {\n input.to_uppercase()\n}\n\nfn main() {\n println!(\"Async test\");\n}",
|
| 171 |
"tests": [
|
| 172 |
{
|
| 173 |
"name": "process_sync_basic",
|
|
|
|
| 191 |
"title": "Unsafe FFI Integration Causing Crashes",
|
| 192 |
"difficulty": "Hard",
|
| 193 |
"description": "Fix Rust code that interfaces with an external C library using raw pointers. The implementation incorrectly handles: Pointer ownership, Memory allocation and deallocation, Undefined behavior risks. Requirements: Safely wrap C library calls, Properly manage memory (allocate/deallocate), No undefined behavior, Handle errors gracefully",
|
| 194 |
+
"header_section": "extern \"C\" {\n fn malloc(size: usize) -> *mut u8;\n fn free(ptr: *mut u8);\n}\n\nfn allocate_and_init(size: usize) -> Vec<u8> {\n unsafe {\n let ptr = malloc(size);\n // BUG: No null check - ptr could be null\n // BUG: Memory not initialized before use\n let slice = std::slice::from_raw_parts_mut(ptr, size);\n \n // Copy to vec and free\n let vec = slice.to_vec();\n free(ptr); // BUG: Freeing memory still referenced in vec\n vec\n }\n}\n\nfn main() {\n println!(\"FFI test\");\n}",
|
| 195 |
"tests": [
|
| 196 |
{
|
| 197 |
"name": "allocate_small_buffer",
|
|
|
|
| 208 |
"title": "Inefficient Data Processing Pipeline",
|
| 209 |
"difficulty": "Hard",
|
| 210 |
"description": "Fix a data pipeline that reads large datasets, applies transformations, and aggregates results. While functionally correct, the implementation has: Excessive memory allocations, Redundant iterations, Inefficient data copying. Requirements: Process data efficiently, Minimize allocations and copies, Use iterators when possible, Produce correct results with better performance",
|
| 211 |
+
"header_section": "fn process_data(numbers: Vec<i32>) -> i32 {\n // BUG: Multiple unnecessary allocations and iterations\n \n // First pass: filter evens (allocates new vector)\n let evens: Vec<i32> = numbers.iter()\n .filter(|n| n % 2 == 0)\n .copied()\n .collect();\n \n // Second pass: double values (allocates another vector)\n let doubled: Vec<i32> = evens.iter()\n .map(|n| n * 2)\n .collect();\n \n // Third pass: sum (unnecessary iteration)\n let sum: i32 = doubled.iter().sum();\n \n // Fourth pass: filter again (redundant)\n let final_sum: i32 = doubled.iter()\n .filter(|n| n % 4 == 0)\n .sum();\n \n final_sum\n}\n\nfn main() {\n println!(\"Efficiency test\");\n}",
|
| 212 |
"tests": [
|
| 213 |
{
|
| 214 |
"name": "simple_pipeline",
|
|
|
|
| 232 |
"title": "Reference-counted Cache with Memory Leak",
|
| 233 |
"difficulty": "Hard+",
|
| 234 |
"description": "Fix a caching system using reference-counted pointers to share data across components. The design creates cyclic references between cached objects, preventing memory from being released and causing memory usage to grow over time. Requirements: Implement caching without memory leaks, Break circular reference patterns, Use Rc/Arc correctly with Weak pointers when needed, Memory should be released when cache is cleared",
|
| 235 |
+
"header_section": "use std::rc::Rc;\nuse std::cell::RefCell;\n\n#[derive(Debug)]\nstruct CacheNode<T> {\n key: String,\n value: T,\n // BUG: This creates a cycle that prevents garbage collection\n related: RefCell<Option<Rc<CacheNode<T>>>>,\n}\n\n#[derive(Debug)]\nstruct Cache<T> {\n items: RefCell<Vec<Rc<CacheNode<T>>>>,\n}\n\nimpl<T: Clone> Cache<T> {\n fn new() -> Self {\n Cache {\n items: RefCell::new(Vec::new()),\n }\n }\n\n fn insert(&self, key: String, value: T) {\n let node = Rc::new(CacheNode {\n key,\n value,\n related: RefCell::new(None),\n });\n \n // BUG: Creating cyclic references\n if let Some(last) = self.items.borrow().last() {\n // Rc to Rc creates a cycle\n if let Ok(mut r) = last.related.try_borrow_mut() {\n *r = Some(Rc::clone(&node)); // Cycle here!\n }\n }\n \n self.items.borrow_mut().push(node);\n }\n}\n\nfn main() {\n println!(\"Cache test\");\n}",
|
| 236 |
"tests": [
|
| 237 |
{
|
| 238 |
"name": "cache_insert_single",
|
server/app.py
CHANGED
|
@@ -1,20 +1,19 @@
|
|
| 1 |
"""
|
| 2 |
-
FastAPI application for the Rust Coder
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
import os
|
| 13 |
import logging
|
| 14 |
-
|
| 15 |
-
import time
|
| 16 |
-
import gradio as gr
|
| 17 |
-
from openai import OpenAI
|
| 18 |
from dotenv import load_dotenv
|
| 19 |
from openenv.core.env_server.http_server import create_app
|
| 20 |
|
|
@@ -23,38 +22,13 @@ from server.rust_coder_environment import RustCoderEnvironment
|
|
| 23 |
|
| 24 |
load_dotenv()
|
| 25 |
|
| 26 |
-
# --- Logging (server/app.py) ---
|
| 27 |
_LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
|
| 28 |
logging.basicConfig(
|
| 29 |
level=getattr(logging, _LOG_LEVEL, logging.INFO),
|
| 30 |
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
|
| 31 |
)
|
| 32 |
-
logger = logging.getLogger("rust_coder.server")
|
| 33 |
|
| 34 |
-
|
| 35 |
-
_DEBUG_LOG_PATH = os.getenv("DEBUG_LOG_PATH") or "debug-55b5ef.log"
|
| 36 |
-
_DEBUG_SESSION_ID = "55b5ef"
|
| 37 |
-
def _dbg(hypothesis_id: str, location: str, message: str, data: dict, run_id: str = "pre-fix") -> None:
|
| 38 |
-
try:
|
| 39 |
-
payload = {
|
| 40 |
-
"sessionId": _DEBUG_SESSION_ID,
|
| 41 |
-
"runId": run_id,
|
| 42 |
-
"hypothesisId": hypothesis_id,
|
| 43 |
-
"location": location,
|
| 44 |
-
"message": message,
|
| 45 |
-
"data": data,
|
| 46 |
-
"timestamp": int(time.time() * 1000),
|
| 47 |
-
}
|
| 48 |
-
with open(_DEBUG_LOG_PATH, "a", encoding="utf-8") as f:
|
| 49 |
-
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
| 50 |
-
except Exception:
|
| 51 |
-
# Never break app for debug logging
|
| 52 |
-
pass
|
| 53 |
-
# #endregion
|
| 54 |
-
|
| 55 |
-
# --- Core OpenEnv Server Setup ---
|
| 56 |
-
# Use a distinct name for the OpenEnv FastAPI instance
|
| 57 |
-
openenv_app = create_app(
|
| 58 |
RustCoderEnvironment,
|
| 59 |
RustCoderAction,
|
| 60 |
RustCoderObservation,
|
|
@@ -62,198 +36,15 @@ openenv_app = create_app(
|
|
| 62 |
max_concurrent_envs=1,
|
| 63 |
)
|
| 64 |
|
| 65 |
-
|
| 66 |
-
@
|
| 67 |
async def health_check():
|
| 68 |
return {"status": "healthy"}
|
| 69 |
|
| 70 |
-
# --- Shared Logic ---
|
| 71 |
-
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 72 |
-
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 73 |
-
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 74 |
-
|
| 75 |
-
def get_llm_solution(problem_desc: str):
|
| 76 |
-
"""Call LLM to get a Rust solution"""
|
| 77 |
-
try:
|
| 78 |
-
_dbg(
|
| 79 |
-
"H2",
|
| 80 |
-
"server/app.py:get_llm_solution:entry",
|
| 81 |
-
"LLM call starting",
|
| 82 |
-
{"model": MODEL_NAME, "base_url": API_BASE_URL, "prompt_chars": len(problem_desc or ""), "token_present": bool(HF_TOKEN)},
|
| 83 |
-
)
|
| 84 |
-
logger.info(
|
| 85 |
-
"LLM call start model=%s base_url=%s prompt_chars=%d token_present=%s",
|
| 86 |
-
MODEL_NAME,
|
| 87 |
-
API_BASE_URL,
|
| 88 |
-
len(problem_desc or ""),
|
| 89 |
-
bool(HF_TOKEN),
|
| 90 |
-
)
|
| 91 |
-
client_llm = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 92 |
-
completion = client_llm.chat.completions.create(
|
| 93 |
-
model=MODEL_NAME,
|
| 94 |
-
messages=[
|
| 95 |
-
{"role": "system", "content": "You are an expert Rust developer. Respond ONLY with the code solution, no explanation."},
|
| 96 |
-
{"role": "user", "content": f"Fix the following Rust problem:\n{problem_desc}"},
|
| 97 |
-
],
|
| 98 |
-
temperature=0.2,
|
| 99 |
-
)
|
| 100 |
-
text = (completion.choices[0].message.content or "").strip()
|
| 101 |
-
logger.debug("LLM raw response chars=%d", len(text))
|
| 102 |
-
# Clean markdown code blocks
|
| 103 |
-
if "```rust" in text:
|
| 104 |
-
text = text.split("```rust")[1].split("```")[0]
|
| 105 |
-
elif "```" in text:
|
| 106 |
-
text = text.split("```")[1].split("```")[0]
|
| 107 |
-
text = text.strip()
|
| 108 |
-
if not text:
|
| 109 |
-
_dbg("H2", "server/app.py:get_llm_solution:empty", "LLM returned empty after cleanup", {"raw_chars": len((completion.choices[0].message.content or ""))})
|
| 110 |
-
logger.warning("LLM returned empty code after cleanup.")
|
| 111 |
-
return "// LLM Error: empty response (no code returned)."
|
| 112 |
-
_dbg("H2", "server/app.py:get_llm_solution:exit", "LLM call finished", {"returned_code_chars": len(text)})
|
| 113 |
-
logger.info("LLM call end: returned_code_chars=%d", len(text))
|
| 114 |
-
return text
|
| 115 |
-
except Exception as e:
|
| 116 |
-
_dbg("H2", "server/app.py:get_llm_solution:error", "LLM call exception", {"error": str(e)})
|
| 117 |
-
logger.exception("LLM call failed.")
|
| 118 |
-
return f"// LLM Error: {e}"
|
| 119 |
-
|
| 120 |
-
def evaluate_single(problem_id, code=None):
|
| 121 |
-
"""Run evaluation for a specific problem. If code is None, it asks the LLM."""
|
| 122 |
-
try:
|
| 123 |
-
idx = int(problem_id.split(":")[0]) - 1
|
| 124 |
-
problem = RustCoderEnvironment().problems[idx]
|
| 125 |
-
_dbg(
|
| 126 |
-
"H2",
|
| 127 |
-
"server/app.py:evaluate_single:entry",
|
| 128 |
-
"evaluate_single called",
|
| 129 |
-
{"problem_id": str(problem_id), "idx": idx, "code_is_none": code is None, "code_chars": len(code or "")},
|
| 130 |
-
)
|
| 131 |
-
logger.info(
|
| 132 |
-
"evaluate_single start problem_id=%s idx=%d code_provided=%s",
|
| 133 |
-
problem_id,
|
| 134 |
-
idx,
|
| 135 |
-
code is not None,
|
| 136 |
-
)
|
| 137 |
-
|
| 138 |
-
# 1. Get code from LLM if not provided
|
| 139 |
-
solution_code = code if code else get_llm_solution(problem["description"])
|
| 140 |
-
|
| 141 |
-
# 2. Guard: If LLM failed, do not evaluate
|
| 142 |
-
if not solution_code.strip() or solution_code.startswith("// LLM Error"):
|
| 143 |
-
_dbg(
|
| 144 |
-
"H2",
|
| 145 |
-
"server/app.py:evaluate_single:abort",
|
| 146 |
-
"evaluate_single abort due to empty/error code",
|
| 147 |
-
{"starts_with_llm_error": solution_code.startswith("// LLM Error"), "solution_code_chars": len(solution_code or "")},
|
| 148 |
-
)
|
| 149 |
-
logger.warning(
|
| 150 |
-
"evaluate_single abort: empty_or_error_code=%s chars=%d",
|
| 151 |
-
solution_code.startswith("// LLM Error"),
|
| 152 |
-
len(solution_code or ""),
|
| 153 |
-
)
|
| 154 |
-
return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
|
| 155 |
-
|
| 156 |
-
# 3. Evaluate properly
|
| 157 |
-
env = RustCoderEnvironment()
|
| 158 |
-
# Reset to the specifically requested index
|
| 159 |
-
state = env.reset(start_index=idx)
|
| 160 |
-
logger.debug("evaluate_single step() submitting chars=%d", len(solution_code))
|
| 161 |
-
state = env.step(RustCoderAction(code=solution_code))
|
| 162 |
-
logger.info(
|
| 163 |
-
"evaluate_single end reward=%.4f compilation_success=%s",
|
| 164 |
-
float(state.reward or 0.0),
|
| 165 |
-
bool(state.compilation_success),
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
metrics = {
|
| 169 |
-
"Total Reward": f"{state.reward:.2f}",
|
| 170 |
-
"Compilation": "Success" if state.compilation_success else "Failed",
|
| 171 |
-
"Metrics": state.reward_breakdown
|
| 172 |
-
}
|
| 173 |
-
return solution_code, metrics
|
| 174 |
-
except Exception as e:
|
| 175 |
-
logger.exception("evaluate_single crashed.")
|
| 176 |
-
return f"// Error: {e}", {"error": f"Evaluation system error: {e}"}
|
| 177 |
-
|
| 178 |
-
def run_benchmark(progress=gr.Progress()):
|
| 179 |
-
"""Run all 10 problems through the LLM and show summary"""
|
| 180 |
-
try:
|
| 181 |
-
env = RustCoderEnvironment()
|
| 182 |
-
rows = []
|
| 183 |
-
total_score = 0.0
|
| 184 |
-
|
| 185 |
-
# Check if token is actually present
|
| 186 |
-
test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 187 |
-
if not test_token:
|
| 188 |
-
return "## Error: HF_TOKEN is not set. Add it to your HF Space secrets or local .env file.", []
|
| 189 |
-
|
| 190 |
-
for i in range(len(env.problems)):
|
| 191 |
-
progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
|
| 192 |
-
problem = env.problems[i]
|
| 193 |
-
code = get_llm_solution(problem["description"])
|
| 194 |
-
|
| 195 |
-
reward = 0.0
|
| 196 |
-
compiled = "Failed (LLM Error)"
|
| 197 |
-
|
| 198 |
-
if not code.startswith("// LLM Error"):
|
| 199 |
-
env.reset(start_index=i)
|
| 200 |
-
state = env.step(RustCoderAction(code=code))
|
| 201 |
-
reward = state.reward
|
| 202 |
-
compiled = "Success" if state.compilation_success else "Failed"
|
| 203 |
-
|
| 204 |
-
rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
|
| 205 |
-
total_score += reward
|
| 206 |
-
|
| 207 |
-
avg_score = total_score / len(env.problems)
|
| 208 |
-
summary_md = f"## Benchmark Summary\n**Final Environment Score: {avg_score:.2f} / 1.0**"
|
| 209 |
-
return summary_md, rows
|
| 210 |
-
except Exception as e:
|
| 211 |
-
return f"### Benchmark Error: {e}", []
|
| 212 |
-
|
| 213 |
-
# --- Build the Gradio UI ---
|
| 214 |
-
def create_dashboard():
|
| 215 |
-
with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
|
| 216 |
-
gr.Markdown("# π¦ Rust Coder: LLM Evaluation Dashboard")
|
| 217 |
-
|
| 218 |
-
with gr.Tab("Individual Task Evaluation"):
|
| 219 |
-
with gr.Row():
|
| 220 |
-
with gr.Column(scale=1):
|
| 221 |
-
p_env = RustCoderEnvironment()
|
| 222 |
-
p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
|
| 223 |
-
dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
|
| 224 |
-
desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
|
| 225 |
-
|
| 226 |
-
with gr.Column(scale=1):
|
| 227 |
-
run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
|
| 228 |
-
code_display = gr.Code(label="AI Generated Solution", interactive=False)
|
| 229 |
-
results_json = gr.JSON(label="Metric Breakdown")
|
| 230 |
-
|
| 231 |
-
def update_desc(p_str):
|
| 232 |
-
idx = int(p_str.split(":")[0]) - 1
|
| 233 |
-
p = p_env.problems[idx]
|
| 234 |
-
return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
|
| 235 |
-
|
| 236 |
-
dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
|
| 237 |
-
run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
|
| 238 |
-
|
| 239 |
-
with gr.Tab("Full Environment Benchmark"):
|
| 240 |
-
gr.Markdown("### Complete Environment Suite")
|
| 241 |
-
gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
|
| 242 |
-
|
| 243 |
-
b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
|
| 244 |
-
b_sum = gr.Markdown()
|
| 245 |
-
b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
|
| 246 |
-
|
| 247 |
-
b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
|
| 248 |
-
|
| 249 |
-
return demo
|
| 250 |
-
|
| 251 |
-
# Final consolidated Gradio App mounted on the FastAPI server
|
| 252 |
-
app = gr.mount_gradio_app(openenv_app, create_dashboard(), path="/")
|
| 253 |
|
| 254 |
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 255 |
-
"""Entry point: uv run server or python -m server.app"""
|
| 256 |
import uvicorn
|
|
|
|
| 257 |
uvicorn.run(app, host=host, port=port)
|
| 258 |
|
| 259 |
|
|
|
|
| 1 |
"""
|
| 2 |
+
FastAPI application for the Rust Coder OpenEnv environment.
|
| 3 |
|
| 4 |
+
This module is the Hugging Face Space entrypoint (see `openenv.yaml` and Docker `CMD`).
|
| 5 |
+
|
| 6 |
+
Endpoints (provided by OpenEnv `create_app`):
|
| 7 |
+
- POST /reset
|
| 8 |
+
- POST /step
|
| 9 |
+
- GET /state
|
| 10 |
+
- GET /schema
|
| 11 |
+
- WS /ws
|
| 12 |
"""
|
| 13 |
|
| 14 |
import os
|
| 15 |
import logging
|
| 16 |
+
|
|
|
|
|
|
|
|
|
|
| 17 |
from dotenv import load_dotenv
|
| 18 |
from openenv.core.env_server.http_server import create_app
|
| 19 |
|
|
|
|
| 22 |
|
| 23 |
load_dotenv()
|
| 24 |
|
|
|
|
| 25 |
_LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
|
| 26 |
logging.basicConfig(
|
| 27 |
level=getattr(logging, _LOG_LEVEL, logging.INFO),
|
| 28 |
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
|
| 29 |
)
|
|
|
|
| 30 |
|
| 31 |
+
app = create_app(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
RustCoderEnvironment,
|
| 33 |
RustCoderAction,
|
| 34 |
RustCoderObservation,
|
|
|
|
| 36 |
max_concurrent_envs=1,
|
| 37 |
)
|
| 38 |
|
| 39 |
+
|
| 40 |
+
@app.get("/health")
|
| 41 |
async def health_check():
|
| 42 |
return {"status": "healthy"}
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
|
|
|
| 46 |
import uvicorn
|
| 47 |
+
|
| 48 |
uvicorn.run(app, host=host, port=port)
|
| 49 |
|
| 50 |
|
server/rust_coder_environment.py
CHANGED
|
@@ -129,7 +129,7 @@ class RustCoderEnvironment(Environment):
|
|
| 129 |
|
| 130 |
return RustCoderObservation(
|
| 131 |
problem_description=problem["description"],
|
| 132 |
-
|
| 133 |
compilation_success=False,
|
| 134 |
compilation_output="",
|
| 135 |
test_results=[],
|
|
@@ -143,6 +143,7 @@ class RustCoderEnvironment(Environment):
|
|
| 143 |
self.step_count += 1
|
| 144 |
problem = self.problems[self.current_problem_idx]
|
| 145 |
code = action.code
|
|
|
|
| 146 |
|
| 147 |
self._dbg(
|
| 148 |
"H1",
|
|
@@ -169,9 +170,9 @@ class RustCoderEnvironment(Environment):
|
|
| 169 |
base_url = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 170 |
token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 171 |
prompt = problem.get("description", "")
|
| 172 |
-
|
| 173 |
-
if
|
| 174 |
-
prompt += f"\n\
|
| 175 |
|
| 176 |
self._dbg(
|
| 177 |
"H5",
|
|
@@ -191,7 +192,7 @@ class RustCoderEnvironment(Environment):
|
|
| 191 |
self._logger.error("AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY missing.")
|
| 192 |
return RustCoderObservation(
|
| 193 |
problem_description=problem.get("description", ""),
|
| 194 |
-
|
| 195 |
compilation_success=False,
|
| 196 |
compilation_output="Error: AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY is missing.",
|
| 197 |
test_results=[],
|
|
@@ -210,8 +211,8 @@ class RustCoderEnvironment(Environment):
|
|
| 210 |
completion = client_llm.chat.completions.create(
|
| 211 |
model=model,
|
| 212 |
messages=[
|
| 213 |
-
{"role": "system", "content": "You are a senior Rust engineer. Return ONLY the complete
|
| 214 |
-
{"role": "user", "content": prompt},
|
| 215 |
],
|
| 216 |
temperature=0.1,
|
| 217 |
)
|
|
@@ -264,7 +265,7 @@ class RustCoderEnvironment(Environment):
|
|
| 264 |
done = False
|
| 265 |
return RustCoderObservation(
|
| 266 |
problem_description=problem["description"],
|
| 267 |
-
|
| 268 |
compilation_success=False,
|
| 269 |
compilation_output="Error: no code submitted.",
|
| 270 |
test_results=[],
|
|
@@ -279,6 +280,28 @@ class RustCoderEnvironment(Environment):
|
|
| 279 |
reward=0.0,
|
| 280 |
)
|
| 281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
# ββ 1. Compilation (40%) ββββββββββββββββββββββββββββββββββββββ
|
| 283 |
compilation_success, compilation_output = self._compile_check(code)
|
| 284 |
r_compilation = 1.0 if compilation_success else 0.0
|
|
@@ -302,7 +325,9 @@ class RustCoderEnvironment(Environment):
|
|
| 302 |
r_coverage = 1.0
|
| 303 |
|
| 304 |
# ββ 3. Elegance (10%) βββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
-
|
|
|
|
|
|
|
| 306 |
|
| 307 |
# ββ 4. Efficiency (10%) βββββββββββββββββββββββββββββββββββββββ
|
| 308 |
baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
|
|
@@ -318,30 +343,43 @@ class RustCoderEnvironment(Environment):
|
|
| 318 |
"elegance": round(r_elegance, 4),
|
| 319 |
"efficiency": round(r_efficiency, 4),
|
| 320 |
}
|
| 321 |
-
# Calculate weighted total reward
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
# ββ Advance Logic βββββββββββββββββββββββββββββββββββββββββββββ
|
| 332 |
self.current_problem_idx += 1
|
| 333 |
done = self.current_problem_idx >= len(self.problems)
|
| 334 |
|
| 335 |
next_prob_desc = "--- ALL TASKS COMPLETED in this episode ---"
|
| 336 |
-
|
| 337 |
if not done:
|
| 338 |
next_prob = self.problems[self.current_problem_idx]
|
| 339 |
next_prob_desc = f"--- NEXT TASK: {next_prob['title']} ---\n\n{next_prob['description']}"
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
return RustCoderObservation(
|
| 343 |
-
problem_description=
|
| 344 |
-
|
| 345 |
compilation_success=compilation_success,
|
| 346 |
compilation_output=compilation_output[:2000], # cap length
|
| 347 |
test_results=test_results,
|
|
|
|
| 129 |
|
| 130 |
return RustCoderObservation(
|
| 131 |
problem_description=problem["description"],
|
| 132 |
+
header_section=problem.get("header_section", ""),
|
| 133 |
compilation_success=False,
|
| 134 |
compilation_output="",
|
| 135 |
test_results=[],
|
|
|
|
| 143 |
self.step_count += 1
|
| 144 |
problem = self.problems[self.current_problem_idx]
|
| 145 |
code = action.code
|
| 146 |
+
header = problem.get("header_section", "")
|
| 147 |
|
| 148 |
self._dbg(
|
| 149 |
"H1",
|
|
|
|
| 170 |
base_url = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 171 |
token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 172 |
prompt = problem.get("description", "")
|
| 173 |
+
header = problem.get("header_section", "")
|
| 174 |
+
if header:
|
| 175 |
+
prompt += f"\n\nHeader Section (must be included verbatim in your final code):\n```rust\n{header}\n```"
|
| 176 |
|
| 177 |
self._dbg(
|
| 178 |
"H5",
|
|
|
|
| 192 |
self._logger.error("AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY missing.")
|
| 193 |
return RustCoderObservation(
|
| 194 |
problem_description=problem.get("description", ""),
|
| 195 |
+
header_section=problem.get("header_section", ""),
|
| 196 |
compilation_success=False,
|
| 197 |
compilation_output="Error: AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY is missing.",
|
| 198 |
test_results=[],
|
|
|
|
| 211 |
completion = client_llm.chat.completions.create(
|
| 212 |
model=model,
|
| 213 |
messages=[
|
| 214 |
+
{"role": "system", "content": "You are a senior Rust engineer. Return ONLY the complete Rust code file. No explanation."},
|
| 215 |
+
{"role": "user", "content": prompt + "\n\nReturn the COMPLETE Rust code, including the header section above."},
|
| 216 |
],
|
| 217 |
temperature=0.1,
|
| 218 |
)
|
|
|
|
| 265 |
done = False
|
| 266 |
return RustCoderObservation(
|
| 267 |
problem_description=problem["description"],
|
| 268 |
+
header_section=problem.get("header_section", ""),
|
| 269 |
compilation_success=False,
|
| 270 |
compilation_output="Error: no code submitted.",
|
| 271 |
test_results=[],
|
|
|
|
| 280 |
reward=0.0,
|
| 281 |
)
|
| 282 |
|
| 283 |
+
# Do NOT mutate submissions by injecting header_section.
|
| 284 |
+
# LeetCode-style behavior: the agent/LLM must return a complete Rust file
|
| 285 |
+
# that already includes the required header_section.
|
| 286 |
+
if header and header.strip() and header.strip() not in (code or ""):
|
| 287 |
+
done = False
|
| 288 |
+
return RustCoderObservation(
|
| 289 |
+
problem_description=problem.get("description", ""),
|
| 290 |
+
header_section=header,
|
| 291 |
+
compilation_success=False,
|
| 292 |
+
compilation_output="Error: submission is missing the required header_section. Return the complete Rust code including the header_section.",
|
| 293 |
+
test_results=[],
|
| 294 |
+
reward_breakdown={
|
| 295 |
+
"compilation": 0.0,
|
| 296 |
+
"correctness": 0.0,
|
| 297 |
+
"coverage": 0.0,
|
| 298 |
+
"elegance": 0.0,
|
| 299 |
+
"efficiency": 0.0,
|
| 300 |
+
},
|
| 301 |
+
done=done,
|
| 302 |
+
reward=0.0,
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
# ββ 1. Compilation (40%) ββββββββββββββββββββββββββββββββββββββ
|
| 306 |
compilation_success, compilation_output = self._compile_check(code)
|
| 307 |
r_compilation = 1.0 if compilation_success else 0.0
|
|
|
|
| 325 |
r_coverage = 1.0
|
| 326 |
|
| 327 |
# ββ 3. Elegance (10%) βββββββββββββββββββββββββββββββββββββββββ
|
| 328 |
+
# Only score elegance for code that compiles; otherwise it can
|
| 329 |
+
# incorrectly award points for non-compiling submissions.
|
| 330 |
+
r_elegance = self._score_elegance(code) if compilation_success else 0.0
|
| 331 |
|
| 332 |
# ββ 4. Efficiency (10%) βββββββββββββββββββββββββββββββββββββββ
|
| 333 |
baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
|
|
|
|
| 343 |
"elegance": round(r_elegance, 4),
|
| 344 |
"efficiency": round(r_efficiency, 4),
|
| 345 |
}
|
| 346 |
+
# Calculate weighted total reward.
|
| 347 |
+
# Hard rule: if it doesn't compile, total reward must be 0.0.
|
| 348 |
+
if not compilation_success:
|
| 349 |
+
total_reward = 0.0
|
| 350 |
+
else:
|
| 351 |
+
total_reward = round(
|
| 352 |
+
r_compilation * 0.40
|
| 353 |
+
+ r_correctness * 0.20
|
| 354 |
+
+ r_coverage * 0.20
|
| 355 |
+
+ r_elegance * 0.10
|
| 356 |
+
+ r_efficiency * 0.10,
|
| 357 |
+
4,
|
| 358 |
+
)
|
| 359 |
|
| 360 |
# ββ Advance Logic βββββββββββββββββββββββββββββββββββββββββββββ
|
| 361 |
self.current_problem_idx += 1
|
| 362 |
done = self.current_problem_idx >= len(self.problems)
|
| 363 |
|
| 364 |
next_prob_desc = "--- ALL TASKS COMPLETED in this episode ---"
|
| 365 |
+
next_header = ""
|
| 366 |
if not done:
|
| 367 |
next_prob = self.problems[self.current_problem_idx]
|
| 368 |
next_prob_desc = f"--- NEXT TASK: {next_prob['title']} ---\n\n{next_prob['description']}"
|
| 369 |
+
next_header = next_prob.get("header_section", "")
|
| 370 |
+
|
| 371 |
+
# IMPORTANT: The compilation/test results correspond to the code evaluated
|
| 372 |
+
# on `problem` (the current task), while the UI should also know what's next.
|
| 373 |
+
# To avoid confusion, include both "evaluated" and "next" in the description.
|
| 374 |
+
response_problem_desc = (
|
| 375 |
+
f"--- EVALUATED TASK: {problem.get('title', '')} ---\n\n"
|
| 376 |
+
f"{problem.get('description', '')}\n\n"
|
| 377 |
+
f"{next_prob_desc}"
|
| 378 |
+
)
|
| 379 |
|
| 380 |
return RustCoderObservation(
|
| 381 |
+
problem_description=response_problem_desc,
|
| 382 |
+
header_section=next_header,
|
| 383 |
compilation_success=compilation_success,
|
| 384 |
compilation_output=compilation_output[:2000], # cap length
|
| 385 |
test_results=test_results,
|