Spaces:

Parthiban007
/

rust_coder

Running

App Files Files Community

Parthiban007 commited on 1 day ago

Commit

e96c0d4

verified ·

1 Parent(s): 83d47a9

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

README.md +8 -2
client.py +1 -1
inference.py +22 -11
models.py +1 -1
problems.json +10 -10
server/app.py +14 -223
server/rust_coder_environment.py +60 -22

README.md CHANGED Viewed

@@ -49,7 +49,7 @@ The environment returns detailed feedback after each submission:
 | Field                  | Type        | Description                                         |
 |------------------------|-------------|-----------------------------------------------------|
 | `problem_description`  | string      | Task requirements and context                       |
-| `starter_code`         | string      | The intentionally broken code to fix                |
 | `compilation_success`  | bool        | Whether `rustc` compiled the submitted code         |
 | `compilation_output`   | string      | Raw compiler errors and warnings                    |
 | `test_results`         | list[dict]  | Per-test pass/fail results with error details       |
@@ -213,6 +213,12 @@ rust_coder/
 ├── inference.py                   # Baseline inference script (entry point)
 ├── __init__.py                    # Package exports
 └── server/
-    ├── app.py                     # FastAPI + Gradio server
     └── rust_coder_environment.py  # Core environment logic
 ```

 | Field                  | Type        | Description                                         |
 |------------------------|-------------|-----------------------------------------------------|
 | `problem_description`  | string      | Task requirements and context                       |
+| `header_section`       | string      | LeetCode-style scaffold (imports + signatures/types) |
 | `compilation_success`  | bool        | Whether `rustc` compiled the submitted code         |
 | `compilation_output`   | string      | Raw compiler errors and warnings                    |
 | `test_results`         | list[dict]  | Per-test pass/fail results with error details       |
 ├── inference.py                   # Baseline inference script (entry point)
 ├── __init__.py                    # Package exports
 └── server/
+    ├── app.py                     # FastAPI OpenEnv server entrypoint
     └── rust_coder_environment.py  # Core environment logic
 ```
+## HF Space runtime model
+- The Hugging Face Space serves the environment via `uvicorn server.app:app` (see `openenv.yaml` and `Dockerfile`).
+- The built-in OpenEnv web UI may send an empty action on Step; this environment supports that by auto-calling the LLM when `action.code` is empty (unless disabled via `AUTO_LLM_ON_EMPTY_STEP=0`).
+- `inference.py` is the required baseline runner used by the validator/judge. It connects to the running Space and drives `reset()`/`step()` in a loop, emitting strict `[START]`/`[STEP]`/`[END]` stdout lines.

client.py CHANGED Viewed

@@ -44,7 +44,7 @@ class RustCoderEnv(
         obs_data = payload.get("observation", {})
         observation = RustCoderObservation(
             problem_description=obs_data.get("problem_description", ""),
-            starter_code=obs_data.get("starter_code", ""),
             compilation_success=obs_data.get("compilation_success", False),
             compilation_output=obs_data.get("compilation_output", ""),
             test_results=obs_data.get("test_results", []),

         obs_data = payload.get("observation", {})
         observation = RustCoderObservation(
             problem_description=obs_data.get("problem_description", ""),
+            header_section=obs_data.get("header_section", ""),
             compilation_success=obs_data.get("compilation_success", False),
             compilation_output=obs_data.get("compilation_output", ""),
             test_results=obs_data.get("test_results", []),

inference.py CHANGED Viewed

@@ -34,17 +34,28 @@ from models import RustCoderAction
 # --- Strict Logging Helpers ---
 def log_start(task: str, env: str, model: str):
-    print(f'[START] task="{task}" env="{env}" model="{model}"', flush=True)
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
-    escaped_action = action.replace('\n', '\\n')[:100] + "..."
-    log_line = f'[STEP] step={step} action="{escaped_action}" reward={reward:.4f} done={str(done).lower()}'
-    if error:
-        log_line += f' error="{error}"'
-    print(log_line, flush=True)
 def log_end(success: bool, steps: int, score: float, rewards: List[float]):
-    print(f'[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={json.dumps(rewards)}', flush=True)
 # --- LLM Solution Logic ---
 async def get_model_code(prompt: str, client: OpenAI) -> str:
@@ -110,10 +121,10 @@ async def main():
             steps_taken = step
-            # Format prompt including starter code if available
             prompt = obs.problem_description
-            if obs.starter_code:
-                prompt += f"\n\nStarter Code:\n```rust\n{obs.starter_code}\n```"
             # 1. Ask model for solution to current task
             code_solution = await get_model_code(prompt, client)
@@ -126,7 +137,7 @@ async def main():
             done = result.done
             rewards.append(reward)
-            log_step(step=step, action=code_solution, reward=reward, done=done)
             if done:
                 break

 # --- Strict Logging Helpers ---
 def log_start(task: str, env: str, model: str):
+    # REQUIRED exact stdout format (no quotes)
+    print(f"[START] task={task} env={env} model={model}", flush=True)
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
+    # REQUIRED exact stdout format:
+    # [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")
+    action_str = action_str[:200]  # keep single-line + bounded
+    err_field = "null" if error is None else str(error).replace("\r", "\\r").replace("\n", "\\n")
+    reward_2 = f"{float(reward or 0.0):.2f}"
+    print(
+        f"[STEP] step={step} action={action_str} reward={reward_2} done={str(bool(done)).lower()} error={err_field}",
+        flush=True,
+    )
 def log_end(success: bool, steps: int, score: float, rewards: List[float]):
+    # REQUIRED exact stdout format, rewards as comma-separated 2dp
+    rewards_str = ",".join(f"{float(r or 0.0):.2f}" for r in rewards)
+    print(
+        f"[END] success={str(bool(success)).lower()} steps={steps} score={float(score or 0.0):.2f} rewards={rewards_str}",
+        flush=True,
+    )
 # --- LLM Solution Logic ---
 async def get_model_code(prompt: str, client: OpenAI) -> str:
             steps_taken = step
+            # Format prompt including header_section if available
             prompt = obs.problem_description
+            if getattr(obs, "header_section", ""):
+                prompt += f"\n\nHeader Section (must be included verbatim in final code):\n```rust\n{obs.header_section}\n```"
             # 1. Ask model for solution to current task
             code_solution = await get_model_code(prompt, client)
             done = result.done
             rewards.append(reward)
+            log_step(step=step, action=code_solution, reward=reward, done=done, error=None)
             if done:
                 break

models.py CHANGED Viewed

@@ -24,7 +24,7 @@ class RustCoderObservation(Observation):
     """Observation space for the Rust Coder environment."""
     problem_description: str = Field(default="", description="The text description of the current coding task, including requirements.")
-    starter_code: str = Field(default="", description="The specific Rust code snippet that needs fixing for this task.")
     compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
     compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
     test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")

     """Observation space for the Rust Coder environment."""
     problem_description: str = Field(default="", description="The text description of the current coding task, including requirements.")
+    header_section: str = Field(default="", description="LeetCode-style header/scaffold (imports + signatures/types) for deterministic evaluation.")
     compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
     compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
     test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")

problems.json CHANGED Viewed

@@ -4,7 +4,7 @@
         "title": "Broken CLI Argument Parser",
         "difficulty": "Easy",
         "description": "Fix a command-line tool that parses user input to determine file operations (read, write, append). The implementation uses enums and pattern matching but contains: Mismatched types in enum variants, Incomplete match arms, Incorrect handling of optional arguments. The parser must compile and correctly interpret valid command-line inputs like: 'read file.txt' -> FileOp::Read('file.txt'), 'write file.txt content' -> FileOp::Write('file.txt', Some('content')), 'append file.txt' -> FileOp::Append('file.txt')",
-        "starter_code": "#[derive(Debug, PartialEq)]\nenum FileOp {\n    Read(String),\n    Write(String, Option<String>),\n    Append(String),\n}\n\nfn parse_command(input: &str) -> Option<FileOp> {\n    let parts: Vec<&str> = input.split_whitespace().collect();\n    \n    match parts.get(0) {\n        Some(&\"read\") => {\n            let filename = parts.get(1)?;\n            FileOp::Read(filename.to_string())  // BUG: Missing Some()\n        }\n        Some(&\"write\") => {\n            let filename = parts.get(1)?;\n            let content = parts.get(2).map(|s| s.to_string());\n            Some(FileOp::Write(filename.to_string(), content))\n        }\n        Some(&\"append\") => {\n            let filename = parts.get(1)?;\n            // BUG: Missing return statement\n        }\n        _ => None,\n    }\n}\n\nfn main() {\n    println!(\"CLI Parser Test\");\n}",
         "tests": [
             {
                 "name": "parse_read_command",
@@ -35,7 +35,7 @@
         "title": "Conflicting Borrows in Collection Processing",
         "difficulty": "Easy\u2192Medium",
         "description": "Fix a function that processes a vector of strings while conditionally modifying elements and storing references for later use. The implementation mixes mutable and immutable borrows within the same scope, causing borrow checker conflicts. Requirements: Iterate through vector of strings, Store uppercase versions in a results vector, Handle optional transformations without borrowing conflicts, Must compile and execute without panics",
-        "starter_code": "fn process_strings(strings: &mut Vec<String>) -> Vec<String> {\n    let mut results = Vec::new();\n    \n    for s in strings {\n        // BUG: Cannot borrow as mutable while immutable borrow is active\n        let upper = s.to_uppercase();\n        s.push_str(\"_processed\");  // Mutable borrow\n        results.push(upper);\n    }\n    \n    results\n}\n\nfn main() {\n    println!(\"String processing\");\n}",
         "tests": [
             {
                 "name": "process_single_string",
@@ -57,7 +57,7 @@
         "title": "Invalid Lifetime Annotations in Text API",
         "difficulty": "Medium",
         "description": "Fix a text-processing utility that accepts multiple string slices and returns a reference derived from them. The function either fails to compile or produces incorrect lifetime relationships, risking references that outlive their input data. Requirements: Function must accept multiple &str parameters, Return a &str derived from the inputs, Properly annotate lifetimes, Must be safe (no dangling references)",
-        "starter_code": "// BUG: Invalid lifetime annotations - which lifetime should the return type use?\nfn longest_text<'a>(s1: &'a str, s2: &'a str) -> &'a str {\n    if s1.len() > s2.len() {\n        s1\n    } else {\n        s2\n    }\n}\n\n// BUG: This function has a lifetime issue\nfn find_first_word(s: &str) -> &str {\n    let bytes = s.as_bytes();\n    for (i, &byte) in bytes.iter().enumerate() {\n        if byte == b' ' {\n            return &s[0..i];\n        }\n    }\n    &s[..]\n}\n\nfn main() {\n    println!(\"Lifetime test\");\n}",
         "tests": [
             {
                 "name": "longest_text_basic",
@@ -88,7 +88,7 @@
         "title": "Business Logic Producing Incorrect Results",
         "difficulty": "Medium",
         "description": "Fix a module implementing order validation logic including pricing, discounts, and boundary conditions. The code compiles but produces incorrect outputs for edge cases such as: Zero values, Overlapping discounts, Large numeric inputs, Negative prices. Requirements: Calculate order total correctly, Apply discounts properly (no double-counting), Handle edge cases (zero items, negative values), Be mathematically sound",
-        "starter_code": "#[derive(Debug, Clone)]\nstruct Order {\n    quantity: i32,\n    unit_price: f64,\n    discount_percent: f64,\n}\n\nimpl Order {\n    fn new(quantity: i32, unit_price: f64) -> Self {\n        Order {\n            quantity,\n            unit_price,\n            discount_percent: 0.0,\n        }\n    }\n\n    fn with_discount(mut self, discount: f64) -> Self {\n        self.discount_percent = discount;\n        self\n    }\n\n    fn calculate_total(&self) -> f64 {\n        let subtotal = self.quantity as f64 * self.unit_price;\n        // BUG: Incorrect discount calculation\n        let discount = subtotal * (self.discount_percent / 100.0);\n        subtotal - discount  // Missing rounding/validation\n    }\n}\n\nfn main() {\n    println!(\"Order test\");\n}",
         "tests": [
             {
                 "name": "simple_order",
@@ -119,7 +119,7 @@
         "title": "Corrupted Singly Linked List",
         "difficulty": "Medium\u2192Hard",
         "description": "Fix a custom singly linked list that supports insertion, deletion, and traversal. The implementation incorrectly manages node ownership and pointer transitions, resulting in: Lost nodes, Inconsistent traversal output, Occasional runtime panics. Requirements: Insert elements at head, Delete elements correctly, Traverse without panics, No memory leaks or lost data",
-        "starter_code": "use std::ptr;\n\n#[derive(Debug)]\nstruct Node<T> {\n    value: T,\n    next: Option<Box<Node<T>>>,\n}\n\n#[derive(Debug)]\nstruct LinkedList<T> {\n    head: Option<Box<Node<T>>>,\n}\n\nimpl<T> LinkedList<T> {\n    fn new() -> Self {\n        LinkedList { head: None }\n    }\n\n    fn insert(&mut self, value: T) {\n        let new_node = Box::new(Node {\n            value,\n            next: None,  // BUG: Should move self.head into next\n        });\n        self.head = Some(new_node);\n    }\n\n    fn len(&self) -> usize {\n        let mut count = 0;\n        let mut current = &self.head;\n        while let Some(node) = current {\n            count += 1;\n            current = &node.next;  // Correct, but insert is broken\n        }\n        count\n    }\n}\n\nfn main() {\n    println!(\"LinkedList test\");\n}",
         "tests": [
             {
                 "name": "insert_single_element",
@@ -143,7 +143,7 @@
         "title": "Deadlock in Multi-threaded Worker System",
         "difficulty": "Hard",
         "description": "Fix a worker system using multiple threads to process jobs from a shared queue protected by synchronization primitives. Under certain workloads, threads block indefinitely due to: Improper lock acquisition order, Shared state handling issues, Missing signal/wake mechanisms. Requirements: Spawn N worker threads, Process jobs from shared queue without deadlock, Handle shutdown gracefully, No panics under load",
-        "starter_code": "use std::sync::{Arc, Mutex, mpsc};\nuse std::thread;\n\nfn worker_system(num_workers: usize, jobs: Vec<i32>) -> Vec<i32> {\n    let (tx, rx) = mpsc::channel();\n    let rx = Arc::new(Mutex::new(rx));\n    let results = Arc::new(Mutex::new(Vec::new()));\n    \n    let mut handles = vec![];\n    \n    for _ in 0..num_workers {\n        let rx = Arc::clone(&rx);\n        let results = Arc::clone(&results);\n        \n        let handle = thread::spawn(move || {\n            loop {\n                // BUG: Lock acquired but never released before trying to acquire results lock\n                let receiver = rx.lock().unwrap();\n                match receiver.try_recv() {\n                    Ok(job) => {\n                        let result = job * 2;\n                        // BUG: Tries to lock results while still holding rx lock - DEADLOCK\n                        results.lock().unwrap().push(result);\n                    }\n                    Err(_) => break,\n                }\n            }\n        });\n        handles.push(handle);\n    }\n    \n    for job in jobs {\n        let _ = tx.send(job);  // Ignore send errors\n    }\n    drop(tx);\n    \n    for handle in handles {\n        let _ = handle.join();\n    }\n    \n    Arc::try_unwrap(results)\n        .unwrap()\n        .into_inner()\n        .unwrap()\n}\n\nfn main() {\n    println!(\"Worker system test\");\n}",
         "tests": [
             {
                 "name": "single_worker_single_job",
@@ -167,7 +167,7 @@
         "title": "Async Function with Borrowing Conflicts",
         "difficulty": "Hard",
         "description": "Fix an asynchronous function that processes input data and performs non-blocking operations while returning references tied to the input. The implementation violates borrowing constraints in an async context, leading to: Compilation errors when using references across await points, Invalid reference usage. Requirements: Accept &str input, Perform async operation, Return derived reference, Must be sound and compile",
-        "starter_code": "use std::pin::Pin;\nuse std::future::Future;\n\n// BUG: Cannot return reference that outlives await point\nasync fn process_async(input: &str) -> &str {\n    // Simulating async work\n    // tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;\n    \n    // BUG: input reference cannot be returned from async context like this\n    input\n}\n\n// Better approach: return owned data or 'static reference\nfn process_sync(input: &str) -> String {\n    input.to_uppercase()\n}\n\nfn main() {\n    println!(\"Async test\");\n}",
         "tests": [
             {
                 "name": "process_sync_basic",
@@ -191,7 +191,7 @@
         "title": "Unsafe FFI Integration Causing Crashes",
         "difficulty": "Hard",
         "description": "Fix Rust code that interfaces with an external C library using raw pointers. The implementation incorrectly handles: Pointer ownership, Memory allocation and deallocation, Undefined behavior risks. Requirements: Safely wrap C library calls, Properly manage memory (allocate/deallocate), No undefined behavior, Handle errors gracefully",
-        "starter_code": "extern \"C\" {\n    fn malloc(size: usize) -> *mut u8;\n    fn free(ptr: *mut u8);\n}\n\nfn allocate_and_init(size: usize) -> Vec<u8> {\n    unsafe {\n        let ptr = malloc(size);\n        // BUG: No null check - ptr could be null\n        // BUG: Memory not initialized before use\n        let slice = std::slice::from_raw_parts_mut(ptr, size);\n        \n        // Copy to vec and free\n        let vec = slice.to_vec();\n        free(ptr);  // BUG: Freeing memory still referenced in vec\n        vec\n    }\n}\n\nfn main() {\n    println!(\"FFI test\");\n}",
         "tests": [
             {
                 "name": "allocate_small_buffer",
@@ -208,7 +208,7 @@
         "title": "Inefficient Data Processing Pipeline",
         "difficulty": "Hard",
         "description": "Fix a data pipeline that reads large datasets, applies transformations, and aggregates results. While functionally correct, the implementation has: Excessive memory allocations, Redundant iterations, Inefficient data copying. Requirements: Process data efficiently, Minimize allocations and copies, Use iterators when possible, Produce correct results with better performance",
-        "starter_code": "fn process_data(numbers: Vec<i32>) -> i32 {\n    // BUG: Multiple unnecessary allocations and iterations\n    \n    // First pass: filter evens (allocates new vector)\n    let evens: Vec<i32> = numbers.iter()\n        .filter(|n| n % 2 == 0)\n        .copied()\n        .collect();\n    \n    // Second pass: double values (allocates another vector)\n    let doubled: Vec<i32> = evens.iter()\n        .map(|n| n * 2)\n        .collect();\n    \n    // Third pass: sum (unnecessary iteration)\n    let sum: i32 = doubled.iter().sum();\n    \n    // Fourth pass: filter again (redundant)\n    let final_sum: i32 = doubled.iter()\n        .filter(|n| n % 4 == 0)\n        .sum();\n    \n    final_sum\n}\n\nfn main() {\n    println!(\"Efficiency test\");\n}",
         "tests": [
             {
                 "name": "simple_pipeline",
@@ -232,7 +232,7 @@
         "title": "Reference-counted Cache with Memory Leak",
         "difficulty": "Hard+",
         "description": "Fix a caching system using reference-counted pointers to share data across components. The design creates cyclic references between cached objects, preventing memory from being released and causing memory usage to grow over time. Requirements: Implement caching without memory leaks, Break circular reference patterns, Use Rc/Arc correctly with Weak pointers when needed, Memory should be released when cache is cleared",
-        "starter_code": "use std::rc::Rc;\nuse std::cell::RefCell;\n\n#[derive(Debug)]\nstruct CacheNode<T> {\n    key: String,\n    value: T,\n    // BUG: This creates a cycle that prevents garbage collection\n    related: RefCell<Option<Rc<CacheNode<T>>>>,\n}\n\n#[derive(Debug)]\nstruct Cache<T> {\n    items: RefCell<Vec<Rc<CacheNode<T>>>>,\n}\n\nimpl<T: Clone> Cache<T> {\n    fn new() -> Self {\n        Cache {\n            items: RefCell::new(Vec::new()),\n        }\n    }\n\n    fn insert(&self, key: String, value: T) {\n        let node = Rc::new(CacheNode {\n            key,\n            value,\n            related: RefCell::new(None),\n        });\n        \n        // BUG: Creating cyclic references\n        if let Some(last) = self.items.borrow().last() {\n            // Rc to Rc creates a cycle\n            if let Ok(mut r) = last.related.try_borrow_mut() {\n                *r = Some(Rc::clone(&node));  // Cycle here!\n            }\n        }\n        \n        self.items.borrow_mut().push(node);\n    }\n}\n\nfn main() {\n    println!(\"Cache test\");\n}",
         "tests": [
             {
                 "name": "cache_insert_single",

         "title": "Broken CLI Argument Parser",
         "difficulty": "Easy",
         "description": "Fix a command-line tool that parses user input to determine file operations (read, write, append). The implementation uses enums and pattern matching but contains: Mismatched types in enum variants, Incomplete match arms, Incorrect handling of optional arguments. The parser must compile and correctly interpret valid command-line inputs like: 'read file.txt' -> FileOp::Read('file.txt'), 'write file.txt content' -> FileOp::Write('file.txt', Some('content')), 'append file.txt' -> FileOp::Append('file.txt')",
+        "header_section": "#[derive(Debug, PartialEq)]\nenum FileOp {\n    Read(String),\n    Write(String, Option<String>),\n    Append(String),\n}\n\nfn parse_command(input: &str) -> Option<FileOp> {\n    let parts: Vec<&str> = input.split_whitespace().collect();\n    \n    match parts.get(0) {\n        Some(&\"read\") => {\n            let filename = parts.get(1)?;\n            FileOp::Read(filename.to_string())  // BUG: Missing Some()\n        }\n        Some(&\"write\") => {\n            let filename = parts.get(1)?;\n            let content = parts.get(2).map(|s| s.to_string());\n            Some(FileOp::Write(filename.to_string(), content))\n        }\n        Some(&\"append\") => {\n            let filename = parts.get(1)?;\n            // BUG: Missing return statement\n        }\n        _ => None,\n    }\n}\n\nfn main() {\n    println!(\"CLI Parser Test\");\n}",
         "tests": [
             {
                 "name": "parse_read_command",
         "title": "Conflicting Borrows in Collection Processing",
         "difficulty": "Easy\u2192Medium",
         "description": "Fix a function that processes a vector of strings while conditionally modifying elements and storing references for later use. The implementation mixes mutable and immutable borrows within the same scope, causing borrow checker conflicts. Requirements: Iterate through vector of strings, Store uppercase versions in a results vector, Handle optional transformations without borrowing conflicts, Must compile and execute without panics",
+        "header_section": "fn process_strings(strings: &mut Vec<String>) -> Vec<String> {\n    let mut results = Vec::new();\n    \n    for s in strings {\n        // BUG: Cannot borrow as mutable while immutable borrow is active\n        let upper = s.to_uppercase();\n        s.push_str(\"_processed\");  // Mutable borrow\n        results.push(upper);\n    }\n    \n    results\n}\n\nfn main() {\n    println!(\"String processing\");\n}",
         "tests": [
             {
                 "name": "process_single_string",
         "title": "Invalid Lifetime Annotations in Text API",
         "difficulty": "Medium",
         "description": "Fix a text-processing utility that accepts multiple string slices and returns a reference derived from them. The function either fails to compile or produces incorrect lifetime relationships, risking references that outlive their input data. Requirements: Function must accept multiple &str parameters, Return a &str derived from the inputs, Properly annotate lifetimes, Must be safe (no dangling references)",
+        "header_section": "// BUG: Invalid lifetime annotations - which lifetime should the return type use?\nfn longest_text<'a>(s1: &'a str, s2: &'a str) -> &'a str {\n    if s1.len() > s2.len() {\n        s1\n    } else {\n        s2\n    }\n}\n\n// BUG: This function has a lifetime issue\nfn find_first_word(s: &str) -> &str {\n    let bytes = s.as_bytes();\n    for (i, &byte) in bytes.iter().enumerate() {\n        if byte == b' ' {\n            return &s[0..i];\n        }\n    }\n    &s[..]\n}\n\nfn main() {\n    println!(\"Lifetime test\");\n}",
         "tests": [
             {
                 "name": "longest_text_basic",
         "title": "Business Logic Producing Incorrect Results",
         "difficulty": "Medium",
         "description": "Fix a module implementing order validation logic including pricing, discounts, and boundary conditions. The code compiles but produces incorrect outputs for edge cases such as: Zero values, Overlapping discounts, Large numeric inputs, Negative prices. Requirements: Calculate order total correctly, Apply discounts properly (no double-counting), Handle edge cases (zero items, negative values), Be mathematically sound",
+        "header_section": "#[derive(Debug, Clone)]\nstruct Order {\n    quantity: i32,\n    unit_price: f64,\n    discount_percent: f64,\n}\n\nimpl Order {\n    fn new(quantity: i32, unit_price: f64) -> Self {\n        Order {\n            quantity,\n            unit_price,\n            discount_percent: 0.0,\n        }\n    }\n\n    fn with_discount(mut self, discount: f64) -> Self {\n        self.discount_percent = discount;\n        self\n    }\n\n    fn calculate_total(&self) -> f64 {\n        let subtotal = self.quantity as f64 * self.unit_price;\n        // BUG: Incorrect discount calculation\n        let discount = subtotal * (self.discount_percent / 100.0);\n        subtotal - discount  // Missing rounding/validation\n    }\n}\n\nfn main() {\n    println!(\"Order test\");\n}",
         "tests": [
             {
                 "name": "simple_order",
         "title": "Corrupted Singly Linked List",
         "difficulty": "Medium\u2192Hard",
         "description": "Fix a custom singly linked list that supports insertion, deletion, and traversal. The implementation incorrectly manages node ownership and pointer transitions, resulting in: Lost nodes, Inconsistent traversal output, Occasional runtime panics. Requirements: Insert elements at head, Delete elements correctly, Traverse without panics, No memory leaks or lost data",
+        "header_section": "use std::ptr;\n\n#[derive(Debug)]\nstruct Node<T> {\n    value: T,\n    next: Option<Box<Node<T>>>,\n}\n\n#[derive(Debug)]\nstruct LinkedList<T> {\n    head: Option<Box<Node<T>>>,\n}\n\nimpl<T> LinkedList<T> {\n    fn new() -> Self {\n        LinkedList { head: None }\n    }\n\n    fn insert(&mut self, value: T) {\n        let new_node = Box::new(Node {\n            value,\n            next: None,  // BUG: Should move self.head into next\n        });\n        self.head = Some(new_node);\n    }\n\n    fn len(&self) -> usize {\n        let mut count = 0;\n        let mut current = &self.head;\n        while let Some(node) = current {\n            count += 1;\n            current = &node.next;  // Correct, but insert is broken\n        }\n        count\n    }\n}\n\nfn main() {\n    println!(\"LinkedList test\");\n}",
         "tests": [
             {
                 "name": "insert_single_element",
         "title": "Deadlock in Multi-threaded Worker System",
         "difficulty": "Hard",
         "description": "Fix a worker system using multiple threads to process jobs from a shared queue protected by synchronization primitives. Under certain workloads, threads block indefinitely due to: Improper lock acquisition order, Shared state handling issues, Missing signal/wake mechanisms. Requirements: Spawn N worker threads, Process jobs from shared queue without deadlock, Handle shutdown gracefully, No panics under load",
+        "header_section": "use std::sync::{Arc, Mutex, mpsc};\nuse std::thread;\n\nfn worker_system(num_workers: usize, jobs: Vec<i32>) -> Vec<i32> {\n    let (tx, rx) = mpsc::channel();\n    let rx = Arc::new(Mutex::new(rx));\n    let results = Arc::new(Mutex::new(Vec::new()));\n    \n    let mut handles = vec![];\n    \n    for _ in 0..num_workers {\n        let rx = Arc::clone(&rx);\n        let results = Arc::clone(&results);\n        \n        let handle = thread::spawn(move || {\n            loop {\n                // BUG: Lock acquired but never released before trying to acquire results lock\n                let receiver = rx.lock().unwrap();\n                match receiver.try_recv() {\n                    Ok(job) => {\n                        let result = job * 2;\n                        // BUG: Tries to lock results while still holding rx lock - DEADLOCK\n                        results.lock().unwrap().push(result);\n                    }\n                    Err(_) => break,\n                }\n            }\n        });\n        handles.push(handle);\n    }\n    \n    for job in jobs {\n        let _ = tx.send(job);  // Ignore send errors\n    }\n    drop(tx);\n    \n    for handle in handles {\n        let _ = handle.join();\n    }\n    \n    Arc::try_unwrap(results)\n        .unwrap()\n        .into_inner()\n        .unwrap()\n}\n\nfn main() {\n    println!(\"Worker system test\");\n}",
         "tests": [
             {
                 "name": "single_worker_single_job",
         "title": "Async Function with Borrowing Conflicts",
         "difficulty": "Hard",
         "description": "Fix an asynchronous function that processes input data and performs non-blocking operations while returning references tied to the input. The implementation violates borrowing constraints in an async context, leading to: Compilation errors when using references across await points, Invalid reference usage. Requirements: Accept &str input, Perform async operation, Return derived reference, Must be sound and compile",
+        "header_section": "use std::pin::Pin;\nuse std::future::Future;\n\n// BUG: Cannot return reference that outlives await point\nasync fn process_async(input: &str) -> &str {\n    // Simulating async work\n    // tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;\n    \n    // BUG: input reference cannot be returned from async context like this\n    input\n}\n\n// Better approach: return owned data or 'static reference\nfn process_sync(input: &str) -> String {\n    input.to_uppercase()\n}\n\nfn main() {\n    println!(\"Async test\");\n}",
         "tests": [
             {
                 "name": "process_sync_basic",
         "title": "Unsafe FFI Integration Causing Crashes",
         "difficulty": "Hard",
         "description": "Fix Rust code that interfaces with an external C library using raw pointers. The implementation incorrectly handles: Pointer ownership, Memory allocation and deallocation, Undefined behavior risks. Requirements: Safely wrap C library calls, Properly manage memory (allocate/deallocate), No undefined behavior, Handle errors gracefully",
+        "header_section": "extern \"C\" {\n    fn malloc(size: usize) -> *mut u8;\n    fn free(ptr: *mut u8);\n}\n\nfn allocate_and_init(size: usize) -> Vec<u8> {\n    unsafe {\n        let ptr = malloc(size);\n        // BUG: No null check - ptr could be null\n        // BUG: Memory not initialized before use\n        let slice = std::slice::from_raw_parts_mut(ptr, size);\n        \n        // Copy to vec and free\n        let vec = slice.to_vec();\n        free(ptr);  // BUG: Freeing memory still referenced in vec\n        vec\n    }\n}\n\nfn main() {\n    println!(\"FFI test\");\n}",
         "tests": [
             {
                 "name": "allocate_small_buffer",
         "title": "Inefficient Data Processing Pipeline",
         "difficulty": "Hard",
         "description": "Fix a data pipeline that reads large datasets, applies transformations, and aggregates results. While functionally correct, the implementation has: Excessive memory allocations, Redundant iterations, Inefficient data copying. Requirements: Process data efficiently, Minimize allocations and copies, Use iterators when possible, Produce correct results with better performance",
+        "header_section": "fn process_data(numbers: Vec<i32>) -> i32 {\n    // BUG: Multiple unnecessary allocations and iterations\n    \n    // First pass: filter evens (allocates new vector)\n    let evens: Vec<i32> = numbers.iter()\n        .filter(|n| n % 2 == 0)\n        .copied()\n        .collect();\n    \n    // Second pass: double values (allocates another vector)\n    let doubled: Vec<i32> = evens.iter()\n        .map(|n| n * 2)\n        .collect();\n    \n    // Third pass: sum (unnecessary iteration)\n    let sum: i32 = doubled.iter().sum();\n    \n    // Fourth pass: filter again (redundant)\n    let final_sum: i32 = doubled.iter()\n        .filter(|n| n % 4 == 0)\n        .sum();\n    \n    final_sum\n}\n\nfn main() {\n    println!(\"Efficiency test\");\n}",
         "tests": [
             {
                 "name": "simple_pipeline",
         "title": "Reference-counted Cache with Memory Leak",
         "difficulty": "Hard+",
         "description": "Fix a caching system using reference-counted pointers to share data across components. The design creates cyclic references between cached objects, preventing memory from being released and causing memory usage to grow over time. Requirements: Implement caching without memory leaks, Break circular reference patterns, Use Rc/Arc correctly with Weak pointers when needed, Memory should be released when cache is cleared",
+        "header_section": "use std::rc::Rc;\nuse std::cell::RefCell;\n\n#[derive(Debug)]\nstruct CacheNode<T> {\n    key: String,\n    value: T,\n    // BUG: This creates a cycle that prevents garbage collection\n    related: RefCell<Option<Rc<CacheNode<T>>>>,\n}\n\n#[derive(Debug)]\nstruct Cache<T> {\n    items: RefCell<Vec<Rc<CacheNode<T>>>>,\n}\n\nimpl<T: Clone> Cache<T> {\n    fn new() -> Self {\n        Cache {\n            items: RefCell::new(Vec::new()),\n        }\n    }\n\n    fn insert(&self, key: String, value: T) {\n        let node = Rc::new(CacheNode {\n            key,\n            value,\n            related: RefCell::new(None),\n        });\n        \n        // BUG: Creating cyclic references\n        if let Some(last) = self.items.borrow().last() {\n            // Rc to Rc creates a cycle\n            if let Ok(mut r) = last.related.try_borrow_mut() {\n                *r = Some(Rc::clone(&node));  // Cycle here!\n            }\n        }\n        \n        self.items.borrow_mut().push(node);\n    }\n}\n\nfn main() {\n    println!(\"Cache test\");\n}",
         "tests": [
             {
                 "name": "cache_insert_single",

server/app.py CHANGED Viewed

@@ -1,20 +1,19 @@
 """
-FastAPI application for the Rust Coder Environment.
-Endpoints:
-    POST /reset  — Start new episode (loads next problem)
-    POST /step   — Submit Rust code for evaluation
-    GET  /state  — Get current episode state
-    GET  /schema — Action/observation JSON schemas
-    WS   /ws     — WebSocket for persistent sessions
 """
 import os
 import logging
-import json
-import time
-import gradio as gr
-from openai import OpenAI
 from dotenv import load_dotenv
 from openenv.core.env_server.http_server import create_app
@@ -23,38 +22,13 @@ from server.rust_coder_environment import RustCoderEnvironment
 load_dotenv()
-# --- Logging (server/app.py) ---
 _LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
 logging.basicConfig(
     level=getattr(logging, _LOG_LEVEL, logging.INFO),
     format="%(asctime)s %(levelname)s %(name)s - %(message)s",
 )
-logger = logging.getLogger("rust_coder.server")
-# #region agent log
-_DEBUG_LOG_PATH = os.getenv("DEBUG_LOG_PATH") or "debug-55b5ef.log"
-_DEBUG_SESSION_ID = "55b5ef"
-def _dbg(hypothesis_id: str, location: str, message: str, data: dict, run_id: str = "pre-fix") -> None:
-    try:
-        payload = {
-            "sessionId": _DEBUG_SESSION_ID,
-            "runId": run_id,
-            "hypothesisId": hypothesis_id,
-            "location": location,
-            "message": message,
-            "data": data,
-            "timestamp": int(time.time() * 1000),
-        }
-        with open(_DEBUG_LOG_PATH, "a", encoding="utf-8") as f:
-            f.write(json.dumps(payload, ensure_ascii=False) + "\n")
-    except Exception:
-        # Never break app for debug logging
-        pass
-# #endregion
-# --- Core OpenEnv Server Setup ---
-# Use a distinct name for the OpenEnv FastAPI instance
-openenv_app = create_app(
     RustCoderEnvironment,
     RustCoderAction,
     RustCoderObservation,
@@ -62,198 +36,15 @@ openenv_app = create_app(
     max_concurrent_envs=1,
 )
-# Add a health check endpoint for Docker directly to the base app
-@openenv_app.get("/health")
 async def health_check():
     return {"status": "healthy"}
-# --- Shared Logic ---
-API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
-MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
-HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
-def get_llm_solution(problem_desc: str):
-    """Call LLM to get a Rust solution"""
-    try:
-        _dbg(
-            "H2",
-            "server/app.py:get_llm_solution:entry",
-            "LLM call starting",
-            {"model": MODEL_NAME, "base_url": API_BASE_URL, "prompt_chars": len(problem_desc or ""), "token_present": bool(HF_TOKEN)},
-        )
-        logger.info(
-            "LLM call start model=%s base_url=%s prompt_chars=%d token_present=%s",
-            MODEL_NAME,
-            API_BASE_URL,
-            len(problem_desc or ""),
-            bool(HF_TOKEN),
-        )
-        client_llm = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
-        completion = client_llm.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[
-                {"role": "system", "content": "You are an expert Rust developer. Respond ONLY with the code solution, no explanation."},
-                {"role": "user", "content": f"Fix the following Rust problem:\n{problem_desc}"},
-            ],
-            temperature=0.2,
-        )
-        text = (completion.choices[0].message.content or "").strip()
-        logger.debug("LLM raw response chars=%d", len(text))
-        # Clean markdown code blocks
-        if "```rust" in text:
-            text = text.split("```rust")[1].split("```")[0]
-        elif "```" in text:
-            text = text.split("```")[1].split("```")[0]
-        text = text.strip()
-        if not text:
-            _dbg("H2", "server/app.py:get_llm_solution:empty", "LLM returned empty after cleanup", {"raw_chars": len((completion.choices[0].message.content or ""))})
-            logger.warning("LLM returned empty code after cleanup.")
-            return "// LLM Error: empty response (no code returned)."
-        _dbg("H2", "server/app.py:get_llm_solution:exit", "LLM call finished", {"returned_code_chars": len(text)})
-        logger.info("LLM call end: returned_code_chars=%d", len(text))
-        return text
-    except Exception as e:
-        _dbg("H2", "server/app.py:get_llm_solution:error", "LLM call exception", {"error": str(e)})
-        logger.exception("LLM call failed.")
-        return f"// LLM Error: {e}"
-def evaluate_single(problem_id, code=None):
-    """Run evaluation for a specific problem. If code is None, it asks the LLM."""
-    try:
-        idx = int(problem_id.split(":")[0]) - 1
-        problem = RustCoderEnvironment().problems[idx]
-        _dbg(
-            "H2",
-            "server/app.py:evaluate_single:entry",
-            "evaluate_single called",
-            {"problem_id": str(problem_id), "idx": idx, "code_is_none": code is None, "code_chars": len(code or "")},
-        )
-        logger.info(
-            "evaluate_single start problem_id=%s idx=%d code_provided=%s",
-            problem_id,
-            idx,
-            code is not None,
-        )
-        # 1. Get code from LLM if not provided
-        solution_code = code if code else get_llm_solution(problem["description"])
-        # 2. Guard: If LLM failed, do not evaluate
-        if not solution_code.strip() or solution_code.startswith("// LLM Error"):
-            _dbg(
-                "H2",
-                "server/app.py:evaluate_single:abort",
-                "evaluate_single abort due to empty/error code",
-                {"starts_with_llm_error": solution_code.startswith("// LLM Error"), "solution_code_chars": len(solution_code or "")},
-            )
-            logger.warning(
-                "evaluate_single abort: empty_or_error_code=%s chars=%d",
-                solution_code.startswith("// LLM Error"),
-                len(solution_code or ""),
-            )
-            return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
-        # 3. Evaluate properly
-        env = RustCoderEnvironment()
-        # Reset to the specifically requested index
-        state = env.reset(start_index=idx)
-        logger.debug("evaluate_single step() submitting chars=%d", len(solution_code))
-        state = env.step(RustCoderAction(code=solution_code))
-        logger.info(
-            "evaluate_single end reward=%.4f compilation_success=%s",
-            float(state.reward or 0.0),
-            bool(state.compilation_success),
-        )
-        metrics = {
-            "Total Reward": f"{state.reward:.2f}",
-            "Compilation": "Success" if state.compilation_success else "Failed",
-            "Metrics": state.reward_breakdown
-        }
-        return solution_code, metrics
-    except Exception as e:
-        logger.exception("evaluate_single crashed.")
-        return f"// Error: {e}", {"error": f"Evaluation system error: {e}"}
-def run_benchmark(progress=gr.Progress()):
-    """Run all 10 problems through the LLM and show summary"""
-    try:
-        env = RustCoderEnvironment()
-        rows = []
-        total_score = 0.0
-        # Check if token is actually present
-        test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
-        if not test_token:
-            return "## Error: HF_TOKEN is not set. Add it to your HF Space secrets or local .env file.", []
-        for i in range(len(env.problems)):
-            progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
-            problem = env.problems[i]
-            code = get_llm_solution(problem["description"])
-            reward = 0.0
-            compiled = "Failed (LLM Error)"
-            if not code.startswith("// LLM Error"):
-                env.reset(start_index=i)
-                state = env.step(RustCoderAction(code=code))
-                reward = state.reward
-                compiled = "Success" if state.compilation_success else "Failed"
-            rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
-            total_score += reward
-        avg_score = total_score / len(env.problems)
-        summary_md = f"## Benchmark Summary\n**Final Environment Score: {avg_score:.2f} / 1.0**"
-        return summary_md, rows
-    except Exception as e:
-        return f"### Benchmark Error: {e}", []
-# --- Build the Gradio UI ---
-def create_dashboard():
-    with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
-        gr.Markdown("# 🦀 Rust Coder: LLM Evaluation Dashboard")
-        with gr.Tab("Individual Task Evaluation"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    p_env = RustCoderEnvironment()
-                    p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
-                    dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
-                    desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
-                with gr.Column(scale=1):
-                    run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
-                    code_display = gr.Code(label="AI Generated Solution", interactive=False)
-                    results_json = gr.JSON(label="Metric Breakdown")
-            def update_desc(p_str):
-                idx = int(p_str.split(":")[0]) - 1
-                p = p_env.problems[idx]
-                return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
-            dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
-            run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
-        with gr.Tab("Full Environment Benchmark"):
-            gr.Markdown("### Complete Environment Suite")
-            gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
-            b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
-            b_sum = gr.Markdown()
-            b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
-            b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
-    return demo
-# Final consolidated Gradio App mounted on the FastAPI server
-app = gr.mount_gradio_app(openenv_app, create_dashboard(), path="/")
 def main(host: str = "0.0.0.0", port: int = 8000) -> None:
-    """Entry point: uv run server or python -m server.app"""
     import uvicorn
     uvicorn.run(app, host=host, port=port)

 """
+FastAPI application for the Rust Coder OpenEnv environment.
+This module is the Hugging Face Space entrypoint (see `openenv.yaml` and Docker `CMD`).
+Endpoints (provided by OpenEnv `create_app`):
+    - POST /reset
+    - POST /step
+    - GET  /state
+    - GET  /schema
+    - WS   /ws
 """
 import os
 import logging
 from dotenv import load_dotenv
 from openenv.core.env_server.http_server import create_app
 load_dotenv()
 _LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
 logging.basicConfig(
     level=getattr(logging, _LOG_LEVEL, logging.INFO),
     format="%(asctime)s %(levelname)s %(name)s - %(message)s",
 )
+app = create_app(
     RustCoderEnvironment,
     RustCoderAction,
     RustCoderObservation,
     max_concurrent_envs=1,
 )
+@app.get("/health")
 async def health_check():
     return {"status": "healthy"}
 def main(host: str = "0.0.0.0", port: int = 8000) -> None:
     import uvicorn
     uvicorn.run(app, host=host, port=port)

server/rust_coder_environment.py CHANGED Viewed

@@ -129,7 +129,7 @@ class RustCoderEnvironment(Environment):
         return RustCoderObservation(
             problem_description=problem["description"],
-            starter_code=problem.get("starter_code", ""),
             compilation_success=False,
             compilation_output="",
             test_results=[],
@@ -143,6 +143,7 @@ class RustCoderEnvironment(Environment):
         self.step_count += 1
         problem = self.problems[self.current_problem_idx]
         code = action.code
         self._dbg(
             "H1",
@@ -169,9 +170,9 @@ class RustCoderEnvironment(Environment):
                 base_url = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
                 token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
                 prompt = problem.get("description", "")
-                starter = problem.get("starter_code", "")
-                if starter:
-                    prompt += f"\n\nStarter Code:\n```rust\n{starter}\n```"
                 self._dbg(
                     "H5",
@@ -191,7 +192,7 @@ class RustCoderEnvironment(Environment):
                     self._logger.error("AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY missing.")
                     return RustCoderObservation(
                         problem_description=problem.get("description", ""),
-                        starter_code=problem.get("starter_code", ""),
                         compilation_success=False,
                         compilation_output="Error: AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY is missing.",
                         test_results=[],
@@ -210,8 +211,8 @@ class RustCoderEnvironment(Environment):
                     completion = client_llm.chat.completions.create(
                         model=model,
                         messages=[
-                            {"role": "system", "content": "You are a senior Rust engineer. Return ONLY the complete fixed Rust code. No explanation."},
-                            {"role": "user", "content": prompt},
                         ],
                         temperature=0.1,
                     )
@@ -264,7 +265,7 @@ class RustCoderEnvironment(Environment):
                 done = False
                 return RustCoderObservation(
                     problem_description=problem["description"],
-                    starter_code=problem.get("starter_code", ""),
                     compilation_success=False,
                     compilation_output="Error: no code submitted.",
                     test_results=[],
@@ -279,6 +280,28 @@ class RustCoderEnvironment(Environment):
                     reward=0.0,
                 )
         # ── 1. Compilation (40%) ──────────────────────────────────────
         compilation_success, compilation_output = self._compile_check(code)
         r_compilation = 1.0 if compilation_success else 0.0
@@ -302,7 +325,9 @@ class RustCoderEnvironment(Environment):
                 r_coverage    = 1.0
         # ── 3. Elegance (10%) ─────────────────────────────────────────
-        r_elegance = self._score_elegance(code)
         # ── 4. Efficiency (10%) ───────────────────────────────────────
         baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
@@ -318,30 +343,43 @@ class RustCoderEnvironment(Environment):
             "elegance":     round(r_elegance,     4),
             "efficiency":   round(r_efficiency,   4),
         }
-        # Calculate weighted total reward
-        total_reward = round(
-            r_compilation * 0.40
-            + r_correctness * 0.20
-            + r_coverage    * 0.20
-            + r_elegance    * 0.10
-            + r_efficiency  * 0.10,
-            4,
-        )
         # ── Advance Logic ─────────────────────────────────────────────
         self.current_problem_idx += 1
         done = self.current_problem_idx >= len(self.problems)
         next_prob_desc = "--- ALL TASKS COMPLETED in this episode ---"
-        next_starter = ""
         if not done:
             next_prob = self.problems[self.current_problem_idx]
             next_prob_desc = f"--- NEXT TASK: {next_prob['title']} ---\n\n{next_prob['description']}"
-            next_starter = next_prob.get("starter_code", "")
         return RustCoderObservation(
-            problem_description=next_prob_desc,
-            starter_code=next_starter,
             compilation_success=compilation_success,
             compilation_output=compilation_output[:2000],   # cap length
             test_results=test_results,

         return RustCoderObservation(
             problem_description=problem["description"],
+            header_section=problem.get("header_section", ""),
             compilation_success=False,
             compilation_output="",
             test_results=[],
         self.step_count += 1
         problem = self.problems[self.current_problem_idx]
         code = action.code
+        header = problem.get("header_section", "")
         self._dbg(
             "H1",
                 base_url = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
                 token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
                 prompt = problem.get("description", "")
+                header = problem.get("header_section", "")
+                if header:
+                    prompt += f"\n\nHeader Section (must be included verbatim in your final code):\n```rust\n{header}\n```"
                 self._dbg(
                     "H5",
                     self._logger.error("AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY missing.")
                     return RustCoderObservation(
                         problem_description=problem.get("description", ""),
+                        header_section=problem.get("header_section", ""),
                         compilation_success=False,
                         compilation_output="Error: AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY is missing.",
                         test_results=[],
                     completion = client_llm.chat.completions.create(
                         model=model,
                         messages=[
+                            {"role": "system", "content": "You are a senior Rust engineer. Return ONLY the complete Rust code file. No explanation."},
+                            {"role": "user", "content": prompt + "\n\nReturn the COMPLETE Rust code, including the header section above."},
                         ],
                         temperature=0.1,
                     )
                 done = False
                 return RustCoderObservation(
                     problem_description=problem["description"],
+                    header_section=problem.get("header_section", ""),
                     compilation_success=False,
                     compilation_output="Error: no code submitted.",
                     test_results=[],
                     reward=0.0,
                 )
+        # Do NOT mutate submissions by injecting header_section.
+        # LeetCode-style behavior: the agent/LLM must return a complete Rust file
+        # that already includes the required header_section.
+        if header and header.strip() and header.strip() not in (code or ""):
+            done = False
+            return RustCoderObservation(
+                problem_description=problem.get("description", ""),
+                header_section=header,
+                compilation_success=False,
+                compilation_output="Error: submission is missing the required header_section. Return the complete Rust code including the header_section.",
+                test_results=[],
+                reward_breakdown={
+                    "compilation": 0.0,
+                    "correctness": 0.0,
+                    "coverage": 0.0,
+                    "elegance": 0.0,
+                    "efficiency": 0.0,
+                },
+                done=done,
+                reward=0.0,
+            )
         # ── 1. Compilation (40%) ──────────────────────────────────────
         compilation_success, compilation_output = self._compile_check(code)
         r_compilation = 1.0 if compilation_success else 0.0
                 r_coverage    = 1.0
         # ── 3. Elegance (10%) ─────────────────────────────────────────
+        # Only score elegance for code that compiles; otherwise it can
+        # incorrectly award points for non-compiling submissions.
+        r_elegance = self._score_elegance(code) if compilation_success else 0.0
         # ── 4. Efficiency (10%) ───────────────────────────────────────
         baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
             "elegance":     round(r_elegance,     4),
             "efficiency":   round(r_efficiency,   4),
         }
+        # Calculate weighted total reward.
+        # Hard rule: if it doesn't compile, total reward must be 0.0.
+        if not compilation_success:
+            total_reward = 0.0
+        else:
+            total_reward = round(
+                r_compilation * 0.40
+                + r_correctness * 0.20
+                + r_coverage    * 0.20
+                + r_elegance    * 0.10
+                + r_efficiency  * 0.10,
+                4,
+            )
         # ── Advance Logic ─────────────────────────────────────────────
         self.current_problem_idx += 1
         done = self.current_problem_idx >= len(self.problems)
         next_prob_desc = "--- ALL TASKS COMPLETED in this episode ---"
+        next_header = ""
         if not done:
             next_prob = self.problems[self.current_problem_idx]
             next_prob_desc = f"--- NEXT TASK: {next_prob['title']} ---\n\n{next_prob['description']}"
+            next_header = next_prob.get("header_section", "")
+        # IMPORTANT: The compilation/test results correspond to the code evaluated
+        # on `problem` (the current task), while the UI should also know what's next.
+        # To avoid confusion, include both "evaluated" and "next" in the description.
+        response_problem_desc = (
+            f"--- EVALUATED TASK: {problem.get('title', '')} ---\n\n"
+            f"{problem.get('description', '')}\n\n"
+            f"{next_prob_desc}"
+        )
         return RustCoderObservation(
+            problem_description=response_problem_desc,
+            header_section=next_header,
             compilation_success=compilation_success,
             compilation_output=compilation_output[:2000],   # cap length
             test_results=test_results,