Spaces:

Parthiban007
/

rust_coder

Running

App Files Files Community

Parthiban007 commited on about 12 hours ago

Commit

090dc69

verified ·

1 Parent(s): 8a096e2

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

inference.py +1 -1
models.py +13 -5
openenv.yaml +8 -126
server/app.py +120 -69

inference.py CHANGED Viewed

@@ -53,7 +53,7 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]):
     # REQUIRED exact stdout format, rewards as comma-separated 2dp
     rewards_str = ",".join(f"{float(r or 0.0):.2f}" for r in rewards)
     print(
-        f"[END] success={str(bool(success)).lower()} steps={steps} score={float(score or 0.0):.2f} rewards={rewards_str}",
         flush=True,
     )

     # REQUIRED exact stdout format, rewards as comma-separated 2dp
     rewards_str = ",".join(f"{float(r or 0.0):.2f}" for r in rewards)
     print(
+        f"[END] success={str(bool(success)).lower()} steps={steps} score={float(score or 0.0):.3f} rewards={rewards_str}",
         flush=True,
     )

models.py CHANGED Viewed

@@ -6,12 +6,11 @@
 """
 Data models for the Rust Coder Environment.
-The rust_coder environment is a simple test environment that echoes back messages.
 """
 from openenv.core.env_server.types import Action, Observation
-from pydantic import Field
 class RustCoderAction(Action):
@@ -27,5 +26,14 @@ class RustCoderObservation(Observation):
     header_section: str = Field(default="", description="LeetCode-style header/scaffold (imports + signatures/types) for deterministic evaluation.")
     compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
     compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
-    test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
-    reward_breakdown: dict = Field(default_factory=dict, description="Detailed components of the 0.0-1.0 reward.")

 """
 Data models for the Rust Coder Environment.
 """
+from typing import Any, Dict, List
 from openenv.core.env_server.types import Action, Observation
+from pydantic import BaseModel, Field
 class RustCoderAction(Action):
     header_section: str = Field(default="", description="LeetCode-style header/scaffold (imports + signatures/types) for deterministic evaluation.")
     compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
     compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
+    test_results: List[Dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
+    reward_breakdown: Dict = Field(default_factory=dict, description="Detailed components of the 0.0-1.0 reward.")
+class TaskInfo(BaseModel):
+    """Metadata for a single task exposed via GET /tasks."""
+    task_id: str
+    difficulty: str
+    description: str
+    action_schema: Dict[str, Any] = Field(default_factory=dict)

openenv.yaml CHANGED Viewed

@@ -1,136 +1,18 @@
 spec_version: 1
 name: rust_coder
-description: "High-fidelity RL environment for evaluating LLM agents on Rust systems programming, including borrow checking, safe concurrency, and memory management."
 type: space
 runtime: fastapi
 app: server.app:app
 port: 8000
-dockerfile: Dockerfile
 tags:
   - openenv
-  - software-engineering
   - rust
   - coding-benchmark
-# Task Definition (Easy -> Medium -> Hard)
-# Each task has a grader that scores submissions 0.0-1.0
-tasks:
-  - id: "task_1"
-    title: "Broken CLI Argument Parser"
-    difficulty: "easy"
-    description: "Fix enum variant mismatches and incomplete match arms in a CLI argument parser."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_1"
-      success_threshold: 0.7
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_2"
-    title: "Conflicting Borrows in Collection Processing"
-    difficulty: "easy"
-    description: "Resolve mutable/immutable borrow conflicts in a string collection processor."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_2"
-      success_threshold: 0.7
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_3"
-    title: "Lifetime Annotations"
-    difficulty: "medium"
-    description: "Add correct lifetime annotations to enable a struct holding references to work properly."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_3"
-      success_threshold: 0.6
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_4"
-    title: "Business Logic Bug"
-    difficulty: "medium"
-    description: "Fix off-by-one errors and logic bugs in a financial calculation module."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_4"
-      success_threshold: 0.6
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_5"
-    title: "Linked List Management"
-    difficulty: "medium"
-    description: "Implement a safe singly-linked list with push, pop, and peek operations."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_5"
-      success_threshold: 0.6
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_6"
-    title: "Multi-threaded Deadlocks"
-    difficulty: "hard"
-    description: "Identify and fix deadlock conditions in a multi-threaded producer-consumer pattern."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_6"
-      success_threshold: 0.5
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_7"
-    title: "Async Borrowing"
-    difficulty: "hard"
-    description: "Fix async/await borrowing conflicts in a concurrent file processor."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_7"
-      success_threshold: 0.5
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_8"
-    title: "Unsafe FFI Integration"
-    difficulty: "hard"
-    description: "Write safe Rust wrappers around unsafe FFI calls to a C library."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_8"
-      success_threshold: 0.5
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_9"
-    title: "Inefficient Data Pipelines"
-    difficulty: "hard"
-    description: "Optimize a data transformation pipeline using iterators and avoiding unnecessary allocations."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_9"
-      success_threshold: 0.5
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-  - id: "task_10"
-    title: "Memory Leak Prevention"
-    difficulty: "hard"
-    description: "Fix memory leak patterns in a custom allocator and ensure proper Drop implementations."
-    grader:
-      type: "programmatic"
-      endpoint: "/grade/task_10"
-      success_threshold: 0.4
-      reward_range: [0.0, 1.0]
-      description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
-# Definitions for Documentation and Graders
-action_space:
-  type: "RustCoderAction"
-  description: "A single string containing the fixed Rust code."
-observation_space:
-  type: "RustCoderObservation"
-  description: "Observation containing problem description, compilation logs, test results, and reward breakdown."

 spec_version: 1
 name: rust_coder
 type: space
 runtime: fastapi
 app: server.app:app
 port: 8000
+description: >
+  Rust Coder environment for evaluating LLM agents on real-world Rust systems
+  programming tasks: borrow checking, lifetimes, safe concurrency, and memory
+  management. Multi-dimensional reward: compilation, correctness, coverage,
+  elegance, and efficiency.
 tags:
   - openenv
+  - reinforcement-learning
   - rust
+  - software-engineering
   - coding-benchmark
+  - hackathon-2026

server/app.py CHANGED Viewed

@@ -1,19 +1,19 @@
 """
 FastAPI application for the Rust Coder OpenEnv environment.
-This module is the Hugging Face Space entrypoint (see `openenv.yaml` and Docker `CMD`).
-Endpoints (provided by OpenEnv `create_app`):
-    - POST /reset
-    - POST /step
-    - GET  /state
-    - GET  /schema
-    - WS   /ws
-Additional endpoints:
-    - GET  /health
-    - GET  /tasks           — list all tasks with grader metadata
-    - POST /grade/{task_id} — grade a code submission for a specific task
 """
 import os
@@ -21,10 +21,10 @@ import logging
 from dotenv import load_dotenv
 from fastapi import HTTPException
-from pydantic import BaseModel
 from openenv.core.env_server.http_server import create_app
-from models import RustCoderAction, RustCoderObservation
 from server.rust_coder_environment import RustCoderEnvironment
 load_dotenv()
@@ -43,24 +43,74 @@ app = create_app(
     max_concurrent_envs=1,
 )
 # ---------------------------------------------------------------------------
-# Task metadata — mirrors openenv.yaml tasks section
 # ---------------------------------------------------------------------------
-_TASK_REGISTRY = [
-    {"id": "task_1",  "index": 0,  "title": "Broken CLI Argument Parser",              "difficulty": "easy",   "success_threshold": 0.7},
-    {"id": "task_2",  "index": 1,  "title": "Conflicting Borrows in Collection Processing", "difficulty": "easy",   "success_threshold": 0.7},
-    {"id": "task_3",  "index": 2,  "title": "Lifetime Annotations",                     "difficulty": "medium", "success_threshold": 0.6},
-    {"id": "task_4",  "index": 3,  "title": "Business Logic Bug",                       "difficulty": "medium", "success_threshold": 0.6},
-    {"id": "task_5",  "index": 4,  "title": "Linked List Management",                   "difficulty": "medium", "success_threshold": 0.6},
-    {"id": "task_6",  "index": 5,  "title": "Multi-threaded Deadlocks",                 "difficulty": "hard",   "success_threshold": 0.5},
-    {"id": "task_7",  "index": 6,  "title": "Async Borrowing",                          "difficulty": "hard",   "success_threshold": 0.5},
-    {"id": "task_8",  "index": 7,  "title": "Unsafe FFI Integration",                   "difficulty": "hard",   "success_threshold": 0.5},
-    {"id": "task_9",  "index": 8,  "title": "Inefficient Data Pipelines",               "difficulty": "hard",   "success_threshold": 0.5},
-    {"id": "task_10", "index": 9,  "title": "Memory Leak Prevention",                   "difficulty": "hard",   "success_threshold": 0.4},
-]
-_TASK_BY_ID = {t["id"]: t for t in _TASK_REGISTRY}
 # ---------------------------------------------------------------------------
@@ -74,57 +124,59 @@ async def health_check():
 @app.get("/tasks")
 async def list_tasks():
-    """Return the list of all tasks with their grader metadata."""
-    tasks_out = []
-    for t in _TASK_REGISTRY:
-        tasks_out.append({
-            "id": t["id"],
-            "title": t["title"],
-            "difficulty": t["difficulty"],
-            "grader": {
-                "type": "programmatic",
-                "endpoint": f"/grade/{t['id']}",
-                "success_threshold": t["success_threshold"],
-                "reward_range": [0.0, 1.0],
-                "description": "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)",
-            },
-        })
-    return {"tasks": tasks_out, "total": len(tasks_out)}
-class GradeRequest(BaseModel):
-    code: str = ""
-@app.post("/grade/{task_id}")
-async def grade_task(task_id: str, request: GradeRequest):
     """
-    Grade a Rust code submission for a specific task.
-    Returns a score in [0.0, 1.0] with detailed breakdown.
-    This is the programmatic grader endpoint referenced in openenv.yaml.
     """
-    task_meta = _TASK_BY_ID.get(task_id)
     if task_meta is None:
-        raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found.")
     env = RustCoderEnvironment()
-    # Reset to the specific task
     env.reset(start_index=task_meta["index"])
-    # Submit the code
-    action = RustCoderAction(code=request.code)
     obs = env.step(action)
-    score = float(obs.reward) if obs.reward is not None else 0.0
-    score = max(0.0, min(1.0, score))
     success = score >= task_meta["success_threshold"]
     return {
         "task_id": task_id,
-        "score": round(score, 4),
-        "success": success,
-        "success_threshold": task_meta["success_threshold"],
         "reward_breakdown": obs.reward_breakdown,
         "compilation_success": obs.compilation_success,
         "compilation_output": obs.compilation_output,
@@ -134,7 +186,6 @@ async def grade_task(task_id: str, request: GradeRequest):
 def main(host: str = "0.0.0.0", port: int = 8000) -> None:
     import uvicorn
     uvicorn.run(app, host=host, port=port)

 """
 FastAPI application for the Rust Coder OpenEnv environment.
+Entrypoint: server.app:app  (see openenv.yaml and Dockerfile CMD)
+Standard OpenEnv endpoints (via create_app):
+    POST /reset   — start a new episode
+    POST /step    — submit an action, receive observation + reward
+    GET  /state   — current episode state
+    GET  /schema  — action / observation JSON schemas
+    WS   /ws      — WebSocket interface
+Custom endpoints:
+    GET  /health               — health check
+    GET  /tasks                — list all tasks with action schema
+    POST /grader?task_id=X     — programmatic grader for task X
 """
 import os
 from dotenv import load_dotenv
 from fastapi import HTTPException
 from openenv.core.env_server.http_server import create_app
+from models import RustCoderAction, RustCoderObservation, TaskInfo
 from server.rust_coder_environment import RustCoderEnvironment
 load_dotenv()
     max_concurrent_envs=1,
 )
 # ---------------------------------------------------------------------------
+# Task registry
 # ---------------------------------------------------------------------------
+TASK_REGISTRY = {
+    "task_1": {
+        "index": 0,
+        "difficulty": "easy",
+        "description": "Fix enum variant mismatches and incomplete match arms in a CLI argument parser.",
+        "success_threshold": 0.7,
+    },
+    "task_2": {
+        "index": 1,
+        "difficulty": "easy",
+        "description": "Resolve mutable/immutable borrow conflicts in a string collection processor.",
+        "success_threshold": 0.7,
+    },
+    "task_3": {
+        "index": 2,
+        "difficulty": "medium",
+        "description": "Add correct lifetime annotations so a struct holding references compiles and works.",
+        "success_threshold": 0.6,
+    },
+    "task_4": {
+        "index": 3,
+        "difficulty": "medium",
+        "description": "Fix off-by-one errors and logic bugs in a financial calculation module.",
+        "success_threshold": 0.6,
+    },
+    "task_5": {
+        "index": 4,
+        "difficulty": "medium",
+        "description": "Implement a safe singly-linked list with push, pop, and peek operations.",
+        "success_threshold": 0.6,
+    },
+    "task_6": {
+        "index": 5,
+        "difficulty": "hard",
+        "description": "Identify and fix deadlock conditions in a multi-threaded producer-consumer pattern.",
+        "success_threshold": 0.5,
+    },
+    "task_7": {
+        "index": 6,
+        "difficulty": "hard",
+        "description": "Fix async/await borrowing conflicts in a concurrent file processor.",
+        "success_threshold": 0.5,
+    },
+    "task_8": {
+        "index": 7,
+        "difficulty": "hard",
+        "description": "Write safe Rust wrappers around unsafe FFI calls to a C library.",
+        "success_threshold": 0.5,
+    },
+    "task_9": {
+        "index": 8,
+        "difficulty": "hard",
+        "description": "Optimize a data pipeline using iterators and avoiding unnecessary allocations.",
+        "success_threshold": 0.5,
+    },
+    "task_10": {
+        "index": 9,
+        "difficulty": "hard",
+        "description": "Fix memory leak patterns and ensure correct Drop implementations.",
+        "success_threshold": 0.4,
+    },
+}
+TASK_IDS = list(TASK_REGISTRY.keys())
 # ---------------------------------------------------------------------------
 @app.get("/tasks")
 async def list_tasks():
     """
+    Return all available tasks.
+    The competition platform enumerates this endpoint to discover tasks.
+    Each entry includes task_id, difficulty, description, and action_schema.
+    """
+    return [
+        TaskInfo(
+            task_id=task_id,
+            difficulty=task["difficulty"],
+            description=task["description"],
+            action_schema=RustCoderAction.model_json_schema(),
+        )
+        for task_id, task in TASK_REGISTRY.items()
+    ]
+@app.post("/grader")
+async def grader(task_id: str, action: RustCoderAction):
+    """
+    Programmatic grader for a specific task.
+    Usage:  POST /grader?task_id=task_1
+    Body:   {"code": "<rust source code>"}
+    Scores are strictly in the open interval (0, 1):
+      - Minimum 0.01  — floor for any submission (even empty/non-compiling code)
+      - Maximum 0.99  — ceiling so no submission scores a theoretical perfect 1.0
+      - Natural range based on: Compilation(40%) + Correctness(20%) +
+        Coverage(20%) + Elegance(10%) + Efficiency(10%)
     """
+    task_meta = TASK_REGISTRY.get(task_id)
     if task_meta is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Unknown task_id '{task_id}'. Valid IDs: {TASK_IDS}",
+        )
     env = RustCoderEnvironment()
     env.reset(start_index=task_meta["index"])
     obs = env.step(action)
+    raw_score = float(obs.reward) if obs.reward is not None else 0.0
+    # Enforce strictly open interval (0, 1) — never exactly 0.0 or 1.0
+    score = round(max(0.01, min(0.99, raw_score)), 4)
     success = score >= task_meta["success_threshold"]
     return {
         "task_id": task_id,
+        "score": score,
+        "passed": 1 if success else 0,
+        "total": 1,
+        "metric": "rust_code_quality",
         "reward_breakdown": obs.reward_breakdown,
         "compilation_success": obs.compilation_success,
         "compilation_output": obs.compilation_output,
 def main(host: str = "0.0.0.0", port: int = 8000) -> None:
     import uvicorn
     uvicorn.run(app, host=host, port=port)