Parthiban007 commited on
Commit
090dc69
Β·
verified Β·
1 Parent(s): 8a096e2

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. inference.py +1 -1
  2. models.py +13 -5
  3. openenv.yaml +8 -126
  4. server/app.py +120 -69
inference.py CHANGED
@@ -53,7 +53,7 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]):
53
  # REQUIRED exact stdout format, rewards as comma-separated 2dp
54
  rewards_str = ",".join(f"{float(r or 0.0):.2f}" for r in rewards)
55
  print(
56
- f"[END] success={str(bool(success)).lower()} steps={steps} score={float(score or 0.0):.2f} rewards={rewards_str}",
57
  flush=True,
58
  )
59
 
 
53
  # REQUIRED exact stdout format, rewards as comma-separated 2dp
54
  rewards_str = ",".join(f"{float(r or 0.0):.2f}" for r in rewards)
55
  print(
56
+ f"[END] success={str(bool(success)).lower()} steps={steps} score={float(score or 0.0):.3f} rewards={rewards_str}",
57
  flush=True,
58
  )
59
 
models.py CHANGED
@@ -6,12 +6,11 @@
6
 
7
  """
8
  Data models for the Rust Coder Environment.
9
-
10
- The rust_coder environment is a simple test environment that echoes back messages.
11
  """
12
 
 
13
  from openenv.core.env_server.types import Action, Observation
14
- from pydantic import Field
15
 
16
 
17
  class RustCoderAction(Action):
@@ -27,5 +26,14 @@ class RustCoderObservation(Observation):
27
  header_section: str = Field(default="", description="LeetCode-style header/scaffold (imports + signatures/types) for deterministic evaluation.")
28
  compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
29
  compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
30
- test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
31
- reward_breakdown: dict = Field(default_factory=dict, description="Detailed components of the 0.0-1.0 reward.")
 
 
 
 
 
 
 
 
 
 
6
 
7
  """
8
  Data models for the Rust Coder Environment.
 
 
9
  """
10
 
11
+ from typing import Any, Dict, List
12
  from openenv.core.env_server.types import Action, Observation
13
+ from pydantic import BaseModel, Field
14
 
15
 
16
  class RustCoderAction(Action):
 
26
  header_section: str = Field(default="", description="LeetCode-style header/scaffold (imports + signatures/types) for deterministic evaluation.")
27
  compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
28
  compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
29
+ test_results: List[Dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
30
+ reward_breakdown: Dict = Field(default_factory=dict, description="Detailed components of the 0.0-1.0 reward.")
31
+
32
+
33
+ class TaskInfo(BaseModel):
34
+ """Metadata for a single task exposed via GET /tasks."""
35
+
36
+ task_id: str
37
+ difficulty: str
38
+ description: str
39
+ action_schema: Dict[str, Any] = Field(default_factory=dict)
openenv.yaml CHANGED
@@ -1,136 +1,18 @@
1
  spec_version: 1
2
  name: rust_coder
3
- description: "High-fidelity RL environment for evaluating LLM agents on Rust systems programming, including borrow checking, safe concurrency, and memory management."
4
  type: space
5
  runtime: fastapi
6
  app: server.app:app
7
  port: 8000
8
- dockerfile: Dockerfile
 
 
 
 
9
  tags:
10
  - openenv
11
- - software-engineering
12
  - rust
 
13
  - coding-benchmark
14
-
15
- # Task Definition (Easy -> Medium -> Hard)
16
- # Each task has a grader that scores submissions 0.0-1.0
17
- tasks:
18
- - id: "task_1"
19
- title: "Broken CLI Argument Parser"
20
- difficulty: "easy"
21
- description: "Fix enum variant mismatches and incomplete match arms in a CLI argument parser."
22
- grader:
23
- type: "programmatic"
24
- endpoint: "/grade/task_1"
25
- success_threshold: 0.7
26
- reward_range: [0.0, 1.0]
27
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
28
-
29
- - id: "task_2"
30
- title: "Conflicting Borrows in Collection Processing"
31
- difficulty: "easy"
32
- description: "Resolve mutable/immutable borrow conflicts in a string collection processor."
33
- grader:
34
- type: "programmatic"
35
- endpoint: "/grade/task_2"
36
- success_threshold: 0.7
37
- reward_range: [0.0, 1.0]
38
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
39
-
40
- - id: "task_3"
41
- title: "Lifetime Annotations"
42
- difficulty: "medium"
43
- description: "Add correct lifetime annotations to enable a struct holding references to work properly."
44
- grader:
45
- type: "programmatic"
46
- endpoint: "/grade/task_3"
47
- success_threshold: 0.6
48
- reward_range: [0.0, 1.0]
49
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
50
-
51
- - id: "task_4"
52
- title: "Business Logic Bug"
53
- difficulty: "medium"
54
- description: "Fix off-by-one errors and logic bugs in a financial calculation module."
55
- grader:
56
- type: "programmatic"
57
- endpoint: "/grade/task_4"
58
- success_threshold: 0.6
59
- reward_range: [0.0, 1.0]
60
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
61
-
62
- - id: "task_5"
63
- title: "Linked List Management"
64
- difficulty: "medium"
65
- description: "Implement a safe singly-linked list with push, pop, and peek operations."
66
- grader:
67
- type: "programmatic"
68
- endpoint: "/grade/task_5"
69
- success_threshold: 0.6
70
- reward_range: [0.0, 1.0]
71
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
72
-
73
- - id: "task_6"
74
- title: "Multi-threaded Deadlocks"
75
- difficulty: "hard"
76
- description: "Identify and fix deadlock conditions in a multi-threaded producer-consumer pattern."
77
- grader:
78
- type: "programmatic"
79
- endpoint: "/grade/task_6"
80
- success_threshold: 0.5
81
- reward_range: [0.0, 1.0]
82
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
83
-
84
- - id: "task_7"
85
- title: "Async Borrowing"
86
- difficulty: "hard"
87
- description: "Fix async/await borrowing conflicts in a concurrent file processor."
88
- grader:
89
- type: "programmatic"
90
- endpoint: "/grade/task_7"
91
- success_threshold: 0.5
92
- reward_range: [0.0, 1.0]
93
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
94
-
95
- - id: "task_8"
96
- title: "Unsafe FFI Integration"
97
- difficulty: "hard"
98
- description: "Write safe Rust wrappers around unsafe FFI calls to a C library."
99
- grader:
100
- type: "programmatic"
101
- endpoint: "/grade/task_8"
102
- success_threshold: 0.5
103
- reward_range: [0.0, 1.0]
104
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
105
-
106
- - id: "task_9"
107
- title: "Inefficient Data Pipelines"
108
- difficulty: "hard"
109
- description: "Optimize a data transformation pipeline using iterators and avoiding unnecessary allocations."
110
- grader:
111
- type: "programmatic"
112
- endpoint: "/grade/task_9"
113
- success_threshold: 0.5
114
- reward_range: [0.0, 1.0]
115
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
116
-
117
- - id: "task_10"
118
- title: "Memory Leak Prevention"
119
- difficulty: "hard"
120
- description: "Fix memory leak patterns in a custom allocator and ensure proper Drop implementations."
121
- grader:
122
- type: "programmatic"
123
- endpoint: "/grade/task_10"
124
- success_threshold: 0.4
125
- reward_range: [0.0, 1.0]
126
- description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
127
-
128
- # Definitions for Documentation and Graders
129
- action_space:
130
- type: "RustCoderAction"
131
- description: "A single string containing the fixed Rust code."
132
-
133
- observation_space:
134
- type: "RustCoderObservation"
135
- description: "Observation containing problem description, compilation logs, test results, and reward breakdown."
136
-
 
1
  spec_version: 1
2
  name: rust_coder
 
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
+ description: >
8
+ Rust Coder environment for evaluating LLM agents on real-world Rust systems
9
+ programming tasks: borrow checking, lifetimes, safe concurrency, and memory
10
+ management. Multi-dimensional reward: compilation, correctness, coverage,
11
+ elegance, and efficiency.
12
  tags:
13
  - openenv
14
+ - reinforcement-learning
15
  - rust
16
+ - software-engineering
17
  - coding-benchmark
18
+ - hackathon-2026
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/app.py CHANGED
@@ -1,19 +1,19 @@
1
  """
2
  FastAPI application for the Rust Coder OpenEnv environment.
3
 
4
- This module is the Hugging Face Space entrypoint (see `openenv.yaml` and Docker `CMD`).
5
-
6
- Endpoints (provided by OpenEnv `create_app`):
7
- - POST /reset
8
- - POST /step
9
- - GET /state
10
- - GET /schema
11
- - WS /ws
12
-
13
- Additional endpoints:
14
- - GET /health
15
- - GET /tasks β€” list all tasks with grader metadata
16
- - POST /grade/{task_id} β€” grade a code submission for a specific task
17
  """
18
 
19
  import os
@@ -21,10 +21,10 @@ import logging
21
 
22
  from dotenv import load_dotenv
23
  from fastapi import HTTPException
24
- from pydantic import BaseModel
25
  from openenv.core.env_server.http_server import create_app
26
 
27
- from models import RustCoderAction, RustCoderObservation
28
  from server.rust_coder_environment import RustCoderEnvironment
29
 
30
  load_dotenv()
@@ -43,24 +43,74 @@ app = create_app(
43
  max_concurrent_envs=1,
44
  )
45
 
46
-
47
  # ---------------------------------------------------------------------------
48
- # Task metadata β€” mirrors openenv.yaml tasks section
49
  # ---------------------------------------------------------------------------
50
 
51
- _TASK_REGISTRY = [
52
- {"id": "task_1", "index": 0, "title": "Broken CLI Argument Parser", "difficulty": "easy", "success_threshold": 0.7},
53
- {"id": "task_2", "index": 1, "title": "Conflicting Borrows in Collection Processing", "difficulty": "easy", "success_threshold": 0.7},
54
- {"id": "task_3", "index": 2, "title": "Lifetime Annotations", "difficulty": "medium", "success_threshold": 0.6},
55
- {"id": "task_4", "index": 3, "title": "Business Logic Bug", "difficulty": "medium", "success_threshold": 0.6},
56
- {"id": "task_5", "index": 4, "title": "Linked List Management", "difficulty": "medium", "success_threshold": 0.6},
57
- {"id": "task_6", "index": 5, "title": "Multi-threaded Deadlocks", "difficulty": "hard", "success_threshold": 0.5},
58
- {"id": "task_7", "index": 6, "title": "Async Borrowing", "difficulty": "hard", "success_threshold": 0.5},
59
- {"id": "task_8", "index": 7, "title": "Unsafe FFI Integration", "difficulty": "hard", "success_threshold": 0.5},
60
- {"id": "task_9", "index": 8, "title": "Inefficient Data Pipelines", "difficulty": "hard", "success_threshold": 0.5},
61
- {"id": "task_10", "index": 9, "title": "Memory Leak Prevention", "difficulty": "hard", "success_threshold": 0.4},
62
- ]
63
- _TASK_BY_ID = {t["id"]: t for t in _TASK_REGISTRY}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
 
66
  # ---------------------------------------------------------------------------
@@ -74,57 +124,59 @@ async def health_check():
74
 
75
  @app.get("/tasks")
76
  async def list_tasks():
77
- """Return the list of all tasks with their grader metadata."""
78
- tasks_out = []
79
- for t in _TASK_REGISTRY:
80
- tasks_out.append({
81
- "id": t["id"],
82
- "title": t["title"],
83
- "difficulty": t["difficulty"],
84
- "grader": {
85
- "type": "programmatic",
86
- "endpoint": f"/grade/{t['id']}",
87
- "success_threshold": t["success_threshold"],
88
- "reward_range": [0.0, 1.0],
89
- "description": "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)",
90
- },
91
- })
92
- return {"tasks": tasks_out, "total": len(tasks_out)}
93
-
94
-
95
- class GradeRequest(BaseModel):
96
- code: str = ""
97
-
98
-
99
- @app.post("/grade/{task_id}")
100
- async def grade_task(task_id: str, request: GradeRequest):
101
  """
102
- Grade a Rust code submission for a specific task.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- Returns a score in [0.0, 1.0] with detailed breakdown.
105
- This is the programmatic grader endpoint referenced in openenv.yaml.
 
 
 
106
  """
107
- task_meta = _TASK_BY_ID.get(task_id)
108
  if task_meta is None:
109
- raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found.")
 
 
 
110
 
111
  env = RustCoderEnvironment()
112
- # Reset to the specific task
113
  env.reset(start_index=task_meta["index"])
114
-
115
- # Submit the code
116
- action = RustCoderAction(code=request.code)
117
  obs = env.step(action)
118
 
119
- score = float(obs.reward) if obs.reward is not None else 0.0
120
- score = max(0.0, min(1.0, score))
 
121
  success = score >= task_meta["success_threshold"]
122
 
123
  return {
124
  "task_id": task_id,
125
- "score": round(score, 4),
126
- "success": success,
127
- "success_threshold": task_meta["success_threshold"],
 
128
  "reward_breakdown": obs.reward_breakdown,
129
  "compilation_success": obs.compilation_success,
130
  "compilation_output": obs.compilation_output,
@@ -134,7 +186,6 @@ async def grade_task(task_id: str, request: GradeRequest):
134
 
135
  def main(host: str = "0.0.0.0", port: int = 8000) -> None:
136
  import uvicorn
137
-
138
  uvicorn.run(app, host=host, port=port)
139
 
140
 
 
1
  """
2
  FastAPI application for the Rust Coder OpenEnv environment.
3
 
4
+ Entrypoint: server.app:app (see openenv.yaml and Dockerfile CMD)
5
+
6
+ Standard OpenEnv endpoints (via create_app):
7
+ POST /reset β€” start a new episode
8
+ POST /step β€” submit an action, receive observation + reward
9
+ GET /state β€” current episode state
10
+ GET /schema β€” action / observation JSON schemas
11
+ WS /ws β€” WebSocket interface
12
+
13
+ Custom endpoints:
14
+ GET /health β€” health check
15
+ GET /tasks β€” list all tasks with action schema
16
+ POST /grader?task_id=X β€” programmatic grader for task X
17
  """
18
 
19
  import os
 
21
 
22
  from dotenv import load_dotenv
23
  from fastapi import HTTPException
24
+
25
  from openenv.core.env_server.http_server import create_app
26
 
27
+ from models import RustCoderAction, RustCoderObservation, TaskInfo
28
  from server.rust_coder_environment import RustCoderEnvironment
29
 
30
  load_dotenv()
 
43
  max_concurrent_envs=1,
44
  )
45
 
 
46
  # ---------------------------------------------------------------------------
47
+ # Task registry
48
  # ---------------------------------------------------------------------------
49
 
50
+ TASK_REGISTRY = {
51
+ "task_1": {
52
+ "index": 0,
53
+ "difficulty": "easy",
54
+ "description": "Fix enum variant mismatches and incomplete match arms in a CLI argument parser.",
55
+ "success_threshold": 0.7,
56
+ },
57
+ "task_2": {
58
+ "index": 1,
59
+ "difficulty": "easy",
60
+ "description": "Resolve mutable/immutable borrow conflicts in a string collection processor.",
61
+ "success_threshold": 0.7,
62
+ },
63
+ "task_3": {
64
+ "index": 2,
65
+ "difficulty": "medium",
66
+ "description": "Add correct lifetime annotations so a struct holding references compiles and works.",
67
+ "success_threshold": 0.6,
68
+ },
69
+ "task_4": {
70
+ "index": 3,
71
+ "difficulty": "medium",
72
+ "description": "Fix off-by-one errors and logic bugs in a financial calculation module.",
73
+ "success_threshold": 0.6,
74
+ },
75
+ "task_5": {
76
+ "index": 4,
77
+ "difficulty": "medium",
78
+ "description": "Implement a safe singly-linked list with push, pop, and peek operations.",
79
+ "success_threshold": 0.6,
80
+ },
81
+ "task_6": {
82
+ "index": 5,
83
+ "difficulty": "hard",
84
+ "description": "Identify and fix deadlock conditions in a multi-threaded producer-consumer pattern.",
85
+ "success_threshold": 0.5,
86
+ },
87
+ "task_7": {
88
+ "index": 6,
89
+ "difficulty": "hard",
90
+ "description": "Fix async/await borrowing conflicts in a concurrent file processor.",
91
+ "success_threshold": 0.5,
92
+ },
93
+ "task_8": {
94
+ "index": 7,
95
+ "difficulty": "hard",
96
+ "description": "Write safe Rust wrappers around unsafe FFI calls to a C library.",
97
+ "success_threshold": 0.5,
98
+ },
99
+ "task_9": {
100
+ "index": 8,
101
+ "difficulty": "hard",
102
+ "description": "Optimize a data pipeline using iterators and avoiding unnecessary allocations.",
103
+ "success_threshold": 0.5,
104
+ },
105
+ "task_10": {
106
+ "index": 9,
107
+ "difficulty": "hard",
108
+ "description": "Fix memory leak patterns and ensure correct Drop implementations.",
109
+ "success_threshold": 0.4,
110
+ },
111
+ }
112
+
113
+ TASK_IDS = list(TASK_REGISTRY.keys())
114
 
115
 
116
  # ---------------------------------------------------------------------------
 
124
 
125
  @app.get("/tasks")
126
  async def list_tasks():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  """
128
+ Return all available tasks.
129
+
130
+ The competition platform enumerates this endpoint to discover tasks.
131
+ Each entry includes task_id, difficulty, description, and action_schema.
132
+ """
133
+ return [
134
+ TaskInfo(
135
+ task_id=task_id,
136
+ difficulty=task["difficulty"],
137
+ description=task["description"],
138
+ action_schema=RustCoderAction.model_json_schema(),
139
+ )
140
+ for task_id, task in TASK_REGISTRY.items()
141
+ ]
142
+
143
+
144
+ @app.post("/grader")
145
+ async def grader(task_id: str, action: RustCoderAction):
146
+ """
147
+ Programmatic grader for a specific task.
148
+
149
+ Usage: POST /grader?task_id=task_1
150
+ Body: {"code": "<rust source code>"}
151
 
152
+ Scores are strictly in the open interval (0, 1):
153
+ - Minimum 0.01 β€” floor for any submission (even empty/non-compiling code)
154
+ - Maximum 0.99 β€” ceiling so no submission scores a theoretical perfect 1.0
155
+ - Natural range based on: Compilation(40%) + Correctness(20%) +
156
+ Coverage(20%) + Elegance(10%) + Efficiency(10%)
157
  """
158
+ task_meta = TASK_REGISTRY.get(task_id)
159
  if task_meta is None:
160
+ raise HTTPException(
161
+ status_code=404,
162
+ detail=f"Unknown task_id '{task_id}'. Valid IDs: {TASK_IDS}",
163
+ )
164
 
165
  env = RustCoderEnvironment()
 
166
  env.reset(start_index=task_meta["index"])
 
 
 
167
  obs = env.step(action)
168
 
169
+ raw_score = float(obs.reward) if obs.reward is not None else 0.0
170
+ # Enforce strictly open interval (0, 1) β€” never exactly 0.0 or 1.0
171
+ score = round(max(0.01, min(0.99, raw_score)), 4)
172
  success = score >= task_meta["success_threshold"]
173
 
174
  return {
175
  "task_id": task_id,
176
+ "score": score,
177
+ "passed": 1 if success else 0,
178
+ "total": 1,
179
+ "metric": "rust_code_quality",
180
  "reward_breakdown": obs.reward_breakdown,
181
  "compilation_success": obs.compilation_success,
182
  "compilation_output": obs.compilation_output,
 
186
 
187
  def main(host: str = "0.0.0.0", port: int = 8000) -> None:
188
  import uvicorn
 
189
  uvicorn.run(app, host=host, port=port)
190
 
191