ncncomplete commited on
Commit
d145b94
·
verified ·
1 Parent(s): 092031c

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. README.md +35 -8
  2. client.py +7 -0
  3. inference.py +153 -121
  4. models.py +17 -9
  5. server/app.py +26 -2
  6. server/python_codeact_env.py +53 -55
  7. server/task_bank.py +157 -0
README.md CHANGED
@@ -13,7 +13,9 @@ tags:
13
 
14
  # Coding Environment
15
 
16
- A Python code execution environment that runs arbitrary Python code and returns results. Perfect for testing code execution infrastructure and demonstrating environment usage patterns.
 
 
17
 
18
  ## Quick Start
19
 
@@ -77,20 +79,45 @@ docker build -t coding-env:latest -f envs/coding_env/server/Dockerfile .
77
  ## Environment Details
78
 
79
  ### Action
80
- **CodeAction**: Contains a single field
81
- - `code` (str) - The Python code to execute
 
 
 
82
 
83
  ### Observation
84
- **CodeObservation**: Contains the execution results
85
- - `stdout` (str) - Standard output from code execution
86
- - `stderr` (str) - Standard error from code execution
87
- - `exit_code` (int) - Exit code (0 for success, non-zero for errors)
 
 
 
 
88
 
89
  ### State
90
  **CodeState**: Tracks execution state
91
  - `episode_id` (str) - Unique identifier for the episode
92
  - `step_count` (int) - Number of steps taken
93
- - `last_exit_code` (int) - Exit code from the last execution
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  ## Advanced Usage
96
 
 
13
 
14
  # Coding Environment
15
 
16
+ A code-review benchmark environment with three graded tasks (easy/medium/hard).
17
+ Each episode provides a buggy snippet and asks the agent to return a structured
18
+ review (`bug_type`, `line_number`, `review`, `confidence`).
19
 
20
  ## Quick Start
21
 
 
79
  ## Environment Details
80
 
81
  ### Action
82
+ **CodeAction** fields:
83
+ - `review` (str) - Human-readable review summary
84
+ - `bug_type` (str) - One of `syntax | logic | security | none`
85
+ - `line_number` (int) - Suspected faulty line
86
+ - `confidence` (float) - Confidence score in `[0.0, 1.0]`
87
 
88
  ### Observation
89
+ **CodeObservation** fields:
90
+ - `task_id` (str) - Current task id
91
+ - `difficulty` (str) - Task difficulty (`easy|medium|hard`)
92
+ - `task_description` (str) - Review instructions
93
+ - `code_snippet` (str) - Code to analyze
94
+ - `previous_feedback` (str) - Grader feedback from latest step
95
+ - `reward` (float) - Normalized score contribution `[0.0, 1.0]`
96
+ - `done` (bool) - Episode termination flag
97
 
98
  ### State
99
  **CodeState**: Tracks execution state
100
  - `episode_id` (str) - Unique identifier for the episode
101
  - `step_count` (int) - Number of steps taken
102
+ - `task_id` (str) - Active task id
103
+ - `difficulty` (str) - Active task difficulty
104
+ - `last_score` (float) - Last normalized score
105
+
106
+ ## Built-in Tasks and Graders
107
+
108
+ The server exposes:
109
+ - `GET /tasks` to list all benchmark tasks.
110
+ - `GET /grader?task_id=<id>&episode_id=<id>` to read final normalized score.
111
+
112
+ Shipped tasks:
113
+ - `task_easy_1` (logic)
114
+ - `task_medium_1` (security)
115
+ - `task_hard_1` (logic/performance-concurrency)
116
+
117
+ Rewards are in `[0.0, 1.0]` with partial progress:
118
+ - bug type correctness
119
+ - line number accuracy (exact/near miss)
120
+ - review evidence keywords
121
 
122
  ## Advanced Usage
123
 
client.py CHANGED
@@ -27,6 +27,10 @@ class CodingEnv(EnvClient[CodeAction, CodeObservation, CodeState]):
27
  def _step_payload(self, action: CodeAction) -> dict:
28
  # Shape expected by the server's /step endpoint under "action"
29
  return {
 
 
 
 
30
  "code": action.code,
31
  }
32
 
@@ -53,4 +57,7 @@ class CodingEnv(EnvClient[CodeAction, CodeObservation, CodeState]):
53
  episode_id=payload.get("episode_id"),
54
  step_count=payload.get("step_count", 0),
55
  last_exit_code=payload.get("last_exit_code", 0),
 
 
 
56
  )
 
27
  def _step_payload(self, action: CodeAction) -> dict:
28
  # Shape expected by the server's /step endpoint under "action"
29
  return {
30
+ "review": action.review,
31
+ "bug_type": action.bug_type,
32
+ "line_number": action.line_number,
33
+ "confidence": action.confidence,
34
  "code": action.code,
35
  }
36
 
 
57
  episode_id=payload.get("episode_id"),
58
  step_count=payload.get("step_count", 0),
59
  last_exit_code=payload.get("last_exit_code", 0),
60
+ task_id=payload.get("task_id", ""),
61
+ difficulty=payload.get("difficulty", ""),
62
+ last_score=float(payload.get("last_score", 0.0)),
63
  )
inference.py CHANGED
@@ -1,171 +1,203 @@
1
  #!/usr/bin/env python3
2
- """Code Review Environment Baseline Evaluation.
3
 
4
- This script is hardened for validator compatibility:
5
- - Always prints [START]/[STEP]/[END] to stdout with flush=True
6
- - Avoids failing before first [START] due to optional deps/credentials
7
- - Never redirects stdout
 
8
  """
9
 
10
  from __future__ import annotations
11
 
12
  import json
13
  import os
14
- from typing import Any, Dict, Optional
15
 
16
- try:
17
- import requests
18
- except Exception:
19
- requests = None # type: ignore[assignment]
20
 
21
 
22
- # ---------------------------------------------------------------------------
23
- # Configuration
24
- # ---------------------------------------------------------------------------
25
-
26
- # Required checklist variables:
27
- # - API_BASE_URL and MODEL_NAME have defaults
28
- # - HF_TOKEN has no default
29
- # - LOCAL_IMAGE_NAME is optional
30
- API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
31
- MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
32
  HF_TOKEN = os.getenv("HF_TOKEN")
33
  LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
34
 
35
- # List of task IDs to evaluate
36
- TASKS = os.getenv("TASKS", "task_1,task_2,task_3").split(",")
 
 
37
 
38
- # ---------------------------------------------------------------------------
39
- # Main Task Runner
40
- # ---------------------------------------------------------------------------
41
 
 
 
42
 
43
- def _build_action(task_description: str, code_snippet: str) -> Dict[str, Any]:
44
- """Build an action via LLM when available; otherwise return safe fallback."""
45
- fallback_action: Dict[str, Any] = {
46
- "review": "Unable to run model; submitting safe fallback review.",
47
- "bug_type": "none",
48
- "line_number": -1,
49
- "confidence": 0.0,
50
- }
51
 
52
- if not HF_TOKEN:
53
- return fallback_action
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- try:
56
- from openai import OpenAI # Lazy import to avoid failing at module import time
57
 
58
- client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  except Exception:
60
- return fallback_action
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- prompt = f"""You are a code reviewer. {task_description}
 
63
 
64
- Code to review:
 
 
 
65
  ```python
66
  {code_snippet}
67
  ```
68
 
69
- Respond ONLY with valid JSON, no markdown:
70
- {{
71
- "review": "your detailed analysis",
72
- "bug_type": "syntax or logic or security or none",
73
- "line_number": <integer>,
74
- "confidence": <float 0.0-1.0>
75
- }}"""
76
-
77
  try:
78
  response = client.chat.completions.create(
79
  model=MODEL_NAME,
80
- messages=[{"role": "user", "content": prompt}],
81
  temperature=0.0,
 
82
  )
83
  raw = (response.choices[0].message.content or "").strip()
84
  raw = raw.replace("```json", "").replace("```", "").strip()
85
  parsed = json.loads(raw)
86
- if isinstance(parsed, dict):
87
- return parsed
88
- return fallback_action
89
- except Exception:
90
- return fallback_action
91
-
92
-
93
- def _safe_post_json(url: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
94
- """Return JSON body or None on any network/JSON failure."""
95
- if requests is None:
96
- return None
97
- try:
98
- response = requests.post(url, json=payload, timeout=30)
99
- return response.json()
100
- except Exception:
101
- return None
102
-
103
-
104
- def _safe_get_json(url: str) -> Optional[Dict[str, Any]]:
105
- """Return JSON body or None on any network/JSON failure."""
106
- if requests is None:
107
- return None
108
- try:
109
- response = requests.get(url, timeout=30)
110
- return response.json()
111
  except Exception:
112
- return None
113
 
114
 
115
- def run_task(task_id: str) -> float:
116
- """Run a single code review task and return the score."""
117
- print(f"[START] task={task_id}", flush=True)
118
-
119
  score = 0.0
120
- steps = 1
121
-
122
- reset_data = _safe_post_json(f"{API_BASE_URL}/reset", {"task_id": task_id}) or {}
123
- obs = reset_data.get("observation", {}) if isinstance(reset_data, dict) else {}
124
-
125
- code_snippet = obs.get("code_snippet", "")
126
- task_description = obs.get("task_description", "Review the provided code.")
127
- action = _build_action(str(task_description), str(code_snippet))
128
-
129
- # If stepping fails, we still emit structured output with reward=0.0
130
- _safe_post_json(f"{API_BASE_URL}/step", {"action": action})
131
 
132
- grader_data = _safe_get_json(
133
- f"{API_BASE_URL}/grader?task_id={task_id}&episode_id=baseline"
134
- ) or {}
135
- if isinstance(grader_data, dict):
136
- try:
137
- score = float(grader_data.get("score", 0.0))
138
- except Exception:
139
- score = 0.0
140
 
141
- print(f"[STEP] step=1 reward={score}", flush=True)
142
- print(f"[END] task={task_id} score={score} steps={steps}", flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  return score
145
 
146
 
147
- # ---------------------------------------------------------------------------
148
- # Entrypoint
149
- # ---------------------------------------------------------------------------
150
-
151
-
152
- def main():
153
- scores = {}
154
- normalized_tasks = [t.strip() for t in TASKS if t.strip()]
155
- if not normalized_tasks:
156
- normalized_tasks = ["task_1"]
157
-
158
- for task_id in normalized_tasks:
159
- scores[task_id] = run_task(task_id)
160
-
161
- average = round(sum(scores.values()) / len(scores), 4)
162
- scores["average"] = average
163
-
164
- print(f"\nBaseline Results: {json.dumps(scores, indent=2)}", flush=True)
165
 
166
- with open("baseline_scores.json", "w") as f:
167
- json.dump(scores, f, indent=2)
 
168
 
 
 
 
169
  return scores
170
 
171
 
 
1
  #!/usr/bin/env python3
2
+ """Hackathon baseline inference for coding_env.
3
 
4
+ MANDATORY environment variables handled here:
5
+ - API_BASE_URL (defaulted)
6
+ - MODEL_NAME (defaulted)
7
+ - HF_TOKEN (no default)
8
+ - LOCAL_IMAGE_NAME (optional, for local Docker workflows)
9
  """
10
 
11
  from __future__ import annotations
12
 
13
  import json
14
  import os
15
+ from typing import Any, Dict, List
16
 
17
+ import requests
18
+ from openai import OpenAI
 
 
19
 
20
 
21
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
22
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
 
 
 
 
 
 
 
 
23
  HF_TOKEN = os.getenv("HF_TOKEN")
24
  LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
25
 
26
+ ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
27
+ BENCHMARK = os.getenv("BENCHMARK", "coding_env")
28
+ MAX_STEPS = int(os.getenv("MAX_STEPS", "1"))
29
+ SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.60"))
30
 
 
 
 
31
 
32
+ def _bool_text(value: bool) -> str:
33
+ return "true" if value else "false"
34
 
 
 
 
 
 
 
 
 
35
 
36
+ def log_start(task: str, env: str, model: str) -> None:
37
+ print(f"[START] task={task} env={env} model={model}", flush=True)
38
+
39
+
40
+ def log_step(
41
+ step: int, action: str, reward: float, done: bool, error: str | None
42
+ ) -> None:
43
+ error_value = error if error else "null"
44
+ print(
45
+ f"[STEP] step={step} action={action} reward={reward:.2f} "
46
+ f"done={_bool_text(done)} error={error_value}",
47
+ flush=True,
48
+ )
49
 
 
 
50
 
51
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
52
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
53
+ print(
54
+ f"[END] success={_bool_text(success)} steps={steps} "
55
+ f"score={score:.2f} rewards={rewards_str}",
56
+ flush=True,
57
+ )
58
+
59
+
60
+ def _safe_json(method: str, url: str, **kwargs: Any) -> Dict[str, Any]:
61
+ try:
62
+ response = requests.request(method, url, timeout=30, **kwargs)
63
+ response.raise_for_status()
64
+ data = response.json()
65
+ if isinstance(data, dict):
66
+ return data
67
  except Exception:
68
+ pass
69
+ return {}
70
+
71
+
72
+ def _task_list() -> List[str]:
73
+ data = _safe_json("GET", f"{ENV_BASE_URL}/tasks")
74
+ tasks = data.get("tasks", [])
75
+ if isinstance(tasks, list):
76
+ values: List[str] = []
77
+ for item in tasks:
78
+ if isinstance(item, dict) and item.get("task_id"):
79
+ values.append(str(item["task_id"]))
80
+ if values:
81
+ return values
82
+ return ["task_easy_1", "task_medium_1", "task_hard_1"]
83
+
84
+
85
+ def _build_action(client: OpenAI | None, task_description: str, code_snippet: str) -> Dict[str, Any]:
86
+ fallback = {
87
+ "review": "Potential logic issue found; needs targeted fix.",
88
+ "bug_type": "logic",
89
+ "line_number": 1,
90
+ "confidence": 0.20,
91
+ }
92
 
93
+ if client is None:
94
+ return fallback
95
 
96
+ prompt = f"""You are a strict code reviewer.
97
+ Task: {task_description}
98
+
99
+ Code:
100
  ```python
101
  {code_snippet}
102
  ```
103
 
104
+ Return ONLY valid JSON with keys:
105
+ review (string), bug_type (one of syntax|logic|security|none),
106
+ line_number (integer), confidence (0.0-1.0 float)
107
+ """
 
 
 
 
108
  try:
109
  response = client.chat.completions.create(
110
  model=MODEL_NAME,
 
111
  temperature=0.0,
112
+ messages=[{"role": "user", "content": prompt}],
113
  )
114
  raw = (response.choices[0].message.content or "").strip()
115
  raw = raw.replace("```json", "").replace("```", "").strip()
116
  parsed = json.loads(raw)
117
+ if not isinstance(parsed, dict):
118
+ return fallback
119
+ return {
120
+ "review": str(parsed.get("review", fallback["review"])),
121
+ "bug_type": str(parsed.get("bug_type", fallback["bug_type"])),
122
+ "line_number": int(parsed.get("line_number", fallback["line_number"])),
123
+ "confidence": float(parsed.get("confidence", fallback["confidence"])),
124
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  except Exception:
126
+ return fallback
127
 
128
 
129
+ def run_task(task_id: str, client: OpenAI | None) -> float:
130
+ episode_id = f"baseline-{task_id}"
131
+ rewards: List[float] = []
 
132
  score = 0.0
133
+ success = False
134
+ last_error: str | None = None
135
+ steps_taken = 0
 
 
 
 
 
 
 
 
136
 
137
+ log_start(task_id, BENCHMARK, MODEL_NAME)
 
 
 
 
 
 
 
138
 
139
+ try:
140
+ reset_data = _safe_json(
141
+ "POST",
142
+ f"{ENV_BASE_URL}/reset",
143
+ json={"task_id": task_id, "episode_id": episode_id},
144
+ )
145
+ obs = reset_data.get("observation", {}) if isinstance(reset_data, dict) else {}
146
+ task_description = str(obs.get("task_description", "Review code quality and bugs."))
147
+ code_snippet = str(obs.get("code_snippet", ""))
148
+
149
+ for step_num in range(1, MAX_STEPS + 1):
150
+ action = _build_action(client, task_description, code_snippet)
151
+ action_str = (
152
+ f"bug_type={action['bug_type']};"
153
+ f"line={action['line_number']};"
154
+ f"confidence={float(action['confidence']):.2f}"
155
+ )
156
+
157
+ step_data = _safe_json("POST", f"{ENV_BASE_URL}/step", json={"action": action})
158
+ reward = float(step_data.get("reward", 0.0) or 0.0)
159
+ done = bool(step_data.get("done", False))
160
+ obs_after = step_data.get("observation", {}) if isinstance(step_data, dict) else {}
161
+ raw_error = obs_after.get("last_action_error")
162
+ last_error = str(raw_error) if raw_error else None
163
+
164
+ rewards.append(reward)
165
+ steps_taken = step_num
166
+ log_step(step_num, action_str, reward, done, last_error)
167
+
168
+ if done:
169
+ break
170
+
171
+ grader_data = _safe_json(
172
+ "GET", f"{ENV_BASE_URL}/grader?task_id={task_id}&episode_id={episode_id}"
173
+ )
174
+ score = float(grader_data.get("score", rewards[-1] if rewards else 0.0))
175
+ success = score >= SUCCESS_SCORE_THRESHOLD
176
+ except Exception as exc:
177
+ last_error = str(exc)
178
+ if steps_taken == 0:
179
+ log_step(1, "bug_type=none;line=-1;confidence=0.00", 0.0, True, last_error)
180
+ rewards.append(0.0)
181
+ steps_taken = 1
182
+ score = 0.0
183
+ success = False
184
+ finally:
185
+ log_end(success, max(1, steps_taken), score, rewards or [0.0])
186
 
187
  return score
188
 
189
 
190
+ def main() -> Dict[str, float]:
191
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) if HF_TOKEN else None
192
+ tasks = _task_list()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ scores: Dict[str, float] = {}
195
+ for task_id in tasks:
196
+ scores[task_id] = run_task(task_id, client)
197
 
198
+ avg = sum(scores.values()) / len(scores) if scores else 0.0
199
+ scores["average"] = round(avg, 4)
200
+ print(json.dumps({"summary": scores}, separators=(",", ":")), flush=True)
201
  return scores
202
 
203
 
models.py CHANGED
@@ -10,25 +10,33 @@ from openenv.core.env_server.interfaces import Action, Observation, State
10
 
11
 
12
  class CodeAction(Action):
13
- """
14
- Represents a single code execution request.
15
- """
16
 
17
- code: str
18
- # Optional: future fields like 'lint': bool, 'timeout_s': float, etc.
 
 
 
 
19
 
20
 
21
  class CodeObservation(Observation):
22
- """
23
- Result of executing code in the environment.
24
- """
25
 
26
  stdout: str = ""
27
  stderr: str = ""
28
  exit_code: int = 0
 
 
 
 
 
29
 
30
 
31
  class CodeState(State):
32
- """State for CodeAct environment with persistent execution context."""
33
 
34
  last_exit_code: int = 0
 
 
 
 
10
 
11
 
12
  class CodeAction(Action):
13
+ """Represents a single code-review submission."""
 
 
14
 
15
+ review: str = ""
16
+ bug_type: str = "none"
17
+ line_number: int = -1
18
+ confidence: float = 0.0
19
+ # Optional fallback for compatibility with earlier code-exec flows.
20
+ code: str = ""
21
 
22
 
23
  class CodeObservation(Observation):
24
+ """Observation returned by the code-review environment."""
 
 
25
 
26
  stdout: str = ""
27
  stderr: str = ""
28
  exit_code: int = 0
29
+ task_id: str = ""
30
+ difficulty: str = ""
31
+ task_description: str = ""
32
+ code_snippet: str = ""
33
+ previous_feedback: str = ""
34
 
35
 
36
  class CodeState(State):
37
+ """State for code-review episodes."""
38
 
39
  last_exit_code: int = 0
40
+ task_id: str = ""
41
+ difficulty: str = ""
42
+ last_score: float = 0.0
server/app.py CHANGED
@@ -21,8 +21,16 @@ Usage:
21
  python -m envs.coding_env.server.app
22
  """
23
 
24
- from coding_env.models import CodeAction, CodeObservation
25
- from coding_env.server.python_codeact_env import PythonCodeActEnv
 
 
 
 
 
 
 
 
26
  from openenv.core.env_server import create_app
27
 
28
  # Create the app with web interface and README integration
@@ -30,6 +38,22 @@ from openenv.core.env_server import create_app
30
  app = create_app(PythonCodeActEnv, CodeAction, CodeObservation, env_name="coding_env")
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if __name__ == "__main__":
34
  import uvicorn
35
 
 
21
  python -m envs.coding_env.server.app
22
  """
23
 
24
+ from fastapi import Query
25
+
26
+ try:
27
+ from coding_env.models import CodeAction, CodeObservation
28
+ from coding_env.server.python_codeact_env import PythonCodeActEnv
29
+ from coding_env.server.task_bank import get_episode_score, list_tasks
30
+ except ImportError:
31
+ from ..models import CodeAction, CodeObservation
32
+ from .python_codeact_env import PythonCodeActEnv
33
+ from .task_bank import get_episode_score, list_tasks
34
  from openenv.core.env_server import create_app
35
 
36
  # Create the app with web interface and README integration
 
38
  app = create_app(PythonCodeActEnv, CodeAction, CodeObservation, env_name="coding_env")
39
 
40
 
41
+ @app.get("/tasks", tags=["Environment Info"])
42
+ def tasks():
43
+ """Return available benchmark tasks and their difficulty."""
44
+ return {"tasks": list_tasks()}
45
+
46
+
47
+ @app.get("/grader", tags=["Environment Info"])
48
+ def grader(
49
+ task_id: str = Query(..., description="Task identifier"),
50
+ episode_id: str = Query(..., description="Episode identifier"),
51
+ ):
52
+ """Return normalized score in [0.0, 1.0] for task/episode."""
53
+ score = get_episode_score(task_id, episode_id)
54
+ return {"task_id": task_id, "episode_id": episode_id, "score": float(score)}
55
+
56
+
57
  if __name__ == "__main__":
58
  import uvicorn
59
 
server/python_codeact_env.py CHANGED
@@ -4,75 +4,68 @@
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
- """
8
- Python Code Action Environment.
9
-
10
- This module provides a server-side environment implementation for executing
11
- Python code actions using PyExecutor.
12
- """
13
 
14
  import uuid
 
15
 
16
  from openenv.core.env_server.interfaces import Action, Environment, Observation
17
 
18
  from ..models import CodeAction, CodeObservation, CodeState
19
- from .python_executor import PyExecutor
20
- from .transforms import create_safe_coding_transform
21
 
22
 
23
  class PythonCodeActEnv(Environment):
24
  """
25
- Python Code Action Environment for executing code and tracking state.
26
-
27
- This environment executes Python code submitted as CodeAction during step,
28
- maintains the last exit code in its state, and returns results wrapped
29
- in CodeObservation.
30
-
31
- Args:
32
- transform: Optional transform to apply to observations
33
- additional_imports: List of additional module imports to authorize
34
- (e.g., ["numpy", "pandas", "matplotlib"])
35
-
36
- Example:
37
- >>> env = PythonCodeActEnv()
38
- >>> obs = env.reset()
39
- >>> action = CodeAction(code="print('Hello, World!')")
40
- >>> obs = env.step(action)
41
- >>> print(obs.stdout) # "Hello, World!\n"
42
- >>> print(obs.exit_code) # 0
43
- >>> print(env.state.last_exit_code) # 0
44
  """
45
 
46
  def __init__(
47
  self,
48
  ):
49
- self.transform = create_safe_coding_transform()
50
- self._executor = PyExecutor()
51
  self._state = CodeState()
 
52
 
53
- def reset(self) -> Observation:
 
 
 
 
 
54
  """
55
- Reset environment and start fresh execution session.
56
-
57
- Returns:
58
- Initial observation with empty stdout/stderr and exit_code=0
59
  """
60
- # Initialize fresh state
61
- self._state = CodeState(episode_id=str(uuid.uuid4()), step_count=0)
62
- # Add last_exit_code to state
 
 
 
 
 
 
 
 
63
  self._state.last_exit_code = 0
64
 
65
- # Reset executor to clear any previously defined variables/functions
66
- self._executor = PyExecutor()
67
-
68
- # Reset transform to clear any accumulated state
69
- self.transform = create_safe_coding_transform()
70
-
71
- # Return initial observation
72
  observation = CodeObservation(
73
- stdout="",
74
  stderr="",
75
  exit_code=0,
 
 
 
 
 
 
 
 
76
  )
77
 
78
  return self._apply_transform(observation)
@@ -93,20 +86,25 @@ class PythonCodeActEnv(Environment):
93
  if not isinstance(action, CodeAction):
94
  raise ValueError(f"Expected CodeAction, got {type(action)}")
95
 
96
- # Execute the code using PyExecutor
97
- result = self._executor.run(action.code)
98
 
99
- # Update state
100
  self._state.step_count += 1
101
- self._state.last_exit_code = result.exit_code
 
 
102
 
103
- # Create observation from execution result
104
- # Include code in metadata for transform reward calculation
105
  observation = CodeObservation(
106
- stdout=result.stdout,
107
- stderr=result.stderr,
108
- exit_code=result.exit_code,
109
- metadata={"last_code": action.code},
 
 
 
 
 
 
110
  )
111
 
112
  return self._apply_transform(observation)
 
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
+ """Code review environment with task-based grading and normalized rewards."""
 
 
 
 
 
8
 
9
  import uuid
10
+ from typing import Any
11
 
12
  from openenv.core.env_server.interfaces import Action, Environment, Observation
13
 
14
  from ..models import CodeAction, CodeObservation, CodeState
15
+ from .task_bank import get_task, grade_action, list_tasks, record_episode_score
 
16
 
17
 
18
  class PythonCodeActEnv(Environment):
19
  """
20
+ Task-driven code-review environment.
21
+
22
+ Episodes are single-step:
23
+ 1. `reset(task_id=...)` returns a code snippet + task description.
24
+ 2. Agent submits CodeAction(review, bug_type, line_number, confidence).
25
+ 3. `step()` returns graded reward in [0.0, 1.0] and done=True.
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  """
27
 
28
  def __init__(
29
  self,
30
  ):
31
+ super().__init__(transform=None)
 
32
  self._state = CodeState()
33
+ self._current_task_id = "task_easy_1"
34
 
35
+ def reset(
36
+ self,
37
+ seed: int | None = None,
38
+ episode_id: str | None = None,
39
+ **kwargs: Any,
40
+ ) -> Observation:
41
  """
42
+ Reset environment and pick a task (easy/medium/hard).
 
 
 
43
  """
44
+ requested_task_id = kwargs.get("task_id", self._current_task_id)
45
+ task = get_task(str(requested_task_id))
46
+ self._current_task_id = task.task_id
47
+
48
+ self._state = CodeState(
49
+ episode_id=episode_id or str(uuid.uuid4()),
50
+ step_count=0,
51
+ task_id=task.task_id,
52
+ difficulty=task.difficulty,
53
+ last_score=0.0,
54
+ )
55
  self._state.last_exit_code = 0
56
 
 
 
 
 
 
 
 
57
  observation = CodeObservation(
58
+ stdout="Task initialized.",
59
  stderr="",
60
  exit_code=0,
61
+ task_id=task.task_id,
62
+ difficulty=task.difficulty,
63
+ task_description=task.task_description,
64
+ code_snippet=task.code_snippet,
65
+ previous_feedback="",
66
+ done=False,
67
+ reward=0.0,
68
+ metadata={"available_tasks": list_tasks()},
69
  )
70
 
71
  return self._apply_transform(observation)
 
86
  if not isinstance(action, CodeAction):
87
  raise ValueError(f"Expected CodeAction, got {type(action)}")
88
 
89
+ task = get_task(self._state.task_id or self._current_task_id)
90
+ reward, feedback = grade_action(action, task)
91
 
 
92
  self._state.step_count += 1
93
+ self._state.last_exit_code = 0
94
+ self._state.last_score = reward
95
+ record_episode_score(task.task_id, self._state.episode_id or "default", reward)
96
 
 
 
97
  observation = CodeObservation(
98
+ stdout=feedback,
99
+ stderr="",
100
+ exit_code=0,
101
+ task_id=task.task_id,
102
+ difficulty=task.difficulty,
103
+ task_description=task.task_description,
104
+ code_snippet=task.code_snippet,
105
+ previous_feedback=feedback,
106
+ reward=reward,
107
+ done=True,
108
  )
109
 
110
  return self._apply_transform(observation)
server/task_bank.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Task definitions and grading utilities for coding_env."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List, Tuple
7
+
8
+ try:
9
+ from coding_env.models import CodeAction
10
+ except ImportError:
11
+ from ..models import CodeAction
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class CodeReviewTask:
16
+ task_id: str
17
+ difficulty: str
18
+ task_description: str
19
+ code_snippet: str
20
+ expected_bug_type: str
21
+ expected_line_number: int
22
+ expected_keywords: Tuple[str, ...]
23
+
24
+
25
+ TASKS: Dict[str, CodeReviewTask] = {
26
+ "task_easy_1": CodeReviewTask(
27
+ task_id="task_easy_1",
28
+ difficulty="easy",
29
+ task_description=(
30
+ "Find the primary bug in this function and report bug_type, line_number, "
31
+ "and a concise explanation."
32
+ ),
33
+ code_snippet=(
34
+ "def average(nums):\n"
35
+ " total = 0\n"
36
+ " for n in nums:\n"
37
+ " total += n\n"
38
+ " return total / len(total)\n"
39
+ ),
40
+ expected_bug_type="logic",
41
+ expected_line_number=5,
42
+ expected_keywords=("len", "total", "typeerror"),
43
+ ),
44
+ "task_medium_1": CodeReviewTask(
45
+ task_id="task_medium_1",
46
+ difficulty="medium",
47
+ task_description=(
48
+ "Review for a security issue. Identify the vulnerability type and precise line."
49
+ ),
50
+ code_snippet=(
51
+ "import sqlite3\n"
52
+ "\n"
53
+ "def login(conn, username, password):\n"
54
+ " query = f\"SELECT * FROM users WHERE name='{username}' AND pw='{password}'\"\n"
55
+ " return conn.execute(query).fetchone() is not None\n"
56
+ ),
57
+ expected_bug_type="security",
58
+ expected_line_number=4,
59
+ expected_keywords=("sql", "injection", "parameterized"),
60
+ ),
61
+ "task_hard_1": CodeReviewTask(
62
+ task_id="task_hard_1",
63
+ difficulty="hard",
64
+ task_description=(
65
+ "Find the concurrency/performance bug and explain why it impacts production latency."
66
+ ),
67
+ code_snippet=(
68
+ "from threading import Lock\n"
69
+ "lock = Lock()\n"
70
+ "cache = {}\n"
71
+ "\n"
72
+ "def get_user(user_id, db):\n"
73
+ " with lock:\n"
74
+ " if user_id in cache:\n"
75
+ " return cache[user_id]\n"
76
+ " data = db.fetch_user(user_id)\n"
77
+ " cache[user_id] = data\n"
78
+ " return data\n"
79
+ ),
80
+ expected_bug_type="logic",
81
+ expected_line_number=9,
82
+ expected_keywords=("lock", "critical section", "latency"),
83
+ ),
84
+ }
85
+
86
+
87
+ EPISODE_SCORES: Dict[tuple[str, str], float] = {}
88
+
89
+
90
+ def list_tasks() -> List[Dict[str, str]]:
91
+ """Return public task metadata."""
92
+ return [
93
+ {"task_id": t.task_id, "difficulty": t.difficulty}
94
+ for t in sorted(TASKS.values(), key=lambda item: item.task_id)
95
+ ]
96
+
97
+
98
+ def get_task(task_id: str) -> CodeReviewTask:
99
+ """Resolve task by id."""
100
+ if task_id not in TASKS:
101
+ raise ValueError(
102
+ f"Unknown task_id '{task_id}'. Available tasks: {', '.join(sorted(TASKS))}"
103
+ )
104
+ return TASKS[task_id]
105
+
106
+
107
+ def _normalize(value: str) -> str:
108
+ return value.strip().lower().replace("-", "_")
109
+
110
+
111
+ def grade_action(action: CodeAction, task: CodeReviewTask) -> tuple[float, str]:
112
+ """Score a code-review action in [0.0, 1.0] with partial credit."""
113
+ score = 0.0
114
+ parts: List[str] = []
115
+
116
+ if _normalize(action.bug_type) == _normalize(task.expected_bug_type):
117
+ score += 0.5
118
+ parts.append("bug_type matched (+0.50)")
119
+ else:
120
+ parts.append(
121
+ f"bug_type mismatch (expected {task.expected_bug_type}, got {action.bug_type})"
122
+ )
123
+
124
+ if action.line_number == task.expected_line_number:
125
+ score += 0.3
126
+ parts.append("line_number matched (+0.30)")
127
+ elif abs(action.line_number - task.expected_line_number) <= 1:
128
+ score += 0.15
129
+ parts.append("line_number near miss (+0.15)")
130
+ else:
131
+ parts.append(
132
+ f"line_number mismatch (expected {task.expected_line_number}, got {action.line_number})"
133
+ )
134
+
135
+ review_text = (action.review or "").lower()
136
+ keyword_hits = sum(
137
+ 1 for keyword in task.expected_keywords if keyword.lower() in review_text
138
+ )
139
+ if keyword_hits > 0:
140
+ keyword_bonus = min(0.2, keyword_hits * 0.1)
141
+ score += keyword_bonus
142
+ parts.append(f"review evidence matched (+{keyword_bonus:.2f})")
143
+ else:
144
+ parts.append("review lacks key evidence (+0.00)")
145
+
146
+ score = max(0.0, min(1.0, round(score, 4)))
147
+ return score, "; ".join(parts)
148
+
149
+
150
+ def record_episode_score(task_id: str, episode_id: str, score: float) -> None:
151
+ """Persist normalized score for grader endpoint."""
152
+ EPISODE_SCORES[(task_id, episode_id)] = max(0.0, min(1.0, float(score)))
153
+
154
+
155
+ def get_episode_score(task_id: str, episode_id: str) -> float:
156
+ """Read score for task/episode pair."""
157
+ return EPISODE_SCORES.get((task_id, episode_id), 0.0)