h1manshu commited on
Commit
d0b56d7
·
verified ·
1 Parent(s): 9e4673c

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. openenv.yaml +0 -3
  2. server/code_review_environment.py +81 -90
openenv.yaml CHANGED
@@ -4,7 +4,6 @@ type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
-
8
  tasks:
9
  - id: task_1
10
  description: "Easy — missing import detection"
@@ -30,9 +29,7 @@ tasks:
30
  description: "Hard — cross-file null handling bug"
31
  max_steps: 3
32
  grader: graders:CodeReviewGrader
33
-
34
  endpoints:
35
  reset: /reset
36
  step: /step
37
- state: /state # ✅ added
38
  health: /health
 
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
 
7
  tasks:
8
  - id: task_1
9
  description: "Easy — missing import detection"
 
29
  description: "Hard — cross-file null handling bug"
30
  max_steps: 3
31
  grader: graders:CodeReviewGrader
 
32
  endpoints:
33
  reset: /reset
34
  step: /step
 
35
  health: /health
server/code_review_environment.py CHANGED
@@ -3,19 +3,19 @@
3
  #
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
-
7
  """
8
  Code Review Environment Implementation.
9
-
10
  A simple test environment that echoes back messages sent to it.
11
  Perfect for testing HTTP server infrastructure.
12
  """
13
-
14
  from uuid import uuid4
15
-
16
  from openenv.core.env_server.interfaces import Environment
17
  from openenv.core.env_server.types import State
18
-
19
  try:
20
  from ..models import (
21
  CodeReviewAction,
@@ -32,14 +32,14 @@ except ImportError:
32
  CodeReviewPullRequest,
33
  CodeReviewStepResponse,
34
  )
35
-
36
  import json
37
  from pathlib import Path
38
  import re
39
  from difflib import SequenceMatcher
40
-
41
  dataset_path = Path(__file__).parent.parent / "dataset" / "dataset.json"
42
-
43
  STOP_WORDS = {
44
  "use",
45
  "the",
@@ -60,15 +60,15 @@ STOP_WORDS = {
60
  "from",
61
  "that",
62
  }
63
-
64
-
65
  class CodeReviewEnvironment(Environment):
66
  """
67
  A simple echo environment that echoes back messages.
68
-
69
  This environment is designed for testing the HTTP server infrastructure.
70
  It maintains minimal state and simply echoes back whatever message it receives.
71
-
72
  Example:
73
  >>> env = CodeReviewEnvironment()
74
  >>> obs = env.reset()
@@ -78,57 +78,48 @@ class CodeReviewEnvironment(Environment):
78
  >>> print(obs.echoed_message) # "Hello"
79
  >>> print(obs.message_length) # 5
80
  """
81
-
82
  # Enable concurrent WebSocket sessions.
83
  # Set to True if your environment isolates state between instances.
84
  # When True, multiple WebSocket clients can connect simultaneously, each
85
  # getting their own environment instance (when using factory mode in app.py).
86
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
87
-
88
- def __init__(self , task=None):
89
  """Initialize the code_review environment."""
90
  self._state = State(episode_id=str(uuid4()), step_count=0)
91
  self._reset_count = 0
92
  self.max_steps = 3
93
  self.task_index = 0
94
- self.task = task
95
  with open(dataset_path) as f:
96
  self.dataset = json.load(f)
97
  self.reset()
98
-
99
  def reset(self) -> CodeReviewObservation:
100
  """
101
  Reset the environment.
102
-
103
  Returns:
104
  CodeReviewObservation with a ready message
105
  """
106
  self._state = State(episode_id=str(uuid4()), step_count=0)
107
  self._reset_count += 1
108
- if self.task:
109
- task_id = self.task.get("id")
110
-
111
- for sample in self.dataset:
112
- if sample["id"] == task_id:
113
- self.sample = sample
114
- break
115
- else:
116
- self.sample = self.dataset[0]
117
- else:
118
- self.sample = self.dataset[0]
119
-
120
  self.pr = CodeReviewPullRequest(**self.sample["pr"])
121
  self.gt = self.sample["ground_truth"]
122
  self.task_type = self.sample.get("task_type", "unknown")
123
-
124
  self.history = []
125
  self.step_count = 0
126
  self.done = False
127
-
128
  # State evolution variables
129
  self.issues_identified = []
130
  self.fix_attempted = False
131
-
132
  return CodeReviewObservation(
133
  # echoed_message="Code Review environment ready!",
134
  pr=self.pr,
@@ -138,25 +129,25 @@ class CodeReviewEnvironment(Environment):
138
  reward=0.0,
139
  done=False,
140
  )
141
-
142
  def step(self, action: CodeReviewAction) -> CodeReviewObservation: # type: ignore[override]
143
  """
144
  Execute a step in the environment by echoing the message.
145
-
146
  Args:
147
  action: CodeReviewAction containing the message to echo
148
-
149
  Returns:
150
  CodeReviewObservation with the echoed message and its length
151
  """
152
  self._state.step_count += 1
153
  # print("RAW ACTION TYPE:", type(action))
154
  # print("RAW ACTION:", action)
155
-
156
  try:
157
  if isinstance(action, dict):
158
  action = CodeReviewAction(**action)
159
-
160
  elif isinstance(action, (list, tuple)):
161
  action = CodeReviewAction(
162
  action_type=action[0],
@@ -164,60 +155,60 @@ class CodeReviewEnvironment(Environment):
164
  suggested_code=action[2] if len(action) > 2 else None,
165
  decision=action[3] if len(action) > 3 else None,
166
  )
167
-
168
  elif isinstance(action, CodeReviewAction):
169
  pass
170
-
171
  else:
172
  raise ValueError(f"Unsupported action type: {type(action)}")
173
  except Exception as e:
174
  print(f"Error occurred while processing action: {e}")
175
  return self._invalid_step()
176
-
177
  self.step_count += 1
178
  self.history.append(action)
179
-
180
  if action.action_type == "comment" and action.comment:
181
  self.issues_identified.append(action.comment)
182
-
183
  if action.action_type == "suggest_fix":
184
  self.fix_attempted = True
185
-
186
  score = self.grade_action(action, self.gt)
187
  # print(f"Step {self.step_count} - Score: {score:.4f}")
188
-
189
  bonus = 0.0
190
-
191
  # Encourage meaningful comments
192
  if action.comment and len(action.comment) > 30:
193
  bonus += 0.1
194
-
195
  # Encourage early correct decisions
196
  if action.action_type == "final_decision" and self.step_count <= 2:
197
  bonus += 0.1
198
-
199
  # Penalize useless steps
200
  if not action.comment and action.action_type != "final_decision":
201
  bonus -= 0.1
202
-
203
  # Penalize long trajectories
204
  if self.step_count > 3:
205
  bonus -= 0.05
206
-
207
  score += bonus
208
  score = max(0.0, min(score, 1.0))
209
  # print("Final Score == " , score)
210
-
211
  done = (
212
  action.action_type == "final_decision" or self.step_count >= self.max_steps
213
  )
214
-
215
  if done:
216
  score = max([self.grade_action(a, self.gt) for a in self.history] or [0.0])
217
-
218
  # print(type(CodeReviewObservation))
219
  # print(type(CodeReviewReward))
220
-
221
  obs = CodeReviewObservation(
222
  pr=self.pr,
223
  previous_comments=[a.comment for a in self.history if a.comment],
@@ -225,14 +216,14 @@ class CodeReviewEnvironment(Environment):
225
  max_steps=self.max_steps,
226
  )
227
  # print("Obs == " , obs)
228
-
229
  rew = CodeReviewReward(score=score, feedback="graded")
230
  print("Score == ", type(rew.score), " --- ", rew.score)
231
-
232
  # print("FINAL REWARD TYPE:", type(rew))
233
  # print("FINAL REWARD:", rew)
234
  # print("Got the culprit I guess....")
235
-
236
  return CodeReviewStepResponse(
237
  observation=obs,
238
  reward=rew.score,
@@ -243,17 +234,17 @@ class CodeReviewEnvironment(Environment):
243
  "fix_attempted": self.fix_attempted,
244
  },
245
  )
246
-
247
  @property
248
  def state(self) -> State:
249
  """
250
  Get the current environment state.
251
-
252
  Returns:
253
  Current State with episode_id and step_count
254
  """
255
  return self._state
256
-
257
  def _invalid_step(self):
258
  rew = CodeReviewReward(score=0.0, feedback="invalid action")
259
  obs = CodeReviewObservation(
@@ -269,46 +260,46 @@ class CodeReviewEnvironment(Environment):
269
  done=True,
270
  info={"error": "invalid_action"},
271
  )
272
-
273
  def grade_action(self, action, ground_truth):
274
  score = 0.0
275
-
276
  # print("Action === ", action)
277
  # print("Ground truth === ", ground_truth)
278
-
279
  # ------------------------------
280
  # ISSUE DETECTION (40%)
281
  # ------------------------------
282
  issue_score = self.score_issues(action.comment, ground_truth)
283
  score += 0.4 * issue_score
284
  # print("After Issue Score == ", issue_score)
285
-
286
  # ------------------------------
287
  # FIX QUALITY (30%)
288
  # ------------------------------
289
  fix_score = self.score_fix(action.suggested_code, ground_truth)
290
  score += 0.3 * fix_score
291
-
292
  # print("After Fix Score == ", fix_score)
293
-
294
  # ------------------------------
295
  # DECISION (30%)
296
  # ------------------------------
297
  decision_score = self.score_decision(action, ground_truth)
298
  score += 0.3 * decision_score
299
-
300
  # print("After Decision Score == ", decision_score)
301
-
302
  # ------------------------------
303
  # CLAMP SCORE
304
  # ------------------------------
305
  score = max(0.0, min(score, 1.0))
306
-
307
  return score
308
-
309
  def normalize(self, text):
310
  return (text or "").lower().strip()
311
-
312
  # ==============================
313
  # ISSUE MATCH (PARTIAL CREDIT)
314
  # ==============================
@@ -316,68 +307,68 @@ class CodeReviewEnvironment(Environment):
316
  issues = ground_truth.get("issues", [])
317
  if not comment or not issues:
318
  return 0.0
319
-
320
  comment = self.normalize(comment)
321
-
322
  matches = sum(1 for issue in issues if self.normalize(issue) in comment)
323
-
324
  return matches / len(issues)
325
-
326
  # ==============================
327
  # FIX MATCH (FUZZY)
328
  # ==============================
329
  def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
330
  if not suggested_code:
331
  return 0.0
332
-
333
  expected_fix = self.normalize(ground_truth.get("fix", ""))
334
  suggested_code = self.normalize(suggested_code)
335
-
336
  if not expected_fix:
337
  return 0.0
338
-
339
  # 1. Exact / substring match — full score
340
  if expected_fix in suggested_code:
341
  return 1.0
342
-
343
  # 2. Token overlap ignoring stop words
344
  def code_tokens(text: str) -> list[str]:
345
  tokens = re.findall(r"[a-zA-Z_]\w*|\d+|[=<>!+\-*/]+", text)
346
  return [t for t in tokens if t.lower() not in STOP_WORDS]
347
-
348
  expected_tokens = code_tokens(expected_fix)
349
  suggested_tokens = set(code_tokens(suggested_code))
350
-
351
  if not expected_tokens:
352
  return 0.0
353
-
354
  token_score = sum(1 for t in expected_tokens if t in suggested_tokens) / len(
355
  expected_tokens
356
  )
357
-
358
  # 3. Sequence similarity as a secondary signal
359
  seq_score = SequenceMatcher(None, expected_fix, suggested_code).ratio()
360
-
361
  # Weighted: token overlap matters more than character similarity
362
  return round(0.7 * token_score + 0.3 * seq_score, 4)
363
-
364
  # ==============================
365
  # DECISION MATCH
366
  # ==============================
367
  def score_decision(self, action, ground_truth):
368
  expected = ground_truth.get("decision")
369
-
370
  # Not a decision step → no contribution
371
  if action.action_type != "final_decision":
372
  return 0.0
373
-
374
  # Missing decision → small penalty
375
  if not action.decision:
376
  return 0.0
377
-
378
  # Correct decision
379
  if action.decision == expected:
380
  return 1.0
381
-
382
  # Wrong decision → partial penalty (not negative)
383
- return 0.2
 
3
  #
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
+
7
  """
8
  Code Review Environment Implementation.
9
+
10
  A simple test environment that echoes back messages sent to it.
11
  Perfect for testing HTTP server infrastructure.
12
  """
13
+
14
  from uuid import uuid4
15
+
16
  from openenv.core.env_server.interfaces import Environment
17
  from openenv.core.env_server.types import State
18
+
19
  try:
20
  from ..models import (
21
  CodeReviewAction,
 
32
  CodeReviewPullRequest,
33
  CodeReviewStepResponse,
34
  )
35
+
36
  import json
37
  from pathlib import Path
38
  import re
39
  from difflib import SequenceMatcher
40
+
41
  dataset_path = Path(__file__).parent.parent / "dataset" / "dataset.json"
42
+
43
  STOP_WORDS = {
44
  "use",
45
  "the",
 
60
  "from",
61
  "that",
62
  }
63
+
64
+
65
  class CodeReviewEnvironment(Environment):
66
  """
67
  A simple echo environment that echoes back messages.
68
+
69
  This environment is designed for testing the HTTP server infrastructure.
70
  It maintains minimal state and simply echoes back whatever message it receives.
71
+
72
  Example:
73
  >>> env = CodeReviewEnvironment()
74
  >>> obs = env.reset()
 
78
  >>> print(obs.echoed_message) # "Hello"
79
  >>> print(obs.message_length) # 5
80
  """
81
+
82
  # Enable concurrent WebSocket sessions.
83
  # Set to True if your environment isolates state between instances.
84
  # When True, multiple WebSocket clients can connect simultaneously, each
85
  # getting their own environment instance (when using factory mode in app.py).
86
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
87
+
88
+ def __init__(self):
89
  """Initialize the code_review environment."""
90
  self._state = State(episode_id=str(uuid4()), step_count=0)
91
  self._reset_count = 0
92
  self.max_steps = 3
93
  self.task_index = 0
 
94
  with open(dataset_path) as f:
95
  self.dataset = json.load(f)
96
  self.reset()
97
+
98
  def reset(self) -> CodeReviewObservation:
99
  """
100
  Reset the environment.
101
+
102
  Returns:
103
  CodeReviewObservation with a ready message
104
  """
105
  self._state = State(episode_id=str(uuid4()), step_count=0)
106
  self._reset_count += 1
107
+ self.task_index += 1
108
+
109
+ self.sample = self.dataset[self.task_index % len(self.dataset)]
110
+
 
 
 
 
 
 
 
 
111
  self.pr = CodeReviewPullRequest(**self.sample["pr"])
112
  self.gt = self.sample["ground_truth"]
113
  self.task_type = self.sample.get("task_type", "unknown")
114
+
115
  self.history = []
116
  self.step_count = 0
117
  self.done = False
118
+
119
  # State evolution variables
120
  self.issues_identified = []
121
  self.fix_attempted = False
122
+
123
  return CodeReviewObservation(
124
  # echoed_message="Code Review environment ready!",
125
  pr=self.pr,
 
129
  reward=0.0,
130
  done=False,
131
  )
132
+
133
  def step(self, action: CodeReviewAction) -> CodeReviewObservation: # type: ignore[override]
134
  """
135
  Execute a step in the environment by echoing the message.
136
+
137
  Args:
138
  action: CodeReviewAction containing the message to echo
139
+
140
  Returns:
141
  CodeReviewObservation with the echoed message and its length
142
  """
143
  self._state.step_count += 1
144
  # print("RAW ACTION TYPE:", type(action))
145
  # print("RAW ACTION:", action)
146
+
147
  try:
148
  if isinstance(action, dict):
149
  action = CodeReviewAction(**action)
150
+
151
  elif isinstance(action, (list, tuple)):
152
  action = CodeReviewAction(
153
  action_type=action[0],
 
155
  suggested_code=action[2] if len(action) > 2 else None,
156
  decision=action[3] if len(action) > 3 else None,
157
  )
158
+
159
  elif isinstance(action, CodeReviewAction):
160
  pass
161
+
162
  else:
163
  raise ValueError(f"Unsupported action type: {type(action)}")
164
  except Exception as e:
165
  print(f"Error occurred while processing action: {e}")
166
  return self._invalid_step()
167
+
168
  self.step_count += 1
169
  self.history.append(action)
170
+
171
  if action.action_type == "comment" and action.comment:
172
  self.issues_identified.append(action.comment)
173
+
174
  if action.action_type == "suggest_fix":
175
  self.fix_attempted = True
176
+
177
  score = self.grade_action(action, self.gt)
178
  # print(f"Step {self.step_count} - Score: {score:.4f}")
179
+
180
  bonus = 0.0
181
+
182
  # Encourage meaningful comments
183
  if action.comment and len(action.comment) > 30:
184
  bonus += 0.1
185
+
186
  # Encourage early correct decisions
187
  if action.action_type == "final_decision" and self.step_count <= 2:
188
  bonus += 0.1
189
+
190
  # Penalize useless steps
191
  if not action.comment and action.action_type != "final_decision":
192
  bonus -= 0.1
193
+
194
  # Penalize long trajectories
195
  if self.step_count > 3:
196
  bonus -= 0.05
197
+
198
  score += bonus
199
  score = max(0.0, min(score, 1.0))
200
  # print("Final Score == " , score)
201
+
202
  done = (
203
  action.action_type == "final_decision" or self.step_count >= self.max_steps
204
  )
205
+
206
  if done:
207
  score = max([self.grade_action(a, self.gt) for a in self.history] or [0.0])
208
+
209
  # print(type(CodeReviewObservation))
210
  # print(type(CodeReviewReward))
211
+
212
  obs = CodeReviewObservation(
213
  pr=self.pr,
214
  previous_comments=[a.comment for a in self.history if a.comment],
 
216
  max_steps=self.max_steps,
217
  )
218
  # print("Obs == " , obs)
219
+
220
  rew = CodeReviewReward(score=score, feedback="graded")
221
  print("Score == ", type(rew.score), " --- ", rew.score)
222
+
223
  # print("FINAL REWARD TYPE:", type(rew))
224
  # print("FINAL REWARD:", rew)
225
  # print("Got the culprit I guess....")
226
+
227
  return CodeReviewStepResponse(
228
  observation=obs,
229
  reward=rew.score,
 
234
  "fix_attempted": self.fix_attempted,
235
  },
236
  )
237
+
238
  @property
239
  def state(self) -> State:
240
  """
241
  Get the current environment state.
242
+
243
  Returns:
244
  Current State with episode_id and step_count
245
  """
246
  return self._state
247
+
248
  def _invalid_step(self):
249
  rew = CodeReviewReward(score=0.0, feedback="invalid action")
250
  obs = CodeReviewObservation(
 
260
  done=True,
261
  info={"error": "invalid_action"},
262
  )
263
+
264
  def grade_action(self, action, ground_truth):
265
  score = 0.0
266
+
267
  # print("Action === ", action)
268
  # print("Ground truth === ", ground_truth)
269
+
270
  # ------------------------------
271
  # ISSUE DETECTION (40%)
272
  # ------------------------------
273
  issue_score = self.score_issues(action.comment, ground_truth)
274
  score += 0.4 * issue_score
275
  # print("After Issue Score == ", issue_score)
276
+
277
  # ------------------------------
278
  # FIX QUALITY (30%)
279
  # ------------------------------
280
  fix_score = self.score_fix(action.suggested_code, ground_truth)
281
  score += 0.3 * fix_score
282
+
283
  # print("After Fix Score == ", fix_score)
284
+
285
  # ------------------------------
286
  # DECISION (30%)
287
  # ------------------------------
288
  decision_score = self.score_decision(action, ground_truth)
289
  score += 0.3 * decision_score
290
+
291
  # print("After Decision Score == ", decision_score)
292
+
293
  # ------------------------------
294
  # CLAMP SCORE
295
  # ------------------------------
296
  score = max(0.0, min(score, 1.0))
297
+
298
  return score
299
+
300
  def normalize(self, text):
301
  return (text or "").lower().strip()
302
+
303
  # ==============================
304
  # ISSUE MATCH (PARTIAL CREDIT)
305
  # ==============================
 
307
  issues = ground_truth.get("issues", [])
308
  if not comment or not issues:
309
  return 0.0
310
+
311
  comment = self.normalize(comment)
312
+
313
  matches = sum(1 for issue in issues if self.normalize(issue) in comment)
314
+
315
  return matches / len(issues)
316
+
317
  # ==============================
318
  # FIX MATCH (FUZZY)
319
  # ==============================
320
  def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
321
  if not suggested_code:
322
  return 0.0
323
+
324
  expected_fix = self.normalize(ground_truth.get("fix", ""))
325
  suggested_code = self.normalize(suggested_code)
326
+
327
  if not expected_fix:
328
  return 0.0
329
+
330
  # 1. Exact / substring match — full score
331
  if expected_fix in suggested_code:
332
  return 1.0
333
+
334
  # 2. Token overlap ignoring stop words
335
  def code_tokens(text: str) -> list[str]:
336
  tokens = re.findall(r"[a-zA-Z_]\w*|\d+|[=<>!+\-*/]+", text)
337
  return [t for t in tokens if t.lower() not in STOP_WORDS]
338
+
339
  expected_tokens = code_tokens(expected_fix)
340
  suggested_tokens = set(code_tokens(suggested_code))
341
+
342
  if not expected_tokens:
343
  return 0.0
344
+
345
  token_score = sum(1 for t in expected_tokens if t in suggested_tokens) / len(
346
  expected_tokens
347
  )
348
+
349
  # 3. Sequence similarity as a secondary signal
350
  seq_score = SequenceMatcher(None, expected_fix, suggested_code).ratio()
351
+
352
  # Weighted: token overlap matters more than character similarity
353
  return round(0.7 * token_score + 0.3 * seq_score, 4)
354
+
355
  # ==============================
356
  # DECISION MATCH
357
  # ==============================
358
  def score_decision(self, action, ground_truth):
359
  expected = ground_truth.get("decision")
360
+
361
  # Not a decision step → no contribution
362
  if action.action_type != "final_decision":
363
  return 0.0
364
+
365
  # Missing decision → small penalty
366
  if not action.decision:
367
  return 0.0
368
+
369
  # Correct decision
370
  if action.decision == expected:
371
  return 1.0
372
+
373
  # Wrong decision → partial penalty (not negative)
374
+ return 0.2