uvpatel7271 commited on
Commit
4ae018d
·
verified ·
1 Parent(s): 566a172

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. server/env.py +32 -296
server/env.py CHANGED
@@ -32,17 +32,17 @@ class PythonCodeReviewEnvironment(
32
 
33
  SUPPORTS_CONCURRENT_SESSIONS = True
34
 
35
- def __init__(self) -> None:
36
- super().__init__()
37
- self._task_order = list(task_ids())
38
- self._task_cursor = -1
39
- self._task: Optional[TaskSpec] = None
40
- self._state = PythonCodeReviewState()
41
- self._done = False
42
- self._last_status = "Call reset() to start."
43
- self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
44
- self._best_visible_test_fraction = 0.0
45
- self._best_quality_score = 0.0
46
  self._full_correctness_awarded = False
47
  self._syntax_reward_awarded = False
48
 
@@ -151,9 +151,13 @@ class PythonCodeReviewEnvironment(
151
  """Return the current environment state."""
152
  return self._state.model_copy(deep=True)
153
 
154
- def list_task_summaries(self) -> List[object]:
155
- """Return public task metadata."""
156
- return list_task_summaries()
 
 
 
 
157
 
158
  def get_task(self, task_id: str) -> object:
159
  """Return a single task descriptor."""
@@ -356,285 +360,17 @@ class PythonCodeReviewEnvironment(
356
  return "Test execution timed out."
357
  return f"Tests: {grade.tests_passed}/{grade.tests_total} passing"
358
 
359
- def _append_history(self, action_type: str, status: str, reward: float) -> None:
360
- """Append action to history."""
361
- entry = HistoryEntry(
362
- step=self._state.step_count,
363
- action_type=action_type,
364
- status=status,
365
- reward=reward,
366
- )
367
- self._state.history.append(entry)
368
-
369
- return self.reset()
370
- if self._done:
371
- self._last_reward = RewardDetails(
372
- value=-INVALID_ACTION_PENALTY,
373
- invalid_action_penalty=INVALID_ACTION_PENALTY,
374
- reason="Episode already completed.",
375
- )
376
- self._last_status = "Episode already completed. Call reset() to continue."
377
- return self._build_observation()
378
-
379
- self._state.step_count += 1
380
- status = ""
381
- reward = RewardDetails(reason="Action processed.")
382
-
383
- if action.action_type == "analyze_code":
384
- reward, status = self._handle_analyze()
385
- elif action.action_type == "edit_code":
386
- reward, status = self._handle_edit(action)
387
- elif action.action_type == "run_tests":
388
- reward, status = self._handle_run_tests()
389
- elif action.action_type == "submit_solution":
390
- reward, status = self._handle_submit()
391
- else: # pragma: no cover
392
- reward = RewardDetails(
393
- value=-INVALID_ACTION_PENALTY,
394
- invalid_action_penalty=INVALID_ACTION_PENALTY,
395
- reason=f"Unsupported action_type {action.action_type}.",
396
- )
397
- status = f"Unsupported action_type {action.action_type}."
398
-
399
- self._last_reward = reward
400
- self._last_status = status
401
- self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
402
-
403
- if self._state.attempts_remaining == 0 and not self._done:
404
- self._finalize_episode(auto_submit=True)
405
-
406
- self._state.done = self._done
407
- return self._build_observation()
408
-
409
- @property
410
- def state(self) -> PythonCodeReviewState:
411
- """Return the current environment state."""
412
-
413
- return self._state.model_copy(deep=True)
414
-
415
- def list_tasks(self) -> List[TaskDescriptor]:
416
- """Return all task descriptors."""
417
-
418
- return list_task_descriptors()
419
-
420
- def list_task_summaries(self) -> List[TaskDescriptor]:
421
- """Return public task metadata."""
422
-
423
- return list_task_summaries()
424
-
425
- def get_task(self, task_id: str) -> TaskDescriptor:
426
- """Return a single task descriptor."""
427
-
428
- return get_task(task_id).to_descriptor()
429
-
430
- def health(self) -> HealthResponse:
431
- """Return a simple health model."""
432
-
433
- return HealthResponse(task_count=len(self._task_order))
434
-
435
- def grade_task_submission(self, task_id: str, code: str) -> TaskGrade:
436
- """Expose deterministic grading outside of an active episode."""
437
-
438
- return grade_task(code, get_task(task_id), include_hidden=True)
439
-
440
- def _handle_analyze(self) -> tuple[RewardDetails, str]:
441
- grade = grade_task(self._state.current_code, self._task, include_hidden=False)
442
- error = grade.details.get("compile_error", "")
443
- if error:
444
- self._state.errors = f"Syntax analysis failed: {error}"
445
- self._state.test_results = "Tests skipped because the code does not compile."
446
- summary = self._state.errors
447
- else:
448
- self._state.errors = ""
449
- if self._task.task_kind == "syntax_fix":
450
- self._state.test_results = "Compilation succeeds."
451
- else:
452
- visible_total = len(self._task.visible_tests)
453
- visible_passed = min(grade.tests_passed, visible_total)
454
- self._state.test_results = (
455
- f"Visible checks preview: {visible_passed}/{visible_total} passing."
456
- )
457
- summary = "Static analysis refreshed."
458
-
459
- reward = RewardDetails(value=0.0, reason=summary)
460
- self._append_history("analyze_code", summary, reward.value)
461
- self._sync_score(include_hidden=False)
462
- return reward, summary
463
-
464
- def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
465
- code = (action.code or "").strip("\n")
466
- if not code:
467
- reward = RewardDetails(
468
- value=-INVALID_ACTION_PENALTY,
469
- invalid_action_penalty=INVALID_ACTION_PENALTY,
470
- reason="edit_code requires non-empty code.",
471
- )
472
- status = "Invalid action: edit_code requires code."
473
- self._append_history("edit_code", status, reward.value)
474
- return reward, status
475
-
476
- previous_visible = grade_task(self._state.current_code, self._task, include_hidden=False)
477
- new_visible = grade_task(code, self._task, include_hidden=False)
478
- self._state.current_code = code
479
- self._state.errors = new_visible.details.get("compile_error", "")
480
- self._state.test_results = self._format_test_results(new_visible, include_hidden=False)
481
-
482
- syntax_reward = 0.0
483
- if previous_visible.syntax_score < 1.0 and new_visible.syntax_score == 1.0:
484
- syntax_reward = 0.2
485
-
486
- quality_bonus = 0.0
487
- quality_delta = max(new_visible.quality_score - self._best_quality_score, 0.0)
488
- if quality_delta > 0:
489
- quality_bonus = round(min(quality_delta * QUALITY_BONUS_SCALE, 0.1), 6)
490
- self._best_quality_score = max(self._best_quality_score, new_visible.quality_score)
491
-
492
- reward_value = syntax_reward + quality_bonus
493
- status = "Code updated."
494
- if self._state.errors:
495
- status = f"Code updated, but syntax issues remain: {self._state.errors}"
496
- elif new_visible.tests_total:
497
- status = self._state.test_results
498
-
499
- reward = RewardDetails(
500
- value=reward_value,
501
- syntax_reward=syntax_reward,
502
- quality_bonus=quality_bonus,
503
- reason=status,
504
- )
505
- self._append_history("edit_code", status, reward.value)
506
- self._sync_score(include_hidden=False)
507
- return reward, status
508
-
509
- def _handle_run_tests(self) -> tuple[RewardDetails, str]:
510
- grade = grade_task(self._state.current_code, self._task, include_hidden=False)
511
- self._state.errors = grade.details.get("compile_error", "")
512
- self._state.test_results = self._format_test_results(grade, include_hidden=False)
513
- reward = self._reward_from_grade(grade, include_hidden=False)
514
- status = self._state.test_results if not self._state.errors else self._state.errors
515
- self._append_history("run_tests", status, reward.value)
516
- self._sync_score(include_hidden=False)
517
- return reward, status
518
-
519
- def _handle_submit(self) -> tuple[RewardDetails, str]:
520
- grade = grade_task(self._state.current_code, self._task, include_hidden=True)
521
- self._state.errors = grade.details.get("compile_error", "")
522
- self._state.test_results = self._format_test_results(grade, include_hidden=True)
523
- reward = self._reward_from_grade(grade, include_hidden=True)
524
- self._finalize_episode(auto_submit=False, grade=grade)
525
- status = f"Solution submitted. Final score: {grade.score:.2f}."
526
- self._append_history("submit_solution", status, reward.value)
527
- return reward, status
528
-
529
- def _finalize_episode(self, auto_submit: bool, grade: Optional[TaskGrade] = None) -> None:
530
- if grade is None:
531
- grade = grade_task(self._state.current_code, self._task, include_hidden=True)
532
- self._state.errors = grade.details.get("compile_error", "")
533
- self._state.test_results = self._format_test_results(grade, include_hidden=True)
534
- self._state.score = grade.score
535
- self._done = True
536
- self._state.done = True
537
- if auto_submit:
538
- self._last_status = f"Step budget exhausted. Final score: {grade.score:.2f}."
539
- self._last_reward = self._reward_from_grade(grade, include_hidden=True)
540
-
541
- def _reward_from_grade(self, grade: TaskGrade, include_hidden: bool) -> RewardDetails:
542
- syntax_reward = 0.0
543
- if grade.syntax_score == 1.0 and not self._state.errors and not self._syntax_reward_awarded:
544
- syntax_reward = 0.2
545
- self._syntax_reward_awarded = True
546
- test_fraction = grade.tests_passed / grade.tests_total if grade.tests_total else grade.score
547
- test_gain = max(test_fraction - self._best_visible_test_fraction, 0.0)
548
- test_reward = 0.3 * test_gain
549
- if test_gain > 0:
550
- self._best_visible_test_fraction = test_fraction
551
-
552
- quality_bonus = 0.0
553
- quality_delta = max(grade.quality_score - self._best_quality_score, 0.0)
554
- if quality_delta > 0:
555
- quality_bonus = min(quality_delta * QUALITY_BONUS_SCALE, 0.1)
556
- self._best_quality_score = grade.quality_score
557
-
558
- correctness_bonus = 0.0
559
- if include_hidden and grade.score >= 0.999999 and not self._full_correctness_awarded:
560
- correctness_bonus = 0.5
561
- self._full_correctness_awarded = True
562
-
563
- timeout_penalty = TIMEOUT_PENALTY if grade.timed_out else 0.0
564
- reward_value = round(
565
- syntax_reward + test_reward + quality_bonus + correctness_bonus - timeout_penalty,
566
- 6,
567
- )
568
- return RewardDetails(
569
- value=reward_value,
570
- syntax_reward=syntax_reward,
571
- test_reward=round(test_reward, 6),
572
- correctness_bonus=correctness_bonus,
573
- quality_bonus=round(quality_bonus, 6),
574
- timeout_penalty=timeout_penalty,
575
- reason=self._format_test_results(grade, include_hidden=include_hidden),
576
- )
577
-
578
- def _format_test_results(self, grade: TaskGrade, include_hidden: bool) -> str:
579
- if grade.details.get("compile_error"):
580
- return f"Compilation failed: {grade.details['compile_error']}"
581
- scope = "full grader" if include_hidden else "visible checks"
582
- parts = [f"{scope}: score={grade.score:.2f}"]
583
- if grade.tests_total:
584
- parts.append(f"tests={grade.tests_passed}/{grade.tests_total}")
585
- if grade.runtime_score:
586
- parts.append(f"runtime={grade.runtime_score:.2f}")
587
- if grade.quality_score:
588
- parts.append(f"quality={grade.quality_score:.2f}")
589
- if grade.style_score:
590
- parts.append(f"style={grade.style_score:.2f}")
591
- if grade.timed_out:
592
- parts.append("timed_out=True")
593
- return " | ".join(parts)
594
-
595
- def _sync_score(self, include_hidden: bool) -> None:
596
- grade = grade_task(self._state.current_code, self._task, include_hidden=include_hidden)
597
- self._state.score = grade.score
598
-
599
- def _append_history(self, action_type: str, summary: str, reward: float) -> None:
600
- self._state.history.append(
601
- HistoryEntry(
602
- step=self._state.step_count,
603
- action_type=action_type, # type: ignore[arg-type]
604
- summary=summary,
605
- reward=reward,
606
- )
607
- )
608
-
609
- def _build_observation(self) -> PythonCodeReviewObservation:
610
- return PythonCodeReviewObservation(
611
- task_id=self._task.task_id,
612
- title=self._task.title,
613
- difficulty=self._task.difficulty,
614
- task_kind=self._task.task_kind,
615
- task_description=self._task.task_description,
616
- current_code=self._state.current_code,
617
- errors=self._state.errors,
618
- test_results=self._state.test_results,
619
- history=list(self._state.history),
620
- attempts_remaining=self._state.attempts_remaining,
621
- last_action_status=self._last_status,
622
- score=self._state.score,
623
- reward_details=self._last_reward,
624
- done=self._done,
625
- reward=self._last_reward.value,
626
- metadata={
627
- "episode_id": self._state.episode_id,
628
- "step_count": self._state.step_count,
629
- "task_kind": self._task.task_kind,
630
- "visible_tests": list(self._task.visible_tests),
631
- "info": {
632
- "reward": reward_metadata(self._last_reward),
633
- },
634
- },
635
- )
636
-
637
-
638
- # Backwards-compatible aliases used elsewhere in the repo.
639
- PythonEnvironment = PythonCodeReviewEnvironment
640
- CodeReviewEnvironment = PythonCodeReviewEnvironment
 
32
 
33
  SUPPORTS_CONCURRENT_SESSIONS = True
34
 
35
+ def __init__(self) -> None:
36
+ super().__init__()
37
+ self._task_order = list(task_ids())
38
+ self._task_cursor = -1
39
+ self._task: Optional[TaskSpec] = None
40
+ self._state = PythonCodeReviewState(episode_id=str(uuid4()))
41
+ self._done = False
42
+ self._last_status = "Call reset() to start."
43
+ self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
44
+ self._best_visible_test_fraction = 0.0
45
+ self._best_quality_score = 0.0
46
  self._full_correctness_awarded = False
47
  self._syntax_reward_awarded = False
48
 
 
151
  """Return the current environment state."""
152
  return self._state.model_copy(deep=True)
153
 
154
+ def list_task_summaries(self) -> List[object]:
155
+ """Return public task metadata."""
156
+ return list_task_summaries()
157
+
158
+ def list_tasks(self) -> List[object]:
159
+ """Return all public task descriptors."""
160
+ return list_task_descriptors()
161
 
162
  def get_task(self, task_id: str) -> object:
163
  """Return a single task descriptor."""
 
360
  return "Test execution timed out."
361
  return f"Tests: {grade.tests_passed}/{grade.tests_total} passing"
362
 
363
+ def _append_history(self, action_type: str, status: str, reward: float) -> None:
364
+ """Append action to history."""
365
+ entry = HistoryEntry(
366
+ step=self._state.step_count,
367
+ action_type=action_type,
368
+ status=status,
369
+ reward=reward,
370
+ )
371
+ self._state.history.append(entry)
372
+
373
+
374
+ # Backwards-compatible aliases used elsewhere in the repo.
375
+ PythonEnvironment = PythonCodeReviewEnvironment
376
+ CodeReviewEnvironment = PythonCodeReviewEnvironment