import random from typing import Dict, Any, Tuple from environment.models import Observation, Action, Reward from environment.tasks import TASKS from environment.graders import grade_easy, grade_medium, grade_hard from environment.rewards import compute_reward class CodeReviewEnv: def __init__(self, task_id: str): if task_id not in TASKS: raise ValueError(f"Unknown task: {task_id}") self.task_id = task_id self._state = None self._step_count = 0 self._done = False self._final_f1 = None self._ground_truth = TASKS[task_id]["ground_truth"] self._max_steps = TASKS[task_id]["max_steps"] # Use a local random instance for isolation self._rng = random.Random(42) def reset(self) -> Observation: self._rng.seed(42) # Set seed on instance for each reset self._step_count = 0 self._done = False self._final_f1 = None task = TASKS[self.task_id] self._state = { "code": task["code"], "instructions": task["instructions"], "issues_reported": [] } return Observation( code=self._state["code"], step_count=self._step_count, previous_feedback="", done=False ) def step(self, action: Action) -> Tuple[Observation, Reward, bool, Dict[str, Any]]: if self._done: raise RuntimeError("Episode already done. Call reset().") self._step_count += 1 self._state["issues_reported"] = action.issues # Compute reward reward_obj = compute_reward( action=action, ground_truth=self._ground_truth, step_count=self._step_count, max_steps=self._max_steps ) # Check episode termination done = False info = {} if action.final or self._step_count >= self._max_steps: # Grade the final attempt if self.task_id == "easy": final_score = grade_easy(action.issues) elif self.task_id == "medium": final_score = grade_medium(action.issues) else: final_score = grade_hard(action.issues) self._final_f1 = final_score done = True info["final_f1"] = final_score # Override reward: give final F1 as reward for the terminal step reward_obj = Reward(value=final_score, reason=f"Episode finished. F1={final_score}") self._done = done obs = Observation( code=self._state["code"], step_count=self._step_count, previous_feedback=reward_obj.reason, done=done ) return obs, reward_obj, done, info def state(self) -> Dict[str, Any]: return self._state.copy()