Spaces:

h1manshu
/

code_review

Sleeping

App Files Files Community

h1manshu commited on Apr 7

Commit

d0b56d7

verified ·

1 Parent(s): 9e4673c

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

openenv.yaml +0 -3
server/code_review_environment.py +81 -90

openenv.yaml CHANGED Viewed

@@ -4,7 +4,6 @@ type: space
 runtime: fastapi
 app: server.app:app
 port: 8000
 tasks:
   - id: task_1
     description: "Easy — missing import detection"
@@ -30,9 +29,7 @@ tasks:
     description: "Hard — cross-file null handling bug"
     max_steps: 3
     grader: graders:CodeReviewGrader
 endpoints:
   reset: /reset
   step: /step
-  state: /state   # ✅ added
   health: /health

 runtime: fastapi
 app: server.app:app
 port: 8000
 tasks:
   - id: task_1
     description: "Easy — missing import detection"
     description: "Hard — cross-file null handling bug"
     max_steps: 3
     grader: graders:CodeReviewGrader
 endpoints:
   reset: /reset
   step: /step
   health: /health

server/code_review_environment.py CHANGED Viewed

@@ -3,19 +3,19 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 """
 Code Review Environment Implementation.
 A simple test environment that echoes back messages sent to it.
 Perfect for testing HTTP server infrastructure.
 """
 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import State
 try:
     from ..models import (
         CodeReviewAction,
@@ -32,14 +32,14 @@ except ImportError:
         CodeReviewPullRequest,
         CodeReviewStepResponse,
     )
 import json
 from pathlib import Path
 import re
 from difflib import SequenceMatcher
 dataset_path = Path(__file__).parent.parent / "dataset" / "dataset.json"
 STOP_WORDS = {
     "use",
     "the",
@@ -60,15 +60,15 @@ STOP_WORDS = {
     "from",
     "that",
 }
 class CodeReviewEnvironment(Environment):
     """
     A simple echo environment that echoes back messages.
     This environment is designed for testing the HTTP server infrastructure.
     It maintains minimal state and simply echoes back whatever message it receives.
     Example:
         >>> env = CodeReviewEnvironment()
         >>> obs = env.reset()
@@ -78,57 +78,48 @@ class CodeReviewEnvironment(Environment):
         >>> print(obs.echoed_message)  # "Hello"
         >>> print(obs.message_length)  # 5
     """
     # Enable concurrent WebSocket sessions.
     # Set to True if your environment isolates state between instances.
     # When True, multiple WebSocket clients can connect simultaneously, each
     # getting their own environment instance (when using factory mode in app.py).
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
-    def __init__(self , task=None):
         """Initialize the code_review environment."""
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._reset_count = 0
         self.max_steps = 3
         self.task_index = 0
-        self.task = task
         with open(dataset_path) as f:
             self.dataset = json.load(f)
         self.reset()
     def reset(self) -> CodeReviewObservation:
         """
         Reset the environment.
         Returns:
             CodeReviewObservation with a ready message
         """
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._reset_count += 1
-        if self.task:
-            task_id = self.task.get("id")
-            for sample in self.dataset:
-                if sample["id"] == task_id:
-                    self.sample = sample
-                    break
-            else:
-                self.sample = self.dataset[0]
-        else:
-            self.sample = self.dataset[0]
         self.pr = CodeReviewPullRequest(**self.sample["pr"])
         self.gt = self.sample["ground_truth"]
         self.task_type = self.sample.get("task_type", "unknown")
         self.history = []
         self.step_count = 0
         self.done = False
         # State evolution variables
         self.issues_identified = []
         self.fix_attempted = False
         return CodeReviewObservation(
             # echoed_message="Code Review environment ready!",
             pr=self.pr,
@@ -138,25 +129,25 @@ class CodeReviewEnvironment(Environment):
             reward=0.0,
             done=False,
         )
     def step(self, action: CodeReviewAction) -> CodeReviewObservation:  # type: ignore[override]
         """
         Execute a step in the environment by echoing the message.
         Args:
             action: CodeReviewAction containing the message to echo
         Returns:
             CodeReviewObservation with the echoed message and its length
         """
         self._state.step_count += 1
         # print("RAW ACTION TYPE:", type(action))
         # print("RAW ACTION:", action)
         try:
             if isinstance(action, dict):
                 action = CodeReviewAction(**action)
             elif isinstance(action, (list, tuple)):
                 action = CodeReviewAction(
                     action_type=action[0],
@@ -164,60 +155,60 @@ class CodeReviewEnvironment(Environment):
                     suggested_code=action[2] if len(action) > 2 else None,
                     decision=action[3] if len(action) > 3 else None,
                 )
             elif isinstance(action, CodeReviewAction):
                 pass
             else:
                 raise ValueError(f"Unsupported action type: {type(action)}")
         except Exception as e:
             print(f"Error occurred while processing action: {e}")
             return self._invalid_step()
         self.step_count += 1
         self.history.append(action)
         if action.action_type == "comment" and action.comment:
             self.issues_identified.append(action.comment)
         if action.action_type == "suggest_fix":
             self.fix_attempted = True
         score = self.grade_action(action, self.gt)
         # print(f"Step {self.step_count} - Score: {score:.4f}")
         bonus = 0.0
         # Encourage meaningful comments
         if action.comment and len(action.comment) > 30:
             bonus += 0.1
         # Encourage early correct decisions
         if action.action_type == "final_decision" and self.step_count <= 2:
             bonus += 0.1
         # Penalize useless steps
         if not action.comment and action.action_type != "final_decision":
             bonus -= 0.1
         # Penalize long trajectories
         if self.step_count > 3:
             bonus -= 0.05
         score += bonus
         score = max(0.0, min(score, 1.0))
         # print("Final Score == " , score)
         done = (
             action.action_type == "final_decision" or self.step_count >= self.max_steps
         )
         if done:
             score = max([self.grade_action(a, self.gt) for a in self.history] or [0.0])
         # print(type(CodeReviewObservation))
         # print(type(CodeReviewReward))
         obs = CodeReviewObservation(
             pr=self.pr,
             previous_comments=[a.comment for a in self.history if a.comment],
@@ -225,14 +216,14 @@ class CodeReviewEnvironment(Environment):
             max_steps=self.max_steps,
         )
         # print("Obs == " , obs)
         rew = CodeReviewReward(score=score, feedback="graded")
         print("Score == ", type(rew.score), " --- ", rew.score)
         # print("FINAL REWARD TYPE:", type(rew))
         # print("FINAL REWARD:", rew)
         # print("Got the culprit I guess....")
         return CodeReviewStepResponse(
             observation=obs,
             reward=rew.score,
@@ -243,17 +234,17 @@ class CodeReviewEnvironment(Environment):
                 "fix_attempted": self.fix_attempted,
             },
         )
     @property
     def state(self) -> State:
         """
         Get the current environment state.
         Returns:
             Current State with episode_id and step_count
         """
         return self._state
     def _invalid_step(self):
         rew = CodeReviewReward(score=0.0, feedback="invalid action")
         obs = CodeReviewObservation(
@@ -269,46 +260,46 @@ class CodeReviewEnvironment(Environment):
             done=True,
             info={"error": "invalid_action"},
         )
     def grade_action(self, action, ground_truth):
         score = 0.0
         # print("Action === ", action)
         # print("Ground truth === ", ground_truth)
         # ------------------------------
         # ISSUE DETECTION (40%)
         # ------------------------------
         issue_score = self.score_issues(action.comment, ground_truth)
         score += 0.4 * issue_score
         # print("After Issue Score == ", issue_score)
         # ------------------------------
         # FIX QUALITY (30%)
         # ------------------------------
         fix_score = self.score_fix(action.suggested_code, ground_truth)
         score += 0.3 * fix_score
         # print("After Fix Score == ", fix_score)
         # ------------------------------
         # DECISION (30%)
         # ------------------------------
         decision_score = self.score_decision(action, ground_truth)
         score += 0.3 * decision_score
         # print("After Decision Score == ", decision_score)
         # ------------------------------
         # CLAMP SCORE
         # ------------------------------
         score = max(0.0, min(score, 1.0))
         return score
     def normalize(self, text):
         return (text or "").lower().strip()
     # ==============================
     # ISSUE MATCH (PARTIAL CREDIT)
     # ==============================
@@ -316,68 +307,68 @@ class CodeReviewEnvironment(Environment):
         issues = ground_truth.get("issues", [])
         if not comment or not issues:
             return 0.0
         comment = self.normalize(comment)
         matches = sum(1 for issue in issues if self.normalize(issue) in comment)
         return matches / len(issues)
     # ==============================
     # FIX MATCH (FUZZY)
     # ==============================
     def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
         if not suggested_code:
             return 0.0
         expected_fix = self.normalize(ground_truth.get("fix", ""))
         suggested_code = self.normalize(suggested_code)
         if not expected_fix:
             return 0.0
         # 1. Exact / substring match — full score
         if expected_fix in suggested_code:
             return 1.0
         # 2. Token overlap ignoring stop words
         def code_tokens(text: str) -> list[str]:
             tokens = re.findall(r"[a-zA-Z_]\w*|\d+|[=<>!+\-*/]+", text)
             return [t for t in tokens if t.lower() not in STOP_WORDS]
         expected_tokens = code_tokens(expected_fix)
         suggested_tokens = set(code_tokens(suggested_code))
         if not expected_tokens:
             return 0.0
         token_score = sum(1 for t in expected_tokens if t in suggested_tokens) / len(
             expected_tokens
         )
         # 3. Sequence similarity as a secondary signal
         seq_score = SequenceMatcher(None, expected_fix, suggested_code).ratio()
         # Weighted: token overlap matters more than character similarity
         return round(0.7 * token_score + 0.3 * seq_score, 4)
     # ==============================
     # DECISION MATCH
     # ==============================
     def score_decision(self, action, ground_truth):
         expected = ground_truth.get("decision")
         # Not a decision step → no contribution
         if action.action_type != "final_decision":
             return 0.0
         # Missing decision → small penalty
         if not action.decision:
             return 0.0
         # Correct decision
         if action.decision == expected:
             return 1.0
         # Wrong decision → partial penalty (not negative)
-        return 0.2

 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 """
 Code Review Environment Implementation.
 A simple test environment that echoes back messages sent to it.
 Perfect for testing HTTP server infrastructure.
 """
 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import State
 try:
     from ..models import (
         CodeReviewAction,
         CodeReviewPullRequest,
         CodeReviewStepResponse,
     )
 import json
 from pathlib import Path
 import re
 from difflib import SequenceMatcher
 dataset_path = Path(__file__).parent.parent / "dataset" / "dataset.json"
 STOP_WORDS = {
     "use",
     "the",
     "from",
     "that",
 }
 class CodeReviewEnvironment(Environment):
     """
     A simple echo environment that echoes back messages.
     This environment is designed for testing the HTTP server infrastructure.
     It maintains minimal state and simply echoes back whatever message it receives.
     Example:
         >>> env = CodeReviewEnvironment()
         >>> obs = env.reset()
         >>> print(obs.echoed_message)  # "Hello"
         >>> print(obs.message_length)  # 5
     """
     # Enable concurrent WebSocket sessions.
     # Set to True if your environment isolates state between instances.
     # When True, multiple WebSocket clients can connect simultaneously, each
     # getting their own environment instance (when using factory mode in app.py).
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
         """Initialize the code_review environment."""
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._reset_count = 0
         self.max_steps = 3
         self.task_index = 0
         with open(dataset_path) as f:
             self.dataset = json.load(f)
         self.reset()
     def reset(self) -> CodeReviewObservation:
         """
         Reset the environment.
         Returns:
             CodeReviewObservation with a ready message
         """
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._reset_count += 1
+        self.task_index += 1
+        self.sample = self.dataset[self.task_index % len(self.dataset)]
         self.pr = CodeReviewPullRequest(**self.sample["pr"])
         self.gt = self.sample["ground_truth"]
         self.task_type = self.sample.get("task_type", "unknown")
         self.history = []
         self.step_count = 0
         self.done = False
         # State evolution variables
         self.issues_identified = []
         self.fix_attempted = False
         return CodeReviewObservation(
             # echoed_message="Code Review environment ready!",
             pr=self.pr,
             reward=0.0,
             done=False,
         )
     def step(self, action: CodeReviewAction) -> CodeReviewObservation:  # type: ignore[override]
         """
         Execute a step in the environment by echoing the message.
         Args:
             action: CodeReviewAction containing the message to echo
         Returns:
             CodeReviewObservation with the echoed message and its length
         """
         self._state.step_count += 1
         # print("RAW ACTION TYPE:", type(action))
         # print("RAW ACTION:", action)
         try:
             if isinstance(action, dict):
                 action = CodeReviewAction(**action)
             elif isinstance(action, (list, tuple)):
                 action = CodeReviewAction(
                     action_type=action[0],
                     suggested_code=action[2] if len(action) > 2 else None,
                     decision=action[3] if len(action) > 3 else None,
                 )
             elif isinstance(action, CodeReviewAction):
                 pass
             else:
                 raise ValueError(f"Unsupported action type: {type(action)}")
         except Exception as e:
             print(f"Error occurred while processing action: {e}")
             return self._invalid_step()
         self.step_count += 1
         self.history.append(action)
         if action.action_type == "comment" and action.comment:
             self.issues_identified.append(action.comment)
         if action.action_type == "suggest_fix":
             self.fix_attempted = True
         score = self.grade_action(action, self.gt)
         # print(f"Step {self.step_count} - Score: {score:.4f}")
         bonus = 0.0
         # Encourage meaningful comments
         if action.comment and len(action.comment) > 30:
             bonus += 0.1
         # Encourage early correct decisions
         if action.action_type == "final_decision" and self.step_count <= 2:
             bonus += 0.1
         # Penalize useless steps
         if not action.comment and action.action_type != "final_decision":
             bonus -= 0.1
         # Penalize long trajectories
         if self.step_count > 3:
             bonus -= 0.05
         score += bonus
         score = max(0.0, min(score, 1.0))
         # print("Final Score == " , score)
         done = (
             action.action_type == "final_decision" or self.step_count >= self.max_steps
         )
         if done:
             score = max([self.grade_action(a, self.gt) for a in self.history] or [0.0])
         # print(type(CodeReviewObservation))
         # print(type(CodeReviewReward))
         obs = CodeReviewObservation(
             pr=self.pr,
             previous_comments=[a.comment for a in self.history if a.comment],
             max_steps=self.max_steps,
         )
         # print("Obs == " , obs)
         rew = CodeReviewReward(score=score, feedback="graded")
         print("Score == ", type(rew.score), " --- ", rew.score)
         # print("FINAL REWARD TYPE:", type(rew))
         # print("FINAL REWARD:", rew)
         # print("Got the culprit I guess....")
         return CodeReviewStepResponse(
             observation=obs,
             reward=rew.score,
                 "fix_attempted": self.fix_attempted,
             },
         )
     @property
     def state(self) -> State:
         """
         Get the current environment state.
         Returns:
             Current State with episode_id and step_count
         """
         return self._state
     def _invalid_step(self):
         rew = CodeReviewReward(score=0.0, feedback="invalid action")
         obs = CodeReviewObservation(
             done=True,
             info={"error": "invalid_action"},
         )
     def grade_action(self, action, ground_truth):
         score = 0.0
         # print("Action === ", action)
         # print("Ground truth === ", ground_truth)
         # ------------------------------
         # ISSUE DETECTION (40%)
         # ------------------------------
         issue_score = self.score_issues(action.comment, ground_truth)
         score += 0.4 * issue_score
         # print("After Issue Score == ", issue_score)
         # ------------------------------
         # FIX QUALITY (30%)
         # ------------------------------
         fix_score = self.score_fix(action.suggested_code, ground_truth)
         score += 0.3 * fix_score
         # print("After Fix Score == ", fix_score)
         # ------------------------------
         # DECISION (30%)
         # ------------------------------
         decision_score = self.score_decision(action, ground_truth)
         score += 0.3 * decision_score
         # print("After Decision Score == ", decision_score)
         # ------------------------------
         # CLAMP SCORE
         # ------------------------------
         score = max(0.0, min(score, 1.0))
         return score
     def normalize(self, text):
         return (text or "").lower().strip()
     # ==============================
     # ISSUE MATCH (PARTIAL CREDIT)
     # ==============================
         issues = ground_truth.get("issues", [])
         if not comment or not issues:
             return 0.0
         comment = self.normalize(comment)
         matches = sum(1 for issue in issues if self.normalize(issue) in comment)
         return matches / len(issues)
     # ==============================
     # FIX MATCH (FUZZY)
     # ==============================
     def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
         if not suggested_code:
             return 0.0
         expected_fix = self.normalize(ground_truth.get("fix", ""))
         suggested_code = self.normalize(suggested_code)
         if not expected_fix:
             return 0.0
         # 1. Exact / substring match — full score
         if expected_fix in suggested_code:
             return 1.0
         # 2. Token overlap ignoring stop words
         def code_tokens(text: str) -> list[str]:
             tokens = re.findall(r"[a-zA-Z_]\w*|\d+|[=<>!+\-*/]+", text)
             return [t for t in tokens if t.lower() not in STOP_WORDS]
         expected_tokens = code_tokens(expected_fix)
         suggested_tokens = set(code_tokens(suggested_code))
         if not expected_tokens:
             return 0.0
         token_score = sum(1 for t in expected_tokens if t in suggested_tokens) / len(
             expected_tokens
         )
         # 3. Sequence similarity as a secondary signal
         seq_score = SequenceMatcher(None, expected_fix, suggested_code).ratio()
         # Weighted: token overlap matters more than character similarity
         return round(0.7 * token_score + 0.3 * seq_score, 4)
     # ==============================
     # DECISION MATCH
     # ==============================
     def score_decision(self, action, ground_truth):
         expected = ground_truth.get("decision")
         # Not a decision step → no contribution
         if action.action_type != "final_decision":
             return 0.0
         # Missing decision → small penalty
         if not action.decision:
             return 0.0
         # Correct decision
         if action.decision == expected:
             return 1.0
         # Wrong decision → partial penalty (not negative)
+        return 0.2