Spaces:

varb15
/

dataqa-env

Sleeping

varb15 commited on Apr 8

Commit

c5b540e

verified ·

1 Parent(s): 9f1cf04

Upload folder using huggingface_hub

Files changed (2) hide show

dataqa_env/server/environment.py CHANGED Viewed

@@ -26,6 +26,14 @@ from .tasks import PlantedIssue, Task, get_task, list_tasks
 IDENTIFY_WEIGHT = 0.6
 FIX_WEIGHT = 0.4
 def parse_issue_key(raw: str) -> Optional[str]:
     """
@@ -416,7 +424,7 @@ class DataQAEnvironment(Environment):
             num_issues_hint=len(self._current_task.planted_issues),
             max_steps=self._current_task.max_steps,
             done=False,
-            reward=0.0,
         )
     def step(
@@ -596,7 +604,7 @@ class DataQAEnvironment(Environment):
             num_issues_hint=len(self._current_task.planted_issues),
             max_steps=self._state.max_steps,
             done=is_done,
-            reward=self._best_score,
             metadata={
                 "identify_f1": identify_f1,
                 "identify_score": identify_score,

 IDENTIFY_WEIGHT = 0.6
 FIX_WEIGHT = 0.4
+# Clamp reward to strict (0, 1) — validators reject exactly 0.0 and 1.0
+REWARD_MIN = 0.001
+REWARD_MAX = 0.999
+def _clamp_reward(r: float) -> float:
+    return max(REWARD_MIN, min(REWARD_MAX, r))
 def parse_issue_key(raw: str) -> Optional[str]:
     """
             num_issues_hint=len(self._current_task.planted_issues),
             max_steps=self._current_task.max_steps,
             done=False,
+            reward=_clamp_reward(0.0),
         )
     def step(
             num_issues_hint=len(self._current_task.planted_issues),
             max_steps=self._state.max_steps,
             done=is_done,
+            reward=_clamp_reward(self._best_score),
             metadata={
                 "identify_f1": identify_f1,
                 "identify_score": identify_score,

tests/test_environment.py CHANGED Viewed

@@ -262,7 +262,7 @@ class TestDataQAEnvironment:
         assert obs.num_issues_hint == 6
         assert obs.max_steps == 3
         assert obs.done is False
-        assert obs.reward == 0.0
         assert "fix" in obs.feedback.lower()  # mentions fix phase
     def test_reset_medium(self, env):
@@ -318,7 +318,7 @@ class TestDataQAEnvironment:
         env.reset(task_id="easy")
         action = DataQAAction(issues=[], task_id="easy")
         obs = env.step(action)
-        assert obs.reward == 0.0
     def test_step_exhausts_max_steps(self, env):
         env.reset(task_id="easy")

         assert obs.num_issues_hint == 6
         assert obs.max_steps == 3
         assert obs.done is False
+        assert obs.reward < 0.01  # clamped to 0.001, not exactly 0.0
         assert "fix" in obs.feedback.lower()  # mentions fix phase
     def test_reset_medium(self, env):
         env.reset(task_id="easy")
         action = DataQAAction(issues=[], task_id="easy")
         obs = env.step(action)
+        assert obs.reward < 0.01  # clamped to 0.001, not exactly 0.0
     def test_step_exhausts_max_steps(self, env):
         env.reset(task_id="easy")