Sid8421 commited on
Commit
1d7df11
·
1 Parent(s): b63619e

Fix structural phase 2 validation: Clamp all scores rigorously within (0, 1) bounds exclusively

Browse files
Files changed (3) hide show
  1. env/graders.py +8 -8
  2. inference.py +1 -1
  3. tests/test_environment.py +1 -1
env/graders.py CHANGED
@@ -13,7 +13,7 @@ def grade_easy(state: EnvironmentState) -> float:
13
 
14
  if "escalate" in actions:
15
  reward -= 0.5 # penalty for unnecessary escalation
16
- return max(0.0, min(1.0, reward))
17
 
18
  def grade_medium(state: EnvironmentState) -> float:
19
  # Requires: check_policy, reply_to_customer (explaining policy), close_ticket
@@ -28,10 +28,10 @@ def grade_medium(state: EnvironmentState) -> float:
28
  if "close_ticket" in actions:
29
  reward += 0.3
30
 
31
- if "issue_refund" in actions: # fatal mistake
32
- return 0.0
33
 
34
- return max(0.0, min(1.0, reward))
35
 
36
  def grade_hard(state: EnvironmentState) -> float:
37
  # Requires: fetch_user_data, escalate to "billing_tier2", reply_to_customer
@@ -57,7 +57,7 @@ def grade_hard(state: EnvironmentState) -> float:
57
  if "close_ticket" in actions:
58
  reward -= 0.3 # can't close without resolving escalate
59
 
60
- return max(0.0, min(1.0, reward))
61
 
62
  def grade_fraud_detection(state: EnvironmentState) -> float:
63
  # Requires: fetch_user_data, check_policy, deny refund, close_ticket
@@ -77,9 +77,9 @@ def grade_fraud_detection(state: EnvironmentState) -> float:
77
  print("Reward after close_ticket:", reward)
78
 
79
  if "issue_refund" in actions: # fatal mistake
80
- return 0.0
81
 
82
- return max(0.0, min(1.0, reward))
83
 
84
  def grade(state: EnvironmentState) -> float:
85
  if state.current_task_id == "task_fraud_detection":
@@ -90,4 +90,4 @@ def grade(state: EnvironmentState) -> float:
90
  return grade_medium(state)
91
  elif state.task_difficulty == "hard":
92
  return grade_hard(state)
93
- return 0.0
 
13
 
14
  if "escalate" in actions:
15
  reward -= 0.5 # penalty for unnecessary escalation
16
+ return max(0.01, min(0.99, reward))
17
 
18
  def grade_medium(state: EnvironmentState) -> float:
19
  # Requires: check_policy, reply_to_customer (explaining policy), close_ticket
 
28
  if "close_ticket" in actions:
29
  reward += 0.3
30
 
31
+ if "issue_refund" in actions:
32
+ return 0.01
33
 
34
+ return max(0.01, min(0.99, reward))
35
 
36
  def grade_hard(state: EnvironmentState) -> float:
37
  # Requires: fetch_user_data, escalate to "billing_tier2", reply_to_customer
 
57
  if "close_ticket" in actions:
58
  reward -= 0.3 # can't close without resolving escalate
59
 
60
+ return max(0.01, min(0.99, reward))
61
 
62
  def grade_fraud_detection(state: EnvironmentState) -> float:
63
  # Requires: fetch_user_data, check_policy, deny refund, close_ticket
 
77
  print("Reward after close_ticket:", reward)
78
 
79
  if "issue_refund" in actions: # fatal mistake
80
+ return 0.01
81
 
82
+ return max(0.01, min(0.99, reward))
83
 
84
  def grade(state: EnvironmentState) -> float:
85
  if state.current_task_id == "task_fraud_detection":
 
90
  return grade_medium(state)
91
  elif state.task_difficulty == "hard":
92
  return grade_hard(state)
93
+ return 0.01
inference.py CHANGED
@@ -162,7 +162,7 @@ async def run_task(task_id: str, client: OpenAI) -> None:
162
  score = actual_reward
163
  break
164
 
165
- score = min(max(score, 0.0), 1.0)
166
  success = score >= SUCCESS_SCORE_THRESHOLD
167
  finally:
168
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 
162
  score = actual_reward
163
  break
164
 
165
+ score = min(max(score, 0.01), 0.99)
166
  success = score >= SUCCESS_SCORE_THRESHOLD
167
  finally:
168
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
tests/test_environment.py CHANGED
@@ -102,4 +102,4 @@ def test_fraud_detection_task():
102
  action4 = Action(action_type="close_ticket", parameters={"resolution": "Refund denied due to chargebacks."})
103
  obs4, reward4, done4, info4 = env.step(action4)
104
  assert done4 is True
105
- assert info4.get("current_reward", -1.0) == 0.0
 
102
  action4 = Action(action_type="close_ticket", parameters={"resolution": "Refund denied due to chargebacks."})
103
  obs4, reward4, done4, info4 = env.step(action4)
104
  assert done4 is True
105
+ assert info4.get("current_reward", -1.0) == 0.01