Spaces:
Sleeping
Sleeping
Fix structural phase 2 validation: Clamp all scores rigorously within (0, 1) bounds exclusively
Browse files- env/graders.py +8 -8
- inference.py +1 -1
- tests/test_environment.py +1 -1
env/graders.py
CHANGED
|
@@ -13,7 +13,7 @@ def grade_easy(state: EnvironmentState) -> float:
|
|
| 13 |
|
| 14 |
if "escalate" in actions:
|
| 15 |
reward -= 0.5 # penalty for unnecessary escalation
|
| 16 |
-
return max(0.
|
| 17 |
|
| 18 |
def grade_medium(state: EnvironmentState) -> float:
|
| 19 |
# Requires: check_policy, reply_to_customer (explaining policy), close_ticket
|
|
@@ -28,10 +28,10 @@ def grade_medium(state: EnvironmentState) -> float:
|
|
| 28 |
if "close_ticket" in actions:
|
| 29 |
reward += 0.3
|
| 30 |
|
| 31 |
-
if "issue_refund" in actions:
|
| 32 |
-
return 0.
|
| 33 |
|
| 34 |
-
return max(0.
|
| 35 |
|
| 36 |
def grade_hard(state: EnvironmentState) -> float:
|
| 37 |
# Requires: fetch_user_data, escalate to "billing_tier2", reply_to_customer
|
|
@@ -57,7 +57,7 @@ def grade_hard(state: EnvironmentState) -> float:
|
|
| 57 |
if "close_ticket" in actions:
|
| 58 |
reward -= 0.3 # can't close without resolving escalate
|
| 59 |
|
| 60 |
-
return max(0.
|
| 61 |
|
| 62 |
def grade_fraud_detection(state: EnvironmentState) -> float:
|
| 63 |
# Requires: fetch_user_data, check_policy, deny refund, close_ticket
|
|
@@ -77,9 +77,9 @@ def grade_fraud_detection(state: EnvironmentState) -> float:
|
|
| 77 |
print("Reward after close_ticket:", reward)
|
| 78 |
|
| 79 |
if "issue_refund" in actions: # fatal mistake
|
| 80 |
-
return 0.
|
| 81 |
|
| 82 |
-
return max(0.
|
| 83 |
|
| 84 |
def grade(state: EnvironmentState) -> float:
|
| 85 |
if state.current_task_id == "task_fraud_detection":
|
|
@@ -90,4 +90,4 @@ def grade(state: EnvironmentState) -> float:
|
|
| 90 |
return grade_medium(state)
|
| 91 |
elif state.task_difficulty == "hard":
|
| 92 |
return grade_hard(state)
|
| 93 |
-
return 0.
|
|
|
|
| 13 |
|
| 14 |
if "escalate" in actions:
|
| 15 |
reward -= 0.5 # penalty for unnecessary escalation
|
| 16 |
+
return max(0.01, min(0.99, reward))
|
| 17 |
|
| 18 |
def grade_medium(state: EnvironmentState) -> float:
|
| 19 |
# Requires: check_policy, reply_to_customer (explaining policy), close_ticket
|
|
|
|
| 28 |
if "close_ticket" in actions:
|
| 29 |
reward += 0.3
|
| 30 |
|
| 31 |
+
if "issue_refund" in actions:
|
| 32 |
+
return 0.01
|
| 33 |
|
| 34 |
+
return max(0.01, min(0.99, reward))
|
| 35 |
|
| 36 |
def grade_hard(state: EnvironmentState) -> float:
|
| 37 |
# Requires: fetch_user_data, escalate to "billing_tier2", reply_to_customer
|
|
|
|
| 57 |
if "close_ticket" in actions:
|
| 58 |
reward -= 0.3 # can't close without resolving escalate
|
| 59 |
|
| 60 |
+
return max(0.01, min(0.99, reward))
|
| 61 |
|
| 62 |
def grade_fraud_detection(state: EnvironmentState) -> float:
|
| 63 |
# Requires: fetch_user_data, check_policy, deny refund, close_ticket
|
|
|
|
| 77 |
print("Reward after close_ticket:", reward)
|
| 78 |
|
| 79 |
if "issue_refund" in actions: # fatal mistake
|
| 80 |
+
return 0.01
|
| 81 |
|
| 82 |
+
return max(0.01, min(0.99, reward))
|
| 83 |
|
| 84 |
def grade(state: EnvironmentState) -> float:
|
| 85 |
if state.current_task_id == "task_fraud_detection":
|
|
|
|
| 90 |
return grade_medium(state)
|
| 91 |
elif state.task_difficulty == "hard":
|
| 92 |
return grade_hard(state)
|
| 93 |
+
return 0.01
|
inference.py
CHANGED
|
@@ -162,7 +162,7 @@ async def run_task(task_id: str, client: OpenAI) -> None:
|
|
| 162 |
score = actual_reward
|
| 163 |
break
|
| 164 |
|
| 165 |
-
score = min(max(score, 0.
|
| 166 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 167 |
finally:
|
| 168 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
|
|
|
| 162 |
score = actual_reward
|
| 163 |
break
|
| 164 |
|
| 165 |
+
score = min(max(score, 0.01), 0.99)
|
| 166 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 167 |
finally:
|
| 168 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
tests/test_environment.py
CHANGED
|
@@ -102,4 +102,4 @@ def test_fraud_detection_task():
|
|
| 102 |
action4 = Action(action_type="close_ticket", parameters={"resolution": "Refund denied due to chargebacks."})
|
| 103 |
obs4, reward4, done4, info4 = env.step(action4)
|
| 104 |
assert done4 is True
|
| 105 |
-
assert info4.get("current_reward", -1.0) == 0.
|
|
|
|
| 102 |
action4 = Action(action_type="close_ticket", parameters={"resolution": "Refund denied due to chargebacks."})
|
| 103 |
obs4, reward4, done4, info4 = env.step(action4)
|
| 104 |
assert done4 is True
|
| 105 |
+
assert info4.get("current_reward", -1.0) == 0.01
|