Spaces:
Running
Running
Commit ·
57d984d
1
Parent(s): 140ce9d
Emergency fix: drop scoring limit to 2 decimal points and 0.01 / 0.99 to pass deep verification
Browse files
financial_audit_env/server/graders.py
CHANGED
|
@@ -23,7 +23,7 @@ from .data_generator import ERROR_MONETARY_VALUES, ERROR_SEVERITY_WEIGHTS
|
|
| 23 |
# Phase-2 validator requires every task score to be strictly in (0, 1).
|
| 24 |
# We enforce: final_score = clamp(round(raw_score, N))
|
| 25 |
# ---------------------------------------------------------------------------
|
| 26 |
-
_SCORE_EPSILON = 0.
|
| 27 |
|
| 28 |
def _clamp_score(score: float) -> float:
|
| 29 |
"""Clamp a score to be strictly within (0, 1) — never 0.0 or 1.0."""
|
|
@@ -33,7 +33,7 @@ def _clamp_score(score: float) -> float:
|
|
| 33 |
return 1.0 - _SCORE_EPSILON
|
| 34 |
return score
|
| 35 |
|
| 36 |
-
def strict_round_clamp(raw_score: float, n_digits: int =
|
| 37 |
"""Safely round then clamp to guarantee the result is strictly in (0, 1)."""
|
| 38 |
epsilon = 10 ** (-n_digits)
|
| 39 |
rounded = round(raw_score, n_digits)
|
|
@@ -236,16 +236,16 @@ def compute_f1_score(
|
|
| 236 |
|
| 237 |
return {
|
| 238 |
# All numeric scores clamped to (0, 1) exclusive — Phase-2 validator requirement
|
| 239 |
-
"score": strict_round_clamp(f1,
|
| 240 |
-
"precision": strict_round_clamp(precision,
|
| 241 |
-
"recall": strict_round_clamp(recall,
|
| 242 |
-
"weighted_score": strict_round_clamp(weighted_f1,
|
| 243 |
-
"partial_credit_score": strict_round_clamp(partial_credit_f1,
|
| 244 |
# Counts
|
| 245 |
"true_positives": true_positives,
|
| 246 |
"false_positives": len(false_positive_list),
|
| 247 |
"false_negatives": false_negatives,
|
| 248 |
-
"weighted_false_negatives": round(weighted_total - weighted_tp,
|
| 249 |
"duplicates": len(duplicates_list),
|
| 250 |
"partial_matches": len(partial_matches),
|
| 251 |
"total_findings": total_findings,
|
|
@@ -436,4 +436,4 @@ def compute_step_reward(
|
|
| 436 |
if result["recall"] < 0.3:
|
| 437 |
reward -= 0.20
|
| 438 |
|
| 439 |
-
return strict_round_clamp(reward,
|
|
|
|
| 23 |
# Phase-2 validator requires every task score to be strictly in (0, 1).
|
| 24 |
# We enforce: final_score = clamp(round(raw_score, N))
|
| 25 |
# ---------------------------------------------------------------------------
|
| 26 |
+
_SCORE_EPSILON = 0.01
|
| 27 |
|
| 28 |
def _clamp_score(score: float) -> float:
|
| 29 |
"""Clamp a score to be strictly within (0, 1) — never 0.0 or 1.0."""
|
|
|
|
| 33 |
return 1.0 - _SCORE_EPSILON
|
| 34 |
return score
|
| 35 |
|
| 36 |
+
def strict_round_clamp(raw_score: float, n_digits: int = 2) -> float:
|
| 37 |
"""Safely round then clamp to guarantee the result is strictly in (0, 1)."""
|
| 38 |
epsilon = 10 ** (-n_digits)
|
| 39 |
rounded = round(raw_score, n_digits)
|
|
|
|
| 236 |
|
| 237 |
return {
|
| 238 |
# All numeric scores clamped to (0, 1) exclusive — Phase-2 validator requirement
|
| 239 |
+
"score": strict_round_clamp(f1, 2),
|
| 240 |
+
"precision": strict_round_clamp(precision, 2),
|
| 241 |
+
"recall": strict_round_clamp(recall, 2),
|
| 242 |
+
"weighted_score": strict_round_clamp(weighted_f1, 2),
|
| 243 |
+
"partial_credit_score": strict_round_clamp(partial_credit_f1, 2),
|
| 244 |
# Counts
|
| 245 |
"true_positives": true_positives,
|
| 246 |
"false_positives": len(false_positive_list),
|
| 247 |
"false_negatives": false_negatives,
|
| 248 |
+
"weighted_false_negatives": round(weighted_total - weighted_tp, 2),
|
| 249 |
"duplicates": len(duplicates_list),
|
| 250 |
"partial_matches": len(partial_matches),
|
| 251 |
"total_findings": total_findings,
|
|
|
|
| 436 |
if result["recall"] < 0.3:
|
| 437 |
reward -= 0.20
|
| 438 |
|
| 439 |
+
return strict_round_clamp(reward, 2)
|
tests/test_graders.py
CHANGED
|
@@ -175,7 +175,7 @@ class TestStepReward:
|
|
| 175 |
def test_false_positive_negative_reward(self, ground_truth):
|
| 176 |
fake = [{"document_id": "FAKE", "error_type": "fake"}]
|
| 177 |
reward = compute_step_reward(fake, fake, ground_truth, 1, False)
|
| 178 |
-
assert reward <= 0.
|
| 179 |
|
| 180 |
def test_reward_decay_over_steps(self, ground_truth):
|
| 181 |
findings = [ground_truth[0]]
|
|
@@ -194,4 +194,4 @@ class TestStepReward:
|
|
| 194 |
fake = [{"document_id": "FAKE", "error_type": "fake"}]
|
| 195 |
reward = compute_step_reward(fake, fake, ground_truth, 1, True)
|
| 196 |
# Should get the low-recall penalty (clamped to epsilon)
|
| 197 |
-
assert reward <= 0.
|
|
|
|
| 175 |
def test_false_positive_negative_reward(self, ground_truth):
|
| 176 |
fake = [{"document_id": "FAKE", "error_type": "fake"}]
|
| 177 |
reward = compute_step_reward(fake, fake, ground_truth, 1, False)
|
| 178 |
+
assert reward <= 0.01 # False positive + step penalty (clamped to epsilon)
|
| 179 |
|
| 180 |
def test_reward_decay_over_steps(self, ground_truth):
|
| 181 |
findings = [ground_truth[0]]
|
|
|
|
| 194 |
fake = [{"document_id": "FAKE", "error_type": "fake"}]
|
| 195 |
reward = compute_step_reward(fake, fake, ground_truth, 1, True)
|
| 196 |
# Should get the low-recall penalty (clamped to epsilon)
|
| 197 |
+
assert reward <= 0.01
|