balloonmann commited on
Commit
57d984d
·
1 Parent(s): 140ce9d

Emergency fix: drop scoring limit to 2 decimal points and 0.01 / 0.99 to pass deep verification

Browse files
financial_audit_env/server/graders.py CHANGED
@@ -23,7 +23,7 @@ from .data_generator import ERROR_MONETARY_VALUES, ERROR_SEVERITY_WEIGHTS
23
  # Phase-2 validator requires every task score to be strictly in (0, 1).
24
  # We enforce: final_score = clamp(round(raw_score, N))
25
  # ---------------------------------------------------------------------------
26
- _SCORE_EPSILON = 0.0001
27
 
28
  def _clamp_score(score: float) -> float:
29
  """Clamp a score to be strictly within (0, 1) — never 0.0 or 1.0."""
@@ -33,7 +33,7 @@ def _clamp_score(score: float) -> float:
33
  return 1.0 - _SCORE_EPSILON
34
  return score
35
 
36
- def strict_round_clamp(raw_score: float, n_digits: int = 4) -> float:
37
  """Safely round then clamp to guarantee the result is strictly in (0, 1)."""
38
  epsilon = 10 ** (-n_digits)
39
  rounded = round(raw_score, n_digits)
@@ -236,16 +236,16 @@ def compute_f1_score(
236
 
237
  return {
238
  # All numeric scores clamped to (0, 1) exclusive — Phase-2 validator requirement
239
- "score": strict_round_clamp(f1, 4),
240
- "precision": strict_round_clamp(precision, 4),
241
- "recall": strict_round_clamp(recall, 4),
242
- "weighted_score": strict_round_clamp(weighted_f1, 4),
243
- "partial_credit_score": strict_round_clamp(partial_credit_f1, 4),
244
  # Counts
245
  "true_positives": true_positives,
246
  "false_positives": len(false_positive_list),
247
  "false_negatives": false_negatives,
248
- "weighted_false_negatives": round(weighted_total - weighted_tp, 4),
249
  "duplicates": len(duplicates_list),
250
  "partial_matches": len(partial_matches),
251
  "total_findings": total_findings,
@@ -436,4 +436,4 @@ def compute_step_reward(
436
  if result["recall"] < 0.3:
437
  reward -= 0.20
438
 
439
- return strict_round_clamp(reward, 4)
 
23
  # Phase-2 validator requires every task score to be strictly in (0, 1).
24
  # We enforce: final_score = clamp(round(raw_score, N))
25
  # ---------------------------------------------------------------------------
26
+ _SCORE_EPSILON = 0.01
27
 
28
  def _clamp_score(score: float) -> float:
29
  """Clamp a score to be strictly within (0, 1) — never 0.0 or 1.0."""
 
33
  return 1.0 - _SCORE_EPSILON
34
  return score
35
 
36
+ def strict_round_clamp(raw_score: float, n_digits: int = 2) -> float:
37
  """Safely round then clamp to guarantee the result is strictly in (0, 1)."""
38
  epsilon = 10 ** (-n_digits)
39
  rounded = round(raw_score, n_digits)
 
236
 
237
  return {
238
  # All numeric scores clamped to (0, 1) exclusive — Phase-2 validator requirement
239
+ "score": strict_round_clamp(f1, 2),
240
+ "precision": strict_round_clamp(precision, 2),
241
+ "recall": strict_round_clamp(recall, 2),
242
+ "weighted_score": strict_round_clamp(weighted_f1, 2),
243
+ "partial_credit_score": strict_round_clamp(partial_credit_f1, 2),
244
  # Counts
245
  "true_positives": true_positives,
246
  "false_positives": len(false_positive_list),
247
  "false_negatives": false_negatives,
248
+ "weighted_false_negatives": round(weighted_total - weighted_tp, 2),
249
  "duplicates": len(duplicates_list),
250
  "partial_matches": len(partial_matches),
251
  "total_findings": total_findings,
 
436
  if result["recall"] < 0.3:
437
  reward -= 0.20
438
 
439
+ return strict_round_clamp(reward, 2)
tests/test_graders.py CHANGED
@@ -175,7 +175,7 @@ class TestStepReward:
175
  def test_false_positive_negative_reward(self, ground_truth):
176
  fake = [{"document_id": "FAKE", "error_type": "fake"}]
177
  reward = compute_step_reward(fake, fake, ground_truth, 1, False)
178
- assert reward <= 0.0001 # False positive + step penalty (clamped to epsilon)
179
 
180
  def test_reward_decay_over_steps(self, ground_truth):
181
  findings = [ground_truth[0]]
@@ -194,4 +194,4 @@ class TestStepReward:
194
  fake = [{"document_id": "FAKE", "error_type": "fake"}]
195
  reward = compute_step_reward(fake, fake, ground_truth, 1, True)
196
  # Should get the low-recall penalty (clamped to epsilon)
197
- assert reward <= 0.0001
 
175
  def test_false_positive_negative_reward(self, ground_truth):
176
  fake = [{"document_id": "FAKE", "error_type": "fake"}]
177
  reward = compute_step_reward(fake, fake, ground_truth, 1, False)
178
+ assert reward <= 0.01 # False positive + step penalty (clamped to epsilon)
179
 
180
  def test_reward_decay_over_steps(self, ground_truth):
181
  findings = [ground_truth[0]]
 
194
  fake = [{"document_id": "FAKE", "error_type": "fake"}]
195
  reward = compute_step_reward(fake, fake, ground_truth, 1, True)
196
  # Should get the low-recall penalty (clamped to epsilon)
197
+ assert reward <= 0.01