DevikaJ2005 commited on
Commit
df73ff6
·
1 Parent(s): 723cc80

Clamp task scores to strict open interval

Browse files
Files changed (2) hide show
  1. graders.py +18 -5
  2. inference.py +45 -16
graders.py CHANGED
@@ -9,7 +9,9 @@ from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_
9
 
10
 
11
  class FraudShieldGrader:
12
- """Task graders returning scores in the inclusive range [0.0, 1.0]."""
 
 
13
 
14
  @staticmethod
15
  def _validate(predictions: List[str], ground_truth: List[str], confidences: List[float]) -> bool:
@@ -25,6 +27,17 @@ class FraudShieldGrader:
25
  def _score_confidence(predictions: List[str], confidences: List[float]) -> List[float]:
26
  return [confidence if pred == "fraud" else 1.0 - confidence for pred, confidence in zip(predictions, confidences)]
27
 
 
 
 
 
 
 
 
 
 
 
 
28
  @staticmethod
29
  def _classification_metrics(
30
  predictions: List[str],
@@ -69,7 +82,7 @@ class FraudShieldGrader:
69
  """Easy task emphasizes obvious-case accuracy and false-positive control."""
70
 
71
  if not FraudShieldGrader._validate(predictions, ground_truth, confidences):
72
- return {"score": 0.0, "reason": "Invalid predictions"}
73
 
74
  metrics = FraudShieldGrader._classification_metrics(predictions, ground_truth, confidences)
75
  score = (
@@ -89,7 +102,7 @@ class FraudShieldGrader:
89
  """Medium task rewards balanced classification and calibrated confidence."""
90
 
91
  if not FraudShieldGrader._validate(predictions, ground_truth, confidences):
92
- return {"score": 0.0, "reason": "Invalid predictions"}
93
 
94
  metrics = FraudShieldGrader._classification_metrics(predictions, ground_truth, confidences)
95
  score = (
@@ -109,7 +122,7 @@ class FraudShieldGrader:
109
  """Hard task weights fraud capture, precision, and ranking quality."""
110
 
111
  if not FraudShieldGrader._validate(predictions, ground_truth, confidences):
112
- return {"score": 0.0, "reason": "Invalid predictions"}
113
 
114
  metrics = FraudShieldGrader._classification_metrics(predictions, ground_truth, confidences)
115
  score = (
@@ -182,7 +195,7 @@ class FraudShieldGrader:
182
  ground_truth: List[str],
183
  ) -> Dict[str, Any]:
184
  return {
185
- "score": float(max(0.0, min(1.0, score))),
186
  "task": task_name,
187
  "metrics": metrics,
188
  "num_transactions": len(ground_truth),
 
9
 
10
 
11
  class FraudShieldGrader:
12
+ """Task graders returning scores in the strict range (0.0, 1.0)."""
13
+
14
+ STRICT_SCORE_EPSILON = 1e-4
15
 
16
  @staticmethod
17
  def _validate(predictions: List[str], ground_truth: List[str], confidences: List[float]) -> bool:
 
27
  def _score_confidence(predictions: List[str], confidences: List[float]) -> List[float]:
28
  return [confidence if pred == "fraud" else 1.0 - confidence for pred, confidence in zip(predictions, confidences)]
29
 
30
+ @staticmethod
31
+ def _strict_score(score: float) -> float:
32
+ """Clamp task scores to the open interval required by the submission validator."""
33
+
34
+ return float(
35
+ max(
36
+ FraudShieldGrader.STRICT_SCORE_EPSILON,
37
+ min(1.0 - FraudShieldGrader.STRICT_SCORE_EPSILON, score),
38
+ )
39
+ )
40
+
41
  @staticmethod
42
  def _classification_metrics(
43
  predictions: List[str],
 
82
  """Easy task emphasizes obvious-case accuracy and false-positive control."""
83
 
84
  if not FraudShieldGrader._validate(predictions, ground_truth, confidences):
85
+ return {"score": FraudShieldGrader._strict_score(0.0), "reason": "Invalid predictions"}
86
 
87
  metrics = FraudShieldGrader._classification_metrics(predictions, ground_truth, confidences)
88
  score = (
 
102
  """Medium task rewards balanced classification and calibrated confidence."""
103
 
104
  if not FraudShieldGrader._validate(predictions, ground_truth, confidences):
105
+ return {"score": FraudShieldGrader._strict_score(0.0), "reason": "Invalid predictions"}
106
 
107
  metrics = FraudShieldGrader._classification_metrics(predictions, ground_truth, confidences)
108
  score = (
 
122
  """Hard task weights fraud capture, precision, and ranking quality."""
123
 
124
  if not FraudShieldGrader._validate(predictions, ground_truth, confidences):
125
+ return {"score": FraudShieldGrader._strict_score(0.0), "reason": "Invalid predictions"}
126
 
127
  metrics = FraudShieldGrader._classification_metrics(predictions, ground_truth, confidences)
128
  score = (
 
195
  ground_truth: List[str],
196
  ) -> Dict[str, Any]:
197
  return {
198
+ "score": FraudShieldGrader._strict_score(score),
199
  "task": task_name,
200
  "metrics": metrics,
201
  "num_transactions": len(ground_truth),
inference.py CHANGED
@@ -111,7 +111,7 @@ def run_task(
111
  env: FraudShieldEnvironment,
112
  agent: object,
113
  task_name: str,
114
- ) -> Tuple[List[str], List[str], List[float], object]:
115
  """Run one task episode and capture the full prediction trace.
116
 
117
  This function executes a complete episode for a single task difficulty,
@@ -128,6 +128,7 @@ def run_task(
128
  - ground_truth: List[str] of true labels
129
  - confidences: List[float] of confidence values [0.0, 1.0]
130
  - agent: Possibly updated agent if a fallback was needed
 
131
 
132
  Workflow:
133
  1. Call env.reset(task_name) to initialize episode
@@ -201,14 +202,7 @@ def run_task(
201
  accuracy,
202
  env.cumulative_reward,
203
  )
204
- emit_event(
205
- "END",
206
- task=task_name,
207
- score=f"{accuracy:.4f}",
208
- reward=f"{env.cumulative_reward:.4f}",
209
- steps=env.step_count,
210
- )
211
- return predictions, list(env.ground_truth_labels), confidences, agent
212
 
213
 
214
  def main() -> Dict[str, object]:
@@ -264,21 +258,56 @@ def main() -> Dict[str, object]:
264
  getattr(agent, "model_name", get_env("MODEL_NAME", "MODELNAME", default="<offline-heuristic>")),
265
  )
266
 
267
- easy_predictions, easy_ground_truth, easy_confidences, agent = run_task(env, agent, "easy")
268
- medium_predictions, medium_ground_truth, medium_confidences, agent = run_task(env, agent, "medium")
269
- hard_predictions, hard_ground_truth, hard_confidences, agent = run_task(env, agent, "hard")
 
 
 
 
 
 
270
 
271
- grading_result = FraudShieldGrader.grade_all_tasks(
272
- easy_predictions,
273
- easy_ground_truth,
274
- easy_confidences,
275
  medium_predictions,
276
  medium_ground_truth,
277
  medium_confidences,
 
 
 
 
 
 
 
 
 
 
 
278
  hard_predictions,
279
  hard_ground_truth,
280
  hard_confidences,
281
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  grading_result["metadata"] = {
283
  "agent_name": getattr(agent, "name", agent.__class__.__name__),
284
  "api_base_url": getattr(agent, "api_base_url", get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1")),
 
111
  env: FraudShieldEnvironment,
112
  agent: object,
113
  task_name: str,
114
+ ) -> Tuple[List[str], List[str], List[float], object, float]:
115
  """Run one task episode and capture the full prediction trace.
116
 
117
  This function executes a complete episode for a single task difficulty,
 
128
  - ground_truth: List[str] of true labels
129
  - confidences: List[float] of confidence values [0.0, 1.0]
130
  - agent: Possibly updated agent if a fallback was needed
131
+ - cumulative_reward: Total episode reward for the task
132
 
133
  Workflow:
134
  1. Call env.reset(task_name) to initialize episode
 
202
  accuracy,
203
  env.cumulative_reward,
204
  )
205
+ return predictions, list(env.ground_truth_labels), confidences, agent, env.cumulative_reward
 
 
 
 
 
 
 
206
 
207
 
208
  def main() -> Dict[str, object]:
 
258
  getattr(agent, "model_name", get_env("MODEL_NAME", "MODELNAME", default="<offline-heuristic>")),
259
  )
260
 
261
+ easy_predictions, easy_ground_truth, easy_confidences, agent, easy_reward = run_task(env, agent, "easy")
262
+ easy_result = FraudShieldGrader.grade_easy_task(easy_predictions, easy_ground_truth, easy_confidences)
263
+ emit_event(
264
+ "END",
265
+ task="easy",
266
+ score=f"{easy_result['score']:.4f}",
267
+ reward=f"{easy_reward:.4f}",
268
+ steps=len(easy_ground_truth),
269
+ )
270
 
271
+ medium_predictions, medium_ground_truth, medium_confidences, agent, medium_reward = run_task(env, agent, "medium")
272
+ medium_result = FraudShieldGrader.grade_medium_task(
 
 
273
  medium_predictions,
274
  medium_ground_truth,
275
  medium_confidences,
276
+ )
277
+ emit_event(
278
+ "END",
279
+ task="medium",
280
+ score=f"{medium_result['score']:.4f}",
281
+ reward=f"{medium_reward:.4f}",
282
+ steps=len(medium_ground_truth),
283
+ )
284
+
285
+ hard_predictions, hard_ground_truth, hard_confidences, agent, hard_reward = run_task(env, agent, "hard")
286
+ hard_result = FraudShieldGrader.grade_hard_task(
287
  hard_predictions,
288
  hard_ground_truth,
289
  hard_confidences,
290
  )
291
+ emit_event(
292
+ "END",
293
+ task="hard",
294
+ score=f"{hard_result['score']:.4f}",
295
+ reward=f"{hard_reward:.4f}",
296
+ steps=len(hard_ground_truth),
297
+ )
298
+
299
+ final_score = (easy_result["score"] + medium_result["score"] + hard_result["score"]) / 3.0
300
+ grading_result = {
301
+ "final_score": float(final_score),
302
+ "easy": easy_result,
303
+ "medium": medium_result,
304
+ "hard": hard_result,
305
+ "breakdown": {
306
+ "easy_weight": 1 / 3,
307
+ "medium_weight": 1 / 3,
308
+ "hard_weight": 1 / 3,
309
+ },
310
+ }
311
  grading_result["metadata"] = {
312
  "agent_name": getattr(agent, "name", agent.__class__.__name__),
313
  "api_base_url": getattr(agent, "api_base_url", get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1")),