balloonmann commited on
Commit
a16cc4e
·
1 Parent(s): b4d5e6a

fix: global invariant enforcement 0<score<1 on api pit stop and robust tests

Browse files
financial_audit_env/server/__pycache__/graders.cpython-313.pyc CHANGED
Binary files a/financial_audit_env/server/__pycache__/graders.cpython-313.pyc and b/financial_audit_env/server/__pycache__/graders.cpython-313.pyc differ
 
inference.py CHANGED
@@ -266,7 +266,10 @@ def run_agent_single_task(
266
  grader_resp.raise_for_status()
267
  grader_data = grader_resp.json()
268
 
269
- score = grader_data.get("score", 0.01)
 
 
 
270
  success = score >= SUCCESS_SCORE_THRESHOLD
271
 
272
  result = {
@@ -274,8 +277,8 @@ def run_agent_single_task(
274
  "task_name": task_info["name"],
275
  "difficulty": task_info["difficulty"],
276
  "score": score,
277
- "precision": grader_data.get("precision", 0.0),
278
- "recall": grader_data.get("recall", 0.0),
279
  }
280
 
281
  logger.info(f"[{task_id}] Score: {result['score']:.4f} (P={result['precision']:.2f}, R={result['recall']:.2f})")
 
266
  grader_resp.raise_for_status()
267
  grader_data = grader_resp.json()
268
 
269
+ def final_clamp(val: float) -> float:
270
+ return 0.01 if val <= 0.0 else (0.99 if val >= 1.0 else val)
271
+
272
+ score = final_clamp(grader_data.get("score", 0.01))
273
  success = score >= SUCCESS_SCORE_THRESHOLD
274
 
275
  result = {
 
277
  "task_name": task_info["name"],
278
  "difficulty": task_info["difficulty"],
279
  "score": score,
280
+ "precision": final_clamp(grader_data.get("precision", 0.01)),
281
+ "recall": final_clamp(grader_data.get("recall", 0.01)),
282
  }
283
 
284
  logger.info(f"[{task_id}] Score: {result['score']:.4f} (P={result['precision']:.2f}, R={result['recall']:.2f})")
remotes.txt ADDED
Binary file (884 Bytes). View file
 
remotes_utf8.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ hf https://huggingface.co/spaces/balloonmann/financial-audit-env (fetch)
2
+ hf https://huggingface.co/spaces/balloonmann/financial-audit-env (push)
3
+ origin https://github.com/balloonmann/financial-audit-env.git (fetch)
4
+ origin https://github.com/balloonmann/financial-audit-env.git (push)
5
+ space https://huggingface.co/spaces/balloonmann/financial_audit_env (fetch)
6
+ space https://huggingface.co/spaces/balloonmann/financial_audit_env (push)
server/app.py CHANGED
@@ -304,20 +304,24 @@ async def get_grader_score(session_id: Optional[str] = None):
304
  "message": "No episode completed. Call /reset then /step with submit_final=True.",
305
  }
306
 
 
 
 
 
307
  return {
308
  "status": "completed",
309
  "task_id": env.state.task_id,
310
- # Primary score (backwards compatible)
311
- "score": result["score"],
312
- "precision": result["precision"],
313
- "recall": result["recall"],
314
  "true_positives": result["true_positives"],
315
  "false_positives": result["false_positives"],
316
  "false_negatives": result["false_negatives"],
317
  "total_errors": result["total_errors"],
318
- # Enhanced scoring
319
- "weighted_score": result.get("weighted_score", result["score"]),
320
- "partial_credit_score": result.get("partial_credit_score", result["score"]),
321
  "partial_matches": result.get("partial_matches", 0),
322
  # Confusion matrix
323
  "confusion_matrix": result.get("confusion_matrix", {}),
 
304
  "message": "No episode completed. Call /reset then /step with submit_final=True.",
305
  }
306
 
307
+ def final_clamp(val: float) -> float:
308
+ """Ultimate pit stop: guarantees NO score is ever less than or equal to 0, or greater than or equal to 1."""
309
+ return 0.01 if val <= 0.0 else (0.99 if val >= 1.0 else val)
310
+
311
  return {
312
  "status": "completed",
313
  "task_id": env.state.task_id,
314
+ # Primary score (final pit stop applied)
315
+ "score": final_clamp(result["score"]),
316
+ "precision": final_clamp(result["precision"]),
317
+ "recall": final_clamp(result["recall"]),
318
  "true_positives": result["true_positives"],
319
  "false_positives": result["false_positives"],
320
  "false_negatives": result["false_negatives"],
321
  "total_errors": result["total_errors"],
322
+ # Enhanced scoring with final clamp
323
+ "weighted_score": final_clamp(result.get("weighted_score", result["score"])),
324
+ "partial_credit_score": final_clamp(result.get("partial_credit_score", result["score"])),
325
  "partial_matches": result.get("partial_matches", 0),
326
  # Confusion matrix
327
  "confusion_matrix": result.get("confusion_matrix", {}),