Spaces:
Running
Running
Commit ·
a16cc4e
1
Parent(s): b4d5e6a
fix: global invariant enforcement 0<score<1 on api pit stop and robust tests
Browse files- financial_audit_env/server/__pycache__/graders.cpython-313.pyc +0 -0
- inference.py +6 -3
- remotes.txt +0 -0
- remotes_utf8.txt +6 -0
- server/app.py +11 -7
financial_audit_env/server/__pycache__/graders.cpython-313.pyc
CHANGED
|
Binary files a/financial_audit_env/server/__pycache__/graders.cpython-313.pyc and b/financial_audit_env/server/__pycache__/graders.cpython-313.pyc differ
|
|
|
inference.py
CHANGED
|
@@ -266,7 +266,10 @@ def run_agent_single_task(
|
|
| 266 |
grader_resp.raise_for_status()
|
| 267 |
grader_data = grader_resp.json()
|
| 268 |
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
| 270 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 271 |
|
| 272 |
result = {
|
|
@@ -274,8 +277,8 @@ def run_agent_single_task(
|
|
| 274 |
"task_name": task_info["name"],
|
| 275 |
"difficulty": task_info["difficulty"],
|
| 276 |
"score": score,
|
| 277 |
-
"precision": grader_data.get("precision", 0.
|
| 278 |
-
"recall": grader_data.get("recall", 0.
|
| 279 |
}
|
| 280 |
|
| 281 |
logger.info(f"[{task_id}] Score: {result['score']:.4f} (P={result['precision']:.2f}, R={result['recall']:.2f})")
|
|
|
|
| 266 |
grader_resp.raise_for_status()
|
| 267 |
grader_data = grader_resp.json()
|
| 268 |
|
| 269 |
+
def final_clamp(val: float) -> float:
|
| 270 |
+
return 0.01 if val <= 0.0 else (0.99 if val >= 1.0 else val)
|
| 271 |
+
|
| 272 |
+
score = final_clamp(grader_data.get("score", 0.01))
|
| 273 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 274 |
|
| 275 |
result = {
|
|
|
|
| 277 |
"task_name": task_info["name"],
|
| 278 |
"difficulty": task_info["difficulty"],
|
| 279 |
"score": score,
|
| 280 |
+
"precision": final_clamp(grader_data.get("precision", 0.01)),
|
| 281 |
+
"recall": final_clamp(grader_data.get("recall", 0.01)),
|
| 282 |
}
|
| 283 |
|
| 284 |
logger.info(f"[{task_id}] Score: {result['score']:.4f} (P={result['precision']:.2f}, R={result['recall']:.2f})")
|
remotes.txt
ADDED
|
Binary file (884 Bytes). View file
|
|
|
remotes_utf8.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hf https://huggingface.co/spaces/balloonmann/financial-audit-env (fetch)
|
| 2 |
+
hf https://huggingface.co/spaces/balloonmann/financial-audit-env (push)
|
| 3 |
+
origin https://github.com/balloonmann/financial-audit-env.git (fetch)
|
| 4 |
+
origin https://github.com/balloonmann/financial-audit-env.git (push)
|
| 5 |
+
space https://huggingface.co/spaces/balloonmann/financial_audit_env (fetch)
|
| 6 |
+
space https://huggingface.co/spaces/balloonmann/financial_audit_env (push)
|
server/app.py
CHANGED
|
@@ -304,20 +304,24 @@ async def get_grader_score(session_id: Optional[str] = None):
|
|
| 304 |
"message": "No episode completed. Call /reset then /step with submit_final=True.",
|
| 305 |
}
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
return {
|
| 308 |
"status": "completed",
|
| 309 |
"task_id": env.state.task_id,
|
| 310 |
-
# Primary score (
|
| 311 |
-
"score": result["score"],
|
| 312 |
-
"precision": result["precision"],
|
| 313 |
-
"recall": result["recall"],
|
| 314 |
"true_positives": result["true_positives"],
|
| 315 |
"false_positives": result["false_positives"],
|
| 316 |
"false_negatives": result["false_negatives"],
|
| 317 |
"total_errors": result["total_errors"],
|
| 318 |
-
# Enhanced scoring
|
| 319 |
-
"weighted_score": result.get("weighted_score", result["score"]),
|
| 320 |
-
"partial_credit_score": result.get("partial_credit_score", result["score"]),
|
| 321 |
"partial_matches": result.get("partial_matches", 0),
|
| 322 |
# Confusion matrix
|
| 323 |
"confusion_matrix": result.get("confusion_matrix", {}),
|
|
|
|
| 304 |
"message": "No episode completed. Call /reset then /step with submit_final=True.",
|
| 305 |
}
|
| 306 |
|
| 307 |
+
def final_clamp(val: float) -> float:
|
| 308 |
+
"""Ultimate pit stop: guarantees NO score is ever less than or equal to 0, or greater than or equal to 1."""
|
| 309 |
+
return 0.01 if val <= 0.0 else (0.99 if val >= 1.0 else val)
|
| 310 |
+
|
| 311 |
return {
|
| 312 |
"status": "completed",
|
| 313 |
"task_id": env.state.task_id,
|
| 314 |
+
# Primary score (final pit stop applied)
|
| 315 |
+
"score": final_clamp(result["score"]),
|
| 316 |
+
"precision": final_clamp(result["precision"]),
|
| 317 |
+
"recall": final_clamp(result["recall"]),
|
| 318 |
"true_positives": result["true_positives"],
|
| 319 |
"false_positives": result["false_positives"],
|
| 320 |
"false_negatives": result["false_negatives"],
|
| 321 |
"total_errors": result["total_errors"],
|
| 322 |
+
# Enhanced scoring with final clamp
|
| 323 |
+
"weighted_score": final_clamp(result.get("weighted_score", result["score"])),
|
| 324 |
+
"partial_credit_score": final_clamp(result.get("partial_credit_score", result["score"])),
|
| 325 |
"partial_matches": result.get("partial_matches", 0),
|
| 326 |
# Confusion matrix
|
| 327 |
"confusion_matrix": result.get("confusion_matrix", {}),
|