Spaces:

balloonmann
/

financial_audit_env

Running

App Files Files Community

balloonmann commited on Apr 7

Commit

a16cc4e

1 Parent(s): b4d5e6a

fix: global invariant enforcement 0<score<1 on api pit stop and robust tests

Browse files

Files changed (5) hide show

financial_audit_env/server/__pycache__/graders.cpython-313.pyc +0 -0
inference.py +6 -3
remotes.txt +0 -0
remotes_utf8.txt +6 -0
server/app.py +11 -7

financial_audit_env/server/__pycache__/graders.cpython-313.pyc CHANGED Viewed

Binary files a/financial_audit_env/server/__pycache__/graders.cpython-313.pyc and b/financial_audit_env/server/__pycache__/graders.cpython-313.pyc differ

inference.py CHANGED Viewed

@@ -266,7 +266,10 @@ def run_agent_single_task(
         grader_resp.raise_for_status()
         grader_data = grader_resp.json()
-        score = grader_data.get("score", 0.01)
         success = score >= SUCCESS_SCORE_THRESHOLD
         result = {
@@ -274,8 +277,8 @@ def run_agent_single_task(
             "task_name": task_info["name"],
             "difficulty": task_info["difficulty"],
             "score": score,
-            "precision": grader_data.get("precision", 0.0),
-            "recall": grader_data.get("recall", 0.0),
         }
         logger.info(f"[{task_id}] Score: {result['score']:.4f} (P={result['precision']:.2f}, R={result['recall']:.2f})")

         grader_resp.raise_for_status()
         grader_data = grader_resp.json()
+        def final_clamp(val: float) -> float:
+            return 0.01 if val <= 0.0 else (0.99 if val >= 1.0 else val)
+        score = final_clamp(grader_data.get("score", 0.01))
         success = score >= SUCCESS_SCORE_THRESHOLD
         result = {
             "task_name": task_info["name"],
             "difficulty": task_info["difficulty"],
             "score": score,
+            "precision": final_clamp(grader_data.get("precision", 0.01)),
+            "recall": final_clamp(grader_data.get("recall", 0.01)),
         }
         logger.info(f"[{task_id}] Score: {result['score']:.4f} (P={result['precision']:.2f}, R={result['recall']:.2f})")

remotes.txt ADDED Viewed

Binary file (884 Bytes). View file

remotes_utf8.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+hf	https://huggingface.co/spaces/balloonmann/financial-audit-env (fetch)
+hf	https://huggingface.co/spaces/balloonmann/financial-audit-env (push)
+origin	https://github.com/balloonmann/financial-audit-env.git (fetch)
+origin	https://github.com/balloonmann/financial-audit-env.git (push)
+space	https://huggingface.co/spaces/balloonmann/financial_audit_env (fetch)
+space	https://huggingface.co/spaces/balloonmann/financial_audit_env (push)

server/app.py CHANGED Viewed

@@ -304,20 +304,24 @@ async def get_grader_score(session_id: Optional[str] = None):
             "message": "No episode completed. Call /reset then /step with submit_final=True.",
         }
     return {
         "status": "completed",
         "task_id": env.state.task_id,
-        # Primary score (backwards compatible)
-        "score": result["score"],
-        "precision": result["precision"],
-        "recall": result["recall"],
         "true_positives": result["true_positives"],
         "false_positives": result["false_positives"],
         "false_negatives": result["false_negatives"],
         "total_errors": result["total_errors"],
-        # Enhanced scoring
-        "weighted_score": result.get("weighted_score", result["score"]),
-        "partial_credit_score": result.get("partial_credit_score", result["score"]),
         "partial_matches": result.get("partial_matches", 0),
         # Confusion matrix
         "confusion_matrix": result.get("confusion_matrix", {}),

             "message": "No episode completed. Call /reset then /step with submit_final=True.",
         }
+    def final_clamp(val: float) -> float:
+        """Ultimate pit stop: guarantees NO score is ever less than or equal to 0, or greater than or equal to 1."""
+        return 0.01 if val <= 0.0 else (0.99 if val >= 1.0 else val)
     return {
         "status": "completed",
         "task_id": env.state.task_id,
+        # Primary score (final pit stop applied)
+        "score": final_clamp(result["score"]),
+        "precision": final_clamp(result["precision"]),
+        "recall": final_clamp(result["recall"]),
         "true_positives": result["true_positives"],
         "false_positives": result["false_positives"],
         "false_negatives": result["false_negatives"],
         "total_errors": result["total_errors"],
+        # Enhanced scoring with final clamp
+        "weighted_score": final_clamp(result.get("weighted_score", result["score"])),
+        "partial_credit_score": final_clamp(result.get("partial_credit_score", result["score"])),
         "partial_matches": result.get("partial_matches", 0),
         # Confusion matrix
         "confusion_matrix": result.get("confusion_matrix", {}),