Spaces:

echoboi
/

discovery-env

Sleeping

echoboi Claude Sonnet 4.6 commited on Mar 3

Commit

5931b5f

1 Parent(s): 86a79eb

Add cell_errors, error_regions, common_error_patterns to submit_rule response

Tracks worst-performing test state during evaluation and computes:
- cell_errors: up to 10 wrong cells with 8-neighbor context
- error_regions: quadrant error distribution + worst state accuracy
- common_error_patterns: top-5 (predicted, expected, count) pairs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +54 -1

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ import random
 import threading
 import time
 import uuid
 from typing import Any
 import numpy as np
@@ -269,18 +270,67 @@ def submit_rule(session_id: str, body: SubmitRequest, _: None = Depends(_auth)):
     test_states = _generate_test_states(env, n=500, seed=eval_seed)
     total_cell_acc = 0.0
     exact_matches = 0
     for state in test_states:
         expected = env.get_true_next(state)
         try:
             predicted = fn(state.copy())
             if isinstance(predicted, np.ndarray) and predicted.shape == expected.shape:
                 # Partial credit: fraction of cells predicted correctly
-                total_cell_acc += float(np.mean(predicted == expected))
                 if np.array_equal(predicted, expected):
                     exact_matches += 1
         except Exception:
             pass
     # functional_accuracy = mean cell-level accuracy across all test states
     accuracy = total_cell_acc / len(test_states)
@@ -326,6 +376,9 @@ def submit_rule(session_id: str, body: SubmitRequest, _: None = Depends(_auth)):
         "agent_dl": agent_dl,
         "reference_dl": ref_dl,
         "delta_dl": delta_dl,
     })

 import threading
 import time
 import uuid
+from collections import Counter
 from typing import Any
 import numpy as np
     test_states = _generate_test_states(env, n=500, seed=eval_seed)
     total_cell_acc = 0.0
     exact_matches = 0
+    worst_state_acc = 1.0
+    worst_pred = None
+    worst_exp = None
+    worst_inp = None
     for state in test_states:
         expected = env.get_true_next(state)
         try:
             predicted = fn(state.copy())
             if isinstance(predicted, np.ndarray) and predicted.shape == expected.shape:
                 # Partial credit: fraction of cells predicted correctly
+                state_acc = float(np.mean(predicted == expected))
+                total_cell_acc += state_acc
+                if state_acc < worst_state_acc:
+                    worst_state_acc = state_acc
+                    worst_pred = predicted
+                    worst_exp = expected
+                    worst_inp = state
                 if np.array_equal(predicted, expected):
                     exact_matches += 1
         except Exception:
             pass
+    # Compute cell-level diagnostics from worst-performing state
+    cell_errors = []
+    error_regions = {}
+    common_error_patterns = []
+    if worst_pred is not None:
+        rows, cols = worst_pred.shape
+        wrong_mask = worst_pred != worst_exp
+        wrong_indices = np.argwhere(wrong_mask)
+        for idx in wrong_indices[:10]:
+            r, c = int(idx[0]), int(idx[1])
+            cell_errors.append({
+                "pos": [r, c],
+                "center": int(worst_inp[r, c]),
+                "N":  int(worst_inp[(r - 1) % rows, c]),
+                "S":  int(worst_inp[(r + 1) % rows, c]),
+                "E":  int(worst_inp[r, (c + 1) % cols]),
+                "W":  int(worst_inp[r, (c - 1) % cols]),
+                "NW": int(worst_inp[(r - 1) % rows, (c - 1) % cols]),
+                "NE": int(worst_inp[(r - 1) % rows, (c + 1) % cols]),
+                "SW": int(worst_inp[(r + 1) % rows, (c - 1) % cols]),
+                "SE": int(worst_inp[(r + 1) % rows, (c + 1) % cols]),
+                "predicted": int(worst_pred[r, c]),
+                "expected":  int(worst_exp[r, c]),
+            })
+        mid_r, mid_c = rows // 2, cols // 2
+        error_regions = {
+            "top_left":     int(wrong_mask[:mid_r, :mid_c].sum()),
+            "top_right":    int(wrong_mask[:mid_r, mid_c:].sum()),
+            "bottom_left":  int(wrong_mask[mid_r:, :mid_c].sum()),
+            "bottom_right": int(wrong_mask[mid_r:, mid_c:].sum()),
+            "total":        int(wrong_mask.sum()),
+            "worst_state_accuracy": round(worst_state_acc, 4),
+        }
+        wrong_pairs = list(zip(worst_pred[wrong_mask].tolist(), worst_exp[wrong_mask].tolist()))
+        common_error_patterns = [
+            {"predicted": p, "expected": e, "count": c}
+            for (p, e), c in Counter(wrong_pairs).most_common(5)
+        ]
     # functional_accuracy = mean cell-level accuracy across all test states
     accuracy = total_cell_acc / len(test_states)
         "agent_dl": agent_dl,
         "reference_dl": ref_dl,
         "delta_dl": delta_dl,
+        "cell_errors": cell_errors,
+        "error_regions": error_regions,
+        "common_error_patterns": common_error_patterns,
     })