Spaces:

lanczos
/

graphtestbed

Running

Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on Apr 20

Commit

b3dbec5

1 Parent(s): 640f1df

ibm-aml → binary submission for minority F1 (server: no threshold)

Rationale: F1 with prob+0.5 threshold is unfair when positive rate is 0.19% —
a well-calibrated model gets F1 ≈ 0 because almost no probs exceed 0.5.
Switch to binary submission so the agent owns the threshold (typically tuned
on val), which is what the IBM Multi-GNN paper reports.

- manifest: pred_dtype binary, drop auc_roc secondary (it's degenerate
on binary input — would just be weird)
- submit.py + server/api.py: validate ints in {0,1}; score f1 from raw
- agents/common/tasks.py: parameterize pred_type_desc so binary tasks
tell the agent to pick its own threshold

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

.claude/scheduled_tasks.lock +1 -0
agents/common/tasks.py +12 -1
datasets/manifest.yaml +6 -7
graphtestbed/submit.py +13 -1
server/api.py +19 -3

.claude/scheduled_tasks.lock ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"sessionId":"4b8c2006-8200-4510-a2fc-d6fd597095ce","pid":41708,"acquiredAt":1776710497823}

agents/common/tasks.py CHANGED Viewed

@@ -34,7 +34,7 @@ Write a CSV with **exactly two columns**, in this order:
 | column | type | meaning |
 | --- | --- | --- |
 | `{id_col}` | id | matches `test_features.csv[{id_col}]` 100% |
-| `{pred_col}` | float in [0, 1] | predicted score |
 Row count: **{n_rows}**.
@@ -44,6 +44,12 @@ You will be evaluated on `{primary}` (primary). Secondary: {secondary}.
 Optimize for the primary metric.
 """
 def task_instruction(task: str) -> str:
     override = Path(__file__).parent / "tasks_md" / f"{task}.md"
@@ -52,12 +58,17 @@ def task_instruction(task: str) -> str:
     cfg = task_config(task)
     s = cfg["submission_schema"]
     m = cfg["metric"]
     return _TEMPLATE.format(
         task=task,
         description=str(cfg.get("description", "")).strip(),
         id_col=s["id_col"],
         pred_col=s["pred_col"],
         n_rows=s.get("n_rows", "?"),
         primary=m["primary"],
         secondary=", ".join(m.get("secondary", [])) or "(none)",
     )

 | column | type | meaning |
 | --- | --- | --- |
 | `{id_col}` | id | matches `test_features.csv[{id_col}]` 100% |
+| `{pred_col}` | {pred_type_desc} | {pred_meaning} |
 Row count: **{n_rows}**.
 Optimize for the primary metric.
 """
+_DTYPE_DESC = {
+    "float": ("float in [0, 1]", "predicted score (probability)"),
+    "binary": ("0 or 1", "predicted hard class — pick your own threshold "
+                          "(e.g. tune on val to maximize the primary metric)"),
+}
 def task_instruction(task: str) -> str:
     override = Path(__file__).parent / "tasks_md" / f"{task}.md"
     cfg = task_config(task)
     s = cfg["submission_schema"]
     m = cfg["metric"]
+    type_desc, meaning = _DTYPE_DESC.get(
+        s.get("pred_dtype", "float"), _DTYPE_DESC["float"]
+    )
     return _TEMPLATE.format(
         task=task,
         description=str(cfg.get("description", "")).strip(),
         id_col=s["id_col"],
         pred_col=s["pred_col"],
         n_rows=s.get("n_rows", "?"),
+        pred_type_desc=type_desc,
+        pred_meaning=meaning,
         primary=m["primary"],
         secondary=", ".join(m.get("secondary", [])) or "(none)",
     )

datasets/manifest.yaml CHANGED Viewed

@@ -145,21 +145,20 @@ ibm-aml:
     id_col: transaction_id
     pred_col: is_laundering
     n_rows: 863900
-    pred_dtype: float
   metric:
     primary: f1
     secondary:
     - auc_pr
-    - auc_roc
   description: 'Predict whether each transaction is part of a money-laundering pattern.
     Source: IBM Transactions for AML (ealtman2019/ibm-transactions-for-anti-money-laundering-aml
     on Kaggle), HI-Small_Trans.csv variant (~5M total rows). Split: per IBM Multi-GNN
     convention (github.com/IBM/Multi-GNN), sort by Timestamp, partition by day to
     ~[0.6, 0.2, 0.2]. transaction_id = row index after the global sort. Test rows:
-    863,900 (~0.19% positive — heavy class imbalance, hence AUC-PR primary).
-    Metric: F1 on the minority (laundering) class as primary — the dataset is ~0.19%
-    positive, so a model that predicts all-zeros gets a trivially high accuracy but
-    useless F1. AUC-PR and AUC-ROC reported as secondary; AUC-PR is the IBM Multi-GNN
-    paper baseline metric and stays useful for reference.'

     id_col: transaction_id
     pred_col: is_laundering
     n_rows: 863900
+    pred_dtype: binary
   metric:
     primary: f1
     secondary:
     - auc_pr
   description: 'Predict whether each transaction is part of a money-laundering pattern.
     Source: IBM Transactions for AML (ealtman2019/ibm-transactions-for-anti-money-laundering-aml
     on Kaggle), HI-Small_Trans.csv variant (~5M total rows). Split: per IBM Multi-GNN
     convention (github.com/IBM/Multi-GNN), sort by Timestamp, partition by day to
     ~[0.6, 0.2, 0.2]. transaction_id = row index after the global sort. Test rows:
+    863,900 (~0.19% positive — heavy class imbalance).
+    Metric: F1 on the minority (laundering) class as primary. Submission must be
+    binary 0/1 (you pick the threshold yourself — typically by maximizing F1 on val).
+    AUC-PR (computed from your binary submission, so degenerates to a single point)
+    is reported as secondary for reference vs the IBM Multi-GNN paper baseline.'

graphtestbed/submit.py CHANGED Viewed

@@ -52,13 +52,25 @@ def validate_submission(task: str, csv_path: Path) -> dict:
     if df[s["id_col"]].duplicated().any():
         raise SystemExit(f"Duplicate IDs in {s['id_col']} column")
-    if s.get("pred_dtype") == "float":
         try:
             preds = df[s["pred_col"]].astype(float)
         except (TypeError, ValueError) as e:
             raise SystemExit(f"pred_col not float-castable: {e}")
         if (preds < 0).any() or (preds > 1).any():
             raise SystemExit("predictions must lie in [0, 1]")
     return {
         "n_rows": len(df),

     if df[s["id_col"]].duplicated().any():
         raise SystemExit(f"Duplicate IDs in {s['id_col']} column")
+    dtype = s.get("pred_dtype")
+    if dtype == "float":
         try:
             preds = df[s["pred_col"]].astype(float)
         except (TypeError, ValueError) as e:
             raise SystemExit(f"pred_col not float-castable: {e}")
         if (preds < 0).any() or (preds > 1).any():
             raise SystemExit("predictions must lie in [0, 1]")
+    elif dtype == "binary":
+        try:
+            preds = df[s["pred_col"]].astype(int)
+        except (TypeError, ValueError) as e:
+            raise SystemExit(f"pred_col not int-castable: {e}")
+        bad = ~preds.isin([0, 1])
+        if bad.any():
+            raise SystemExit(
+                f"binary submission must contain only 0 or 1; "
+                f"got {int(bad.sum())} other values"
+            )
     return {
         "n_rows": len(df),

server/api.py CHANGED Viewed

@@ -110,8 +110,12 @@ def _score(task: str, sub_df: pd.DataFrame, cfg: dict) -> dict:
         )
     y_true = merged["Label"].astype(int)
-    y_score = merged["_pred"].astype(float)
-    y_pred = (y_score >= 0.5).astype(int)
     funcs = {
         "auc_roc": lambda: roc_auc_score(y_true, y_score),
         "auc_pr": lambda: average_precision_score(y_true, y_score),
@@ -207,13 +211,25 @@ def _validate_schema(sub_df: pd.DataFrame, cfg: dict) -> None:
         )
     if sub_df[s["id_col"]].duplicated().any():
         raise ValueError(f"duplicate IDs in {s['id_col']}")
-    if s.get("pred_dtype") == "float":
         try:
             preds = sub_df[s["pred_col"]].astype(float)
         except (TypeError, ValueError) as e:
             raise ValueError(f"pred_col not float-castable: {e}")
         if (preds < 0).any() or (preds > 1).any():
             raise ValueError("predictions must lie in [0, 1]")
 @app.post("/submit")

         )
     y_true = merged["Label"].astype(int)
+    if schema.get("pred_dtype") == "binary":
+        y_pred = merged["_pred"].astype(int)
+        y_score = y_pred.astype(float)
+    else:
+        y_score = merged["_pred"].astype(float)
+        y_pred = (y_score >= 0.5).astype(int)
     funcs = {
         "auc_roc": lambda: roc_auc_score(y_true, y_score),
         "auc_pr": lambda: average_precision_score(y_true, y_score),
         )
     if sub_df[s["id_col"]].duplicated().any():
         raise ValueError(f"duplicate IDs in {s['id_col']}")
+    dtype = s.get("pred_dtype")
+    if dtype == "float":
         try:
             preds = sub_df[s["pred_col"]].astype(float)
         except (TypeError, ValueError) as e:
             raise ValueError(f"pred_col not float-castable: {e}")
         if (preds < 0).any() or (preds > 1).any():
             raise ValueError("predictions must lie in [0, 1]")
+    elif dtype == "binary":
+        try:
+            preds = sub_df[s["pred_col"]].astype(int)
+        except (TypeError, ValueError) as e:
+            raise ValueError(f"pred_col not int-castable: {e}")
+        bad = ~preds.isin([0, 1])
+        if bad.any():
+            raise ValueError(
+                f"binary submission must contain only 0 or 1; "
+                f"got {int(bad.sum())} other values"
+            )
 @app.post("/submit")