Spaces:
Running
Running
Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on
Commit ·
b3dbec5
1
Parent(s): 640f1df
ibm-aml → binary submission for minority F1 (server: no threshold)
Browse filesRationale: F1 with prob+0.5 threshold is unfair when positive rate is 0.19% —
a well-calibrated model gets F1 ≈ 0 because almost no probs exceed 0.5.
Switch to binary submission so the agent owns the threshold (typically tuned
on val), which is what the IBM Multi-GNN paper reports.
- manifest: pred_dtype binary, drop auc_roc secondary (it's degenerate
on binary input — would just be weird)
- submit.py + server/api.py: validate ints in {0,1}; score f1 from raw
- agents/common/tasks.py: parameterize pred_type_desc so binary tasks
tell the agent to pick its own threshold
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- .claude/scheduled_tasks.lock +1 -0
- agents/common/tasks.py +12 -1
- datasets/manifest.yaml +6 -7
- graphtestbed/submit.py +13 -1
- server/api.py +19 -3
.claude/scheduled_tasks.lock
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sessionId":"4b8c2006-8200-4510-a2fc-d6fd597095ce","pid":41708,"acquiredAt":1776710497823}
|
agents/common/tasks.py
CHANGED
|
@@ -34,7 +34,7 @@ Write a CSV with **exactly two columns**, in this order:
|
|
| 34 |
| column | type | meaning |
|
| 35 |
| --- | --- | --- |
|
| 36 |
| `{id_col}` | id | matches `test_features.csv[{id_col}]` 100% |
|
| 37 |
-
| `{pred_col}` |
|
| 38 |
|
| 39 |
Row count: **{n_rows}**.
|
| 40 |
|
|
@@ -44,6 +44,12 @@ You will be evaluated on `{primary}` (primary). Secondary: {secondary}.
|
|
| 44 |
Optimize for the primary metric.
|
| 45 |
"""
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
def task_instruction(task: str) -> str:
|
| 49 |
override = Path(__file__).parent / "tasks_md" / f"{task}.md"
|
|
@@ -52,12 +58,17 @@ def task_instruction(task: str) -> str:
|
|
| 52 |
cfg = task_config(task)
|
| 53 |
s = cfg["submission_schema"]
|
| 54 |
m = cfg["metric"]
|
|
|
|
|
|
|
|
|
|
| 55 |
return _TEMPLATE.format(
|
| 56 |
task=task,
|
| 57 |
description=str(cfg.get("description", "")).strip(),
|
| 58 |
id_col=s["id_col"],
|
| 59 |
pred_col=s["pred_col"],
|
| 60 |
n_rows=s.get("n_rows", "?"),
|
|
|
|
|
|
|
| 61 |
primary=m["primary"],
|
| 62 |
secondary=", ".join(m.get("secondary", [])) or "(none)",
|
| 63 |
)
|
|
|
|
| 34 |
| column | type | meaning |
|
| 35 |
| --- | --- | --- |
|
| 36 |
| `{id_col}` | id | matches `test_features.csv[{id_col}]` 100% |
|
| 37 |
+
| `{pred_col}` | {pred_type_desc} | {pred_meaning} |
|
| 38 |
|
| 39 |
Row count: **{n_rows}**.
|
| 40 |
|
|
|
|
| 44 |
Optimize for the primary metric.
|
| 45 |
"""
|
| 46 |
|
| 47 |
+
_DTYPE_DESC = {
|
| 48 |
+
"float": ("float in [0, 1]", "predicted score (probability)"),
|
| 49 |
+
"binary": ("0 or 1", "predicted hard class — pick your own threshold "
|
| 50 |
+
"(e.g. tune on val to maximize the primary metric)"),
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
|
| 54 |
def task_instruction(task: str) -> str:
|
| 55 |
override = Path(__file__).parent / "tasks_md" / f"{task}.md"
|
|
|
|
| 58 |
cfg = task_config(task)
|
| 59 |
s = cfg["submission_schema"]
|
| 60 |
m = cfg["metric"]
|
| 61 |
+
type_desc, meaning = _DTYPE_DESC.get(
|
| 62 |
+
s.get("pred_dtype", "float"), _DTYPE_DESC["float"]
|
| 63 |
+
)
|
| 64 |
return _TEMPLATE.format(
|
| 65 |
task=task,
|
| 66 |
description=str(cfg.get("description", "")).strip(),
|
| 67 |
id_col=s["id_col"],
|
| 68 |
pred_col=s["pred_col"],
|
| 69 |
n_rows=s.get("n_rows", "?"),
|
| 70 |
+
pred_type_desc=type_desc,
|
| 71 |
+
pred_meaning=meaning,
|
| 72 |
primary=m["primary"],
|
| 73 |
secondary=", ".join(m.get("secondary", [])) or "(none)",
|
| 74 |
)
|
datasets/manifest.yaml
CHANGED
|
@@ -145,21 +145,20 @@ ibm-aml:
|
|
| 145 |
id_col: transaction_id
|
| 146 |
pred_col: is_laundering
|
| 147 |
n_rows: 863900
|
| 148 |
-
pred_dtype:
|
| 149 |
metric:
|
| 150 |
primary: f1
|
| 151 |
secondary:
|
| 152 |
- auc_pr
|
| 153 |
-
- auc_roc
|
| 154 |
description: 'Predict whether each transaction is part of a money-laundering pattern.
|
| 155 |
Source: IBM Transactions for AML (ealtman2019/ibm-transactions-for-anti-money-laundering-aml
|
| 156 |
on Kaggle), HI-Small_Trans.csv variant (~5M total rows). Split: per IBM Multi-GNN
|
| 157 |
convention (github.com/IBM/Multi-GNN), sort by Timestamp, partition by day to
|
| 158 |
~[0.6, 0.2, 0.2]. transaction_id = row index after the global sort. Test rows:
|
| 159 |
-
863,900 (~0.19% positive — heavy class imbalance
|
| 160 |
|
| 161 |
|
| 162 |
-
Metric: F1 on the minority (laundering) class as primary
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
| 145 |
id_col: transaction_id
|
| 146 |
pred_col: is_laundering
|
| 147 |
n_rows: 863900
|
| 148 |
+
pred_dtype: binary
|
| 149 |
metric:
|
| 150 |
primary: f1
|
| 151 |
secondary:
|
| 152 |
- auc_pr
|
|
|
|
| 153 |
description: 'Predict whether each transaction is part of a money-laundering pattern.
|
| 154 |
Source: IBM Transactions for AML (ealtman2019/ibm-transactions-for-anti-money-laundering-aml
|
| 155 |
on Kaggle), HI-Small_Trans.csv variant (~5M total rows). Split: per IBM Multi-GNN
|
| 156 |
convention (github.com/IBM/Multi-GNN), sort by Timestamp, partition by day to
|
| 157 |
~[0.6, 0.2, 0.2]. transaction_id = row index after the global sort. Test rows:
|
| 158 |
+
863,900 (~0.19% positive — heavy class imbalance).
|
| 159 |
|
| 160 |
|
| 161 |
+
Metric: F1 on the minority (laundering) class as primary. Submission must be
|
| 162 |
+
binary 0/1 (you pick the threshold yourself — typically by maximizing F1 on val).
|
| 163 |
+
AUC-PR (computed from your binary submission, so degenerates to a single point)
|
| 164 |
+
is reported as secondary for reference vs the IBM Multi-GNN paper baseline.'
|
graphtestbed/submit.py
CHANGED
|
@@ -52,13 +52,25 @@ def validate_submission(task: str, csv_path: Path) -> dict:
|
|
| 52 |
if df[s["id_col"]].duplicated().any():
|
| 53 |
raise SystemExit(f"Duplicate IDs in {s['id_col']} column")
|
| 54 |
|
| 55 |
-
|
|
|
|
| 56 |
try:
|
| 57 |
preds = df[s["pred_col"]].astype(float)
|
| 58 |
except (TypeError, ValueError) as e:
|
| 59 |
raise SystemExit(f"pred_col not float-castable: {e}")
|
| 60 |
if (preds < 0).any() or (preds > 1).any():
|
| 61 |
raise SystemExit("predictions must lie in [0, 1]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
return {
|
| 64 |
"n_rows": len(df),
|
|
|
|
| 52 |
if df[s["id_col"]].duplicated().any():
|
| 53 |
raise SystemExit(f"Duplicate IDs in {s['id_col']} column")
|
| 54 |
|
| 55 |
+
dtype = s.get("pred_dtype")
|
| 56 |
+
if dtype == "float":
|
| 57 |
try:
|
| 58 |
preds = df[s["pred_col"]].astype(float)
|
| 59 |
except (TypeError, ValueError) as e:
|
| 60 |
raise SystemExit(f"pred_col not float-castable: {e}")
|
| 61 |
if (preds < 0).any() or (preds > 1).any():
|
| 62 |
raise SystemExit("predictions must lie in [0, 1]")
|
| 63 |
+
elif dtype == "binary":
|
| 64 |
+
try:
|
| 65 |
+
preds = df[s["pred_col"]].astype(int)
|
| 66 |
+
except (TypeError, ValueError) as e:
|
| 67 |
+
raise SystemExit(f"pred_col not int-castable: {e}")
|
| 68 |
+
bad = ~preds.isin([0, 1])
|
| 69 |
+
if bad.any():
|
| 70 |
+
raise SystemExit(
|
| 71 |
+
f"binary submission must contain only 0 or 1; "
|
| 72 |
+
f"got {int(bad.sum())} other values"
|
| 73 |
+
)
|
| 74 |
|
| 75 |
return {
|
| 76 |
"n_rows": len(df),
|
server/api.py
CHANGED
|
@@ -110,8 +110,12 @@ def _score(task: str, sub_df: pd.DataFrame, cfg: dict) -> dict:
|
|
| 110 |
)
|
| 111 |
|
| 112 |
y_true = merged["Label"].astype(int)
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
funcs = {
|
| 116 |
"auc_roc": lambda: roc_auc_score(y_true, y_score),
|
| 117 |
"auc_pr": lambda: average_precision_score(y_true, y_score),
|
|
@@ -207,13 +211,25 @@ def _validate_schema(sub_df: pd.DataFrame, cfg: dict) -> None:
|
|
| 207 |
)
|
| 208 |
if sub_df[s["id_col"]].duplicated().any():
|
| 209 |
raise ValueError(f"duplicate IDs in {s['id_col']}")
|
| 210 |
-
|
|
|
|
| 211 |
try:
|
| 212 |
preds = sub_df[s["pred_col"]].astype(float)
|
| 213 |
except (TypeError, ValueError) as e:
|
| 214 |
raise ValueError(f"pred_col not float-castable: {e}")
|
| 215 |
if (preds < 0).any() or (preds > 1).any():
|
| 216 |
raise ValueError("predictions must lie in [0, 1]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
|
| 219 |
@app.post("/submit")
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
y_true = merged["Label"].astype(int)
|
| 113 |
+
if schema.get("pred_dtype") == "binary":
|
| 114 |
+
y_pred = merged["_pred"].astype(int)
|
| 115 |
+
y_score = y_pred.astype(float)
|
| 116 |
+
else:
|
| 117 |
+
y_score = merged["_pred"].astype(float)
|
| 118 |
+
y_pred = (y_score >= 0.5).astype(int)
|
| 119 |
funcs = {
|
| 120 |
"auc_roc": lambda: roc_auc_score(y_true, y_score),
|
| 121 |
"auc_pr": lambda: average_precision_score(y_true, y_score),
|
|
|
|
| 211 |
)
|
| 212 |
if sub_df[s["id_col"]].duplicated().any():
|
| 213 |
raise ValueError(f"duplicate IDs in {s['id_col']}")
|
| 214 |
+
dtype = s.get("pred_dtype")
|
| 215 |
+
if dtype == "float":
|
| 216 |
try:
|
| 217 |
preds = sub_df[s["pred_col"]].astype(float)
|
| 218 |
except (TypeError, ValueError) as e:
|
| 219 |
raise ValueError(f"pred_col not float-castable: {e}")
|
| 220 |
if (preds < 0).any() or (preds > 1).any():
|
| 221 |
raise ValueError("predictions must lie in [0, 1]")
|
| 222 |
+
elif dtype == "binary":
|
| 223 |
+
try:
|
| 224 |
+
preds = sub_df[s["pred_col"]].astype(int)
|
| 225 |
+
except (TypeError, ValueError) as e:
|
| 226 |
+
raise ValueError(f"pred_col not int-castable: {e}")
|
| 227 |
+
bad = ~preds.isin([0, 1])
|
| 228 |
+
if bad.any():
|
| 229 |
+
raise ValueError(
|
| 230 |
+
f"binary submission must contain only 0 or 1; "
|
| 231 |
+
f"got {int(bad.sum())} other values"
|
| 232 |
+
)
|
| 233 |
|
| 234 |
|
| 235 |
@app.post("/submit")
|