Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on
Commit
b3dbec5
·
1 Parent(s): 640f1df

ibm-aml → binary submission for minority F1 (server: no threshold)

Browse files

Rationale: F1 with prob+0.5 threshold is unfair when positive rate is 0.19% —
a well-calibrated model gets F1 ≈ 0 because almost no probs exceed 0.5.
Switch to binary submission so the agent owns the threshold (typically tuned
on val), which is what the IBM Multi-GNN paper reports.

- manifest: pred_dtype binary, drop auc_roc secondary (it's degenerate
on binary input — would just be weird)
- submit.py + server/api.py: validate ints in {0,1}; score f1 from raw
- agents/common/tasks.py: parameterize pred_type_desc so binary tasks
tell the agent to pick its own threshold

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

.claude/scheduled_tasks.lock ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sessionId":"4b8c2006-8200-4510-a2fc-d6fd597095ce","pid":41708,"acquiredAt":1776710497823}
agents/common/tasks.py CHANGED
@@ -34,7 +34,7 @@ Write a CSV with **exactly two columns**, in this order:
34
  | column | type | meaning |
35
  | --- | --- | --- |
36
  | `{id_col}` | id | matches `test_features.csv[{id_col}]` 100% |
37
- | `{pred_col}` | float in [0, 1] | predicted score |
38
 
39
  Row count: **{n_rows}**.
40
 
@@ -44,6 +44,12 @@ You will be evaluated on `{primary}` (primary). Secondary: {secondary}.
44
  Optimize for the primary metric.
45
  """
46
 
 
 
 
 
 
 
47
 
48
  def task_instruction(task: str) -> str:
49
  override = Path(__file__).parent / "tasks_md" / f"{task}.md"
@@ -52,12 +58,17 @@ def task_instruction(task: str) -> str:
52
  cfg = task_config(task)
53
  s = cfg["submission_schema"]
54
  m = cfg["metric"]
 
 
 
55
  return _TEMPLATE.format(
56
  task=task,
57
  description=str(cfg.get("description", "")).strip(),
58
  id_col=s["id_col"],
59
  pred_col=s["pred_col"],
60
  n_rows=s.get("n_rows", "?"),
 
 
61
  primary=m["primary"],
62
  secondary=", ".join(m.get("secondary", [])) or "(none)",
63
  )
 
34
  | column | type | meaning |
35
  | --- | --- | --- |
36
  | `{id_col}` | id | matches `test_features.csv[{id_col}]` 100% |
37
+ | `{pred_col}` | {pred_type_desc} | {pred_meaning} |
38
 
39
  Row count: **{n_rows}**.
40
 
 
44
  Optimize for the primary metric.
45
  """
46
 
47
+ _DTYPE_DESC = {
48
+ "float": ("float in [0, 1]", "predicted score (probability)"),
49
+ "binary": ("0 or 1", "predicted hard class — pick your own threshold "
50
+ "(e.g. tune on val to maximize the primary metric)"),
51
+ }
52
+
53
 
54
  def task_instruction(task: str) -> str:
55
  override = Path(__file__).parent / "tasks_md" / f"{task}.md"
 
58
  cfg = task_config(task)
59
  s = cfg["submission_schema"]
60
  m = cfg["metric"]
61
+ type_desc, meaning = _DTYPE_DESC.get(
62
+ s.get("pred_dtype", "float"), _DTYPE_DESC["float"]
63
+ )
64
  return _TEMPLATE.format(
65
  task=task,
66
  description=str(cfg.get("description", "")).strip(),
67
  id_col=s["id_col"],
68
  pred_col=s["pred_col"],
69
  n_rows=s.get("n_rows", "?"),
70
+ pred_type_desc=type_desc,
71
+ pred_meaning=meaning,
72
  primary=m["primary"],
73
  secondary=", ".join(m.get("secondary", [])) or "(none)",
74
  )
datasets/manifest.yaml CHANGED
@@ -145,21 +145,20 @@ ibm-aml:
145
  id_col: transaction_id
146
  pred_col: is_laundering
147
  n_rows: 863900
148
- pred_dtype: float
149
  metric:
150
  primary: f1
151
  secondary:
152
  - auc_pr
153
- - auc_roc
154
  description: 'Predict whether each transaction is part of a money-laundering pattern.
155
  Source: IBM Transactions for AML (ealtman2019/ibm-transactions-for-anti-money-laundering-aml
156
  on Kaggle), HI-Small_Trans.csv variant (~5M total rows). Split: per IBM Multi-GNN
157
  convention (github.com/IBM/Multi-GNN), sort by Timestamp, partition by day to
158
  ~[0.6, 0.2, 0.2]. transaction_id = row index after the global sort. Test rows:
159
- 863,900 (~0.19% positive — heavy class imbalance, hence AUC-PR primary).
160
 
161
 
162
- Metric: F1 on the minority (laundering) class as primary the dataset is ~0.19%
163
- positive, so a model that predicts all-zeros gets a trivially high accuracy but
164
- useless F1. AUC-PR and AUC-ROC reported as secondary; AUC-PR is the IBM Multi-GNN
165
- paper baseline metric and stays useful for reference.'
 
145
  id_col: transaction_id
146
  pred_col: is_laundering
147
  n_rows: 863900
148
+ pred_dtype: binary
149
  metric:
150
  primary: f1
151
  secondary:
152
  - auc_pr
 
153
  description: 'Predict whether each transaction is part of a money-laundering pattern.
154
  Source: IBM Transactions for AML (ealtman2019/ibm-transactions-for-anti-money-laundering-aml
155
  on Kaggle), HI-Small_Trans.csv variant (~5M total rows). Split: per IBM Multi-GNN
156
  convention (github.com/IBM/Multi-GNN), sort by Timestamp, partition by day to
157
  ~[0.6, 0.2, 0.2]. transaction_id = row index after the global sort. Test rows:
158
+ 863,900 (~0.19% positive — heavy class imbalance).
159
 
160
 
161
+ Metric: F1 on the minority (laundering) class as primary. Submission must be
162
+ binary 0/1 (you pick the threshold yourself typically by maximizing F1 on val).
163
+ AUC-PR (computed from your binary submission, so degenerates to a single point)
164
+ is reported as secondary for reference vs the IBM Multi-GNN paper baseline.'
graphtestbed/submit.py CHANGED
@@ -52,13 +52,25 @@ def validate_submission(task: str, csv_path: Path) -> dict:
52
  if df[s["id_col"]].duplicated().any():
53
  raise SystemExit(f"Duplicate IDs in {s['id_col']} column")
54
 
55
- if s.get("pred_dtype") == "float":
 
56
  try:
57
  preds = df[s["pred_col"]].astype(float)
58
  except (TypeError, ValueError) as e:
59
  raise SystemExit(f"pred_col not float-castable: {e}")
60
  if (preds < 0).any() or (preds > 1).any():
61
  raise SystemExit("predictions must lie in [0, 1]")
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  return {
64
  "n_rows": len(df),
 
52
  if df[s["id_col"]].duplicated().any():
53
  raise SystemExit(f"Duplicate IDs in {s['id_col']} column")
54
 
55
+ dtype = s.get("pred_dtype")
56
+ if dtype == "float":
57
  try:
58
  preds = df[s["pred_col"]].astype(float)
59
  except (TypeError, ValueError) as e:
60
  raise SystemExit(f"pred_col not float-castable: {e}")
61
  if (preds < 0).any() or (preds > 1).any():
62
  raise SystemExit("predictions must lie in [0, 1]")
63
+ elif dtype == "binary":
64
+ try:
65
+ preds = df[s["pred_col"]].astype(int)
66
+ except (TypeError, ValueError) as e:
67
+ raise SystemExit(f"pred_col not int-castable: {e}")
68
+ bad = ~preds.isin([0, 1])
69
+ if bad.any():
70
+ raise SystemExit(
71
+ f"binary submission must contain only 0 or 1; "
72
+ f"got {int(bad.sum())} other values"
73
+ )
74
 
75
  return {
76
  "n_rows": len(df),
server/api.py CHANGED
@@ -110,8 +110,12 @@ def _score(task: str, sub_df: pd.DataFrame, cfg: dict) -> dict:
110
  )
111
 
112
  y_true = merged["Label"].astype(int)
113
- y_score = merged["_pred"].astype(float)
114
- y_pred = (y_score >= 0.5).astype(int)
 
 
 
 
115
  funcs = {
116
  "auc_roc": lambda: roc_auc_score(y_true, y_score),
117
  "auc_pr": lambda: average_precision_score(y_true, y_score),
@@ -207,13 +211,25 @@ def _validate_schema(sub_df: pd.DataFrame, cfg: dict) -> None:
207
  )
208
  if sub_df[s["id_col"]].duplicated().any():
209
  raise ValueError(f"duplicate IDs in {s['id_col']}")
210
- if s.get("pred_dtype") == "float":
 
211
  try:
212
  preds = sub_df[s["pred_col"]].astype(float)
213
  except (TypeError, ValueError) as e:
214
  raise ValueError(f"pred_col not float-castable: {e}")
215
  if (preds < 0).any() or (preds > 1).any():
216
  raise ValueError("predictions must lie in [0, 1]")
 
 
 
 
 
 
 
 
 
 
 
217
 
218
 
219
  @app.post("/submit")
 
110
  )
111
 
112
  y_true = merged["Label"].astype(int)
113
+ if schema.get("pred_dtype") == "binary":
114
+ y_pred = merged["_pred"].astype(int)
115
+ y_score = y_pred.astype(float)
116
+ else:
117
+ y_score = merged["_pred"].astype(float)
118
+ y_pred = (y_score >= 0.5).astype(int)
119
  funcs = {
120
  "auc_roc": lambda: roc_auc_score(y_true, y_score),
121
  "auc_pr": lambda: average_precision_score(y_true, y_score),
 
211
  )
212
  if sub_df[s["id_col"]].duplicated().any():
213
  raise ValueError(f"duplicate IDs in {s['id_col']}")
214
+ dtype = s.get("pred_dtype")
215
+ if dtype == "float":
216
  try:
217
  preds = sub_df[s["pred_col"]].astype(float)
218
  except (TypeError, ValueError) as e:
219
  raise ValueError(f"pred_col not float-castable: {e}")
220
  if (preds < 0).any() or (preds > 1).any():
221
  raise ValueError("predictions must lie in [0, 1]")
222
+ elif dtype == "binary":
223
+ try:
224
+ preds = sub_df[s["pred_col"]].astype(int)
225
+ except (TypeError, ValueError) as e:
226
+ raise ValueError(f"pred_col not int-castable: {e}")
227
+ bad = ~preds.isin([0, 1])
228
+ if bad.any():
229
+ raise ValueError(
230
+ f"binary submission must contain only 0 or 1; "
231
+ f"got {int(bad.sum())} other values"
232
+ )
233
 
234
 
235
  @app.post("/submit")