File size: 3,406 Bytes
d094faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0309359
 
 
 
 
 
 
 
 
 
 
 
 
d094faf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""GraphTestbed task → mle-bench-shaped data tree.

mle-bench expects, per experiment ID:

    <root>/<exp_id>/prepared/public/{train.csv,test.csv,description.md,sample_submission.csv}

GraphTestbed's test labels live only on the scoring server, so the agent
cannot be auto-scored against `test_features.csv` locally. v1 strategy:

    - Stage `val_features.csv` (with labels) as the "test" the agent
      searches against. MLEvolve's grader can score val predictions locally,
      which is what drives MCGS exploration.
    - Stash the real `test_features.csv` next to the staged tree as
      `<root>/<exp_id>/REAL_TEST_FEATURES.csv` so users can re-execute the
      best runfile.py against it after the search finishes.

This is documented as a known limitation in agents/mlevolve/README.md.
"""

from __future__ import annotations

from pathlib import Path

import pandas as pd

from agents.common.tasks import task_instruction
from graphtestbed._manifest import task_config
from graphtestbed.fetch import cache_dir


def stage(task: str, root: Path) -> Path:
    """Build <root>/<task>/prepared/{public,private}/. Return the prepared dir."""
    cfg = task_config(task)
    s = cfg["submission_schema"]

    src = cache_dir() / task
    if not src.exists():
        raise SystemExit(
            f"No cached dataset at {src}. Run `gtb fetch {task}` first."
        )

    base = root / task / "prepared"
    pub = base / "public"
    priv = base / "private"
    pub.mkdir(parents=True, exist_ok=True)
    priv.mkdir(parents=True, exist_ok=True)

    train = pd.read_csv(src / "train_features.csv")
    val = pd.read_csv(src / "val_features.csv")
    test = pd.read_csv(src / "test_features.csv")

    if s["pred_col"] not in val.columns:
        raise SystemExit(
            f"val_features.csv has no `{s['pred_col']}` column — cannot use "
            f"val as the local-grading split for task {task}."
        )

    # Public tree (what the agent sees). val_no_label = val minus label →
    # served as `test.csv` so the agent's runfile predicts on it.
    val_no_label = val.drop(columns=[s["pred_col"]])
    train.to_csv(pub / "train.csv", index=False)
    val_no_label.to_csv(pub / "test.csv", index=False)

    sample = val_no_label[[s["id_col"]]].copy()
    sample[s["pred_col"]] = 0.5
    sample.to_csv(pub / "sample_submission.csv", index=False)

    (pub / "description.md").write_text(task_instruction(task))

    # Private tree: val with labels — the local grader checks submission
    # against this.
    val[[s["id_col"], s["pred_col"]]].rename(
        columns={s["pred_col"]: "Label"}
    ).to_csv(priv / "test.csv", index=False)

    # Stash the real test set for post-search re-execution by the user.
    test.to_csv(root / task / "REAL_TEST_FEATURES.csv", index=False)

    # Forward any additional task data files declared in the manifest (graph
    # edges, relation tables, …) into the public tree so the agent can build
    # a real graph model instead of treating the task as pure tabular.
    canonical = {"train_features.csv", "val_features.csv",
                 "test_features.csv", "sample_submission.csv"}
    for spec in cfg["files"].values():
        fn = spec["filename"]
        if fn in canonical:
            continue
        src_path = src / fn
        if src_path.exists():
            (pub / fn).write_bytes(src_path.read_bytes())

    return base