Spaces:

lanczos
/

graphtestbed

Running

File size: 5,747 Bytes

bd3e9ac

"""One-shot uploader for the agent-visible features (train/val/test) to a
single public HF dataset repo, organized GLUE-style as one subdir per task.

Layout in the repo:

    lanczos/graphtestbed-data/
    ├── README.md
    ├── arxiv-citation/{train,val,test}_features.csv + sample_submission.csv
    ├── figraph/...
    ├── ibm-aml/...
    └── ieee-fraud-detection/...

The test_features.csv MUST already have its label column stripped out — this
script does NOT strip it. Spot-check before upload by running with --dry-run.

Usage:
    HF_TOKEN=hf_xxx python server/space/push_data.py \
        --repo lanczos/graphtestbed-data --src ~/.graphtestbed/data
    # or one task at a time:
    HF_TOKEN=hf_xxx python server/space/push_data.py \
        --repo lanczos/graphtestbed-data --src ~/.graphtestbed/data \
        --tasks figraph arxiv-citation
"""

from __future__ import annotations

import argparse
import os
import sys
import tempfile
from pathlib import Path

import yaml
from huggingface_hub import HfApi, create_repo

REPO_ROOT = Path(__file__).resolve().parents[2]
MANIFEST = REPO_ROOT / "datasets" / "manifest.yaml"

FILES = ["train_features.csv", "val_features.csv",
         "test_features.csv", "sample_submission.csv"]


def _readme(tasks: list[str], cfg: dict) -> str:
    lines = [
        "---",
        "license: mit",
        "tags: [graph, benchmark, fraud-detection, graph-ml]",
        "---",
        "",
        "# GraphTestbed Datasets",
        "",
        "Public train/val/test features for the four [GraphTestbed]"
        "(https://github.com/zhuconv/GraphTestbed) tasks. Test labels are"
        " held privately by the scoring server.",
        "",
        "## Why a single repo",
        "",
        "GLUE-style: one repo, one subdir per task, one README. Adding a"
        " new task is a `git push` of one folder, not a new HF repo.",
        "",
        "## Subsets",
        "",
        "| Task | id col | metric | rows (train/val/test) | Source |",
        "| --- | --- | --- | --- | --- |",
    ]
    for t in tasks:
        c = cfg[t]
        s = c["submission_schema"]
        m = c["metric"]
        # Pull the first sentence of the description as the source line
        desc = (c.get("description", "") or "").split(".")[0]
        lines.append(
            f"| `{t}` | `{s['id_col']}` | `{m['primary']}` | "
            f"see csv | {desc.strip()[:60]} |"
        )
    lines += [
        "",
        "## Use",
        "",
        "```python",
        "from huggingface_hub import hf_hub_download",
        "import pandas as pd",
        "",
        "p = hf_hub_download(",
        "    'lanczos/graphtestbed-data', 'arxiv-citation/train_features.csv',",
        "    repo_type='dataset',",
        ")",
        "train = pd.read_csv(p)",
        "```",
        "",
        "**Contract:** treat upstream sources (e.g. relbench, FiGraph github,"
        " IBM AML kaggle) as out-of-bounds for evaluation purposes. Train +"
        " HPO on what's in this repo only.",
        "",
        "Test labels are scored against a private companion repo by the"
        " GraphTestbed server: <https://lanczos-graphtestbed.hf.space/>.",
    ]
    return "\n".join(lines)


def main() -> None:
    ap = argparse.ArgumentParser(prog="push_data")
    ap.add_argument("--repo", required=True,
                    help="HF dataset repo id, e.g. lanczos/graphtestbed-data")
    ap.add_argument("--src", required=True, type=Path,
                    help="Local source root (e.g. ~/.graphtestbed/data) — "
                         "must contain a subdir per task with the 4 CSVs.")
    ap.add_argument("--tasks", nargs="+", default=None,
                    help="Limit to these task names (default: all in manifest)")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    cfg = yaml.safe_load(MANIFEST.read_text())
    tasks = args.tasks or sorted(cfg)

    src_root = args.src.expanduser()
    missing = []
    for t in tasks:
        for f in FILES:
            if not (src_root / t / f).exists():
                missing.append(f"{t}/{f}")
    if missing:
        sys.exit("Missing files:\n  " + "\n  ".join(missing))

    token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
    if not token:
        sys.exit("Set HF_TOKEN env var with write scope on the namespace.")

    api = HfApi(token=token)

    if args.dry_run:
        print(f"[dry-run] would push to {args.repo}:")
        for t in tasks:
            for f in FILES:
                p = (src_root / t / f).resolve()
                size_mb = p.stat().st_size / (1024 * 1024)
                print(f"  {t}/{f}  ({size_mb:.1f} MB)")
        return

    create_repo(args.repo, repo_type="dataset", token=token,
                exist_ok=True, private=False)

    # Write the README into a tempdir so we don't dirty the source root
    with tempfile.TemporaryDirectory() as td:
        readme = Path(td) / "README.md"
        readme.write_text(_readme(tasks, cfg))
        api.upload_file(
            path_or_fileobj=str(readme),
            path_in_repo="README.md",
            repo_id=args.repo,
            repo_type="dataset",
            commit_message="Update README (auto from push_data.py)",
        )

    for t in tasks:
        # upload_folder follows symlinks via the underlying open() calls.
        api.upload_folder(
            folder_path=str(src_root / t),
            path_in_repo=t,
            repo_id=args.repo,
            repo_type="dataset",
            allow_patterns=FILES,
            commit_message=f"Push {t} train/val/test features",
        )
        print(f"  ✓ {t}/")
    print("Done.")


if __name__ == "__main__":
    main()