graphtestbed / server /space /push_data.py
Zhu Jiajun (jz28583)
Single-repo dataset hosting on HF (GLUE-style subdirs)
bd3e9ac
"""One-shot uploader for the agent-visible features (train/val/test) to a
single public HF dataset repo, organized GLUE-style as one subdir per task.
Layout in the repo:
lanczos/graphtestbed-data/
β”œβ”€β”€ README.md
β”œβ”€β”€ arxiv-citation/{train,val,test}_features.csv + sample_submission.csv
β”œβ”€β”€ figraph/...
β”œβ”€β”€ ibm-aml/...
└── ieee-fraud-detection/...
The test_features.csv MUST already have its label column stripped out β€” this
script does NOT strip it. Spot-check before upload by running with --dry-run.
Usage:
HF_TOKEN=hf_xxx python server/space/push_data.py \
--repo lanczos/graphtestbed-data --src ~/.graphtestbed/data
# or one task at a time:
HF_TOKEN=hf_xxx python server/space/push_data.py \
--repo lanczos/graphtestbed-data --src ~/.graphtestbed/data \
--tasks figraph arxiv-citation
"""
from __future__ import annotations
import argparse
import os
import sys
import tempfile
from pathlib import Path
import yaml
from huggingface_hub import HfApi, create_repo
REPO_ROOT = Path(__file__).resolve().parents[2]
MANIFEST = REPO_ROOT / "datasets" / "manifest.yaml"
FILES = ["train_features.csv", "val_features.csv",
"test_features.csv", "sample_submission.csv"]
def _readme(tasks: list[str], cfg: dict) -> str:
lines = [
"---",
"license: mit",
"tags: [graph, benchmark, fraud-detection, graph-ml]",
"---",
"",
"# GraphTestbed Datasets",
"",
"Public train/val/test features for the four [GraphTestbed]"
"(https://github.com/zhuconv/GraphTestbed) tasks. Test labels are"
" held privately by the scoring server.",
"",
"## Why a single repo",
"",
"GLUE-style: one repo, one subdir per task, one README. Adding a"
" new task is a `git push` of one folder, not a new HF repo.",
"",
"## Subsets",
"",
"| Task | id col | metric | rows (train/val/test) | Source |",
"| --- | --- | --- | --- | --- |",
]
for t in tasks:
c = cfg[t]
s = c["submission_schema"]
m = c["metric"]
# Pull the first sentence of the description as the source line
desc = (c.get("description", "") or "").split(".")[0]
lines.append(
f"| `{t}` | `{s['id_col']}` | `{m['primary']}` | "
f"see csv | {desc.strip()[:60]} |"
)
lines += [
"",
"## Use",
"",
"```python",
"from huggingface_hub import hf_hub_download",
"import pandas as pd",
"",
"p = hf_hub_download(",
" 'lanczos/graphtestbed-data', 'arxiv-citation/train_features.csv',",
" repo_type='dataset',",
")",
"train = pd.read_csv(p)",
"```",
"",
"**Contract:** treat upstream sources (e.g. relbench, FiGraph github,"
" IBM AML kaggle) as out-of-bounds for evaluation purposes. Train +"
" HPO on what's in this repo only.",
"",
"Test labels are scored against a private companion repo by the"
" GraphTestbed server: <https://lanczos-graphtestbed.hf.space/>.",
]
return "\n".join(lines)
def main() -> None:
ap = argparse.ArgumentParser(prog="push_data")
ap.add_argument("--repo", required=True,
help="HF dataset repo id, e.g. lanczos/graphtestbed-data")
ap.add_argument("--src", required=True, type=Path,
help="Local source root (e.g. ~/.graphtestbed/data) β€” "
"must contain a subdir per task with the 4 CSVs.")
ap.add_argument("--tasks", nargs="+", default=None,
help="Limit to these task names (default: all in manifest)")
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
cfg = yaml.safe_load(MANIFEST.read_text())
tasks = args.tasks or sorted(cfg)
src_root = args.src.expanduser()
missing = []
for t in tasks:
for f in FILES:
if not (src_root / t / f).exists():
missing.append(f"{t}/{f}")
if missing:
sys.exit("Missing files:\n " + "\n ".join(missing))
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
if not token:
sys.exit("Set HF_TOKEN env var with write scope on the namespace.")
api = HfApi(token=token)
if args.dry_run:
print(f"[dry-run] would push to {args.repo}:")
for t in tasks:
for f in FILES:
p = (src_root / t / f).resolve()
size_mb = p.stat().st_size / (1024 * 1024)
print(f" {t}/{f} ({size_mb:.1f} MB)")
return
create_repo(args.repo, repo_type="dataset", token=token,
exist_ok=True, private=False)
# Write the README into a tempdir so we don't dirty the source root
with tempfile.TemporaryDirectory() as td:
readme = Path(td) / "README.md"
readme.write_text(_readme(tasks, cfg))
api.upload_file(
path_or_fileobj=str(readme),
path_in_repo="README.md",
repo_id=args.repo,
repo_type="dataset",
commit_message="Update README (auto from push_data.py)",
)
for t in tasks:
# upload_folder follows symlinks via the underlying open() calls.
api.upload_folder(
folder_path=str(src_root / t),
path_in_repo=t,
repo_id=args.repo,
repo_type="dataset",
allow_patterns=FILES,
commit_message=f"Push {t} train/val/test features",
)
print(f" βœ“ {t}/")
print("Done.")
if __name__ == "__main__":
main()