"""One-shot uploader for the agent-visible features (train/val/test) to a single public HF dataset repo, organized GLUE-style as one subdir per task. Layout in the repo: lanczos/graphtestbed-data/ ├── README.md ├── arxiv-citation/{train,val,test}_features.csv + sample_submission.csv ├── figraph/... ├── ibm-aml/... └── ieee-fraud-detection/... The test_features.csv MUST already have its label column stripped out — this script does NOT strip it. Spot-check before upload by running with --dry-run. Usage: HF_TOKEN=hf_xxx python server/space/push_data.py \ --repo lanczos/graphtestbed-data --src ~/.graphtestbed/data # or one task at a time: HF_TOKEN=hf_xxx python server/space/push_data.py \ --repo lanczos/graphtestbed-data --src ~/.graphtestbed/data \ --tasks figraph arxiv-citation """ from __future__ import annotations import argparse import os import sys import tempfile from pathlib import Path import yaml from huggingface_hub import HfApi, create_repo REPO_ROOT = Path(__file__).resolve().parents[2] MANIFEST = REPO_ROOT / "datasets" / "manifest.yaml" FILES = ["train_features.csv", "val_features.csv", "test_features.csv", "sample_submission.csv"] def _readme(tasks: list[str], cfg: dict) -> str: lines = [ "---", "license: mit", "tags: [graph, benchmark, fraud-detection, graph-ml]", "---", "", "# GraphTestbed Datasets", "", "Public train/val/test features for the four [GraphTestbed]" "(https://github.com/zhuconv/GraphTestbed) tasks. Test labels are" " held privately by the scoring server.", "", "## Why a single repo", "", "GLUE-style: one repo, one subdir per task, one README. Adding a" " new task is a `git push` of one folder, not a new HF repo.", "", "## Subsets", "", "| Task | id col | metric | rows (train/val/test) | Source |", "| --- | --- | --- | --- | --- |", ] for t in tasks: c = cfg[t] s = c["submission_schema"] m = c["metric"] # Pull the first sentence of the description as the source line desc = (c.get("description", "") or "").split(".")[0] lines.append( f"| `{t}` | `{s['id_col']}` | `{m['primary']}` | " f"see csv | {desc.strip()[:60]} |" ) lines += [ "", "## Use", "", "```python", "from huggingface_hub import hf_hub_download", "import pandas as pd", "", "p = hf_hub_download(", " 'lanczos/graphtestbed-data', 'arxiv-citation/train_features.csv',", " repo_type='dataset',", ")", "train = pd.read_csv(p)", "```", "", "**Contract:** treat upstream sources (e.g. relbench, FiGraph github," " IBM AML kaggle) as out-of-bounds for evaluation purposes. Train +" " HPO on what's in this repo only.", "", "Test labels are scored against a private companion repo by the" " GraphTestbed server: .", ] return "\n".join(lines) def main() -> None: ap = argparse.ArgumentParser(prog="push_data") ap.add_argument("--repo", required=True, help="HF dataset repo id, e.g. lanczos/graphtestbed-data") ap.add_argument("--src", required=True, type=Path, help="Local source root (e.g. ~/.graphtestbed/data) — " "must contain a subdir per task with the 4 CSVs.") ap.add_argument("--tasks", nargs="+", default=None, help="Limit to these task names (default: all in manifest)") ap.add_argument("--dry-run", action="store_true") args = ap.parse_args() cfg = yaml.safe_load(MANIFEST.read_text()) tasks = args.tasks or sorted(cfg) src_root = args.src.expanduser() missing = [] for t in tasks: for f in FILES: if not (src_root / t / f).exists(): missing.append(f"{t}/{f}") if missing: sys.exit("Missing files:\n " + "\n ".join(missing)) token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") if not token: sys.exit("Set HF_TOKEN env var with write scope on the namespace.") api = HfApi(token=token) if args.dry_run: print(f"[dry-run] would push to {args.repo}:") for t in tasks: for f in FILES: p = (src_root / t / f).resolve() size_mb = p.stat().st_size / (1024 * 1024) print(f" {t}/{f} ({size_mb:.1f} MB)") return create_repo(args.repo, repo_type="dataset", token=token, exist_ok=True, private=False) # Write the README into a tempdir so we don't dirty the source root with tempfile.TemporaryDirectory() as td: readme = Path(td) / "README.md" readme.write_text(_readme(tasks, cfg)) api.upload_file( path_or_fileobj=str(readme), path_in_repo="README.md", repo_id=args.repo, repo_type="dataset", commit_message="Update README (auto from push_data.py)", ) for t in tasks: # upload_folder follows symlinks via the underlying open() calls. api.upload_folder( folder_path=str(src_root / t), path_in_repo=t, repo_id=args.repo, repo_type="dataset", allow_patterns=FILES, commit_message=f"Push {t} train/val/test features", ) print(f" ✓ {t}/") print("Done.") if __name__ == "__main__": main()