Spaces:
Running
Running
File size: 5,747 Bytes
bd3e9ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | """One-shot uploader for the agent-visible features (train/val/test) to a
single public HF dataset repo, organized GLUE-style as one subdir per task.
Layout in the repo:
lanczos/graphtestbed-data/
βββ README.md
βββ arxiv-citation/{train,val,test}_features.csv + sample_submission.csv
βββ figraph/...
βββ ibm-aml/...
βββ ieee-fraud-detection/...
The test_features.csv MUST already have its label column stripped out β this
script does NOT strip it. Spot-check before upload by running with --dry-run.
Usage:
HF_TOKEN=hf_xxx python server/space/push_data.py \
--repo lanczos/graphtestbed-data --src ~/.graphtestbed/data
# or one task at a time:
HF_TOKEN=hf_xxx python server/space/push_data.py \
--repo lanczos/graphtestbed-data --src ~/.graphtestbed/data \
--tasks figraph arxiv-citation
"""
from __future__ import annotations
import argparse
import os
import sys
import tempfile
from pathlib import Path
import yaml
from huggingface_hub import HfApi, create_repo
REPO_ROOT = Path(__file__).resolve().parents[2]
MANIFEST = REPO_ROOT / "datasets" / "manifest.yaml"
FILES = ["train_features.csv", "val_features.csv",
"test_features.csv", "sample_submission.csv"]
def _readme(tasks: list[str], cfg: dict) -> str:
lines = [
"---",
"license: mit",
"tags: [graph, benchmark, fraud-detection, graph-ml]",
"---",
"",
"# GraphTestbed Datasets",
"",
"Public train/val/test features for the four [GraphTestbed]"
"(https://github.com/zhuconv/GraphTestbed) tasks. Test labels are"
" held privately by the scoring server.",
"",
"## Why a single repo",
"",
"GLUE-style: one repo, one subdir per task, one README. Adding a"
" new task is a `git push` of one folder, not a new HF repo.",
"",
"## Subsets",
"",
"| Task | id col | metric | rows (train/val/test) | Source |",
"| --- | --- | --- | --- | --- |",
]
for t in tasks:
c = cfg[t]
s = c["submission_schema"]
m = c["metric"]
# Pull the first sentence of the description as the source line
desc = (c.get("description", "") or "").split(".")[0]
lines.append(
f"| `{t}` | `{s['id_col']}` | `{m['primary']}` | "
f"see csv | {desc.strip()[:60]} |"
)
lines += [
"",
"## Use",
"",
"```python",
"from huggingface_hub import hf_hub_download",
"import pandas as pd",
"",
"p = hf_hub_download(",
" 'lanczos/graphtestbed-data', 'arxiv-citation/train_features.csv',",
" repo_type='dataset',",
")",
"train = pd.read_csv(p)",
"```",
"",
"**Contract:** treat upstream sources (e.g. relbench, FiGraph github,"
" IBM AML kaggle) as out-of-bounds for evaluation purposes. Train +"
" HPO on what's in this repo only.",
"",
"Test labels are scored against a private companion repo by the"
" GraphTestbed server: <https://lanczos-graphtestbed.hf.space/>.",
]
return "\n".join(lines)
def main() -> None:
ap = argparse.ArgumentParser(prog="push_data")
ap.add_argument("--repo", required=True,
help="HF dataset repo id, e.g. lanczos/graphtestbed-data")
ap.add_argument("--src", required=True, type=Path,
help="Local source root (e.g. ~/.graphtestbed/data) β "
"must contain a subdir per task with the 4 CSVs.")
ap.add_argument("--tasks", nargs="+", default=None,
help="Limit to these task names (default: all in manifest)")
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
cfg = yaml.safe_load(MANIFEST.read_text())
tasks = args.tasks or sorted(cfg)
src_root = args.src.expanduser()
missing = []
for t in tasks:
for f in FILES:
if not (src_root / t / f).exists():
missing.append(f"{t}/{f}")
if missing:
sys.exit("Missing files:\n " + "\n ".join(missing))
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
if not token:
sys.exit("Set HF_TOKEN env var with write scope on the namespace.")
api = HfApi(token=token)
if args.dry_run:
print(f"[dry-run] would push to {args.repo}:")
for t in tasks:
for f in FILES:
p = (src_root / t / f).resolve()
size_mb = p.stat().st_size / (1024 * 1024)
print(f" {t}/{f} ({size_mb:.1f} MB)")
return
create_repo(args.repo, repo_type="dataset", token=token,
exist_ok=True, private=False)
# Write the README into a tempdir so we don't dirty the source root
with tempfile.TemporaryDirectory() as td:
readme = Path(td) / "README.md"
readme.write_text(_readme(tasks, cfg))
api.upload_file(
path_or_fileobj=str(readme),
path_in_repo="README.md",
repo_id=args.repo,
repo_type="dataset",
commit_message="Update README (auto from push_data.py)",
)
for t in tasks:
# upload_folder follows symlinks via the underlying open() calls.
api.upload_folder(
folder_path=str(src_root / t),
path_in_repo=t,
repo_id=args.repo,
repo_type="dataset",
allow_patterns=FILES,
commit_message=f"Push {t} train/val/test features",
)
print(f" β {t}/")
print("Done.")
if __name__ == "__main__":
main()
|