Spaces:

lanczos
/

graphtestbed

Running

graphtestbed / server /space /push_data.py

Zhu Jiajun (jz28583)

Single-repo dataset hosting on HF (GLUE-style subdirs)

bd3e9ac 16 days ago

5.75 kB

	"""One-shot uploader for the agent-visible features (train/val/test) to a
	single public HF dataset repo, organized GLUE-style as one subdir per task.

	Layout in the repo:

	lanczos/graphtestbed-data/
	├── README.md
	├── arxiv-citation/{train,val,test}_features.csv + sample_submission.csv
	├── figraph/...
	├── ibm-aml/...
	└── ieee-fraud-detection/...

	The test_features.csv MUST already have its label column stripped out — this
	script does NOT strip it. Spot-check before upload by running with --dry-run.

	Usage:
	HF_TOKEN=hf_xxx python server/space/push_data.py \
	--repo lanczos/graphtestbed-data --src ~/.graphtestbed/data
	# or one task at a time:
	HF_TOKEN=hf_xxx python server/space/push_data.py \
	--repo lanczos/graphtestbed-data --src ~/.graphtestbed/data \
	--tasks figraph arxiv-citation
	"""

	from __future__ import annotations

	import argparse
	import os
	import sys
	import tempfile
	from pathlib import Path

	import yaml
	from huggingface_hub import HfApi, create_repo

	REPO_ROOT = Path(__file__).resolve().parents[2]
	MANIFEST = REPO_ROOT / "datasets" / "manifest.yaml"

	FILES = ["train_features.csv", "val_features.csv",
	"test_features.csv", "sample_submission.csv"]


	def _readme(tasks: list[str], cfg: dict) -> str:
	lines = [
	"---",
	"license: mit",
	"tags: [graph, benchmark, fraud-detection, graph-ml]",
	"---",
	"",
	"# GraphTestbed Datasets",
	"",
	"Public train/val/test features for the four [GraphTestbed]"
	"(https://github.com/zhuconv/GraphTestbed) tasks. Test labels are"
	" held privately by the scoring server.",
	"",
	"## Why a single repo",
	"",
	"GLUE-style: one repo, one subdir per task, one README. Adding a"
	" new task is a `git push` of one folder, not a new HF repo.",
	"",
	"## Subsets",
	"",
	"\| Task \| id col \| metric \| rows (train/val/test) \| Source \|",
	"\| --- \| --- \| --- \| --- \| --- \|",
	]
	for t in tasks:
	c = cfg[t]
	s = c["submission_schema"]
	m = c["metric"]
	# Pull the first sentence of the description as the source line
	desc = (c.get("description", "") or "").split(".")[0]
	lines.append(
	f"\| `{t}` \| `{s['id_col']}` \| `{m['primary']}` \| "
	f"see csv \| {desc.strip()[:60]} \|"
	)
	lines += [
	"",
	"## Use",
	"",
	"```python",
	"from huggingface_hub import hf_hub_download",
	"import pandas as pd",
	"",
	"p = hf_hub_download(",
	" 'lanczos/graphtestbed-data', 'arxiv-citation/train_features.csv',",
	" repo_type='dataset',",
	")",
	"train = pd.read_csv(p)",
	"```",
	"",
	"Contract: treat upstream sources (e.g. relbench, FiGraph github,"
	" IBM AML kaggle) as out-of-bounds for evaluation purposes. Train +"
	" HPO on what's in this repo only.",
	"",
	"Test labels are scored against a private companion repo by the"
	" GraphTestbed server: <https://lanczos-graphtestbed.hf.space/>.",
	]
	return "\n".join(lines)


	def main() -> None:
	ap = argparse.ArgumentParser(prog="push_data")
	ap.add_argument("--repo", required=True,
	help="HF dataset repo id, e.g. lanczos/graphtestbed-data")
	ap.add_argument("--src", required=True, type=Path,
	help="Local source root (e.g. ~/.graphtestbed/data) — "
	"must contain a subdir per task with the 4 CSVs.")
	ap.add_argument("--tasks", nargs="+", default=None,
	help="Limit to these task names (default: all in manifest)")
	ap.add_argument("--dry-run", action="store_true")
	args = ap.parse_args()

	cfg = yaml.safe_load(MANIFEST.read_text())
	tasks = args.tasks or sorted(cfg)

	src_root = args.src.expanduser()
	missing = []
	for t in tasks:
	for f in FILES:
	if not (src_root / t / f).exists():
	missing.append(f"{t}/{f}")
	if missing:
	sys.exit("Missing files:\n " + "\n ".join(missing))

	token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
	if not token:
	sys.exit("Set HF_TOKEN env var with write scope on the namespace.")

	api = HfApi(token=token)

	if args.dry_run:
	print(f"[dry-run] would push to {args.repo}:")
	for t in tasks:
	for f in FILES:
	p = (src_root / t / f).resolve()
	size_mb = p.stat().st_size / (1024 * 1024)
	print(f" {t}/{f} ({size_mb:.1f} MB)")
	return

	create_repo(args.repo, repo_type="dataset", token=token,
	exist_ok=True, private=False)

	# Write the README into a tempdir so we don't dirty the source root
	with tempfile.TemporaryDirectory() as td:
	readme = Path(td) / "README.md"
	readme.write_text(_readme(tasks, cfg))
	api.upload_file(
	path_or_fileobj=str(readme),
	path_in_repo="README.md",
	repo_id=args.repo,
	repo_type="dataset",
	commit_message="Update README (auto from push_data.py)",
	)

	for t in tasks:
	# upload_folder follows symlinks via the underlying open() calls.
	api.upload_folder(
	folder_path=str(src_root / t),
	path_in_repo=t,
	repo_id=args.repo,
	repo_type="dataset",
	allow_patterns=FILES,
	commit_message=f"Push {t} train/val/test features",
	)
	print(f" ✓ {t}/")
	print("Done.")


	if __name__ == "__main__":
	main()