Spaces:
Running
Running
| """One-shot uploader for the agent-visible features (train/val/test) to a | |
| single public HF dataset repo, organized GLUE-style as one subdir per task. | |
| Layout in the repo: | |
| lanczos/graphtestbed-data/ | |
| βββ README.md | |
| βββ arxiv-citation/{train,val,test}_features.csv + sample_submission.csv | |
| βββ figraph/... | |
| βββ ibm-aml/... | |
| βββ ieee-fraud-detection/... | |
| The test_features.csv MUST already have its label column stripped out β this | |
| script does NOT strip it. Spot-check before upload by running with --dry-run. | |
| Usage: | |
| HF_TOKEN=hf_xxx python server/space/push_data.py \ | |
| --repo lanczos/graphtestbed-data --src ~/.graphtestbed/data | |
| # or one task at a time: | |
| HF_TOKEN=hf_xxx python server/space/push_data.py \ | |
| --repo lanczos/graphtestbed-data --src ~/.graphtestbed/data \ | |
| --tasks figraph arxiv-citation | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import os | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| import yaml | |
| from huggingface_hub import HfApi, create_repo | |
| REPO_ROOT = Path(__file__).resolve().parents[2] | |
| MANIFEST = REPO_ROOT / "datasets" / "manifest.yaml" | |
| FILES = ["train_features.csv", "val_features.csv", | |
| "test_features.csv", "sample_submission.csv"] | |
| def _readme(tasks: list[str], cfg: dict) -> str: | |
| lines = [ | |
| "---", | |
| "license: mit", | |
| "tags: [graph, benchmark, fraud-detection, graph-ml]", | |
| "---", | |
| "", | |
| "# GraphTestbed Datasets", | |
| "", | |
| "Public train/val/test features for the four [GraphTestbed]" | |
| "(https://github.com/zhuconv/GraphTestbed) tasks. Test labels are" | |
| " held privately by the scoring server.", | |
| "", | |
| "## Why a single repo", | |
| "", | |
| "GLUE-style: one repo, one subdir per task, one README. Adding a" | |
| " new task is a `git push` of one folder, not a new HF repo.", | |
| "", | |
| "## Subsets", | |
| "", | |
| "| Task | id col | metric | rows (train/val/test) | Source |", | |
| "| --- | --- | --- | --- | --- |", | |
| ] | |
| for t in tasks: | |
| c = cfg[t] | |
| s = c["submission_schema"] | |
| m = c["metric"] | |
| # Pull the first sentence of the description as the source line | |
| desc = (c.get("description", "") or "").split(".")[0] | |
| lines.append( | |
| f"| `{t}` | `{s['id_col']}` | `{m['primary']}` | " | |
| f"see csv | {desc.strip()[:60]} |" | |
| ) | |
| lines += [ | |
| "", | |
| "## Use", | |
| "", | |
| "```python", | |
| "from huggingface_hub import hf_hub_download", | |
| "import pandas as pd", | |
| "", | |
| "p = hf_hub_download(", | |
| " 'lanczos/graphtestbed-data', 'arxiv-citation/train_features.csv',", | |
| " repo_type='dataset',", | |
| ")", | |
| "train = pd.read_csv(p)", | |
| "```", | |
| "", | |
| "**Contract:** treat upstream sources (e.g. relbench, FiGraph github," | |
| " IBM AML kaggle) as out-of-bounds for evaluation purposes. Train +" | |
| " HPO on what's in this repo only.", | |
| "", | |
| "Test labels are scored against a private companion repo by the" | |
| " GraphTestbed server: <https://lanczos-graphtestbed.hf.space/>.", | |
| ] | |
| return "\n".join(lines) | |
| def main() -> None: | |
| ap = argparse.ArgumentParser(prog="push_data") | |
| ap.add_argument("--repo", required=True, | |
| help="HF dataset repo id, e.g. lanczos/graphtestbed-data") | |
| ap.add_argument("--src", required=True, type=Path, | |
| help="Local source root (e.g. ~/.graphtestbed/data) β " | |
| "must contain a subdir per task with the 4 CSVs.") | |
| ap.add_argument("--tasks", nargs="+", default=None, | |
| help="Limit to these task names (default: all in manifest)") | |
| ap.add_argument("--dry-run", action="store_true") | |
| args = ap.parse_args() | |
| cfg = yaml.safe_load(MANIFEST.read_text()) | |
| tasks = args.tasks or sorted(cfg) | |
| src_root = args.src.expanduser() | |
| missing = [] | |
| for t in tasks: | |
| for f in FILES: | |
| if not (src_root / t / f).exists(): | |
| missing.append(f"{t}/{f}") | |
| if missing: | |
| sys.exit("Missing files:\n " + "\n ".join(missing)) | |
| token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") | |
| if not token: | |
| sys.exit("Set HF_TOKEN env var with write scope on the namespace.") | |
| api = HfApi(token=token) | |
| if args.dry_run: | |
| print(f"[dry-run] would push to {args.repo}:") | |
| for t in tasks: | |
| for f in FILES: | |
| p = (src_root / t / f).resolve() | |
| size_mb = p.stat().st_size / (1024 * 1024) | |
| print(f" {t}/{f} ({size_mb:.1f} MB)") | |
| return | |
| create_repo(args.repo, repo_type="dataset", token=token, | |
| exist_ok=True, private=False) | |
| # Write the README into a tempdir so we don't dirty the source root | |
| with tempfile.TemporaryDirectory() as td: | |
| readme = Path(td) / "README.md" | |
| readme.write_text(_readme(tasks, cfg)) | |
| api.upload_file( | |
| path_or_fileobj=str(readme), | |
| path_in_repo="README.md", | |
| repo_id=args.repo, | |
| repo_type="dataset", | |
| commit_message="Update README (auto from push_data.py)", | |
| ) | |
| for t in tasks: | |
| # upload_folder follows symlinks via the underlying open() calls. | |
| api.upload_folder( | |
| folder_path=str(src_root / t), | |
| path_in_repo=t, | |
| repo_id=args.repo, | |
| repo_type="dataset", | |
| allow_patterns=FILES, | |
| commit_message=f"Push {t} train/val/test features", | |
| ) | |
| print(f" β {t}/") | |
| print("Done.") | |
| if __name__ == "__main__": | |
| main() | |