| |
| """Upload project data and code to separate Hugging Face repositories. |
| |
| Recommended layout: |
| - data repo: Hugging Face Dataset repo, e.g. USER/decode-iblend-data |
| - code repo: Hugging Face Model repo, e.g. USER/decode-iblend-code |
| |
| Usage: |
| export HF_TOKEN=hf_xxx |
| python3 scripts/upload_to_huggingface.py \ |
| --data-repo-id USER/decode-iblend-data \ |
| --code-repo-id USER/decode-iblend-code |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
| from pathlib import Path |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| DATA_MINING_ROOT = ROOT.parent |
|
|
|
|
| def import_hf(): |
| try: |
| from huggingface_hub import HfApi |
| except ImportError as exc: |
| raise SystemExit( |
| "Missing dependency: huggingface_hub\n" |
| "Install with:\n" |
| " python3 -m pip install huggingface_hub\n" |
| ) from exc |
| return HfApi |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description="Upload data and code to separate Hugging Face repos.") |
| parser.add_argument("--data-repo-id", required=True, help="Dataset repo id, e.g. username/decode-iblend-data") |
| parser.add_argument("--code-repo-id", required=True, help="Code repo id, e.g. username/decode-iblend-code") |
| parser.add_argument("--data-repo-type", default="dataset", choices=["dataset", "model"], help="HF type for data repo.") |
| parser.add_argument("--code-repo-type", default="model", choices=["dataset", "model"], help="HF type for code repo.") |
| parser.add_argument("--token-file", help="Path to a file containing the Hugging Face token. Overrides HF_TOKEN.") |
| parser.add_argument("--private", action="store_true", help="Create both repos as private.") |
| parser.add_argument("--dry-run", action="store_true", help="Print files that would be uploaded.") |
| return parser.parse_args() |
|
|
|
|
| def iter_data_files(): |
| data_roots = [ |
| (ROOT / "energy_dataset", "energy_dataset"), |
| (DATA_MINING_ROOT / "IIITD_occupancy_dataset", "IIITD_occupancy_dataset"), |
| (DATA_MINING_ROOT / "iiitd_calender_schedule", "iiitd_calender_schedule"), |
| (DATA_MINING_ROOT / "weather_comparison", "weather_comparison"), |
| ] |
| for local_root, repo_root in data_roots: |
| if not local_root.exists(): |
| print(f"Skip missing data folder: {local_root}") |
| continue |
| for path in sorted(local_root.rglob("*")): |
| if path.is_file() and not path.name.startswith("."): |
| yield path, str(Path(repo_root) / path.relative_to(local_root)) |
|
|
|
|
| def iter_code_files(): |
| code_files = [ |
| (ROOT / "DECODE_Reimplementation.ipynb", "DECODE_Reimplementation.ipynb"), |
| (ROOT / "HUGGINGFACE.md", "HUGGINGFACE.md"), |
| (ROOT / "scripts" / "decode_reimplementation.py", "scripts/decode_reimplementation.py"), |
| (ROOT / "scripts" / "upload_to_huggingface.py", "scripts/upload_to_huggingface.py"), |
| (ROOT / "scripts" / "preprocess_and_eda_by_building.py", "scripts/preprocess_and_eda_by_building.py"), |
| (ROOT / "decode_reimplementation_outputs" / "README.md", "decode_reimplementation_outputs/README.md"), |
| ] |
| for path, repo_path in code_files: |
| if path.exists(): |
| yield path, repo_path |
| else: |
| print(f"Skip missing code file: {path}") |
|
|
|
|
| def summarize(label: str, files: list[tuple[Path, str]]) -> None: |
| total_size = sum(path.stat().st_size for path, _ in files) |
| print(f"\n[{label}] files: {len(files)}") |
| print(f"[{label}] total size: {total_size / (1024 ** 3):.3f} GiB") |
| for path, repo_path in files: |
| print(f"{path} -> {repo_path}") |
|
|
|
|
| def upload_files(api, repo_id: str, repo_type: str, files: list[tuple[Path, str]], label: str) -> None: |
| for index, (path, repo_path) in enumerate(files, start=1): |
| print(f"[{label} {index}/{len(files)}] Uploading {repo_path}") |
| api.upload_file( |
| path_or_fileobj=str(path), |
| path_in_repo=repo_path, |
| repo_id=repo_id, |
| repo_type=repo_type, |
| ) |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| token = os.environ.get("HF_TOKEN") |
| if args.token_file: |
| token = Path(args.token_file).read_text(encoding="utf-8").strip() |
| if not token and not args.dry_run: |
| raise SystemExit("HF_TOKEN is not set. Export it before uploading.") |
|
|
| data_files = list(iter_data_files()) |
| code_files = list(iter_code_files()) |
| summarize("data", data_files) |
| summarize("code", code_files) |
|
|
| if args.dry_run: |
| return 0 |
|
|
| HfApi = import_hf() |
| api = HfApi(token=token) |
| api.create_repo(repo_id=args.data_repo_id, repo_type=args.data_repo_type, private=args.private, exist_ok=True) |
| api.create_repo(repo_id=args.code_repo_id, repo_type=args.code_repo_type, private=args.private, exist_ok=True) |
| upload_files(api, args.data_repo_id, args.data_repo_type, data_files, "data") |
| upload_files(api, args.code_repo_id, args.code_repo_type, code_files, "code") |
|
|
| data_url_type = "datasets" if args.data_repo_type == "dataset" else "" |
| code_url_type = "datasets" if args.code_repo_type == "dataset" else "" |
| print("\nUploaded:") |
| print(f"Data: https://huggingface.co/{data_url_type + '/' if data_url_type else ''}{args.data_repo_id}") |
| print(f"Code: https://huggingface.co/{code_url_type + '/' if code_url_type else ''}{args.code_repo_id}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|