#!/usr/bin/env python3 """Upload project data and code to separate Hugging Face repositories. Recommended layout: - data repo: Hugging Face Dataset repo, e.g. USER/decode-iblend-data - code repo: Hugging Face Model repo, e.g. USER/decode-iblend-code Usage: export HF_TOKEN=hf_xxx python3 scripts/upload_to_huggingface.py \ --data-repo-id USER/decode-iblend-data \ --code-repo-id USER/decode-iblend-code """ from __future__ import annotations import argparse import os from pathlib import Path ROOT = Path(__file__).resolve().parents[1] DATA_MINING_ROOT = ROOT.parent def import_hf(): try: from huggingface_hub import HfApi except ImportError as exc: raise SystemExit( "Missing dependency: huggingface_hub\n" "Install with:\n" " python3 -m pip install huggingface_hub\n" ) from exc return HfApi def parse_args(): parser = argparse.ArgumentParser(description="Upload data and code to separate Hugging Face repos.") parser.add_argument("--data-repo-id", required=True, help="Dataset repo id, e.g. username/decode-iblend-data") parser.add_argument("--code-repo-id", required=True, help="Code repo id, e.g. username/decode-iblend-code") parser.add_argument("--data-repo-type", default="dataset", choices=["dataset", "model"], help="HF type for data repo.") parser.add_argument("--code-repo-type", default="model", choices=["dataset", "model"], help="HF type for code repo.") parser.add_argument("--token-file", help="Path to a file containing the Hugging Face token. Overrides HF_TOKEN.") parser.add_argument("--private", action="store_true", help="Create both repos as private.") parser.add_argument("--dry-run", action="store_true", help="Print files that would be uploaded.") return parser.parse_args() def iter_data_files(): data_roots = [ (ROOT / "energy_dataset", "energy_dataset"), (DATA_MINING_ROOT / "IIITD_occupancy_dataset", "IIITD_occupancy_dataset"), (DATA_MINING_ROOT / "iiitd_calender_schedule", "iiitd_calender_schedule"), (DATA_MINING_ROOT / "weather_comparison", "weather_comparison"), ] for local_root, repo_root in data_roots: if not local_root.exists(): print(f"Skip missing data folder: {local_root}") continue for path in sorted(local_root.rglob("*")): if path.is_file() and not path.name.startswith("."): yield path, str(Path(repo_root) / path.relative_to(local_root)) def iter_code_files(): code_files = [ (ROOT / "DECODE_Reimplementation.ipynb", "DECODE_Reimplementation.ipynb"), (ROOT / "HUGGINGFACE.md", "HUGGINGFACE.md"), (ROOT / "scripts" / "decode_reimplementation.py", "scripts/decode_reimplementation.py"), (ROOT / "scripts" / "upload_to_huggingface.py", "scripts/upload_to_huggingface.py"), (ROOT / "scripts" / "preprocess_and_eda_by_building.py", "scripts/preprocess_and_eda_by_building.py"), (ROOT / "decode_reimplementation_outputs" / "README.md", "decode_reimplementation_outputs/README.md"), ] for path, repo_path in code_files: if path.exists(): yield path, repo_path else: print(f"Skip missing code file: {path}") def summarize(label: str, files: list[tuple[Path, str]]) -> None: total_size = sum(path.stat().st_size for path, _ in files) print(f"\n[{label}] files: {len(files)}") print(f"[{label}] total size: {total_size / (1024 ** 3):.3f} GiB") for path, repo_path in files: print(f"{path} -> {repo_path}") def upload_files(api, repo_id: str, repo_type: str, files: list[tuple[Path, str]], label: str) -> None: for index, (path, repo_path) in enumerate(files, start=1): print(f"[{label} {index}/{len(files)}] Uploading {repo_path}") api.upload_file( path_or_fileobj=str(path), path_in_repo=repo_path, repo_id=repo_id, repo_type=repo_type, ) def main() -> int: args = parse_args() token = os.environ.get("HF_TOKEN") if args.token_file: token = Path(args.token_file).read_text(encoding="utf-8").strip() if not token and not args.dry_run: raise SystemExit("HF_TOKEN is not set. Export it before uploading.") data_files = list(iter_data_files()) code_files = list(iter_code_files()) summarize("data", data_files) summarize("code", code_files) if args.dry_run: return 0 HfApi = import_hf() api = HfApi(token=token) api.create_repo(repo_id=args.data_repo_id, repo_type=args.data_repo_type, private=args.private, exist_ok=True) api.create_repo(repo_id=args.code_repo_id, repo_type=args.code_repo_type, private=args.private, exist_ok=True) upload_files(api, args.data_repo_id, args.data_repo_type, data_files, "data") upload_files(api, args.code_repo_id, args.code_repo_type, code_files, "code") data_url_type = "datasets" if args.data_repo_type == "dataset" else "" code_url_type = "datasets" if args.code_repo_type == "dataset" else "" print("\nUploaded:") print(f"Data: https://huggingface.co/{data_url_type + '/' if data_url_type else ''}{args.data_repo_id}") print(f"Code: https://huggingface.co/{code_url_type + '/' if code_url_type else ''}{args.code_repo_id}") return 0 if __name__ == "__main__": raise SystemExit(main())