File size: 5,432 Bytes

626bb3f

#!/usr/bin/env python3
"""Upload project data and code to separate Hugging Face repositories.

Recommended layout:
- data repo: Hugging Face Dataset repo, e.g. USER/decode-iblend-data
- code repo: Hugging Face Model repo, e.g. USER/decode-iblend-code

Usage:
  export HF_TOKEN=hf_xxx
  python3 scripts/upload_to_huggingface.py \
    --data-repo-id USER/decode-iblend-data \
    --code-repo-id USER/decode-iblend-code
"""

from __future__ import annotations

import argparse
import os
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DATA_MINING_ROOT = ROOT.parent


def import_hf():
    try:
        from huggingface_hub import HfApi
    except ImportError as exc:
        raise SystemExit(
            "Missing dependency: huggingface_hub\n"
            "Install with:\n"
            "  python3 -m pip install huggingface_hub\n"
        ) from exc
    return HfApi


def parse_args():
    parser = argparse.ArgumentParser(description="Upload data and code to separate Hugging Face repos.")
    parser.add_argument("--data-repo-id", required=True, help="Dataset repo id, e.g. username/decode-iblend-data")
    parser.add_argument("--code-repo-id", required=True, help="Code repo id, e.g. username/decode-iblend-code")
    parser.add_argument("--data-repo-type", default="dataset", choices=["dataset", "model"], help="HF type for data repo.")
    parser.add_argument("--code-repo-type", default="model", choices=["dataset", "model"], help="HF type for code repo.")
    parser.add_argument("--token-file", help="Path to a file containing the Hugging Face token. Overrides HF_TOKEN.")
    parser.add_argument("--private", action="store_true", help="Create both repos as private.")
    parser.add_argument("--dry-run", action="store_true", help="Print files that would be uploaded.")
    return parser.parse_args()


def iter_data_files():
    data_roots = [
        (ROOT / "energy_dataset", "energy_dataset"),
        (DATA_MINING_ROOT / "IIITD_occupancy_dataset", "IIITD_occupancy_dataset"),
        (DATA_MINING_ROOT / "iiitd_calender_schedule", "iiitd_calender_schedule"),
        (DATA_MINING_ROOT / "weather_comparison", "weather_comparison"),
    ]
    for local_root, repo_root in data_roots:
        if not local_root.exists():
            print(f"Skip missing data folder: {local_root}")
            continue
        for path in sorted(local_root.rglob("*")):
            if path.is_file() and not path.name.startswith("."):
                yield path, str(Path(repo_root) / path.relative_to(local_root))


def iter_code_files():
    code_files = [
        (ROOT / "DECODE_Reimplementation.ipynb", "DECODE_Reimplementation.ipynb"),
        (ROOT / "HUGGINGFACE.md", "HUGGINGFACE.md"),
        (ROOT / "scripts" / "decode_reimplementation.py", "scripts/decode_reimplementation.py"),
        (ROOT / "scripts" / "upload_to_huggingface.py", "scripts/upload_to_huggingface.py"),
        (ROOT / "scripts" / "preprocess_and_eda_by_building.py", "scripts/preprocess_and_eda_by_building.py"),
        (ROOT / "decode_reimplementation_outputs" / "README.md", "decode_reimplementation_outputs/README.md"),
    ]
    for path, repo_path in code_files:
        if path.exists():
            yield path, repo_path
        else:
            print(f"Skip missing code file: {path}")


def summarize(label: str, files: list[tuple[Path, str]]) -> None:
    total_size = sum(path.stat().st_size for path, _ in files)
    print(f"\n[{label}] files: {len(files)}")
    print(f"[{label}] total size: {total_size / (1024 ** 3):.3f} GiB")
    for path, repo_path in files:
        print(f"{path} -> {repo_path}")


def upload_files(api, repo_id: str, repo_type: str, files: list[tuple[Path, str]], label: str) -> None:
    for index, (path, repo_path) in enumerate(files, start=1):
        print(f"[{label} {index}/{len(files)}] Uploading {repo_path}")
        api.upload_file(
            path_or_fileobj=str(path),
            path_in_repo=repo_path,
            repo_id=repo_id,
            repo_type=repo_type,
        )


def main() -> int:
    args = parse_args()
    token = os.environ.get("HF_TOKEN")
    if args.token_file:
        token = Path(args.token_file).read_text(encoding="utf-8").strip()
    if not token and not args.dry_run:
        raise SystemExit("HF_TOKEN is not set. Export it before uploading.")

    data_files = list(iter_data_files())
    code_files = list(iter_code_files())
    summarize("data", data_files)
    summarize("code", code_files)

    if args.dry_run:
        return 0

    HfApi = import_hf()
    api = HfApi(token=token)
    api.create_repo(repo_id=args.data_repo_id, repo_type=args.data_repo_type, private=args.private, exist_ok=True)
    api.create_repo(repo_id=args.code_repo_id, repo_type=args.code_repo_type, private=args.private, exist_ok=True)
    upload_files(api, args.data_repo_id, args.data_repo_type, data_files, "data")
    upload_files(api, args.code_repo_id, args.code_repo_type, code_files, "code")

    data_url_type = "datasets" if args.data_repo_type == "dataset" else ""
    code_url_type = "datasets" if args.code_repo_type == "dataset" else ""
    print("\nUploaded:")
    print(f"Data: https://huggingface.co/{data_url_type + '/' if data_url_type else ''}{args.data_repo_id}")
    print(f"Code: https://huggingface.co/{code_url_type + '/' if code_url_type else ''}{args.code_repo_id}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())