decode-iblend-code / scripts /upload_to_huggingface.py
HoangTrungNguyen's picture
Upload scripts/upload_to_huggingface.py with huggingface_hub
626bb3f verified
Raw
History Blame Contribute Delete
5.43 kB
#!/usr/bin/env python3
"""Upload project data and code to separate Hugging Face repositories.
Recommended layout:
- data repo: Hugging Face Dataset repo, e.g. USER/decode-iblend-data
- code repo: Hugging Face Model repo, e.g. USER/decode-iblend-code
Usage:
export HF_TOKEN=hf_xxx
python3 scripts/upload_to_huggingface.py \
--data-repo-id USER/decode-iblend-data \
--code-repo-id USER/decode-iblend-code
"""
from __future__ import annotations
import argparse
import os
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DATA_MINING_ROOT = ROOT.parent
def import_hf():
try:
from huggingface_hub import HfApi
except ImportError as exc:
raise SystemExit(
"Missing dependency: huggingface_hub\n"
"Install with:\n"
" python3 -m pip install huggingface_hub\n"
) from exc
return HfApi
def parse_args():
parser = argparse.ArgumentParser(description="Upload data and code to separate Hugging Face repos.")
parser.add_argument("--data-repo-id", required=True, help="Dataset repo id, e.g. username/decode-iblend-data")
parser.add_argument("--code-repo-id", required=True, help="Code repo id, e.g. username/decode-iblend-code")
parser.add_argument("--data-repo-type", default="dataset", choices=["dataset", "model"], help="HF type for data repo.")
parser.add_argument("--code-repo-type", default="model", choices=["dataset", "model"], help="HF type for code repo.")
parser.add_argument("--token-file", help="Path to a file containing the Hugging Face token. Overrides HF_TOKEN.")
parser.add_argument("--private", action="store_true", help="Create both repos as private.")
parser.add_argument("--dry-run", action="store_true", help="Print files that would be uploaded.")
return parser.parse_args()
def iter_data_files():
data_roots = [
(ROOT / "energy_dataset", "energy_dataset"),
(DATA_MINING_ROOT / "IIITD_occupancy_dataset", "IIITD_occupancy_dataset"),
(DATA_MINING_ROOT / "iiitd_calender_schedule", "iiitd_calender_schedule"),
(DATA_MINING_ROOT / "weather_comparison", "weather_comparison"),
]
for local_root, repo_root in data_roots:
if not local_root.exists():
print(f"Skip missing data folder: {local_root}")
continue
for path in sorted(local_root.rglob("*")):
if path.is_file() and not path.name.startswith("."):
yield path, str(Path(repo_root) / path.relative_to(local_root))
def iter_code_files():
code_files = [
(ROOT / "DECODE_Reimplementation.ipynb", "DECODE_Reimplementation.ipynb"),
(ROOT / "HUGGINGFACE.md", "HUGGINGFACE.md"),
(ROOT / "scripts" / "decode_reimplementation.py", "scripts/decode_reimplementation.py"),
(ROOT / "scripts" / "upload_to_huggingface.py", "scripts/upload_to_huggingface.py"),
(ROOT / "scripts" / "preprocess_and_eda_by_building.py", "scripts/preprocess_and_eda_by_building.py"),
(ROOT / "decode_reimplementation_outputs" / "README.md", "decode_reimplementation_outputs/README.md"),
]
for path, repo_path in code_files:
if path.exists():
yield path, repo_path
else:
print(f"Skip missing code file: {path}")
def summarize(label: str, files: list[tuple[Path, str]]) -> None:
total_size = sum(path.stat().st_size for path, _ in files)
print(f"\n[{label}] files: {len(files)}")
print(f"[{label}] total size: {total_size / (1024 ** 3):.3f} GiB")
for path, repo_path in files:
print(f"{path} -> {repo_path}")
def upload_files(api, repo_id: str, repo_type: str, files: list[tuple[Path, str]], label: str) -> None:
for index, (path, repo_path) in enumerate(files, start=1):
print(f"[{label} {index}/{len(files)}] Uploading {repo_path}")
api.upload_file(
path_or_fileobj=str(path),
path_in_repo=repo_path,
repo_id=repo_id,
repo_type=repo_type,
)
def main() -> int:
args = parse_args()
token = os.environ.get("HF_TOKEN")
if args.token_file:
token = Path(args.token_file).read_text(encoding="utf-8").strip()
if not token and not args.dry_run:
raise SystemExit("HF_TOKEN is not set. Export it before uploading.")
data_files = list(iter_data_files())
code_files = list(iter_code_files())
summarize("data", data_files)
summarize("code", code_files)
if args.dry_run:
return 0
HfApi = import_hf()
api = HfApi(token=token)
api.create_repo(repo_id=args.data_repo_id, repo_type=args.data_repo_type, private=args.private, exist_ok=True)
api.create_repo(repo_id=args.code_repo_id, repo_type=args.code_repo_type, private=args.private, exist_ok=True)
upload_files(api, args.data_repo_id, args.data_repo_type, data_files, "data")
upload_files(api, args.code_repo_id, args.code_repo_type, code_files, "code")
data_url_type = "datasets" if args.data_repo_type == "dataset" else ""
code_url_type = "datasets" if args.code_repo_type == "dataset" else ""
print("\nUploaded:")
print(f"Data: https://huggingface.co/{data_url_type + '/' if data_url_type else ''}{args.data_repo_id}")
print(f"Code: https://huggingface.co/{code_url_type + '/' if code_url_type else ''}{args.code_repo_id}")
return 0
if __name__ == "__main__":
raise SystemExit(main())