decode-iblend-code / scripts /upload_to_huggingface.py

Upload scripts/upload_to_huggingface.py with huggingface_hub

626bb3f verified 4 days ago

5.43 kB

	#!/usr/bin/env python3
	"""Upload project data and code to separate Hugging Face repositories.

	Recommended layout:
	- data repo: Hugging Face Dataset repo, e.g. USER/decode-iblend-data
	- code repo: Hugging Face Model repo, e.g. USER/decode-iblend-code

	Usage:
	export HF_TOKEN=hf_xxx
	python3 scripts/upload_to_huggingface.py \
	--data-repo-id USER/decode-iblend-data \
	--code-repo-id USER/decode-iblend-code
	"""

	from __future__ import annotations

	import argparse
	import os
	from pathlib import Path


	ROOT = Path(__file__).resolve().parents[1]
	DATA_MINING_ROOT = ROOT.parent


	def import_hf():
	try:
	from huggingface_hub import HfApi
	except ImportError as exc:
	raise SystemExit(
	"Missing dependency: huggingface_hub\n"
	"Install with:\n"
	" python3 -m pip install huggingface_hub\n"
	) from exc
	return HfApi


	def parse_args():
	parser = argparse.ArgumentParser(description="Upload data and code to separate Hugging Face repos.")
	parser.add_argument("--data-repo-id", required=True, help="Dataset repo id, e.g. username/decode-iblend-data")
	parser.add_argument("--code-repo-id", required=True, help="Code repo id, e.g. username/decode-iblend-code")
	parser.add_argument("--data-repo-type", default="dataset", choices=["dataset", "model"], help="HF type for data repo.")
	parser.add_argument("--code-repo-type", default="model", choices=["dataset", "model"], help="HF type for code repo.")
	parser.add_argument("--token-file", help="Path to a file containing the Hugging Face token. Overrides HF_TOKEN.")
	parser.add_argument("--private", action="store_true", help="Create both repos as private.")
	parser.add_argument("--dry-run", action="store_true", help="Print files that would be uploaded.")
	return parser.parse_args()


	def iter_data_files():
	data_roots = [
	(ROOT / "energy_dataset", "energy_dataset"),
	(DATA_MINING_ROOT / "IIITD_occupancy_dataset", "IIITD_occupancy_dataset"),
	(DATA_MINING_ROOT / "iiitd_calender_schedule", "iiitd_calender_schedule"),
	(DATA_MINING_ROOT / "weather_comparison", "weather_comparison"),
	]
	for local_root, repo_root in data_roots:
	if not local_root.exists():
	print(f"Skip missing data folder: {local_root}")
	continue
	for path in sorted(local_root.rglob("*")):
	if path.is_file() and not path.name.startswith("."):
	yield path, str(Path(repo_root) / path.relative_to(local_root))


	def iter_code_files():
	code_files = [
	(ROOT / "DECODE_Reimplementation.ipynb", "DECODE_Reimplementation.ipynb"),
	(ROOT / "HUGGINGFACE.md", "HUGGINGFACE.md"),
	(ROOT / "scripts" / "decode_reimplementation.py", "scripts/decode_reimplementation.py"),
	(ROOT / "scripts" / "upload_to_huggingface.py", "scripts/upload_to_huggingface.py"),
	(ROOT / "scripts" / "preprocess_and_eda_by_building.py", "scripts/preprocess_and_eda_by_building.py"),
	(ROOT / "decode_reimplementation_outputs" / "README.md", "decode_reimplementation_outputs/README.md"),
	]
	for path, repo_path in code_files:
	if path.exists():
	yield path, repo_path
	else:
	print(f"Skip missing code file: {path}")


	def summarize(label: str, files: list[tuple[Path, str]]) -> None:
	total_size = sum(path.stat().st_size for path, _ in files)
	print(f"\n[{label}] files: {len(files)}")
	print(f"[{label}] total size: {total_size / (1024 ** 3):.3f} GiB")
	for path, repo_path in files:
	print(f"{path} -> {repo_path}")


	def upload_files(api, repo_id: str, repo_type: str, files: list[tuple[Path, str]], label: str) -> None:
	for index, (path, repo_path) in enumerate(files, start=1):
	print(f"[{label} {index}/{len(files)}] Uploading {repo_path}")
	api.upload_file(
	path_or_fileobj=str(path),
	path_in_repo=repo_path,
	repo_id=repo_id,
	repo_type=repo_type,
	)


	def main() -> int:
	args = parse_args()
	token = os.environ.get("HF_TOKEN")
	if args.token_file:
	token = Path(args.token_file).read_text(encoding="utf-8").strip()
	if not token and not args.dry_run:
	raise SystemExit("HF_TOKEN is not set. Export it before uploading.")

	data_files = list(iter_data_files())
	code_files = list(iter_code_files())
	summarize("data", data_files)
	summarize("code", code_files)

	if args.dry_run:
	return 0

	HfApi = import_hf()
	api = HfApi(token=token)
	api.create_repo(repo_id=args.data_repo_id, repo_type=args.data_repo_type, private=args.private, exist_ok=True)
	api.create_repo(repo_id=args.code_repo_id, repo_type=args.code_repo_type, private=args.private, exist_ok=True)
	upload_files(api, args.data_repo_id, args.data_repo_type, data_files, "data")
	upload_files(api, args.code_repo_id, args.code_repo_type, code_files, "code")

	data_url_type = "datasets" if args.data_repo_type == "dataset" else ""
	code_url_type = "datasets" if args.code_repo_type == "dataset" else ""
	print("\nUploaded:")
	print(f"Data: https://huggingface.co/{data_url_type + '/' if data_url_type else ''}{args.data_repo_id}")
	print(f"Code: https://huggingface.co/{code_url_type + '/' if code_url_type else ''}{args.code_repo_id}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())