File size: 5,432 Bytes
626bb3f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #!/usr/bin/env python3
"""Upload project data and code to separate Hugging Face repositories.
Recommended layout:
- data repo: Hugging Face Dataset repo, e.g. USER/decode-iblend-data
- code repo: Hugging Face Model repo, e.g. USER/decode-iblend-code
Usage:
export HF_TOKEN=hf_xxx
python3 scripts/upload_to_huggingface.py \
--data-repo-id USER/decode-iblend-data \
--code-repo-id USER/decode-iblend-code
"""
from __future__ import annotations
import argparse
import os
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DATA_MINING_ROOT = ROOT.parent
def import_hf():
try:
from huggingface_hub import HfApi
except ImportError as exc:
raise SystemExit(
"Missing dependency: huggingface_hub\n"
"Install with:\n"
" python3 -m pip install huggingface_hub\n"
) from exc
return HfApi
def parse_args():
parser = argparse.ArgumentParser(description="Upload data and code to separate Hugging Face repos.")
parser.add_argument("--data-repo-id", required=True, help="Dataset repo id, e.g. username/decode-iblend-data")
parser.add_argument("--code-repo-id", required=True, help="Code repo id, e.g. username/decode-iblend-code")
parser.add_argument("--data-repo-type", default="dataset", choices=["dataset", "model"], help="HF type for data repo.")
parser.add_argument("--code-repo-type", default="model", choices=["dataset", "model"], help="HF type for code repo.")
parser.add_argument("--token-file", help="Path to a file containing the Hugging Face token. Overrides HF_TOKEN.")
parser.add_argument("--private", action="store_true", help="Create both repos as private.")
parser.add_argument("--dry-run", action="store_true", help="Print files that would be uploaded.")
return parser.parse_args()
def iter_data_files():
data_roots = [
(ROOT / "energy_dataset", "energy_dataset"),
(DATA_MINING_ROOT / "IIITD_occupancy_dataset", "IIITD_occupancy_dataset"),
(DATA_MINING_ROOT / "iiitd_calender_schedule", "iiitd_calender_schedule"),
(DATA_MINING_ROOT / "weather_comparison", "weather_comparison"),
]
for local_root, repo_root in data_roots:
if not local_root.exists():
print(f"Skip missing data folder: {local_root}")
continue
for path in sorted(local_root.rglob("*")):
if path.is_file() and not path.name.startswith("."):
yield path, str(Path(repo_root) / path.relative_to(local_root))
def iter_code_files():
code_files = [
(ROOT / "DECODE_Reimplementation.ipynb", "DECODE_Reimplementation.ipynb"),
(ROOT / "HUGGINGFACE.md", "HUGGINGFACE.md"),
(ROOT / "scripts" / "decode_reimplementation.py", "scripts/decode_reimplementation.py"),
(ROOT / "scripts" / "upload_to_huggingface.py", "scripts/upload_to_huggingface.py"),
(ROOT / "scripts" / "preprocess_and_eda_by_building.py", "scripts/preprocess_and_eda_by_building.py"),
(ROOT / "decode_reimplementation_outputs" / "README.md", "decode_reimplementation_outputs/README.md"),
]
for path, repo_path in code_files:
if path.exists():
yield path, repo_path
else:
print(f"Skip missing code file: {path}")
def summarize(label: str, files: list[tuple[Path, str]]) -> None:
total_size = sum(path.stat().st_size for path, _ in files)
print(f"\n[{label}] files: {len(files)}")
print(f"[{label}] total size: {total_size / (1024 ** 3):.3f} GiB")
for path, repo_path in files:
print(f"{path} -> {repo_path}")
def upload_files(api, repo_id: str, repo_type: str, files: list[tuple[Path, str]], label: str) -> None:
for index, (path, repo_path) in enumerate(files, start=1):
print(f"[{label} {index}/{len(files)}] Uploading {repo_path}")
api.upload_file(
path_or_fileobj=str(path),
path_in_repo=repo_path,
repo_id=repo_id,
repo_type=repo_type,
)
def main() -> int:
args = parse_args()
token = os.environ.get("HF_TOKEN")
if args.token_file:
token = Path(args.token_file).read_text(encoding="utf-8").strip()
if not token and not args.dry_run:
raise SystemExit("HF_TOKEN is not set. Export it before uploading.")
data_files = list(iter_data_files())
code_files = list(iter_code_files())
summarize("data", data_files)
summarize("code", code_files)
if args.dry_run:
return 0
HfApi = import_hf()
api = HfApi(token=token)
api.create_repo(repo_id=args.data_repo_id, repo_type=args.data_repo_type, private=args.private, exist_ok=True)
api.create_repo(repo_id=args.code_repo_id, repo_type=args.code_repo_type, private=args.private, exist_ok=True)
upload_files(api, args.data_repo_id, args.data_repo_type, data_files, "data")
upload_files(api, args.code_repo_id, args.code_repo_type, code_files, "code")
data_url_type = "datasets" if args.data_repo_type == "dataset" else ""
code_url_type = "datasets" if args.code_repo_type == "dataset" else ""
print("\nUploaded:")
print(f"Data: https://huggingface.co/{data_url_type + '/' if data_url_type else ''}{args.data_repo_id}")
print(f"Code: https://huggingface.co/{code_url_type + '/' if code_url_type else ''}{args.code_repo_id}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|