File size: 5,432 Bytes
626bb3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
"""Upload project data and code to separate Hugging Face repositories.

Recommended layout:
- data repo: Hugging Face Dataset repo, e.g. USER/decode-iblend-data
- code repo: Hugging Face Model repo, e.g. USER/decode-iblend-code

Usage:
  export HF_TOKEN=hf_xxx
  python3 scripts/upload_to_huggingface.py \
    --data-repo-id USER/decode-iblend-data \
    --code-repo-id USER/decode-iblend-code
"""

from __future__ import annotations

import argparse
import os
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DATA_MINING_ROOT = ROOT.parent


def import_hf():
    try:
        from huggingface_hub import HfApi
    except ImportError as exc:
        raise SystemExit(
            "Missing dependency: huggingface_hub\n"
            "Install with:\n"
            "  python3 -m pip install huggingface_hub\n"
        ) from exc
    return HfApi


def parse_args():
    parser = argparse.ArgumentParser(description="Upload data and code to separate Hugging Face repos.")
    parser.add_argument("--data-repo-id", required=True, help="Dataset repo id, e.g. username/decode-iblend-data")
    parser.add_argument("--code-repo-id", required=True, help="Code repo id, e.g. username/decode-iblend-code")
    parser.add_argument("--data-repo-type", default="dataset", choices=["dataset", "model"], help="HF type for data repo.")
    parser.add_argument("--code-repo-type", default="model", choices=["dataset", "model"], help="HF type for code repo.")
    parser.add_argument("--token-file", help="Path to a file containing the Hugging Face token. Overrides HF_TOKEN.")
    parser.add_argument("--private", action="store_true", help="Create both repos as private.")
    parser.add_argument("--dry-run", action="store_true", help="Print files that would be uploaded.")
    return parser.parse_args()


def iter_data_files():
    data_roots = [
        (ROOT / "energy_dataset", "energy_dataset"),
        (DATA_MINING_ROOT / "IIITD_occupancy_dataset", "IIITD_occupancy_dataset"),
        (DATA_MINING_ROOT / "iiitd_calender_schedule", "iiitd_calender_schedule"),
        (DATA_MINING_ROOT / "weather_comparison", "weather_comparison"),
    ]
    for local_root, repo_root in data_roots:
        if not local_root.exists():
            print(f"Skip missing data folder: {local_root}")
            continue
        for path in sorted(local_root.rglob("*")):
            if path.is_file() and not path.name.startswith("."):
                yield path, str(Path(repo_root) / path.relative_to(local_root))


def iter_code_files():
    code_files = [
        (ROOT / "DECODE_Reimplementation.ipynb", "DECODE_Reimplementation.ipynb"),
        (ROOT / "HUGGINGFACE.md", "HUGGINGFACE.md"),
        (ROOT / "scripts" / "decode_reimplementation.py", "scripts/decode_reimplementation.py"),
        (ROOT / "scripts" / "upload_to_huggingface.py", "scripts/upload_to_huggingface.py"),
        (ROOT / "scripts" / "preprocess_and_eda_by_building.py", "scripts/preprocess_and_eda_by_building.py"),
        (ROOT / "decode_reimplementation_outputs" / "README.md", "decode_reimplementation_outputs/README.md"),
    ]
    for path, repo_path in code_files:
        if path.exists():
            yield path, repo_path
        else:
            print(f"Skip missing code file: {path}")


def summarize(label: str, files: list[tuple[Path, str]]) -> None:
    total_size = sum(path.stat().st_size for path, _ in files)
    print(f"\n[{label}] files: {len(files)}")
    print(f"[{label}] total size: {total_size / (1024 ** 3):.3f} GiB")
    for path, repo_path in files:
        print(f"{path} -> {repo_path}")


def upload_files(api, repo_id: str, repo_type: str, files: list[tuple[Path, str]], label: str) -> None:
    for index, (path, repo_path) in enumerate(files, start=1):
        print(f"[{label} {index}/{len(files)}] Uploading {repo_path}")
        api.upload_file(
            path_or_fileobj=str(path),
            path_in_repo=repo_path,
            repo_id=repo_id,
            repo_type=repo_type,
        )


def main() -> int:
    args = parse_args()
    token = os.environ.get("HF_TOKEN")
    if args.token_file:
        token = Path(args.token_file).read_text(encoding="utf-8").strip()
    if not token and not args.dry_run:
        raise SystemExit("HF_TOKEN is not set. Export it before uploading.")

    data_files = list(iter_data_files())
    code_files = list(iter_code_files())
    summarize("data", data_files)
    summarize("code", code_files)

    if args.dry_run:
        return 0

    HfApi = import_hf()
    api = HfApi(token=token)
    api.create_repo(repo_id=args.data_repo_id, repo_type=args.data_repo_type, private=args.private, exist_ok=True)
    api.create_repo(repo_id=args.code_repo_id, repo_type=args.code_repo_type, private=args.private, exist_ok=True)
    upload_files(api, args.data_repo_id, args.data_repo_type, data_files, "data")
    upload_files(api, args.code_repo_id, args.code_repo_type, code_files, "code")

    data_url_type = "datasets" if args.data_repo_type == "dataset" else ""
    code_url_type = "datasets" if args.code_repo_type == "dataset" else ""
    print("\nUploaded:")
    print(f"Data: https://huggingface.co/{data_url_type + '/' if data_url_type else ''}{args.data_repo_id}")
    print(f"Code: https://huggingface.co/{code_url_type + '/' if code_url_type else ''}{args.code_repo_id}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())