File size: 4,359 Bytes
28b13fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
upload_to_hf.py
---------------
One-time uploader: push project code + IU-Xray data to HuggingFace Hub as
two private dataset repos so non-Kaggle platforms (Colab / Lightning / GCP /
local) can pull them via snapshot_download.

Creates (if missing) and uploads to:
  <HF_USER>/cxr-vlm-code    — full project source (excludes checkpoints, data/, __pycache__)
  <HF_USER>/cxr-vlm-data    — IU-Xray/ (images + labels)

Usage (from project root):
    set HF_TOKEN=hf_xxx                                  # Windows cmd
    $env:HF_TOKEN='hf_xxx'                               # PowerShell
    export HF_TOKEN=hf_xxx                               # bash
    python scripts/upload_to_hf.py --user <your-username>

Re-run any time after editing code to re-sync. Large unchanged files are
deduped server-side so re-uploads are fast.
"""
import argparse
import os
import sys
from pathlib import Path

from huggingface_hub import HfApi, create_repo

PROJECT_ROOT = Path(__file__).resolve().parents[1]

# IU-Xray lives under data/IU-Xray.zip/IU-Xray on this machine (historic folder
# naming). Override with --iu_xray_dir if yours differs.
DEFAULT_IU_XRAY_DIR = PROJECT_ROOT / "data" / "IU-Xray.zip" / "IU-Xray"


def parse_args():
    ap = argparse.ArgumentParser()
    ap.add_argument("--user", required=True, help="HF username (repo owner)")
    ap.add_argument("--code_repo", default="cxr-vlm-code")
    ap.add_argument("--data_repo", default="cxr-vlm-data")
    ap.add_argument("--iu_xray_dir", default=str(DEFAULT_IU_XRAY_DIR),
                    help="Local path to IU-Xray folder containing images/ and labels/")
    ap.add_argument("--private", action="store_true", default=True)
    ap.add_argument("--public", dest="private", action="store_false")
    ap.add_argument("--skip_code", action="store_true")
    ap.add_argument("--skip_data", action="store_true")
    return ap.parse_args()


def main():
    args  = parse_args()
    token = os.environ.get("HF_TOKEN")
    if not token:
        sys.exit("ERROR: HF_TOKEN env var not set. Paste a write-scope token first.")

    api       = HfApi(token=token)
    code_id   = f"{args.user}/{args.code_repo}"
    data_id   = f"{args.user}/{args.data_repo}"

    # ── 1. Code repo ──────────────────────────────────────────────────
    if not args.skip_code:
        print(f"\n[1/2] Uploading code → {code_id}")
        create_repo(code_id, repo_type="dataset", private=args.private,
                    token=token, exist_ok=True)
        api.upload_folder(
            folder_path     = str(PROJECT_ROOT),
            repo_id         = code_id,
            repo_type       = "dataset",
            token           = token,
            ignore_patterns = [
                "checkpoints/**",
                "results/**",
                "data/**",               # don't ship data with code
                "wandb/**",
                "**/__pycache__/**",
                "*.pyc",
                ".git/**",
                ".venv/**",
                "venv/**",
                ".idea/**",
                ".vscode/**",
                "*.pt",
                "*.bin",
                "*.safetensors",
            ],
        )
        print(f"✓ code uploaded: https://huggingface.co/datasets/{code_id}")

    # ── 2. Data repo (IU-Xray) ────────────────────────────────────────
    if not args.skip_data:
        iu_dir = Path(args.iu_xray_dir)
        if not iu_dir.is_dir():
            sys.exit(f"ERROR: IU-Xray dir not found: {iu_dir}")
        print(f"\n[2/2] Uploading IU-Xray → {data_id}")
        create_repo(data_id, repo_type="dataset", private=args.private,
                    token=token, exist_ok=True)
        api.upload_folder(
            folder_path  = str(iu_dir),
            path_in_repo = "IU-Xray",
            repo_id      = data_id,
            repo_type    = "dataset",
            token        = token,
            ignore_patterns = ["**/__pycache__/**", "*.pyc"],
        )
        print(f"✓ data uploaded: https://huggingface.co/datasets/{data_id}")

    print("\nAll done. On non-Kaggle platforms set HF_USER =", args.user, "in cell-paths.")


if __name__ == "__main__":
    main()