""" upload_to_hf.py --------------- One-time uploader: push project code + IU-Xray data to HuggingFace Hub as two private dataset repos so non-Kaggle platforms (Colab / Lightning / GCP / local) can pull them via snapshot_download. Creates (if missing) and uploads to: /cxr-vlm-code — full project source (excludes checkpoints, data/, __pycache__) /cxr-vlm-data — IU-Xray/ (images + labels) Usage (from project root): set HF_TOKEN=hf_xxx # Windows cmd $env:HF_TOKEN='hf_xxx' # PowerShell export HF_TOKEN=hf_xxx # bash python scripts/upload_to_hf.py --user Re-run any time after editing code to re-sync. Large unchanged files are deduped server-side so re-uploads are fast. """ import argparse import os import sys from pathlib import Path from huggingface_hub import HfApi, create_repo PROJECT_ROOT = Path(__file__).resolve().parents[1] # IU-Xray lives under data/IU-Xray.zip/IU-Xray on this machine (historic folder # naming). Override with --iu_xray_dir if yours differs. DEFAULT_IU_XRAY_DIR = PROJECT_ROOT / "data" / "IU-Xray.zip" / "IU-Xray" def parse_args(): ap = argparse.ArgumentParser() ap.add_argument("--user", required=True, help="HF username (repo owner)") ap.add_argument("--code_repo", default="cxr-vlm-code") ap.add_argument("--data_repo", default="cxr-vlm-data") ap.add_argument("--iu_xray_dir", default=str(DEFAULT_IU_XRAY_DIR), help="Local path to IU-Xray folder containing images/ and labels/") ap.add_argument("--private", action="store_true", default=True) ap.add_argument("--public", dest="private", action="store_false") ap.add_argument("--skip_code", action="store_true") ap.add_argument("--skip_data", action="store_true") return ap.parse_args() def main(): args = parse_args() token = os.environ.get("HF_TOKEN") if not token: sys.exit("ERROR: HF_TOKEN env var not set. Paste a write-scope token first.") api = HfApi(token=token) code_id = f"{args.user}/{args.code_repo}" data_id = f"{args.user}/{args.data_repo}" # ── 1. Code repo ────────────────────────────────────────────────── if not args.skip_code: print(f"\n[1/2] Uploading code → {code_id}") create_repo(code_id, repo_type="dataset", private=args.private, token=token, exist_ok=True) api.upload_folder( folder_path = str(PROJECT_ROOT), repo_id = code_id, repo_type = "dataset", token = token, ignore_patterns = [ "checkpoints/**", "results/**", "data/**", # don't ship data with code "wandb/**", "**/__pycache__/**", "*.pyc", ".git/**", ".venv/**", "venv/**", ".idea/**", ".vscode/**", "*.pt", "*.bin", "*.safetensors", ], ) print(f"✓ code uploaded: https://huggingface.co/datasets/{code_id}") # ── 2. Data repo (IU-Xray) ──────────────────────────────────────── if not args.skip_data: iu_dir = Path(args.iu_xray_dir) if not iu_dir.is_dir(): sys.exit(f"ERROR: IU-Xray dir not found: {iu_dir}") print(f"\n[2/2] Uploading IU-Xray → {data_id}") create_repo(data_id, repo_type="dataset", private=args.private, token=token, exist_ok=True) api.upload_folder( folder_path = str(iu_dir), path_in_repo = "IU-Xray", repo_id = data_id, repo_type = "dataset", token = token, ignore_patterns = ["**/__pycache__/**", "*.pyc"], ) print(f"✓ data uploaded: https://huggingface.co/datasets/{data_id}") print("\nAll done. On non-Kaggle platforms set HF_USER =", args.user, "in cell-paths.") if __name__ == "__main__": main()