cxr-vlm-code / scripts /upload_to_hf.py
convitom
initial commit
28b13fc
"""
upload_to_hf.py
---------------
One-time uploader: push project code + IU-Xray data to HuggingFace Hub as
two private dataset repos so non-Kaggle platforms (Colab / Lightning / GCP /
local) can pull them via snapshot_download.
Creates (if missing) and uploads to:
<HF_USER>/cxr-vlm-code — full project source (excludes checkpoints, data/, __pycache__)
<HF_USER>/cxr-vlm-data — IU-Xray/ (images + labels)
Usage (from project root):
set HF_TOKEN=hf_xxx # Windows cmd
$env:HF_TOKEN='hf_xxx' # PowerShell
export HF_TOKEN=hf_xxx # bash
python scripts/upload_to_hf.py --user <your-username>
Re-run any time after editing code to re-sync. Large unchanged files are
deduped server-side so re-uploads are fast.
"""
import argparse
import os
import sys
from pathlib import Path
from huggingface_hub import HfApi, create_repo
PROJECT_ROOT = Path(__file__).resolve().parents[1]
# IU-Xray lives under data/IU-Xray.zip/IU-Xray on this machine (historic folder
# naming). Override with --iu_xray_dir if yours differs.
DEFAULT_IU_XRAY_DIR = PROJECT_ROOT / "data" / "IU-Xray.zip" / "IU-Xray"
def parse_args():
ap = argparse.ArgumentParser()
ap.add_argument("--user", required=True, help="HF username (repo owner)")
ap.add_argument("--code_repo", default="cxr-vlm-code")
ap.add_argument("--data_repo", default="cxr-vlm-data")
ap.add_argument("--iu_xray_dir", default=str(DEFAULT_IU_XRAY_DIR),
help="Local path to IU-Xray folder containing images/ and labels/")
ap.add_argument("--private", action="store_true", default=True)
ap.add_argument("--public", dest="private", action="store_false")
ap.add_argument("--skip_code", action="store_true")
ap.add_argument("--skip_data", action="store_true")
return ap.parse_args()
def main():
args = parse_args()
token = os.environ.get("HF_TOKEN")
if not token:
sys.exit("ERROR: HF_TOKEN env var not set. Paste a write-scope token first.")
api = HfApi(token=token)
code_id = f"{args.user}/{args.code_repo}"
data_id = f"{args.user}/{args.data_repo}"
# ── 1. Code repo ──────────────────────────────────────────────────
if not args.skip_code:
print(f"\n[1/2] Uploading code → {code_id}")
create_repo(code_id, repo_type="dataset", private=args.private,
token=token, exist_ok=True)
api.upload_folder(
folder_path = str(PROJECT_ROOT),
repo_id = code_id,
repo_type = "dataset",
token = token,
ignore_patterns = [
"checkpoints/**",
"results/**",
"data/**", # don't ship data with code
"wandb/**",
"**/__pycache__/**",
"*.pyc",
".git/**",
".venv/**",
"venv/**",
".idea/**",
".vscode/**",
"*.pt",
"*.bin",
"*.safetensors",
],
)
print(f"✓ code uploaded: https://huggingface.co/datasets/{code_id}")
# ── 2. Data repo (IU-Xray) ────────────────────────────────────────
if not args.skip_data:
iu_dir = Path(args.iu_xray_dir)
if not iu_dir.is_dir():
sys.exit(f"ERROR: IU-Xray dir not found: {iu_dir}")
print(f"\n[2/2] Uploading IU-Xray → {data_id}")
create_repo(data_id, repo_type="dataset", private=args.private,
token=token, exist_ok=True)
api.upload_folder(
folder_path = str(iu_dir),
path_in_repo = "IU-Xray",
repo_id = data_id,
repo_type = "dataset",
token = token,
ignore_patterns = ["**/__pycache__/**", "*.pyc"],
)
print(f"✓ data uploaded: https://huggingface.co/datasets/{data_id}")
print("\nAll done. On non-Kaggle platforms set HF_USER =", args.user, "in cell-paths.")
if __name__ == "__main__":
main()