| """ |
| upload_to_hf.py |
| --------------- |
| One-time uploader: push project code + IU-Xray data to HuggingFace Hub as |
| two private dataset repos so non-Kaggle platforms (Colab / Lightning / GCP / |
| local) can pull them via snapshot_download. |
| |
| Creates (if missing) and uploads to: |
| <HF_USER>/cxr-vlm-code — full project source (excludes checkpoints, data/, __pycache__) |
| <HF_USER>/cxr-vlm-data — IU-Xray/ (images + labels) |
| |
| Usage (from project root): |
| set HF_TOKEN=hf_xxx # Windows cmd |
| $env:HF_TOKEN='hf_xxx' # PowerShell |
| export HF_TOKEN=hf_xxx # bash |
| python scripts/upload_to_hf.py --user <your-username> |
| |
| Re-run any time after editing code to re-sync. Large unchanged files are |
| deduped server-side so re-uploads are fast. |
| """ |
| import argparse |
| import os |
| import sys |
| from pathlib import Path |
|
|
| from huggingface_hub import HfApi, create_repo |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parents[1] |
|
|
| |
| |
| DEFAULT_IU_XRAY_DIR = PROJECT_ROOT / "data" / "IU-Xray.zip" / "IU-Xray" |
|
|
|
|
| def parse_args(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--user", required=True, help="HF username (repo owner)") |
| ap.add_argument("--code_repo", default="cxr-vlm-code") |
| ap.add_argument("--data_repo", default="cxr-vlm-data") |
| ap.add_argument("--iu_xray_dir", default=str(DEFAULT_IU_XRAY_DIR), |
| help="Local path to IU-Xray folder containing images/ and labels/") |
| ap.add_argument("--private", action="store_true", default=True) |
| ap.add_argument("--public", dest="private", action="store_false") |
| ap.add_argument("--skip_code", action="store_true") |
| ap.add_argument("--skip_data", action="store_true") |
| return ap.parse_args() |
|
|
|
|
| def main(): |
| args = parse_args() |
| token = os.environ.get("HF_TOKEN") |
| if not token: |
| sys.exit("ERROR: HF_TOKEN env var not set. Paste a write-scope token first.") |
|
|
| api = HfApi(token=token) |
| code_id = f"{args.user}/{args.code_repo}" |
| data_id = f"{args.user}/{args.data_repo}" |
|
|
| |
| if not args.skip_code: |
| print(f"\n[1/2] Uploading code → {code_id}") |
| create_repo(code_id, repo_type="dataset", private=args.private, |
| token=token, exist_ok=True) |
| api.upload_folder( |
| folder_path = str(PROJECT_ROOT), |
| repo_id = code_id, |
| repo_type = "dataset", |
| token = token, |
| ignore_patterns = [ |
| "checkpoints/**", |
| "results/**", |
| "data/**", |
| "wandb/**", |
| "**/__pycache__/**", |
| "*.pyc", |
| ".git/**", |
| ".venv/**", |
| "venv/**", |
| ".idea/**", |
| ".vscode/**", |
| "*.pt", |
| "*.bin", |
| "*.safetensors", |
| ], |
| ) |
| print(f"✓ code uploaded: https://huggingface.co/datasets/{code_id}") |
|
|
| |
| if not args.skip_data: |
| iu_dir = Path(args.iu_xray_dir) |
| if not iu_dir.is_dir(): |
| sys.exit(f"ERROR: IU-Xray dir not found: {iu_dir}") |
| print(f"\n[2/2] Uploading IU-Xray → {data_id}") |
| create_repo(data_id, repo_type="dataset", private=args.private, |
| token=token, exist_ok=True) |
| api.upload_folder( |
| folder_path = str(iu_dir), |
| path_in_repo = "IU-Xray", |
| repo_id = data_id, |
| repo_type = "dataset", |
| token = token, |
| ignore_patterns = ["**/__pycache__/**", "*.pyc"], |
| ) |
| print(f"✓ data uploaded: https://huggingface.co/datasets/{data_id}") |
|
|
| print("\nAll done. On non-Kaggle platforms set HF_USER =", args.user, "in cell-paths.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|