f

Files changed (7) hide show

docker/Dockerfile +44 -0
docker/requirements_docker.txt +38 -0
evaluation/evaluate.py +1 -1
requirements.txt +40 -19
scripts/cxrvlm_colab_train.ipynb +2 -59
training/train.py +4 -2
utils/checkpoint.py +13 -0

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+# CXR-VLM training environment.
+# Matches the Colab GPU runtime fingerprint verified on 2026-05:
+#   Python   3.12.13
+#   torch    2.10.0+cu128
+#   CUDA     12.8  (nvcc 12.8.93)
+#   cuDNN    9.10.2
+#   glibc    2.35  (Ubuntu 22.04 base)
+#   bnb      0.49.2  (4-bit quantize verified)
+#
+# ─── Host requirements ────────────────────────────────────────────────────────
+# This image requires NVIDIA driver >= 550.54 on the host (CUDA 12.8 runtime).
+#   • Colab           — driver 580+, OK
+#   • Vast.ai         — filter "CUDA Driver >= 550" when picking an instance
+#   • Lightning AI    — A10G / A100 / H100 OK; check older T4 Studios
+#   • RunPod          — pick a 12.8-compatible template or BYO image
+#
+# T4 (sm_75) note: torch.cuda.is_bf16_supported() returns True via emulation,
+# but T4 has no hardware BF16. Keep train_cfg.training.fp16=True / bf16=False
+# on T4. On A100/L4/H100 (sm_80+) you can flip to bf16.
+#
+# ─── Build & push ─────────────────────────────────────────────────────────────
+#   docker build -t <hub>/cxr-vlm-env:cu128 docker/
+#   docker push  <hub>/cxr-vlm-env:cu128
+#
+# ─── Base image fallbacks (if 2.10.0-cuda12.8 tag is missing on Docker Hub) ──
+#   FROM nvcr.io/nvidia/pytorch:25.04-py3              # NVIDIA NGC, always cu128
+#   FROM pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel   # stable, slightly older
+FROM pytorch/pytorch:2.10.0-cuda12.8-cudnn9-devel
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    BITSANDBYTES_NOWELCOME=1 \
+    TOKENIZERS_PARALLELISM=false \
+    HF_HUB_DISABLE_PROGRESS_BARS=1 \
+    TRANSFORMERS_VERBOSITY=warning
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git wget curl && \
+    rm -rf /var/lib/apt/lists/*
+COPY requirements_docker.txt /tmp/requirements_docker.txt
+RUN pip install --no-cache-dir -r /tmp/requirements_docker.txt
+WORKDIR /workspace

docker/requirements_docker.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+# Versions match Colab GPU runtime (verified 2026-05). torch + torchvision
+# are NOT listed here — they come from the pytorch/pytorch base image with
+# the right CUDA build baked in.
+# ── Core HF stack ─────────────────────────────────────────────────────────────
+transformers==4.49.0
+peft==0.14.0
+accelerate==1.13.0
+bitsandbytes==0.49.2
+huggingface_hub==1.11.0
+httpx==0.28.1          # utils/_httpx_compat.py handles the allow_redirects removal
+# ── Vision encoder ────────────────────────────────────────────────────────────
+# rad_dino loads via transformers AutoModel — no extra dep needed.
+# hi-ml-multimodal (BioViL-T) intentionally omitted; model/rad_dino.py wraps
+# its import in try/except and falls back to timm/transformers cleanly.
+timm==1.0.26
+Pillow==11.3.0
+# ── Config / data ─────────────────────────────────────────────────────────────
+omegaconf==2.3.0
+sentencepiece==0.2.1
+protobuf==5.29.6
+numpy==2.0.2
+pandas==2.2.2
+# ── Eval metrics ──────────────────────────────────────────────────────────────
+nltk==3.9.1
+rouge-score==0.1.2
+bert-score==0.3.12
+scikit-learn==1.6.1
+# ── Training / experiment tracking ────────────────────────────────────────────
+wandb==0.26.1
+tqdm==4.67.3
+# ── Optional: LLM-as-judge for VQA ────────────────────────────────────────────
+openai==2.32.0

evaluation/evaluate.py CHANGED Viewed

@@ -47,7 +47,7 @@ from typing import List, Dict, Optional
 import torch
 from torch.utils.data import DataLoader
 from omegaconf import OmegaConf
-from tqdm import tqdm
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

 import torch
 from torch.utils.data import DataLoader
 from omegaconf import OmegaConf
+from tqdm.auto import tqdm
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

requirements.txt CHANGED Viewed

@@ -1,21 +1,42 @@
-torch==2.0.1
-torchvision==0.15.2
-transformers==4.35.0
-peft==0.6.0
-accelerate==0.24.0
-bitsandbytes==0.41.0
-einops==0.7.0
-sentencepiece==0.1.99
-hi-ml-multimodal==0.2.0
 omegaconf==2.3.0
-wandb==0.16.0
 rouge-score==0.1.2
-nltk==3.8.1
-bert-score==0.3.13
-openai>=1.30.0  # optional: LLM-as-judge for VQA (also works with Gemini/Ollama via base_url)
-scikit-learn==1.3.2
-pandas==2.1.0
-numpy==1.24.0
-Pillow==10.0.0
-tqdm==4.66.1
-fire==0.5.0

+# CXR-VLM dependencies — versions match Colab GPU runtime (verified 2026-05).
+#
+# NOTE: torch / torchvision are deliberately NOT pinned here. Install them
+# separately matching your CUDA driver, e.g. on Colab they come pre-installed,
+# and for local/Vast.ai/Lightning use:
+#     pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124
+# (or whichever CUDA the host driver supports).
+# ── Core HF stack ─────────────────────────────────────────────────────────────
+transformers==4.49.0
+peft==0.14.0
+accelerate==1.13.0
+bitsandbytes==0.49.2
+huggingface_hub==1.11.0
+httpx==0.28.1          # utils/_httpx_compat.py handles the allow_redirects removal
+# ── Vision encoder ────────────────────────────────────────────────────────────
+# rad_dino (default) loads via transformers AutoModel — no extra dep.
+# biovilt backend (hi-ml-multimodal) intentionally OMITTED — code falls back
+# to timm/transformers automatically when health_multimodal isn't installed.
+timm==1.0.26
+Pillow==11.3.0
+# ── Config / data ─────────────────────────────────────────────────────────────
 omegaconf==2.3.0
+sentencepiece==0.2.1
+protobuf==5.29.6
+numpy==2.0.2
+pandas==2.2.2
+# ── Eval metrics ──────────────────────────────────────────────────────────────
+nltk==3.9.1
 rouge-score==0.1.2
+bert-score==0.3.12
+scikit-learn==1.6.1
+# ── Training / experiment tracking ────────────────────────────────────────────
+wandb==0.26.1
+tqdm==4.67.3
+# ── Optional: LLM-as-judge for VQA (set --judge_model on evaluation/evaluate.py)
+openai==2.32.0

scripts/cxrvlm_colab_train.ipynb CHANGED Viewed

@@ -299,64 +299,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip uninstall -y -q torchao transformers bitsandbytes peft accelerate\n",
-    "\n",
-    "# Let pip pick latest bnb that matches Colab's CUDA 12.8 + triton 3.x\n",
-    "!pip install -q -U bitsandbytes\n",
-    "\n",
-    "# Install everything. We DON'T pin httpx anymore — Colab's firebase-admin and\n",
-    "# google-genai hard-pin httpx==0.28.1, so the resolver always wins. Instead\n",
-    "# we monkey-patch httpx 0.28+ below to keep accepting the legacy\n",
-    "# `allow_redirects` kwarg that transformers ≤4.50 still passes.\n",
-    "!pip install -q \\\n",
-    "    'transformers>=4.46,<4.50' \\\n",
-    "    'peft>=0.13,<0.15' \\\n",
-    "    'accelerate>=1.0' \\\n",
-    "    'huggingface_hub>=0.27,<1.0' \\\n",
-    "    omegaconf sentencepiece 'protobuf>=3.20' \\\n",
-    "    nltk rouge-score bert-score sacrebleu\n",
-    "\n",
-    "import torch, transformers, bitsandbytes, peft, accelerate, huggingface_hub, httpx\n",
-    "print('torch          :', torch.__version__, '| cuda:', torch.cuda.is_available())\n",
-    "print('transformers   :', transformers.__version__)\n",
-    "print('bitsandbytes   :', bitsandbytes.__version__)\n",
-    "print('peft           :', peft.__version__)\n",
-    "print('accelerate     :', accelerate.__version__)\n",
-    "print('huggingface_hub:', huggingface_hub.__version__)\n",
-    "print('httpx          :', httpx.__version__)\n",
-    "\n",
-    "# ── httpx 0.28+ compat shim ───────────────────────────────────────────────\n",
-    "# transformers ≤4.50 calls httpx.Client.head(..., allow_redirects=True) which\n",
-    "# httpx 0.28 removed → \"Client.head() got an unexpected keyword argument\n",
-    "# 'allow_redirects'\". Translate the kwarg at the call site so the rest of\n",
-    "# the stack keeps working. No-op on httpx <0.28.\n",
-    "#\n",
-    "# The same patch is auto-applied inside the train.py subprocess via\n",
-    "# utils._quiet → utils._httpx_compat. Here we apply it in the NOTEBOOK\n",
-    "# kernel too, so the smoke test cell (which runs in-kernel) benefits.\n",
-    "def _patch_httpx():\n",
-    "    if tuple(int(x) for x in httpx.__version__.split('.')[:2]) < (0, 28):\n",
-    "        return\n",
-    "    if getattr(httpx.Client, '_cxr_vlm_compat_patched', False):\n",
-    "        return\n",
-    "    def _make(orig):\n",
-    "        def patched(self, *args, **kwargs):\n",
-    "            if 'allow_redirects' in kwargs:\n",
-    "                kwargs['follow_redirects'] = kwargs.pop('allow_redirects')\n",
-    "            # httpx 0.28+ removed per-request `proxies=` too — transformers ≤4.49\n",
-    "            # still passes it via huggingface_hub.has_file → drop it silently.\n",
-    "            kwargs.pop('proxies', None)\n",
-    "            return orig(self, *args, **kwargs)\n",
-    "        return patched\n",
-    "    for cls in (httpx.Client, httpx.AsyncClient):\n",
-    "        for m in ('request', 'get', 'head', 'post', 'put',\n",
-    "                  'patch', 'delete', 'options'):\n",
-    "            if hasattr(cls, m):\n",
-    "                setattr(cls, m, _make(getattr(cls, m)))\n",
-    "    httpx.Client._cxr_vlm_compat_patched = True\n",
-    "    print(f'httpx {httpx.__version__}: monkey-patched allow_redirects → follow_redirects')\n",
-    "\n",
-    "_patch_httpx()\n"
    ]
   },
   {
@@ -1723,4 +1666,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}

    },
    "outputs": [],
    "source": [
+    "import os as _os\n_in_docker = _os.path.exists('/.dockerenv')\nif _in_docker:\n    print('Running inside Docker image -- skipping pip install (env pre-baked).')\nelse:\n    !pip uninstall -y -q torchao transformers bitsandbytes peft accelerate\n    \n    # Let pip pick latest bnb that matches Colab's CUDA 12.8 + triton 3.x\n    !pip install -q -U bitsandbytes\n    \n    # Install everything. We DON'T pin httpx anymore — Colab's firebase-admin and\n    # google-genai hard-pin httpx==0.28.1, so the resolver always wins. Instead\n    # we monkey-patch httpx 0.28+ below to keep accepting the legacy\n    # `allow_redirects` kwarg that transformers ≤4.50 still passes.\n    !pip install -q \\\n        'transformers>=4.46,<4.50' \\\n        'peft>=0.13,<0.15' \\\n        'accelerate>=1.0' \\\n        'huggingface_hub>=0.27,<1.0' \\\n        omegaconf sentencepiece 'protobuf>=3.20' \\\n        nltk rouge-score bert-score sacrebleu\n    \n    import torch, transformers, bitsandbytes, peft, accelerate, huggingface_hub, httpx\n    print('torch          :', torch.__version__, '| cuda:', torch.cuda.is_available())\n    print('transformers   :', transformers.__version__)\n    print('bitsandbytes   :', bitsandbytes.__version__)\n    print('peft           :', peft.__version__)\n    print('accelerate     :', accelerate.__version__)\n    print('huggingface_hub:', huggingface_hub.__version__)\n    print('httpx          :', httpx.__version__)\n    \n    # ── httpx 0.28+ compat shim ─────────────────────────────────���─────────────\n    # transformers ≤4.50 calls httpx.Client.head(..., allow_redirects=True) which\n    # httpx 0.28 removed → \"Client.head() got an unexpected keyword argument\n    # 'allow_redirects'\". Translate the kwarg at the call site so the rest of\n    # the stack keeps working. No-op on httpx <0.28.\n    #\n    # The same patch is auto-applied inside the train.py subprocess via\n    # utils._quiet → utils._httpx_compat. Here we apply it in the NOTEBOOK\n    # kernel too, so the smoke test cell (which runs in-kernel) benefits.\n    def _patch_httpx():\n        if tuple(int(x) for x in httpx.__version__.split('.')[:2]) < (0, 28):\n            return\n        if getattr(httpx.Client, '_cxr_vlm_compat_patched', False):\n            return\n        def _make(orig):\n            def patched(self, *args, **kwargs):\n                if 'allow_redirects' in kwargs:\n                    kwargs['follow_redirects'] = kwargs.pop('allow_redirects')\n                # httpx 0.28+ removed per-request `proxies=` too — transformers ≤4.49\n                # still passes it via huggingface_hub.has_file → drop it silently.\n                kwargs.pop('proxies', None)\n                return orig(self, *args, **kwargs)\n            return patched\n        for cls in (httpx.Client, httpx.AsyncClient):\n            for m in ('request', 'get', 'head', 'post', 'put',\n                      'patch', 'delete', 'options'):\n                if hasattr(cls, m):\n                    setattr(cls, m, _make(getattr(cls, m)))\n        httpx.Client._cxr_vlm_compat_patched = True\n        print(f'httpx {httpx.__version__}: monkey-patched allow_redirects → follow_redirects')\n    \n    _patch_httpx()"
    ]
   },
   {
  },
  "nbformat": 4,
  "nbformat_minor": 5
+}

training/train.py CHANGED Viewed

@@ -34,7 +34,7 @@ torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32       = True
 import transformers
-from transformers import TrainingArguments, Trainer, TrainerCallback
 # Add project root to path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
@@ -353,13 +353,15 @@ def get_trainer(
                     )
             return super()._get_train_sampler(*args, **kwargs)
-    return CXRTrainer(
         model           = model,
         args            = training_args,
         train_dataset   = train_dataset,
         eval_dataset    = val_dataset,
         data_collator   = collator,
     )
 def _cfg(stage_cfg, tr, key, default=None):

 torch.backends.cudnn.allow_tf32       = True
 import transformers
+from transformers import TrainingArguments, Trainer, TrainerCallback, PrinterCallback
 # Add project root to path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
                     )
             return super()._get_train_sampler(*args, **kwargs)
+    trainer = CXRTrainer(
         model           = model,
         args            = training_args,
         train_dataset   = train_dataset,
         eval_dataset    = val_dataset,
         data_collator   = collator,
     )
+    trainer.remove_callback(PrinterCallback)
+    return trainer
 def _cfg(stage_cfg, tr, key, default=None):

utils/checkpoint.py CHANGED Viewed

@@ -99,6 +99,19 @@ def load_checkpoint(
     # Load LoRA — skipped when llm not loaded (ITC Stage-1) or no dir present.
     if load_lora and getattr(model, "llm", None) is not None:
         lora_dir = ckpt_dir / f"{ckpt_name}_lora"
         if lora_dir.exists():
             from peft import PeftModel
             model.llm = PeftModel.from_pretrained(

     # Load LoRA — skipped when llm not loaded (ITC Stage-1) or no dir present.
     if load_lora and getattr(model, "llm", None) is not None:
         lora_dir = ckpt_dir / f"{ckpt_name}_lora"
+        # Defensive: PEFT raises an opaque HFValidationError when the dir
+        # exists but `adapter_config.json` is missing (a partially-written
+        # or partially-downloaded checkpoint). Surface a clearer message so
+        # the user knows the fix: delete the dir and resume from HF Hub.
+        if lora_dir.is_dir() and not (lora_dir / "adapter_config.json").is_file():
+            raise FileNotFoundError(
+                f"[load_checkpoint] {lora_dir} exists but adapter_config.json "
+                f"is missing — checkpoint is partially-written/downloaded. "
+                f"Fix: delete the parent checkpoint folder "
+                f"({lora_dir.parent}) and rerun with --mode resume so it "
+                f"gets re-pulled from HF Hub, OR rm -rf the stage2_instruct "
+                f"folder to train Stage 2 fresh from stage1_final."
+            )
         if lora_dir.exists():
             from peft import PeftModel
             model.llm = PeftModel.from_pretrained(