feat(chexpert): U-MultiClass PNU abnormality guidance + abnormality-guided VQA

- chexpert_classifier.py: 14 binary heads → 14×3 softmax (negative/
positive/uncertain per pathology, META-CXR / CheXpert U-MultiClass).
Add format_pnu/buckets_to_pnu as the single source of truth for the
PNU 3-section prompt string (shared with the oracle builder so GT and
predicted prompts are byte-identical).
- mimic_cxr_builder.py: GT chexpert.csv → PNU string (1→pos, 0→neg,
-1→uncertain, blank/NaN→neg). VQA now carries the SAME PNU context
(abnormality-guided VQA, RaDialog-style). O(1) image lookup (was
O(N²)); index every image so report-less studies still serve VQA.
- Drop the obsolete uncertain_policy knob (U-MultiClass is the only
behaviour now) from builder, dataset.py, dataset_resolver, config.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (6) hide show

configs/train_config.yaml +5 -6
data/dataset.py +14 -13
data/mimic_cxr_builder.py +86 -84
model/chexpert_classifier.py +97 -55
model/{image_encoder.py → rad_dino.py} +0 -0
utils/dataset_resolver.py +6 -7

configs/train_config.yaml CHANGED Viewed

@@ -59,15 +59,14 @@ data:
   mimic_cxr_root:    "/path/to/MIMIC-CXR"
   instruct_json:     "data/data_files/mimic_cxr_instruct_unified.json"
-  # RaDialog abnormality guidance: the 14 CheXpert labels (oracle / GT) are
-  # read from this CSV and baked into the prompt as
-  # "Predicted Findings: ...". If left null the builder auto-discovers any
   # *chexpert*.csv under mimic_cxr_root; if none is found, structured_findings
   # is null and abnormality guidance is silently DISABLED (loud warning).
   mimic_chexpert_csv:     null
-  # How CheXpert -1.0 (uncertain) is mapped: "ignore" (only 1.0 positive,
-  # default, matches the classifier head) | "positive" (treat -1.0 as positive).
-  mimic_uncertain_policy: "ignore"
   # Optional VQA pairs dir with {train,valid,test}.json. null → skip VQA.
   mimic_vqa_root:         null
   # Auto-build the unified JSON (with CheXpert labels) when the cached

   mimic_cxr_root:    "/path/to/MIMIC-CXR"
   instruct_json:     "data/data_files/mimic_cxr_instruct_unified.json"
+  # RaDialog abnormality guidance (U-MultiClass / META-CXR): the 14 CheXpert
+  # labels (oracle / GT) are read from this CSV and baked into the prompt as
+  # the PNU 3-section string ("Positive Abnormalities: ... / Negative ... /
+  # Uncertain ..."). CSV value → class: 1→positive, 0→negative, -1→uncertain,
+  # blank/NaN→negative. If left null the builder auto-discovers any
   # *chexpert*.csv under mimic_cxr_root; if none is found, structured_findings
   # is null and abnormality guidance is silently DISABLED (loud warning).
   mimic_chexpert_csv:     null
   # Optional VQA pairs dir with {train,valid,test}.json. null → skip VQA.
   mimic_vqa_root:         null
   # Auto-build the unified JSON (with CheXpert labels) when the cached

data/dataset.py CHANGED Viewed

@@ -26,7 +26,7 @@ from torch.utils.data import Dataset
 from PIL import Image
 from .prompt_templates import build_training_sample
-from model.image_encoder import BioViLTEncoder
 TaskType = Literal["findings", "impression", "report", "vqa", "mixed"]
@@ -289,7 +289,6 @@ def build_instruct_json(
     vqa_data_root:   Optional[str] = None,
     report_mode:     str = "split",
     image_mode:      str = "all_views_split",
-    uncertain_policy: str = "ignore",
 ) -> str:
     """
     Build the unified MIMIC-CXR instruction JSON.
@@ -297,25 +296,27 @@ def build_instruct_json(
     Thin delegate to `data.mimic_cxr_builder.build_mimic_cxr_instruct_json`,
     which walks the pre-split MIMIC layout (train/valid/test), parses
     findings/impression from the report .txt files, and bakes the 14 CheXpert
-    labels (oracle, from `*chexpert*.csv`) into `structured_findings` as
-    "Predicted Findings: ..." — the RaDialog image + abnormality-guidance
-    setup. `report_mode` / `image_mode` mirror the IU builder.
     Output entries match the shared schema, e.g.:
         {"image_path": "train/p10/p10000032/s50414267/02aa804e.jpg",
          "task": "findings", "target": "The lungs are clear...",
          "question": null,
-         "structured_findings": "Predicted Findings: No Finding",
          "split": "train", "study_id": "s50414267",
          "subject_id": "p10000032"}
     """
     from .mimic_cxr_builder import build_mimic_cxr_instruct_json
     return build_mimic_cxr_instruct_json(
-        mimic_root       = mimic_cxr_root,
-        output_path      = output_path,
-        chexpert_csv     = chexpert_csv,
-        vqa_root         = vqa_data_root,
-        report_mode      = report_mode,
-        image_mode       = image_mode,
-        uncertain_policy = uncertain_policy,
     )

 from PIL import Image
 from .prompt_templates import build_training_sample
+from model.rad_dino import BioViLTEncoder
 TaskType = Literal["findings", "impression", "report", "vqa", "mixed"]
     vqa_data_root:   Optional[str] = None,
     report_mode:     str = "split",
     image_mode:      str = "all_views_split",
 ) -> str:
     """
     Build the unified MIMIC-CXR instruction JSON.
     Thin delegate to `data.mimic_cxr_builder.build_mimic_cxr_instruct_json`,
     which walks the pre-split MIMIC layout (train/valid/test), parses
     findings/impression from the report .txt files, and bakes the 14 CheXpert
+    labels (oracle, from `*chexpert*.csv`) into `structured_findings` as the
+    PNU 3-section string (U-MultiClass, META-CXR format) — the RaDialog
+    image + abnormality-guidance setup. `report_mode` / `image_mode` mirror
+    the IU builder.
     Output entries match the shared schema, e.g.:
         {"image_path": "train/p10/p10000032/s50414267/02aa804e.jpg",
          "task": "findings", "target": "The lungs are clear...",
          "question": null,
+         "structured_findings": "Positive Abnormalities: None\\n
+             Negative Abnormalities: No Finding, ...\\n
+             Uncertain Abnormalities: None",
          "split": "train", "study_id": "s50414267",
          "subject_id": "p10000032"}
     """
     from .mimic_cxr_builder import build_mimic_cxr_instruct_json
     return build_mimic_cxr_instruct_json(
+        mimic_root   = mimic_cxr_root,
+        output_path  = output_path,
+        chexpert_csv = chexpert_csv,
+        vqa_root     = vqa_data_root,
+        report_mode  = report_mode,
+        image_mode   = image_mode,
     )

data/mimic_cxr_builder.py CHANGED Viewed

@@ -15,25 +15,34 @@ NOT the raw PhysioNet tree):
       └── test /pNN/...
     {anywhere under mimic_root}/  *chexpert*.csv   (optional, auto-discovered)
-RaDialog-style abnormality guidance
------------------------------------
 The 14 CheXpert labels are read from `mimic-cxr-2.0.0-chexpert.csv`
 (CheXbert run on the ground-truth reports) and baked into the prompt as
-`structured_findings`:
-    "Predicted Findings: Cardiomegaly, Pleural Effusion"
-    "Predicted Findings: No Finding"           (when no positive label)
 This is the *oracle* setting — GT labels, no trained image classifier and
-no model change. The CheXpert classifier module stays unused; the existing
-`structured_findings` prompt plumbing carries the string through train
-(dataset.py) and eval (evaluate.py) untouched.
 VQA
 ---
-VQA pairs live in a separate dataset and are attached by passing
-`vqa_root` (mirrors the notebook). Omit it to build findings/impression
-only.
 """
 import argparse
@@ -50,23 +59,11 @@ from typing import Dict, List, Optional, Tuple
 _FINDINGS_RE   = re.compile(r"FINDINGS\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
 _IMPRESSION_RE = re.compile(r"IMPRESSION\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
-# 14 CheXpert columns, in the canonical order used by the classifier head.
-CHEXPERT_LABELS = [
-    "No Finding",
-    "Enlarged Cardiomediastinum",
-    "Cardiomegaly",
-    "Lung Opacity",
-    "Lung Lesion",
-    "Edema",
-    "Consolidation",
-    "Pneumonia",
-    "Atelectasis",
-    "Pneumothorax",
-    "Pleural Effusion",
-    "Pleural Other",
-    "Fracture",
-    "Support Devices",
-]
 def _clean(txt: str) -> str:
@@ -83,7 +80,7 @@ def _parse_report(txt_path: Path) -> Tuple[Optional[str], Optional[str]]:
     )
-# ─── CheXpert CSV → "Predicted Findings: ..." string ────────────────────────
 def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optional[Path]:
     if explicit:
@@ -97,17 +94,26 @@ def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optiona
     return None
-def _load_chexpert_map(
-    csv_path: Path,
-    uncertain_policy: str = "ignore",   # "ignore" → only 1.0 positive | "positive" → -1.0 also positive
-) -> Dict[Tuple[str, str], str]:
     """
-    Return {(subject_id, study_id): "Predicted Findings: A, B"} where the ids
-    are the bare integers as strings (CSV stores them without the p/s prefix).
     """
-    pos_threshold = {"1", "1.0"}
-    if uncertain_policy == "positive":
-        pos_threshold = pos_threshold | {"-1", "-1.0"}
     out: Dict[Tuple[str, str], str] = {}
     with open(csv_path, newline="") as f:
@@ -121,25 +127,17 @@ def _load_chexpert_map(
                 f"{csv_path} missing subject_id/study_id columns "
                 f"(have: {reader.fieldnames})"
             )
-        label_cols = [(name, col[name.lower()]) for name in CHEXPERT_LABELS
                       if name.lower() in col]
         for row in reader:
             subj  = str(row[subj_c]).strip().lstrip("p").split(".")[0]
             study = str(row[study_c]).strip().lstrip("s").split(".")[0]
-            positives = [
-                name for name, c in label_cols
-                if str(row.get(c, "")).strip() in pos_threshold
-            ]
-            # "No Finding" alone is reported as such; otherwise list the
-            # genuine positives (drop a redundant "No Finding" if any
-            # pathology is also positive).
-            real = [p for p in positives if p != "No Finding"]
-            if real:
-                txt = ", ".join(real)
-            else:
-                txt = "No Finding"
-            out[(subj, study)] = f"Predicted Findings: {txt}"
     return out
@@ -152,18 +150,17 @@ def build_mimic_cxr_instruct_json(
     vqa_root:     Optional[str] = None,
     report_mode:  str = "split",                  # "split" | "merged" | "split_cascade"
     image_mode:   str = "all_views_split",        # "all_views_split" | "frontal_only_split" | "multi_image_merged"
-    uncertain_policy: str = "ignore",             # how CheXpert -1.0 (uncertain) is treated
 ) -> str:
     """
     Build the unified MIMIC-CXR instruction JSON.
     report_mode mirrors iu_xray_builder:
       "split"         → findings + impression samples; BOTH carry the CheXpert
-                        "Predicted Findings: ..." string in structured_findings
-                        (RaDialog: image + 14 labels → text).
       "merged"        → one task=report sample, target "Findings: ...\n\n
-                        Impression: ...", carries the CheXpert string.
-      "split_cascade" → findings sample carries the CheXpert string; the
                         impression sample instead carries "Findings: <GT
                         findings>" as context (findings→impression). Same
                         convention as the IU builder.
@@ -201,9 +198,9 @@ def build_mimic_cxr_instruct_json(
     # ── CheXpert labels ───────────────────────────────────────────────────
     csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
     if csv_path is not None:
-        chexpert_map = _load_chexpert_map(csv_path, uncertain_policy)
         print(f"[mimic_cxr_builder] CheXpert CSV: {csv_path} "
-              f"({len(chexpert_map):,} studies, uncertain={uncertain_policy})")
     else:
         chexpert_map = {}
         print("[mimic_cxr_builder] WARNING: no *chexpert*.csv found under "
@@ -213,23 +210,24 @@ def build_mimic_cxr_instruct_json(
     # ── Pass 1: index studies ─────────────────────────────────────────────
     samples: List[Dict] = []
-    image_index: Dict[str, str] = {}     # subject-relative path → split label
     n_studies = n_missing_report = n_no_chexpert = 0
     skipped_merged_no_impression = skipped_cascade_no_findings = 0
     def _structured_for(subj: str, study: str) -> Optional[str]:
         return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
-    def _image_groups(study_dir: Path, split_sub: str, subj: str, study: str):
         """Yield path_fields dicts honouring image_mode (same rules as IU)."""
-        imgs = sorted(study_dir.glob("*.jpg"))
-        if not imgs:
-            return
-        def _rel(img: Path) -> str:
-            return f"{split_sub}/{img.parent.parent.parent.name}/{subj}/{study}/{img.name}"
-        rels = [_rel(im) for im in imgs]
-        for r in rels:
-            image_index[r] = split_dirs[split_sub]
         if image_mode == "all_views_split":
             for r in rels:
                 yield {"image_path": r, "image_paths": None}
@@ -242,11 +240,15 @@ def build_mimic_cxr_instruct_json(
         for p_dir in sorted(split_dir.glob("p*")):
             for pat_dir in p_dir.glob("p*"):
                 for study_dir in pat_dir.glob("s*"):
-                    jpgs = list(study_dir.glob("*.jpg"))
-                    if not jpgs:
                         continue
                     n_studies += 1
-                    subj, study = pat_dir.name, study_dir.name
                     txts = list(study_dir.glob("*.txt"))
                     if not txts:
                         n_missing_report += 1
@@ -257,7 +259,7 @@ def build_mimic_cxr_instruct_json(
                         n_no_chexpert += 1
                     split_label = split_dirs[split_sub]
-                    for path_fields in _image_groups(study_dir, split_sub, subj, study):
                         base = {
                             **path_fields,
                             "question":   None,
@@ -310,22 +312,26 @@ def build_mimic_cxr_instruct_json(
                 sub_rel = str(row["image_path"]).lstrip("/")
                 if sub_rel.startswith("files/"):
                     sub_rel = sub_rel[len("files/"):]
-                # match against any indexed image whose tail equals sub_rel
-                hit = next((k for k in image_index if k.endswith(sub_rel)), None)
-                if hit is None:
                     n_vqa_dropped += 1
                     continue
                 ans = row.get("answer", [])
                 answer = (", ".join(map(str, ans)) if isinstance(ans, list)
                           else str(ans)) or "No."
                 samples.append({
-                    "image_path": hit, "image_paths": None,
                     "task": "vqa", "target": answer,
                     "question": row["question"],
-                    "structured_findings": None,
                     "split": split_label,
-                    "study_id": row.get("study_id"),
-                    "subject_id": row.get("subject_id"),
                 })
                 n_vqa += 1
@@ -372,9 +378,6 @@ def _parse_args():
                    choices=["split", "merged", "split_cascade"])
     p.add_argument("--image_mode",   default="all_views_split",
                    choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
-    p.add_argument("--uncertain_policy", default="ignore",
-                   choices=["ignore", "positive"],
-                   help="CheXpert -1.0 (uncertain): ignore (default) or treat as positive.")
     return p.parse_args()
@@ -387,5 +390,4 @@ if __name__ == "__main__":
         vqa_root     = a.vqa_root,
         report_mode  = a.report_mode,
         image_mode   = a.image_mode,
-        uncertain_policy = a.uncertain_policy,
     )

       └── test /pNN/...
     {anywhere under mimic_root}/  *chexpert*.csv   (optional, auto-discovered)
+RaDialog-style abnormality guidance (U-MultiClass / META-CXR)
+-------------------------------------------------------------
 The 14 CheXpert labels are read from `mimic-cxr-2.0.0-chexpert.csv`
 (CheXbert run on the ground-truth reports) and baked into the prompt as
+`structured_findings` in the PNU 3-section format:
+    Positive Abnormalities: Cardiomegaly, Pleural Effusion
+    Negative Abnormalities: No Finding, Edema, ...
+    Uncertain Abnormalities: Atelectasis
+CSV value → class: 1 → positive, 0 → negative, -1 → uncertain,
+blank/NaN → negative (META-CXR convention: missing == negative).
 This is the *oracle* setting — GT labels, no trained image classifier and
+no model change. The string format is shared verbatim with
+`model.chexpert_classifier.format_pnu`, so the learned-classifier path
+(at inference) produces byte-identical prompts. The existing
+`structured_findings` plumbing carries it through train (dataset.py) and
+eval (evaluate.py) untouched.
 VQA
 ---
+VQA pairs live in 3 files {train,valid,test}.json (MIMIC-Ext-CXR-VQA);
+attach them by passing `vqa_root`. Each row is one (image, question,
+answer) sample — one image can yield many rows. VQA samples get the SAME
+PNU CheXpert context as findings/impression (abnormality-guided VQA, à la
+RaDialog), looked up by subject_id/study_id. Omit `vqa_root` to build
+findings/impression only.
 """
 import argparse
 _FINDINGS_RE   = re.compile(r"FINDINGS\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
 _IMPRESSION_RE = re.compile(r"IMPRESSION\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
+# The 14-label list, PNU string formatter and class indices live in
+# model.chexpert_classifier — single source of truth shared with the learned
+# classifier so GT-oracle and predicted prompts are byte-identical. Imported
+# lazily inside _load_chexpert_map (it pulls the model package, which is
+# always available in the train/eval env where JSON building runs).
 def _clean(txt: str) -> str:
     )
+# ─── CheXpert CSV → PNU structured-findings string ──────────────────────────
 def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optional[Path]:
     if explicit:
     return None
+def _load_chexpert_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
     """
+    Return {(subject_id, study_id): <PNU string>} where the ids are the bare
+    integers as strings (CSV stores them without the p/s prefix).
+    U-MultiClass mapping of each CheXpert cell:
+        1 / 1.0   → positive
+        0 / 0.0   → negative
+       -1 / -1.0  → uncertain
+        blank/NaN → negative   (META-CXR convention: missing == negative)
     """
+    from model.chexpert_classifier import (
+        PATHOLOGIES, buckets_to_pnu,
+        CLASS_NEGATIVE, CLASS_POSITIVE, CLASS_UNCERTAIN,
+    )
+    val_to_cls = {
+        "1": CLASS_POSITIVE,  "1.0": CLASS_POSITIVE,
+        "0": CLASS_NEGATIVE,  "0.0": CLASS_NEGATIVE,
+        "-1": CLASS_UNCERTAIN, "-1.0": CLASS_UNCERTAIN,
+    }
     out: Dict[Tuple[str, str], str] = {}
     with open(csv_path, newline="") as f:
                 f"{csv_path} missing subject_id/study_id columns "
                 f"(have: {reader.fieldnames})"
             )
+        label_cols = [(name, col[name.lower()]) for name in PATHOLOGIES
                       if name.lower() in col]
         for row in reader:
             subj  = str(row[subj_c]).strip().lstrip("p").split(".")[0]
             study = str(row[study_c]).strip().lstrip("s").split(".")[0]
+            mapping = {
+                name: val_to_cls.get(str(row.get(c, "")).strip(), CLASS_NEGATIVE)
+                for name, c in label_cols
+            }
+            out[(subj, study)] = buckets_to_pnu(mapping)
     return out
     vqa_root:     Optional[str] = None,
     report_mode:  str = "split",                  # "split" | "merged" | "split_cascade"
     image_mode:   str = "all_views_split",        # "all_views_split" | "frontal_only_split" | "multi_image_merged"
 ) -> str:
     """
     Build the unified MIMIC-CXR instruction JSON.
     report_mode mirrors iu_xray_builder:
       "split"         → findings + impression samples; BOTH carry the CheXpert
+                        PNU string in structured_findings (RaDialog: image +
+                        14 labels → text).
       "merged"        → one task=report sample, target "Findings: ...\n\n
+                        Impression: ...", carries the CheXpert PNU string.
+      "split_cascade" → findings sample carries the CheXpert PNU string; the
                         impression sample instead carries "Findings: <GT
                         findings>" as context (findings→impression). Same
                         convention as the IU builder.
     # ── CheXpert labels ───────────────────────────────────────────────────
     csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
     if csv_path is not None:
+        chexpert_map = _load_chexpert_map(csv_path)
         print(f"[mimic_cxr_builder] CheXpert CSV: {csv_path} "
+              f"({len(chexpert_map):,} studies, PNU U-MultiClass)")
     else:
         chexpert_map = {}
         print("[mimic_cxr_builder] WARNING: no *chexpert*.csv found under "
     # ── Pass 1: index studies ─────────────────────────────────────────────
     samples: List[Dict] = []
+    # sub_rel ("pXX/pXXXX/sYYYY/img.jpg") → full stored image_path
+    # ("{split}/pXX/pXXXX/sYYYY/img.jpg"). O(1) VQA lookup.
+    image_index: Dict[str, str] = {}
     n_studies = n_missing_report = n_no_chexpert = 0
     skipped_merged_no_impression = skipped_cascade_no_findings = 0
     def _structured_for(subj: str, study: str) -> Optional[str]:
         return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
+    def _rels_for(study_dir: Path, split_sub: str, subj: str, study: str) -> List[str]:
+        """Split-prefixed relative image paths for one study, sorted."""
+        return [
+            f"{split_sub}/{im.parent.parent.parent.name}/{subj}/{study}/{im.name}"
+            for im in sorted(study_dir.glob("*.jpg"))
+        ]
+    def _image_groups(rels: List[str]):
         """Yield path_fields dicts honouring image_mode (same rules as IU)."""
         if image_mode == "all_views_split":
             for r in rels:
                 yield {"image_path": r, "image_paths": None}
         for p_dir in sorted(split_dir.glob("p*")):
             for pat_dir in p_dir.glob("p*"):
                 for study_dir in pat_dir.glob("s*"):
+                    subj, study = pat_dir.name, study_dir.name
+                    rels = _rels_for(study_dir, split_sub, subj, study)
+                    if not rels:
                         continue
                     n_studies += 1
+                    # Index EVERY image up front — a VQA row may reference a
+                    # study that has images but no findings/impression report.
+                    for r in rels:
+                        image_index[r.split("/", 1)[1]] = r
                     txts = list(study_dir.glob("*.txt"))
                     if not txts:
                         n_missing_report += 1
                         n_no_chexpert += 1
                     split_label = split_dirs[split_sub]
+                    for path_fields in _image_groups(rels):
                         base = {
                             **path_fields,
                             "question":   None,
                 sub_rel = str(row["image_path"]).lstrip("/")
                 if sub_rel.startswith("files/"):
                     sub_rel = sub_rel[len("files/"):]
+                full = image_index.get(sub_rel)            # O(1)
+                if full is None:
                     n_vqa_dropped += 1
                     continue
                 ans = row.get("answer", [])
                 answer = (", ".join(map(str, ans)) if isinstance(ans, list)
                           else str(ans)) or "No."
+                subj  = str(row.get("subject_id", ""))
+                study = str(row.get("study_id", ""))
                 samples.append({
+                    "image_path": full, "image_paths": None,
                     "task": "vqa", "target": answer,
                     "question": row["question"],
+                    # Abnormality-guided VQA (RaDialog): same PNU CheXpert
+                    # context as findings/impression. None if no chexpert.csv
+                    # (graceful — falls back to image + question only).
+                    "structured_findings": _structured_for(subj, study),
                     "split": split_label,
+                    "study_id": study,
+                    "subject_id": subj,
                 })
                 n_vqa += 1
                    choices=["split", "merged", "split_cascade"])
     p.add_argument("--image_mode",   default="all_views_split",
                    choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
     return p.parse_args()
         vqa_root     = a.vqa_root,
         report_mode  = a.report_mode,
         image_mode   = a.image_mode,
     )

model/chexpert_classifier.py CHANGED Viewed

@@ -1,21 +1,28 @@
 """
 chexpert_classifier.py
 ----------------------
-Multi-label CheXpert pathology classifier.
-Trained separately on MIMIC-CXR with CheXbert labels.
-This component provides structured findings (e.g. "Pleural Effusion: Positive")
-that are appended to the LLM prompt alongside image tokens, improving clinical
-accuracy of generated reports.
-Reference: RaDialog (Pellegrini et al., 2023) — CheXpert Classifier provides
-structured findings to the LLM prompt to improve clinical correctness.
 """
 import torch
 import torch.nn as nn
-from pathlib import Path
-from typing import Optional, List, Dict
 PATHOLOGIES = [
@@ -35,40 +42,77 @@ PATHOLOGIES = [
     "Support Devices",
 ]
 class CheXpertClassifier(nn.Module):
     """
-    Lightweight multi-label classifier on top of BioViL-T global embeddings.
-    Trained separately (Stage 0) with binary cross-entropy loss.
-    Frozen during Stage 1 and Stage 2 of main model training.
     Args:
-        input_dim:   BioViL-T global embedding dim (512 for BioViL-T global)
-        num_classes: number of pathology classes (14)
-        threshold:   classification threshold for positive predictions
-        checkpoint:  path to trained weights (None = random init / not loaded)
     """
     def __init__(
         self,
-        input_dim:  int = 512,
         num_classes: int = 14,
-        threshold:  float = 0.5,
-        checkpoint: Optional[str] = None,
     ):
         super().__init__()
         self.num_classes = num_classes
-        self.threshold   = threshold
         self.pathologies = PATHOLOGIES
-        # Simple MLP classifier head
         self.classifier = nn.Sequential(
             nn.Linear(input_dim, 256),
             nn.ReLU(),
             nn.Dropout(0.2),
-            nn.Linear(256, num_classes),
         )
         if checkpoint is not None:
@@ -82,46 +126,44 @@ class CheXpertClassifier(nn.Module):
     def forward(self, global_features: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            global_features: (B, input_dim) — global CXR embedding from BioViL-T
         Returns:
-            logits: (B, 14)
         """
-        return self.classifier(global_features)
     @torch.no_grad()
     def predict(self, global_features: torch.Tensor) -> List[Dict[str, str]]:
         """
-        Run inference and return human-readable findings per sample.
-        Returns:
-            List of dicts like {"Pleural Effusion": "Positive", "Cardiomegaly": "Negative", ...}
-        """
-        logits = self.forward(global_features)       # (B, 14)
-        probs  = torch.sigmoid(logits)               # (B, 14)
-        preds  = (probs > self.threshold).cpu()      # (B, 14) bool
-        results = []
-        for i in range(preds.size(0)):
-            finding = {}
-            for j, name in enumerate(self.pathologies):
-                finding[name] = "Positive" if preds[i, j].item() else "Negative"
-            results.append(finding)
-        return results
-    def findings_to_text(self, findings: Dict[str, str]) -> str:
         """
-        Convert findings dict to a structured text string for LLM prompt.
-        Example output:
-            "Predicted Findings: Pleural Effusion: Positive, Cardiomegaly: Negative, ..."
         """
-        positive = [k for k, v in findings.items() if v == "Positive"]
-        negative = [k for k, v in findings.items() if v == "Negative"]
-        if not positive:
-            pos_str = "No Finding"
-        else:
-            pos_str = ", ".join(positive)
-        return f"Predicted Findings: {pos_str}"

 """
 chexpert_classifier.py
 ----------------------
+Multi-label, multi-CLASS CheXpert pathology classifier (U-MultiClass).
+Each of the 14 pathologies is predicted as one of THREE classes —
+negative / positive / uncertain — via a per-pathology softmax, mirroring
+META-CXR's MHCAC head and the CheXpert "U-MultiClass" uncertainty policy.
+The structured findings injected into the LLM prompt use the PNU
+(Positive / Negative / Uncertain) 3-section format. `format_pnu()` is the
+single source of truth for that string so the oracle path
+(data/mimic_cxr_builder.py, GT from chexpert.csv) and the learned path
+(this classifier at inference) produce byte-identical prompts.
+Trained separately (Stage 0) on MIMIC-CXR CheXbert labels; frozen during
+Stage 1 / Stage 2 of the main VLM.
+Reference: RaDialog (Pellegrini et al., 2023) for the prompt-conditioning
+idea; META-CXR (Edirisinghe et al., 2025) for the explicit uncertain class.
 """
 import torch
 import torch.nn as nn
+from typing import Optional, List, Dict, Sequence
 PATHOLOGIES = [
     "Support Devices",
 ]
+# Per-pathology class indices (softmax dim order). Keep this stable: the
+# trained checkpoint and the GT-label mapping in mimic_cxr_builder.py both
+# rely on it.
+CLASS_NEGATIVE  = 0
+CLASS_POSITIVE  = 1
+CLASS_UNCERTAIN = 2
+NUM_STATES      = 3
+CLASS_NAMES     = {CLASS_NEGATIVE: "negative",
+                   CLASS_POSITIVE: "positive",
+                   CLASS_UNCERTAIN: "uncertain"}
+def format_pnu(positive: Sequence[str],
+               negative: Sequence[str],
+               uncertain: Sequence[str]) -> str:
+    """
+    Build the PNU structured-findings string (META-CXR prompt format).
+        Positive Abnormalities: Cardiomegaly, Pleural Effusion
+        Negative Abnormalities: No Finding, Edema, ...
+        Uncertain Abnormalities: Atelectasis
+    Empty sections render as "None" so the three lines are always present
+    (the LLM sees a fixed structure regardless of the case).
+    """
+    def _fmt(xs: Sequence[str]) -> str:
+        return ", ".join(xs) if xs else "None"
+    return (f"Positive Abnormalities: {_fmt(positive)}\n"
+            f"Negative Abnormalities: {_fmt(negative)}\n"
+            f"Uncertain Abnormalities: {_fmt(uncertain)}")
+def buckets_to_pnu(class_by_pathology: Dict[str, int]) -> str:
+    """Group a {pathology: class_idx} dict into the PNU string."""
+    pos = [p for p, c in class_by_pathology.items() if c == CLASS_POSITIVE]
+    neg = [p for p, c in class_by_pathology.items() if c == CLASS_NEGATIVE]
+    unc = [p for p, c in class_by_pathology.items() if c == CLASS_UNCERTAIN]
+    return format_pnu(pos, neg, unc)
 class CheXpertClassifier(nn.Module):
     """
+    Multi-label, 3-class-per-label classifier on BioViL-T global embeddings.
+    Output logits have shape (B, 14, 3); a per-pathology softmax/argmax
+    yields negative / positive / uncertain.
     Args:
+        input_dim:   global CXR embedding dim
+        num_classes: number of pathologies (14)
+        checkpoint:  trained weights (None = not loaded)
     """
     def __init__(
         self,
+        input_dim:   int = 512,
         num_classes: int = 14,
+        checkpoint:  Optional[str] = None,
     ):
         super().__init__()
         self.num_classes = num_classes
+        self.num_states  = NUM_STATES
         self.pathologies = PATHOLOGIES
+        # MLP head → num_classes * 3 logits, reshaped to (B, num_classes, 3)
         self.classifier = nn.Sequential(
             nn.Linear(input_dim, 256),
             nn.ReLU(),
             nn.Dropout(0.2),
+            nn.Linear(256, num_classes * NUM_STATES),
         )
         if checkpoint is not None:
     def forward(self, global_features: torch.Tensor) -> torch.Tensor:
         """
         Args:
+            global_features: (B, input_dim)
         Returns:
+            logits: (B, num_classes, 3)  — softmax over the last dim gives
+                    P(negative), P(positive), P(uncertain) per pathology.
+                    Train with cross-entropy over the last dim (the natural
+                    U-MultiClass objective).
         """
+        flat = self.classifier(global_features)              # (B, 14*3)
+        return flat.view(-1, self.num_classes, NUM_STATES)   # (B, 14, 3)
     @torch.no_grad()
     def predict(self, global_features: torch.Tensor) -> List[Dict[str, str]]:
         """
+        Returns a list (per sample) of {pathology: "negative"|"positive"|
+        "uncertain"} using argmax over the 3-state softmax.
         """
+        logits = self.forward(global_features)        # (B, 14, 3)
+        cls    = logits.argmax(dim=-1).cpu()          # (B, 14)
+        out: List[Dict[str, str]] = []
+        for i in range(cls.size(0)):
+            out.append({
+                name: CLASS_NAMES[int(cls[i, j].item())]
+                for j, name in enumerate(self.pathologies)
+            })
+        return out
+    @torch.no_grad()
+    def findings_to_text(self, global_features: torch.Tensor) -> List[str]:
         """
+        Per-sample PNU structured-findings string, identical in format to the
+        GT oracle path (data/mimic_cxr_builder.py). One string per sample.
+        """
+        logits = self.forward(global_features)        # (B, 14, 3)
+        cls    = logits.argmax(dim=-1).cpu()          # (B, 14)
+        texts: List[str] = []
+        for i in range(cls.size(0)):
+            mapping = {name: int(cls[i, j].item())
+                       for j, name in enumerate(self.pathologies)}
+            texts.append(buckets_to_pnu(mapping))
+        return texts

model/{image_encoder.py → rad_dino.py} RENAMED Viewed

File without changes

utils/dataset_resolver.py CHANGED Viewed

@@ -223,13 +223,12 @@ def _ensure_mimic_json_exists(data_cfg,
     print(f"[dataset_resolver] MIMIC JSON not found → auto-building "
           f"(report_mode={report_mode}, image_mode={image_mode}) …")
     build_mimic_cxr_instruct_json(
-        mimic_root       = str(_get(data_cfg, "mimic_cxr_root")),
-        output_path      = str(out),
-        chexpert_csv     = _get(data_cfg, "mimic_chexpert_csv"),
-        vqa_root         = _get(data_cfg, "mimic_vqa_root"),
-        report_mode      = report_mode,
-        image_mode       = image_mode,
-        uncertain_policy = str(_get(data_cfg, "mimic_uncertain_policy", "ignore")),
     )
     return str(out)

     print(f"[dataset_resolver] MIMIC JSON not found → auto-building "
           f"(report_mode={report_mode}, image_mode={image_mode}) …")
     build_mimic_cxr_instruct_json(
+        mimic_root   = str(_get(data_cfg, "mimic_cxr_root")),
+        output_path  = str(out),
+        chexpert_csv = _get(data_cfg, "mimic_chexpert_csv"),
+        vqa_root     = _get(data_cfg, "mimic_vqa_root"),
+        report_mode  = report_mode,
+        image_mode   = image_mode,
     )
     return str(out)