""" Loader for MIMIC-CXR benchmark examples. Reads the curated 10-sample CSV and loads any precomputed attribution results (same directory structure as the PubMedVision benchmark). """ from __future__ import annotations import csv import hashlib from pathlib import Path from typing import Any, Dict, List, Optional # Re-use parsers from the existing medical_loader from .medical_loader import ( parse_summary_txt, parse_vllm_summary, _build_all_cross_modal_pairs, apply_method_to_clip_summary, apply_method_to_vllm_summary, load_mobius_sidecar, load_vllm_result_json, rename_summary_patch_labels_in_place, rename_cross_pair_patch_labels_in_place, ) # ── Path resolution ────────────────────────────────────────────────────── _VIZ_DIR = Path(__file__).resolve().parent _PROJECT_ROOT = _VIZ_DIR.parent def _resolve_mimic_dataset_dir() -> Path: return _PROJECT_ROOT / "results" / "mimic" / "dataset" def _resolve_mimic_results_dir(method_suffix: str = "") -> Optional[Path]: """Resolve a MIMIC results directory. For the BiomedCLIP and LLaVA-Med UnSAM slots, prefers the 4×4 patch-grid variant (`mimic_biomedclip_patch/`, `mimic_llavamed_patch/`) when present and non-empty, then falls back to the UnSAM directory. The in-memory keys and UI labels still use the `_unsam` slot name for historical compatibility — only the on-disk source differs. method_suffix examples: "", "_biomedclip", "_llavamed", "_llavamed_unsam", "_vlm_unsam" """ _PATCH_MAP = { "_biomedclip": "_biomedclip_patch", "_llavamed_unsam": "_llavamed_patch", } patch_suffix = _PATCH_MAP.get(method_suffix) if patch_suffix: patch_dir = _PROJECT_ROOT / "results" / f"mimic{patch_suffix}" if patch_dir.exists() and any(patch_dir.iterdir()): return patch_dir d = _PROJECT_ROOT / "results" / f"mimic{method_suffix}" return d if d.exists() else None # ── Example registry ───────────────────────────────────────────────────── MIMIC_EXAMPLES: Dict[str, Dict[str, Any]] = {} def _load_mimic_examples_from_csv() -> Dict[str, Dict[str, Any]]: """Load the MIMIC-CXR curated CSV into a registry dict.""" csv_path = _resolve_mimic_dataset_dir() / "mimic_cxr_10.csv" if not csv_path.exists(): return {} examples = {} with open(csv_path, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: cocoid = row["cocoid"] example_id = f"coco_{cocoid}" category = row.get("category", "") caption = row.get("caption", "") findings = row.get("findings", "") # Display title is just the category name (short and scannable) cap_short = caption[:60] + "..." if len(caption) > 60 else caption title = category examples[example_id] = { "title": title, "short": cap_short, "category": category, "caption": caption, "findings": findings, "img_name": row.get("img_name", ""), "matched_keyword": row.get("matched_keyword", ""), "cocoid": cocoid, "source": "MIMIC-CXR", "has_results": False, # updated below } # Check which examples have precomputed results for eid, meta in examples.items(): for suffix in ["", "_biomedclip", "_llavamed_unsam"]: rdir = _resolve_mimic_results_dir(suffix) if rdir and (rdir / eid).exists(): meta["has_results"] = True break return examples # Load at import time MIMIC_EXAMPLES = _load_mimic_examples_from_csv() def get_mimic_examples_by_category( category: Optional[str] = None, ) -> Dict[str, Dict[str, Any]]: """Filter MIMIC examples by pathology category.""" if not category or category.lower() == "all": return MIMIC_EXAMPLES return { k: v for k, v in MIMIC_EXAMPLES.items() if v.get("category", "").lower() == category.lower() } def list_mimic_categories() -> List[str]: """Return sorted list of unique pathology categories.""" cats = sorted({v["category"] for v in MIMIC_EXAMPLES.values() if v.get("category")}) return cats # ── Image loading ──────────────────────────────────────────────────────── def get_mimic_image_path(example_id: str) -> Optional[str]: """Return the path to the original chest X-ray image.""" meta = MIMIC_EXAMPLES.get(example_id) if not meta: return None img_name = meta.get("img_name", "") if not img_name: return None img_path = _resolve_mimic_dataset_dir() / "images" / img_name return str(img_path) if img_path.exists() else None # ── Result loading ─────────────────────────────────────────────────────── def load_mimic_example(example_id: str, *, method: str = "shapley") -> Dict[str, Any]: """Load all available precomputed results for a MIMIC-CXR example. Returns a dict with the same structure as load_benchmark_example() from medical_loader.py, so the UI handler can use the same logic. """ meta = MIMIC_EXAMPLES.get(example_id, {}) caption = meta.get("caption", "") findings = meta.get("findings", "") data: Dict[str, Any] = { "example_id": example_id, "meta": meta, "caption": caption, "findings": findings, "method": method, "original_image_path": get_mimic_image_path(example_id), "has_mobius": {}, # Flags "has_clip": False, "has_biomedclip": False, "has_vllm_logprob": False, "has_vllm_gen": False, "has_llavamed_logprob": False, "has_llavamed_gen": False, "has_vlm_unsam_logprob": False, "has_vlm_unsam_gen": False, "has_llavamed_unsam_logprob": False, "has_llavamed_unsam_gen": False, } # ── CLIP cross-modal ───────────────────────────────────────────── clip_dir = _resolve_mimic_results_dir("_tok30_dotmask") if not clip_dir: clip_dir = _resolve_mimic_results_dir("_tok30") if not clip_dir: clip_dir = _resolve_mimic_results_dir() if clip_dir: edir = clip_dir / example_id summary_path = edir / "summary.txt" if summary_path.exists(): summary = parse_summary_txt(summary_path) clip_mobius = load_mobius_sidecar(edir) apply_method_to_clip_summary(summary, clip_mobius, method) rename_summary_patch_labels_in_place(summary) data["has_clip"] = True data["has_mobius"]["clip"] = clip_mobius is not None data["clip"] = { "summary": summary, "mobius_sidecar": clip_mobius, "image_paths": { "original": str(edir / "original.png") if (edir / "original.png").exists() else "", "overlay": str(edir / "overlay.png") if (edir / "overlay.png").exists() else "", "segmap": str(edir / "segmap.png") if (edir / "segmap.png").exists() else "", }, "image_b64": {}, } data["clip"]["all_cross_modal_pairs"] = _build_all_cross_modal_pairs( data["clip"], mobius_sidecar=clip_mobius, method=method, ) rename_cross_pair_patch_labels_in_place(data["clip"]["all_cross_modal_pairs"]) # Load base64 images for interactive view for key in ("original", "overlay", "segmap"): fpath = edir / f"{key}.png" if fpath.exists(): import base64 with open(fpath, "rb") as f: data["clip"].setdefault("image_b64", {})[key] = base64.b64encode(f.read()).decode("ascii") # ── BiomedCLIP cross-modal ─────────────────────────────────────── bc_dir = _resolve_mimic_results_dir("_biomedclip") if bc_dir: edir = bc_dir / example_id summary_path = edir / "summary.txt" if summary_path.exists(): summary = parse_summary_txt(summary_path) bc_mobius = load_mobius_sidecar(edir) apply_method_to_clip_summary(summary, bc_mobius, method) rename_summary_patch_labels_in_place(summary) data["has_biomedclip"] = True data["has_mobius"]["biomedclip"] = bc_mobius is not None data["biomedclip"] = { "summary": summary, "mobius_sidecar": bc_mobius, "image_paths": { "original": str(edir / "original.png") if (edir / "original.png").exists() else "", "overlay": str(edir / "overlay.png") if (edir / "overlay.png").exists() else "", "segmap": str(edir / "segmap.png") if (edir / "segmap.png").exists() else "", }, "image_b64": {}, } data["biomedclip"]["all_cross_modal_pairs"] = _build_all_cross_modal_pairs( data["biomedclip"], mobius_sidecar=bc_mobius, method=method, ) rename_cross_pair_patch_labels_in_place(data["biomedclip"]["all_cross_modal_pairs"]) for key in ("original", "overlay", "segmap"): fpath = edir / f"{key}.png" if fpath.exists(): import base64 with open(fpath, "rb") as f: data["biomedclip"].setdefault("image_b64", {})[key] = base64.b64encode(f.read()).decode("ascii") # ── VLM (Qwen2-VL) logprob + gen ──────────────────────────────── vlm_dir = _resolve_mimic_results_dir() if vlm_dir: edir = vlm_dir / example_id for prefix, flag_key, json_key in [ ("vllm_logprob", "has_vllm_logprob", "vllm_logprob"), ("vllm_gen", "has_vllm_gen", "vllm_gen"), ]: summary_path = edir / f"{prefix}_summary.txt" if summary_path.exists(): parsed = parse_vllm_summary(summary_path) if parsed: json_data = load_vllm_result_json(edir, prefix, method=method) apply_method_to_vllm_summary(parsed, json_data, method) data[flag_key] = True data[json_key] = parsed data[f"{json_key}_json"] = json_data data["has_mobius"][json_key] = bool(json_data.get("mobius_dict")) overlay = edir / f"{prefix}_overlay.png" if overlay.exists(): data[json_key]["overlay_path"] = str(overlay) # ── LLaVA-Med logprob + gen ────────────────────────────────────── lm_dir = _resolve_mimic_results_dir("_llavamed") if lm_dir: edir = lm_dir / example_id for prefix, flag_key, json_key in [ ("vllm_logprob", "has_llavamed_logprob", "llavamed_logprob"), ("vllm_gen", "has_llavamed_gen", "llavamed_gen"), ]: summary_path = edir / f"{prefix}_summary.txt" if summary_path.exists(): parsed = parse_vllm_summary(summary_path) if parsed: json_data = load_vllm_result_json(edir, prefix, method=method) apply_method_to_vllm_summary(parsed, json_data, method) data[flag_key] = True data[json_key] = parsed data[f"{json_key}_json"] = json_data data["has_mobius"][json_key] = bool(json_data.get("mobius_dict")) overlay = edir / f"{prefix}_overlay.png" if overlay.exists(): data[json_key]["overlay_path"] = str(overlay) # ── VLM UnSAM (Qwen2-VL + UnSAM segments) ────────────────────── vu_dir = _resolve_mimic_results_dir("_vlm_unsam") if vu_dir: edir = vu_dir / example_id for prefix, flag_key, json_key in [ ("vllm_logprob", "has_vlm_unsam_logprob", "vlm_unsam_logprob"), ("vllm_gen", "has_vlm_unsam_gen", "vlm_unsam_gen"), ]: summary_path = edir / f"{prefix}_summary.txt" if summary_path.exists(): parsed = parse_vllm_summary(summary_path) if parsed: json_data = load_vllm_result_json(edir, prefix, method=method) apply_method_to_vllm_summary(parsed, json_data, method) data[flag_key] = True data[json_key] = parsed data[f"{json_key}_json"] = json_data data["has_mobius"][json_key] = bool(json_data.get("mobius_dict")) overlay = edir / f"{prefix}_overlay.png" if overlay.exists(): data[json_key]["overlay_path"] = str(overlay) segmap = edir / "segmap.png" original = edir / "original.png" if segmap.exists(): data["vlm_unsam_segmap_path"] = str(segmap) if original.exists(): data["vlm_unsam_original_path"] = str(original) # ── LLaVA-Med UnSAM ───────────────────────────────────────────── lu_dir = _resolve_mimic_results_dir("_llavamed_unsam") if lu_dir: edir = lu_dir / example_id for prefix, flag_key, json_key in [ ("vllm_logprob", "has_llavamed_unsam_logprob", "llavamed_unsam_logprob"), ("vllm_gen", "has_llavamed_unsam_gen", "llavamed_unsam_gen"), ]: summary_path = edir / f"{prefix}_summary.txt" if summary_path.exists(): parsed = parse_vllm_summary(summary_path) if parsed: json_data = load_vllm_result_json(edir, prefix, method=method) apply_method_to_vllm_summary(parsed, json_data, method) data[flag_key] = True data[json_key] = parsed data[f"{json_key}_json"] = json_data data["has_mobius"][json_key] = bool(json_data.get("mobius_dict")) overlay = edir / f"{prefix}_overlay.png" if overlay.exists(): data[json_key]["overlay_path"] = str(overlay) segmap = edir / "segmap.png" original = edir / "original.png" if segmap.exists(): data["llavamed_unsam_segmap_path"] = str(segmap) if original.exists(): data["llavamed_unsam_original_path"] = str(original) return data