chore: ignore .claude worktrees

Files changed (8) hide show

.claude/worktrees/strange-agnesi-73641a +1 -0
.gitignore +0 -0
configs/model_config.yaml +10 -3
data/eda_full.ipynb +874 -0
data/eda_p18.ipynb +797 -0
data/eda_reports.ipynb +741 -0
model/cxr_vlm.py +5 -1
model/image_encoder.py +187 -57

.claude/worktrees/strange-agnesi-73641a ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 9dadb472ab6ab5dee7a656bf525b249a605a68ff

.gitignore CHANGED Viewed

Binary files a/.gitignore and b/.gitignore differ

configs/model_config.yaml CHANGED Viewed

@@ -3,11 +3,18 @@
 # ─────────────────────────────────────────────
 # ── Vision Encoder ──────────────────────────
 image_encoder:
-  name: "microsoft/BioViL-T"          # BioViL-T from hi-ml-multimodal
   frozen: true                         # freeze encoder during training
-  img_size: 448                        # input image resolution
-  output_dim: 768                      # patch feature dimension
 # ── MLP Projection (Alignment Layer) ────────
 projection:

 # ─────────────────────────────────────────────
 # ── Vision Encoder ──────────────────────────
+# `backend` chooses the underlying model. "auto" tries rad_dino → biovilt → vit
+# in priority order and uses the first one that loads.
+#   - rad_dino : microsoft/rad-dino, chest-X-ray DINOv2 (HF transformers).
+#                Works on Python 3.12, recommended for CXR.
+#   - biovilt  : Microsoft BioViL-T (needs hi-ml-multimodal, Python <3.11).
+#   - vit      : timm ViT-B/16 ImageNet — generic fallback if above fail.
 image_encoder:
+  name: "microsoft/rad-dino"          # informational; backend below drives loading
+  backend: "auto"                      # "auto" | "rad_dino" | "biovilt" | "vit"
   frozen: true                         # freeze encoder during training
+  img_size: 448                        # input image resolution (RAD-DINO native is 518)
+  output_dim: 768                      # patch feature dimension (768 for all backends)
 # ── MLP Projection (Alignment Layer) ────────
 projection:

data/eda_full.ipynb ADDED Viewed

	@@ -0,0 +1,874 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# EDA — MIMIC-CXR Full Dataset\n",
+    "\n",
+    "**Datasets used:**\n",
+    "- `MIMIC-CXR-JPG` (v2.1.0) — ảnh JPG + CSV metadata\n",
+    "- `MIMIC-CXR` (v2.1.0) — report `.txt` (Findings / Impression)\n",
+    "- `MIMIC-Ext-MIMIC-CXR-VQA` (v1.0.0) — câu hỏi/đáp VQA\n",
+    "\n",
+    "**Scope:** toàn bộ dataset (tất cả subset p10–p19).\n",
+    "\n",
+    "> ℹ️ **Không cần tải ảnh JPG** để chạy notebook này — toàn bộ EDA dựa trên CSV, .txt reports và .json VQA."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Cấu hình đường dẫn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "from pathlib import Path\n\nDATA_DIR = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\")\nCXR_ROOT = DATA_DIR / \"mimic-cxr-reports\"   # files/p10…p19/pXXXXXX/sYYYYYY.txt — toàn bộ dataset\n\nSPLIT_CSV    = DATA_DIR / \"mimic-cxr-2.0.0-split.csv\"\nMETA_CSV     = DATA_DIR / \"mimic-cxr-2.0.0-metadata.csv\"\nCHEXPERT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-chexpert.csv\"\n\n_VQA_DIR = (DATA_DIR\n    / \"mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\"\n    / \"MIMIC-Ext-MIMIC-CXR-VQA\"\n    / \"dataset\")\nVQA_TRAIN = _VQA_DIR / \"train.json\"\nVQA_VALID = _VQA_DIR / \"valid.json\"\nVQA_TEST  = _VQA_DIR / \"test.json\"\n\n# None = parse hết toàn bộ (~227k studies, mất 10-20 phút)\n# Số nguyên = sample ngẫu nhiên để chạy nhanh\nREPORT_SAMPLE_SIZE = 10000\n\n# Kiểm tra nhanh\nfor name, p in [(\"SPLIT_CSV\",    SPLIT_CSV),\n                (\"META_CSV\",     META_CSV),\n                (\"CHEXPERT_CSV\", CHEXPERT_CSV),\n                (\"CXR_ROOT\",     CXR_ROOT),\n                (\"VQA_TRAIN\",    VQA_TRAIN)]:\n    status = \"✓\" if p.exists() else \"✗ NOT FOUND\"\n    print(f\"  {status}  {name}: {p}\")\n\nprint(\"\\nPaths configured.\")"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import json\n",
+    "import re\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.ticker as mticker\n",
+    "import seaborn as sns\n",
+    "from collections import Counter\n",
+    "\n",
+    "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
+    "plt.rcParams[\"figure.dpi\"] = 120\n",
+    "plt.rcParams[\"figure.figsize\"] = (11, 4)\n",
+    "\n",
+    "CHEXPERT_LABELS = [\n",
+    "    \"Atelectasis\", \"Cardiomegaly\", \"Consolidation\", \"Edema\",\n",
+    "    \"Enlarged Cardiomediastinum\", \"Fracture\", \"Lung Lesion\",\n",
+    "    \"Lung Opacity\", \"No Finding\", \"Pleural Effusion\",\n",
+    "    \"Pleural Other\", \"Pneumonia\", \"Pneumothorax\", \"Support Devices\"\n",
+    "]\n",
+    "\n",
+    "# Subset folders p10–p19\n",
+    "ALL_SUBSETS = [f\"p{i}\" for i in range(10, 20)]\n",
+    "\n",
+    "print(\"Libraries imported.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load CSV files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "split_df    = pd.read_csv(SPLIT_CSV)\n",
+    "meta_df     = pd.read_csv(META_CSV)\n",
+    "chexpert_df = pd.read_csv(CHEXPERT_CSV)\n",
+    "\n",
+    "# Tạo cột subset folder (p10, p11, ..., p19)\n",
+    "def get_subset(subject_id):\n",
+    "    return \"p\" + str(subject_id)[:2]\n",
+    "\n",
+    "for df_ in [split_df, meta_df, chexpert_df]:\n",
+    "    df_[\"subset\"] = df_[\"subject_id\"].astype(str).str[:2].apply(lambda x: f\"p{x}\")\n",
+    "\n",
+    "print(f\"split.csv   — total images  : {len(split_df):,}\")\n",
+    "print(f\"metadata    — total images  : {len(meta_df):,}\")\n",
+    "print(f\"chexpert    — total studies : {len(chexpert_df):,}\")\n",
+    "print(f\"\\nSubsets found in split.csv:\")\n",
+    "print(split_df[\"subset\"].value_counts().sort_index().to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Merge split + metadata\n",
+    "df = split_df.merge(\n",
+    "    meta_df[[\"dicom_id\", \"ViewPosition\", \"Rows\", \"Columns\"]],\n",
+    "    on=\"dicom_id\", how=\"left\"\n",
+    ")\n",
+    "# Giữ lại cột subset từ split_df\n",
+    "if \"subset_y\" in df.columns:\n",
+    "    df = df.drop(columns=[\"subset_y\"]).rename(columns={\"subset_x\": \"subset\"})\n",
+    "\n",
+    "print(f\"Merged shape: {df.shape}\")\n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Tổng quan: số ảnh & report theo split + subset folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tổng theo split\n",
+    "img_per_split   = df[\"split\"].value_counts().reindex([\"train\",\"validate\",\"test\"])\n",
+    "study_per_split = (\n",
+    "    df.drop_duplicates(\"study_id\")[\"split\"]\n",
+    "    .value_counts().reindex([\"train\",\"validate\",\"test\"])\n",
+    ")\n",
+    "\n",
+    "summary_total = pd.DataFrame({\n",
+    "    \"Images\": img_per_split,\n",
+    "    \"Studies/Reports\": study_per_split\n",
+    "})\n",
+    "summary_total.loc[\"TOTAL\"] = summary_total.sum()\n",
+    "print(\"=== Overall split summary ===\")\n",
+    "print(summary_total.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── Breakdown theo từng subset folder ────────────────────────────────────────\n",
+    "img_subset_split = (\n",
+    "    df.groupby([\"subset\", \"split\"])[\"dicom_id\"]\n",
+    "    .count()\n",
+    "    .unstack(fill_value=0)\n",
+    "    .reindex(columns=[\"train\",\"validate\",\"test\"], fill_value=0)\n",
+    "    .reindex(ALL_SUBSETS, fill_value=0)\n",
+    ")\n",
+    "img_subset_split[\"TOTAL\"] = img_subset_split.sum(axis=1)\n",
+    "\n",
+    "study_subset_split = (\n",
+    "    df.drop_duplicates(\"study_id\")\n",
+    "    .groupby([\"subset\", \"split\"])[\"study_id\"]\n",
+    "    .count()\n",
+    "    .unstack(fill_value=0)\n",
+    "    .reindex(columns=[\"train\",\"validate\",\"test\"], fill_value=0)\n",
+    "    .reindex(ALL_SUBSETS, fill_value=0)\n",
+    ")\n",
+    "study_subset_split[\"TOTAL\"] = study_subset_split.sum(axis=1)\n",
+    "\n",
+    "print(\"=== Images per subset × split ===\")\n",
+    "print(img_subset_split.to_string())\n",
+    "print(\"\\n=== Studies/Reports per subset × split ===\")\n",
+    "print(study_subset_split.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
+    "palette = {\"train\": \"#4C72B0\", \"validate\": \"#DD8452\", \"test\": \"#55A868\"}\n",
+    "\n",
+    "for ax, data, title in zip(\n",
+    "    axes,\n",
+    "    [img_subset_split[[\"train\",\"validate\",\"test\"]], study_subset_split[[\"train\",\"validate\",\"test\"]]],\n",
+    "    [\"Số ảnh theo subset × split\", \"Số study/report theo subset × split\"]\n",
+    "):\n",
+    "    data.plot(kind=\"bar\", ax=ax, color=[palette[c] for c in data.columns], width=0.75)\n",
+    "    ax.set_title(title, fontsize=12)\n",
+    "    ax.set_xlabel(\"Subset folder\")\n",
+    "    ax.set_ylabel(\"Count\")\n",
+    "    ax.tick_params(axis=\"x\", rotation=0)\n",
+    "    ax.legend(title=\"Split\")\n",
+    "\n",
+    "plt.suptitle(\"MIMIC-CXR Full Dataset — Split × Subset\", fontsize=14, y=1.02)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Heatmap: tỉ lệ % train/val/test trong mỗi subset\n",
+    "img_pct = img_subset_split[[\"train\",\"validate\",\"test\"]].div(\n",
+    "    img_subset_split[\"TOTAL\"], axis=0\n",
+    ") * 100\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(8, 5))\n",
+    "sns.heatmap(\n",
+    "    img_pct.round(1), annot=True, fmt=\".1f\", cmap=\"YlGnBu\",\n",
+    "    linewidths=0.5, ax=ax, cbar_kws={\"label\": \"%\"}\n",
+    ")\n",
+    "ax.set_title(\"Tỉ lệ (%) train/val/test trong mỗi subset folder\")\n",
+    "ax.set_xlabel(\"Split\")\n",
+    "ax.set_ylabel(\"Subset\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Số ảnh mỗi study"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imgs_per_study = df.groupby(\"study_id\")[\"dicom_id\"].count()\n",
+    "count_dist     = imgs_per_study.value_counts().sort_index()\n",
+    "\n",
+    "print(\"Images per study distribution:\")\n",
+    "print(count_dist.to_string())\n",
+    "print(f\"\\nMax : {imgs_per_study.max()}\")\n",
+    "print(f\"Mean: {imgs_per_study.mean():.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+    "\n",
+    "# Full distribution\n",
+    "axes[0].bar(count_dist.index.astype(str), count_dist.values,\n",
+    "            color=sns.color_palette(\"Blues_d\", len(count_dist)))\n",
+    "axes[0].set_title(\"Số ảnh mỗi study (toàn bộ)\")\n",
+    "axes[0].set_xlabel(\"Số ảnh trong study\")\n",
+    "axes[0].set_ylabel(\"Số study\")\n",
+    "for x, v in zip(count_dist.index, count_dist.values):\n",
+    "    axes[0].text(str(x), v * 1.01, f\"{v:,}\", ha=\"center\", va=\"bottom\", fontsize=8)\n",
+    "\n",
+    "# Per-subset: mean images per study\n",
+    "mean_imgs = df.groupby(\"subset\").apply(\n",
+    "    lambda g: g.groupby(\"study_id\")[\"dicom_id\"].count().mean()\n",
+    ").reindex(ALL_SUBSETS)\n",
+    "axes[1].bar(mean_imgs.index, mean_imgs.values, color=\"steelblue\")\n",
+    "axes[1].set_title(\"Trung bình số ảnh/study theo subset\")\n",
+    "axes[1].set_xlabel(\"Subset\")\n",
+    "axes[1].set_ylabel(\"Mean images/study\")\n",
+    "axes[1].set_ylim(0, mean_imgs.max() * 1.2)\n",
+    "for x, v in zip(mean_imgs.index, mean_imgs.values):\n",
+    "    axes[1].text(x, v * 1.01, f\"{v:.2f}\", ha=\"center\", va=\"bottom\", fontsize=9)\n",
+    "\n",
+    "plt.suptitle(\"Images per Study Distribution\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. View Position"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_counts = df[\"ViewPosition\"].fillna(\"Unknown\").value_counts()\n",
+    "print(\"View position counts (total):\")\n",
+    "print(view_counts.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
+    "\n",
+    "bars = axes[0].bar(view_counts.index, view_counts.values,\n",
+    "                   color=sns.color_palette(\"Set2\", len(view_counts)))\n",
+    "axes[0].bar_label(bars, fmt=\"%d\")\n",
+    "axes[0].set_title(\"Số ảnh theo View Position\")\n",
+    "axes[0].set_ylabel(\"Count\")\n",
+    "\n",
+    "axes[1].pie(view_counts.values, labels=view_counts.index, autopct=\"%1.1f%%\",\n",
+    "            colors=sns.color_palette(\"Set2\", len(view_counts)))\n",
+    "axes[1].set_title(\"Tỉ lệ View Position\")\n",
+    "\n",
+    "plt.suptitle(\"View Position Distribution — Full Dataset\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# View per subset\n",
+    "view_subset = (\n",
+    "    df.fillna({\"ViewPosition\": \"Unknown\"})\n",
+    "    .groupby([\"subset\", \"ViewPosition\"])[\"dicom_id\"]\n",
+    "    .count()\n",
+    "    .unstack(fill_value=0)\n",
+    "    .reindex(ALL_SUBSETS, fill_value=0)\n",
+    ")\n",
+    "\n",
+    "view_subset.plot(kind=\"bar\", figsize=(14, 4),\n",
+    "                 color=sns.color_palette(\"Set2\", view_subset.shape[1]),\n",
+    "                 width=0.8)\n",
+    "plt.title(\"View Position theo subset folder\")\n",
+    "plt.xlabel(\"Subset\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.xticks(rotation=0)\n",
+    "plt.legend(title=\"ViewPosition\", bbox_to_anchor=(1.01, 1), loc=\"upper left\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# View split breakdown\n",
+    "view_split = df.groupby([\"split\", \"ViewPosition\"]).size().unstack(fill_value=0)\n",
+    "view_split = view_split.reindex([\"train\",\"validate\",\"test\"])\n",
+    "view_split.plot(kind=\"bar\", figsize=(10, 4),\n",
+    "                color=sns.color_palette(\"Set2\", view_split.shape[1]))\n",
+    "plt.title(\"View Position theo split\")\n",
+    "plt.xlabel(\"Split\")\n",
+    "plt.xticks(rotation=0)\n",
+    "plt.legend(title=\"ViewPosition\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8845a29f",
+   "source": "## 4b. Frontal-Only Sampling Strategy (AP > PA)\n\nChiến lược train: **1 report + 1 ảnh frontal** mỗi study.\n- Chỉ giữ AP hoặc PA; nếu study có cả hai thì **ưu tiên AP**.\n- Study không có ảnh frontal nào → loại khỏi tập train.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "22a327eb",
+   "source": "frontal = df[df[\"ViewPosition\"].isin([\"AP\", \"PA\"])].copy()\n\ndef pick_frontal_view(group):\n    ap = group[group[\"ViewPosition\"] == \"AP\"]\n    if len(ap) > 0:\n        return ap.iloc[[0]]\n    return group[group[\"ViewPosition\"] == \"PA\"].iloc[[0]]\n\nfrontal_1img = (\n    frontal.groupby(\"study_id\", group_keys=False)\n    .apply(pick_frontal_view)\n    .reset_index(drop=True)\n)\n\nn_study_total    = df[\"study_id\"].nunique()\nn_study_frontal  = frontal_1img[\"study_id\"].nunique()\nn_study_no_front = n_study_total - n_study_frontal\n\nprint(\"=== Frontal-Only Sampling (Full Dataset) ===\")\nprint(f\"Tổng số study                   : {n_study_total:,}\")\nprint(f\"Study có ảnh frontal (AP/PA)    : {n_study_frontal:,}  ({n_study_frontal/n_study_total*100:.1f}%)\")\nprint(f\"Study bị loại (không có frontal): {n_study_no_front:,}  ({n_study_no_front/n_study_total*100:.1f}%)\")\nprint()\nprint(\"Ảnh được chọn theo view:\")\nprint(frontal_1img[\"ViewPosition\"].value_counts().to_string())\nprint()\n\nsplit_frontal = frontal_1img[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\nsplit_all     = df.drop_duplicates(\"study_id\")[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\ncompare = pd.DataFrame({\n    \"All studies\": split_all,\n    \"Frontal-only\": split_frontal,\n    \"Giảm (%)\": ((split_all - split_frontal) / split_all * 100).round(1)\n})\nprint(\"=== Mẫu train sau khi filter (split) ===\")\nprint(compare.to_string())",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "712ff838",
+   "source": "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. All vs Frontal-only\nbars = axes[0].bar([\"All studies\", \"Frontal-only\"],\n                   [n_study_total, n_study_frontal],\n                   color=[\"#4C72B0\", \"#55A868\"], width=0.5)\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Study count: All vs Frontal-only\")\naxes[0].set_ylabel(\"Số study\")\n\n# 2. Pie: view được chọn\nvc = frontal_1img[\"ViewPosition\"].value_counts()\naxes[1].pie(vc.values, labels=vc.index, autopct=\"%1.1f%%\",\n            colors=[\"#4C72B0\", \"#DD8452\"])\naxes[1].set_title(\"View được chọn (AP ưu tiên)\")\n\n# 3. Per-split comparison\nx = np.arange(3)\nw = 0.35\naxes[2].bar(x - w/2, split_all.values,     w, label=\"All\",          color=\"#4C72B0\", alpha=0.85)\naxes[2].bar(x + w/2, split_frontal.values, w, label=\"Frontal-only\", color=\"#55A868\", alpha=0.85)\naxes[2].set_xticks(x)\naxes[2].set_xticklabels([\"train\", \"validate\", \"test\"])\naxes[2].set_title(\"Frontal-only vs All (per split)\")\naxes[2].set_ylabel(\"Số study\")\naxes[2].legend()\n\nplt.suptitle(\"Frontal-Only Sampling Strategy — Full Dataset\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# 4. Frontal-only per subset\nfrontal_subset = (\n    frontal_1img.groupby([\"subset\", \"split\"]).size()\n    .unstack(fill_value=0)\n    .reindex(columns=[\"train\", \"validate\", \"test\"], fill_value=0)\n    .reindex(ALL_SUBSETS, fill_value=0)\n)\nfrontal_subset[\"TOTAL\"] = frontal_subset.sum(axis=1)\nprint(\"Frontal-only samples per subset:\")\nprint(frontal_subset.to_string())\n\nfrontal_subset[[\"train\",\"validate\",\"test\"]].plot(\n    kind=\"bar\", figsize=(13, 4),\n    color=[\"#4C72B0\", \"#DD8452\", \"#55A868\"], width=0.75\n)\nplt.title(\"Frontal-Only samples theo subset × split\")\nplt.xlabel(\"Subset\")\nplt.ylabel(\"Số study\")\nplt.xticks(rotation=0)\nplt.legend(title=\"Split\")\nplt.tight_layout()\nplt.show()",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. CheXpert Labels — 14 nhãn bệnh lý"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_cols = [c for c in chexpert_df.columns if c in CHEXPERT_LABELS]\n",
+    "\n",
+    "positive_counts  = (chexpert_df[label_cols] == 1).sum().sort_values(ascending=False)\n",
+    "uncertain_counts = (chexpert_df[label_cols] == -1).sum()\n",
+    "negative_counts  = (chexpert_df[label_cols] == 0).sum()\n",
+    "\n",
+    "label_summary = pd.DataFrame({\n",
+    "    \"Positive\":      positive_counts,\n",
+    "    \"Uncertain\":     uncertain_counts,\n",
+    "    \"Negative\":      negative_counts,\n",
+    "    \"Not Mentioned\": chexpert_df[label_cols].isna().sum()\n",
+    "})\n",
+    "label_summary[\"Positive %\"] = (label_summary[\"Positive\"] / len(chexpert_df) * 100).round(1)\n",
+    "print(label_summary.sort_values(\"Positive\", ascending=False).to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ordered_labels = label_summary.sort_values(\"Positive\", ascending=False).index.tolist()\n",
+    "x  = np.arange(len(ordered_labels))\n",
+    "w  = 0.25\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(14, 5))\n",
+    "ax.bar(x - w, label_summary.loc[ordered_labels, \"Positive\"],  w, label=\"Positive\",  color=\"#e74c3c\")\n",
+    "ax.bar(x,     label_summary.loc[ordered_labels, \"Uncertain\"], w, label=\"Uncertain\", color=\"#f39c12\")\n",
+    "ax.bar(x + w, label_summary.loc[ordered_labels, \"Negative\"],  w, label=\"Negative\",  color=\"#2ecc71\")\n",
+    "ax.set_xticks(x)\n",
+    "ax.set_xticklabels(ordered_labels, rotation=40, ha=\"right\", fontsize=9)\n",
+    "ax.set_ylabel(\"Số study\")\n",
+    "ax.set_title(\"CheXpert Labels — Positive / Uncertain / Negative (Full Dataset)\")\n",
+    "ax.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "ADMIN_HEADERS = {\n    'EXAMINATION', 'INDICATION', 'CLINICAL INDICATION', 'TECHNIQUE',\n    'COMPARISON', 'HISTORY', 'REASON', 'REASON FOR EXAM',\n    'REASON FOR EXAMINATION', 'PROCEDURE', 'FINAL REPORT',\n    'NOTIFICATION', 'RECOMMENDATION', 'ADDENDUM'\n}\n\nSECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\-]{1,70}?):\\s*', re.MULTILINE)\n\ndef parse_report(txt_path: Path) -> dict:\n    \"\"\"\n    Quy luật detect section: mọi header đều VIẾT HOA TOÀN BỘ và kết thúc bằng ':'.\n    Fallback: nếu không có FINDINGS tường minh, lấy section descriptive đầu tiên.\n    \"\"\"\n    try:\n        text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n    except FileNotFoundError:\n        return {\"findings\": None, \"impression\": None}\n\n    matches = list(SECTION_RE.finditer(text))\n    if not matches:\n        return {\"findings\": None, \"impression\": None}\n\n    sections = []\n    for i, m in enumerate(matches):\n        header  = m.group(1).strip()\n        start   = m.end()\n        end     = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n        content = text[start:end].strip()\n        sections.append((header, content))\n\n    findings = impression = None\n    for header, content in sections:\n        h = header.upper()\n        if \"FINDING\" in h and findings is None:\n            findings = content or None\n        elif \"IMPRESSION\" in h and impression is None:\n            impression = content or None\n\n    if findings is None:\n        for header, content in sections:\n            h = header.upper()\n            if h not in ADMIN_HEADERS and \"IMPRESSION\" not in h and content:\n                findings = content\n                break\n\n    return {\"findings\": findings, \"impression\": impression}\n\n\nall_studies = (\n    df[[\"subject_id\", \"study_id\", \"subset\"]]\n    .drop_duplicates(\"study_id\")\n    .reset_index(drop=True)\n)\n\nif REPORT_SAMPLE_SIZE is not None:\n    parse_studies = all_studies.sample(\n        n=min(REPORT_SAMPLE_SIZE, len(all_studies)), random_state=42\n    ).reset_index(drop=True)\n    print(f\"Sample {len(parse_studies):,} / {len(all_studies):,} studies\")\nelse:\n    parse_studies = all_studies\n    print(f\"Parsing ALL {len(parse_studies):,} studies...\")\n\nrecords = []\nfor _, row in parse_studies.iterrows():\n    sid  = str(row[\"subject_id\"])\n    stid = str(row[\"study_id\"])\n    sub  = row[\"subset\"]\n    txt_path = CXR_ROOT / \"files\" / sub / f\"p{sid}\" / f\"s{stid}.txt\"\n    records.append({\"study_id\": stid, \"subset\": sub, **parse_report(txt_path)})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"]   = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\ntotal = len(report_df)\nprint(f\"\\nFindings   found : {report_df['findings'].notna().sum():,} / {total:,}  ({report_df['findings'].notna().mean()*100:.1f}%)\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {total:,}  ({report_df['impression'].notna().mean()*100:.1f}%)\")\nboth    = (report_df['findings'].notna() & report_df['impression'].notna()).sum()\nneither = (report_df['findings'].isna()  & report_df['impression'].isna()).sum()\nprint(f\"Cả hai           : {both:,} / {total:,}  ({both/total*100:.1f}%)\")\nprint(f\"Không có cả hai  : {neither:,} / {total:,}  ({neither/total*100:.1f}%)\")"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Số nhãn positive mỗi study\n",
+    "labels_per_study = (chexpert_df[label_cols] == 1).sum(axis=1)\n",
+    "lps_counts = labels_per_study.value_counts().sort_index()\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(9, 4))\n",
+    "ax.bar(lps_counts.index.astype(str), lps_counts.values,\n",
+    "       color=sns.color_palette(\"Blues_d\", len(lps_counts)))\n",
+    "ax.set_xlabel(\"Số nhãn positive\")\n",
+    "ax.set_ylabel(\"Số study\")\n",
+    "ax.set_title(\"Phân bố số nhãn positive mỗi study (Full Dataset)\")\n",
+    "for x_, v in zip(lps_counts.index, lps_counts.values):\n",
+    "    ax.text(str(x_), v * 1.01, f\"{v:,}\", ha=\"center\", va=\"bottom\", fontsize=8)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## 6. Phân tích Report — Findings & Impression\n\n> ℹ️ Report parsing chỉ hoạt động với subset **đã tải về**. Các subset chưa có sẽ tự động bị bỏ qua."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "def parse_report(txt_path: Path) -> dict:\n    try:\n        text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n    except FileNotFoundError:\n        return {\"findings\": None, \"impression\": None}\n\n    text = re.sub(r\"[\\r\\n]+\", \" \", text)\n\n    def extract_section(pattern, text):\n        m = re.search(pattern, text, re.IGNORECASE)\n        if not m:\n            return None\n        start = m.end()\n        nxt = re.search(\n            r\"(IMPRESSION|FINDINGS|CONCLUSION|RECOMMENDATION|NOTIFICATION)\",\n            text[start:], re.IGNORECASE\n        )\n        end = start + nxt.start() if nxt else len(text)\n        return text[start:end].strip()\n\n    return {\n        \"findings\":   extract_section(r\"FINDINGS\\s*:\", text),\n        \"impression\": extract_section(r\"IMPRESSION\\s*:\", text)\n    }\n\n\nall_studies = (\n    df[[\"subject_id\", \"study_id\", \"subset\"]]\n    .drop_duplicates(\"study_id\")\n    .reset_index(drop=True)\n)\n\nif REPORT_SAMPLE_SIZE is not None:\n    parse_studies = all_studies.sample(\n        n=min(REPORT_SAMPLE_SIZE, len(all_studies)), random_state=42\n    ).reset_index(drop=True)\n    print(f\"Sample {len(parse_studies):,} / {len(all_studies):,} studies\")\nelse:\n    parse_studies = all_studies\n    print(f\"Parsing ALL {len(parse_studies):,} studies... (có thể mất 10-20 phút)\")\n\nrecords = []\nfor _, row in parse_studies.iterrows():\n    sid  = str(row[\"subject_id\"])\n    stid = str(row[\"study_id\"])\n    sub  = row[\"subset\"]\n    txt_path = CXR_ROOT / \"files\" / sub / f\"p{sid}\" / f\"s{stid}.txt\"\n    records.append({\"study_id\": stid, \"subset\": sub, **parse_report(txt_path)})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"]   = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\nprint(f\"Findings   found : {report_df['findings'].notna().sum():,} / {len(report_df):,}\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {len(report_df):,}\")"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=== Findings word count ===\")\n",
+    "print(report_df[\"findings_len\"].describe().round(1).to_string())\n",
+    "print(\"\\n=== Impression word count ===\")\n",
+    "print(report_df[\"impression_len\"].describe().round(1).to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+    "\n",
+    "for ax, col, title, color in zip(\n",
+    "    axes,\n",
+    "    [\"findings_len\", \"impression_len\"],\n",
+    "    [\"Findings — độ dài (số từ)\", \"Impression — độ dài (số từ)\"],\n",
+    "    [\"steelblue\", \"tomato\"]\n",
+    "):\n",
+    "    data = report_df[col].dropna()\n",
+    "    p99  = data.quantile(0.99)\n",
+    "    ax.hist(data[data <= p99], bins=50, color=color, edgecolor=\"white\", alpha=0.85)\n",
+    "    ax.axvline(data.median(), color=\"black\",  ls=\"--\", lw=1.3, label=f\"Median={data.median():.0f}\")\n",
+    "    ax.axvline(data.mean(),   color=\"gray\",   ls=\":\",  lw=1.3, label=f\"Mean={data.mean():.0f}\")\n",
+    "    ax.set_title(title)\n",
+    "    ax.set_xlabel(\"Số từ\")\n",
+    "    ax.set_ylabel(\"Số report\")\n",
+    "    ax.legend(fontsize=9)\n",
+    "    ax.text(0.97, 0.95, f\"n={len(data):,}\\n(≤p99={p99:.0f}w)\",\n",
+    "            transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
+    "\n",
+    "plt.suptitle(\"Phân bố độ dài report — Full Dataset\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Box plot Findings vs Impression\n",
+    "combined = pd.DataFrame({\n",
+    "    \"word_count\": pd.concat([report_df[\"findings_len\"], report_df[\"impression_len\"]], ignore_index=True),\n",
+    "    \"section\":    [\"Findings\"] * len(report_df) + [\"Impression\"] * len(report_df)\n",
+    "}).dropna()\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(7, 4))\n",
+    "sns.boxplot(data=combined, x=\"section\", y=\"word_count\",\n",
+    "            palette=[\"steelblue\", \"tomato\"], showfliers=False, ax=ax)\n",
+    "ax.set_title(\"Findings vs Impression — độ dài (no outliers)\")\n",
+    "ax.set_ylabel(\"Số từ\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Median report length theo subset\n",
+    "rep_by_subset = report_df.groupby(\"subset\")[[\"findings_len\",\"impression_len\"]].median().reindex(ALL_SUBSETS)\n",
+    "\n",
+    "rep_by_subset.plot(kind=\"bar\", figsize=(12, 4),\n",
+    "                   color=[\"steelblue\", \"tomato\"], width=0.7)\n",
+    "plt.title(\"Median độ dài Findings & Impression theo subset\")\n",
+    "plt.xlabel(\"Subset\")\n",
+    "plt.ylabel(\"Median số từ\")\n",
+    "plt.xticks(rotation=0)\n",
+    "plt.legend([\"Findings\", \"Impression\"])\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. VQA — phân tích câu hỏi & đáp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vqa_dfs = []\n",
+    "for fpath, sname in [(VQA_TRAIN, \"train\"), (VQA_VALID, \"valid\"), (VQA_TEST, \"test\")]:\n",
+    "    if fpath.exists():\n",
+    "        with open(fpath, encoding=\"utf-8\") as f:\n",
+    "            data = json.load(f)\n",
+    "        tmp = pd.DataFrame(data)\n",
+    "        tmp[\"split\"] = sname\n",
+    "        vqa_dfs.append(tmp)\n",
+    "    else:\n",
+    "        print(f\"[WARNING] Not found: {fpath}\")\n",
+    "\n",
+    "vqa_all = pd.concat(vqa_dfs, ignore_index=True)\n",
+    "vqa_all[\"subset\"] = \"p\" + vqa_all[\"subject_id\"].astype(str).str[:2]\n",
+    "\n",
+    "print(f\"VQA total: {len(vqa_all):,}\")\n",
+    "print(f\"\\nPer split:\")\n",
+    "print(vqa_all[\"split\"].value_counts().to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# VQA per subset × split\n",
+    "vqa_subset_split = (\n",
+    "    vqa_all.groupby([\"subset\", \"split\"]).size()\n",
+    "    .unstack(fill_value=0)\n",
+    "    .reindex(columns=[\"train\",\"valid\",\"test\"], fill_value=0)\n",
+    "    .reindex(ALL_SUBSETS, fill_value=0)\n",
+    ")\n",
+    "vqa_subset_split[\"TOTAL\"] = vqa_subset_split.sum(axis=1)\n",
+    "print(\"VQA samples per subset × split:\")\n",
+    "print(vqa_subset_split.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vqa_subset_split[[\"train\",\"valid\",\"test\"]].plot(\n",
+    "    kind=\"bar\", figsize=(13, 4),\n",
+    "    color=[palette[\"train\"], palette[\"validate\"], palette[\"test\"]],\n",
+    "    width=0.75\n",
+    ")\n",
+    "plt.title(\"VQA samples theo subset × split\")\n",
+    "plt.xlabel(\"Subset\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.xticks(rotation=0)\n",
+    "plt.legend(title=\"Split\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63f3247e",
+   "source": "### VQA × View Position — mẫu hỏi đáp thuộc ảnh view nào",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "d5e6a532",
+   "source": "# image_id trong VQA = dicom_id trong metadata\nvqa_view = vqa_all.merge(\n    meta_df[[\"dicom_id\", \"ViewPosition\"]],\n    left_on=\"image_id\", right_on=\"dicom_id\",\n    how=\"left\"\n)\n\nmissing_view_vqa = vqa_view[\"ViewPosition\"].isna().sum()\nvqa_view[\"ViewPosition\"] = vqa_view[\"ViewPosition\"].fillna(\"Unknown\")\n\nview_vqa_counts = vqa_view[\"ViewPosition\"].value_counts()\nprint(\"=== VQA samples theo View Position (Full Dataset) ===\")\nprint(view_vqa_counts.to_string())\nprint(f\"\\nKhông map được ViewPosition: {missing_view_vqa:,} ({missing_view_vqa/len(vqa_view)*100:.1f}%)\")\n\nfig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. Bar\nbars = axes[0].bar(view_vqa_counts.index, view_vqa_counts.values,\n                   color=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Số mẫu VQA theo View Position\")\naxes[0].set_ylabel(\"Số mẫu\")\n\n# 2. Pie\naxes[1].pie(view_vqa_counts.values, labels=view_vqa_counts.index,\n            autopct=\"%1.1f%%\", colors=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[1].set_title(\"Tỉ lệ VQA theo View Position\")\n\n# 3. Semantic type × View (stacked bar)\nsem_view = vqa_view.groupby([\"ViewPosition\", \"semantic_type\"]).size().unstack(fill_value=0)\nsem_view.plot(kind=\"bar\", ax=axes[2], color=sns.color_palette(\"Set1\", sem_view.shape[1]),\n              width=0.7, stacked=True)\naxes[2].set_title(\"Semantic Type × View Position\")\naxes[2].set_xlabel(\"View Position\")\naxes[2].set_ylabel(\"Số mẫu\")\naxes[2].tick_params(axis=\"x\", rotation=30)\naxes[2].legend(title=\"Semantic Type\", fontsize=8)\n\nplt.suptitle(\"VQA × View Position — Full Dataset\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# Content type × View (heatmap)\ncontent_view = (vqa_view.groupby([\"ViewPosition\", \"content_type\"]).size()\n                .unstack(fill_value=0))\nprint(\"\\nContent type theo View Position:\")\nprint(content_view.to_string())\n\nfig, ax = plt.subplots(figsize=(11, 4))\nsns.heatmap(content_view, annot=True, fmt=\"d\", cmap=\"YlGnBu\",\n            linewidths=0.4, ax=ax)\nax.set_title(\"VQA — Content Type × View Position\")\nplt.tight_layout()\nplt.show()",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Semantic type & Content type\n",
+    "sem_counts = vqa_all[\"semantic_type\"].value_counts()\n",
+    "con_counts = vqa_all[\"content_type\"].value_counts()\n",
+    "\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
+    "for ax, counts, title in zip(\n",
+    "    axes,\n",
+    "    [sem_counts, con_counts],\n",
+    "    [\"VQA — Semantic Type\", \"VQA — Content Type\"]\n",
+    "):\n",
+    "    bars = ax.bar(counts.index, counts.values,\n",
+    "                  color=sns.color_palette(\"Set2\", len(counts)))\n",
+    "    ax.bar_label(bars, fmt=\"%d\")\n",
+    "    ax.set_title(title)\n",
+    "    ax.set_ylabel(\"Count\")\n",
+    "    ax.tick_params(axis=\"x\", rotation=30)\n",
+    "\n",
+    "plt.suptitle(\"VQA Question Analysis — Full Dataset\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cross-tab semantic × content\n",
+    "cross = pd.crosstab(vqa_all[\"semantic_type\"], vqa_all[\"content_type\"])\n",
+    "fig, ax = plt.subplots(figsize=(10, 3))\n",
+    "sns.heatmap(cross, annot=True, fmt=\"d\", cmap=\"YlOrRd\", ax=ax)\n",
+    "ax.set_title(\"VQA — Semantic Type × Content Type\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Answer type\n",
+    "def classify_answer(ans_list):\n",
+    "    if not isinstance(ans_list, list) or len(ans_list) == 0:\n",
+    "        return \"no answer\"\n",
+    "    a = ans_list[0].strip().lower()\n",
+    "    return a if a in [\"yes\", \"no\"] else \"open\"\n",
+    "\n",
+    "vqa_all[\"ans_type\"] = vqa_all[\"answer\"].apply(classify_answer)\n",
+    "ans_counts = vqa_all[\"ans_type\"].value_counts()\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(6, 3))\n",
+    "bars = ax.bar(ans_counts.index, ans_counts.values,\n",
+    "              color=sns.color_palette(\"Pastel1\", len(ans_counts)))\n",
+    "ax.bar_label(bars, fmt=\"%d\")\n",
+    "ax.set_title(\"VQA — Answer Type (Full Dataset)\")\n",
+    "ax.set_ylabel(\"Count\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Data Quality & Missing Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Missing ViewPosition\n",
+    "missing_view = df[\"ViewPosition\"].isna().sum()\n",
+    "print(f\"Ảnh thiếu ViewPosition: {missing_view:,} / {len(df):,} ({missing_view/len(df)*100:.2f}%)\")\n",
+    "\n",
+    "# Missing view per subset\n",
+    "mv_subset = df[df[\"ViewPosition\"].isna()].groupby(\"subset\").size().reindex(ALL_SUBSETS, fill_value=0)\n",
+    "print(\"\\nMissing ViewPosition per subset:\")\n",
+    "print(mv_subset.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Missing findings/impression (từ sample)\n",
+    "no_findings   = report_df[\"findings\"].isna().sum()\n",
+    "no_impression = report_df[\"impression\"].isna().sum()\n",
+    "n = len(report_df)\n",
+    "print(f\"Reports thiếu Findings   : {no_findings:,}/{n:,} ({no_findings/n*100:.1f}%)\")\n",
+    "print(f\"Reports thiếu Impression : {no_impression:,}/{n:,} ({no_impression/n*100:.1f}%)\")\n",
+    "print(f\"Reports thiếu CẢ HAI     : {(report_df['findings'].isna() & report_df['impression'].isna()).sum():,}/{n:,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bệnh nhân / study / ảnh tổng quan\n",
+    "n_subjects = df[\"subject_id\"].nunique()\n",
+    "n_studies  = df[\"study_id\"].nunique()\n",
+    "n_images   = df[\"dicom_id\"].nunique()\n",
+    "\n",
+    "print(f\"Bệnh nhân : {n_subjects:,}\")\n",
+    "print(f\"Studies   : {n_studies:,}\")\n",
+    "print(f\"Ảnh       : {n_images:,}\")\n",
+    "print(f\"Trung bình study/patient : {n_studies/n_subjects:.2f}\")\n",
+    "print(f\"Trung bình ảnh/patient   : {n_images/n_subjects:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Study per patient distribution\n",
+    "spp = df.groupby(\"subject_id\")[\"study_id\"].nunique()\n",
+    "print(\"Studies per patient:\")\n",
+    "print(spp.describe().round(1).to_string())\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 4))\n",
+    "spp_vc = spp.value_counts().sort_index()\n",
+    "# clip tails\n",
+    "spp_vc_clip = spp_vc[spp_vc.index <= spp.quantile(0.99)]\n",
+    "ax.bar(spp_vc_clip.index.astype(str), spp_vc_clip.values, color=\"mediumpurple\")\n",
+    "ax.set_xlabel(\"Số study mỗi bệnh nhân\")\n",
+    "ax.set_ylabel(\"Số bệnh nhân\")\n",
+    "ax.set_title(\"Phân bố số lần khám mỗi bệnh nhân (≤p99)\")\n",
+    "ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True, nbins=20))\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Số bệnh nhân và study per subset\n",
+    "patient_subset = df.groupby(\"subset\")[\"subject_id\"].nunique().reindex(ALL_SUBSETS)\n",
+    "study_subset   = df.groupby(\"subset\")[\"study_id\"].nunique().reindex(ALL_SUBSETS)\n",
+    "image_subset   = df.groupby(\"subset\")[\"dicom_id\"].nunique().reindex(ALL_SUBSETS)\n",
+    "\n",
+    "subset_overview = pd.DataFrame({\n",
+    "    \"Patients\":  patient_subset,\n",
+    "    \"Studies\":   study_subset,\n",
+    "    \"Images\":    image_subset\n",
+    "})\n",
+    "print(subset_overview.to_string())\n",
+    "\n",
+    "subset_overview.plot(kind=\"bar\", figsize=(13, 4),\n",
+    "                     color=[\"#5e81ac\", \"#88c0d0\", \"#a3be8c\"], width=0.75)\n",
+    "plt.title(\"Patients / Studies / Images theo subset\")\n",
+    "plt.xlabel(\"Subset\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.xticks(rotation=0)\n",
+    "plt.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=\"*60)\n",
+    "print(\"  SUMMARY — MIMIC-CXR Full Dataset\")\n",
+    "print(\"=\"*60)\n",
+    "print(f\"  Bệnh nhân            : {n_subjects:,}\")\n",
+    "print(f\"  Studies (reports)    : {n_studies:,}\")\n",
+    "print(f\"  Ảnh (dicom/jpg)      : {n_images:,}\")\n",
+    "print()\n",
+    "for sp in [\"train\", \"validate\", \"test\"]:\n",
+    "    ni = img_per_split.get(sp, 0)\n",
+    "    ns = study_per_split.get(sp, 0)\n",
+    "    print(f\"  [{sp:>8}]  ảnh={ni:>6,}   studies={ns:>6,}\")\n",
+    "print()\n",
+    "print(f\"  Frontal (PA+AP)      : {view_counts.get('PA',0)+view_counts.get('AP',0):,} ảnh\")\n",
+    "print(f\"  Lateral (LL)         : {view_counts.get('LL',view_counts.get('LATERAL',0)):,} ảnh\")\n",
+    "print(f\"  VQA total samples    : {len(vqa_all):,}\")\n",
+    "print(\"=\"*60)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

data/eda_p18.ipynb ADDED Viewed

	@@ -0,0 +1,797 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# EDA — MIMIC-CXR Subset p18\n",
+    "\n",
+    "**Datasets used:**\n",
+    "- `MIMIC-CXR-JPG` (v2.1.0) — ảnh JPG + CSV metadata\n",
+    "- `MIMIC-CXR` (v2.1.0) — report `.txt` (Findings / Impression)\n",
+    "- `MIMIC-Ext-MIMIC-CXR-VQA` (v1.0.0) — câu hỏi/đáp VQA\n",
+    "\n",
+    "**Scope:** chỉ phân tích bệnh nhân có `subject_id` bắt đầu bằng `18` (folder `p18`).\n",
+    "\n",
+    "> ℹ️ **Không cần tải ảnh JPG** để chạy notebook này — toàn bộ EDA dựa trên CSV, .txt reports và .json VQA."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Cấu hình đường dẫn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "from pathlib import Path\n\nDATA_DIR = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\")\nCXR_ROOT = DATA_DIR / \"mimic-cxr-reports\"   # files/p10…p19/pXXXXXX/sYYYYYY.txt\n\nSPLIT_CSV    = DATA_DIR / \"mimic-cxr-2.0.0-split.csv\"\nMETA_CSV     = DATA_DIR / \"mimic-cxr-2.0.0-metadata.csv\"\nCHEXPERT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-chexpert.csv\"\n\n_VQA_DIR = (DATA_DIR\n    / \"mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\"\n    / \"MIMIC-Ext-MIMIC-CXR-VQA\"\n    / \"dataset\")\nVQA_TRAIN = _VQA_DIR / \"train.json\"\nVQA_VALID = _VQA_DIR / \"valid.json\"\nVQA_TEST  = _VQA_DIR / \"test.json\"\n\n# Kiểm tra nhanh\nfor name, p in [(\"SPLIT_CSV\",    SPLIT_CSV),\n                (\"META_CSV\",     META_CSV),\n                (\"CHEXPERT_CSV\", CHEXPERT_CSV),\n                (\"CXR_ROOT\",     CXR_ROOT),\n                (\"VQA_TRAIN\",    VQA_TRAIN)]:\n    status = \"✓\" if p.exists() else \"✗ NOT FOUND\"\n    print(f\"  {status}  {name}: {p}\")\n\nprint(\"\\nPaths configured.\")"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import json\n",
+    "import re\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.ticker as mticker\n",
+    "import seaborn as sns\n",
+    "from collections import Counter\n",
+    "\n",
+    "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
+    "plt.rcParams[\"figure.dpi\"] = 120\n",
+    "plt.rcParams[\"figure.figsize\"] = (10, 4)\n",
+    "\n",
+    "CHEXPERT_LABELS = [\n",
+    "    \"Atelectasis\", \"Cardiomegaly\", \"Consolidation\", \"Edema\",\n",
+    "    \"Enlarged Cardiomediastinum\", \"Fracture\", \"Lung Lesion\",\n",
+    "    \"Lung Opacity\", \"No Finding\", \"Pleural Effusion\",\n",
+    "    \"Pleural Other\", \"Pneumonia\", \"Pneumothorax\", \"Support Devices\"\n",
+    "]\n",
+    "\n",
+    "print(\"Libraries imported.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load & lọc subset p18"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "split_df    = pd.read_csv(SPLIT_CSV)\n",
+    "meta_df     = pd.read_csv(META_CSV)\n",
+    "chexpert_df = pd.read_csv(CHEXPERT_CSV)\n",
+    "\n",
+    "# Lọc p18\n",
+    "p18_split = split_df[split_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
+    "p18_meta  = meta_df[meta_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
+    "p18_chex  = chexpert_df[chexpert_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
+    "\n",
+    "print(f\"split.csv   — p18 images  : {len(p18_split):,}\")\n",
+    "print(f\"metadata    — p18 images  : {len(p18_meta):,}\")\n",
+    "print(f\"chexpert    — p18 studies : {len(p18_chex):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Merge split + metadata (by dicom_id)\n",
+    "df = p18_split.merge(\n",
+    "    p18_meta[[\"dicom_id\", \"ViewPosition\", \"Rows\", \"Columns\"]],\n",
+    "    on=\"dicom_id\", how=\"left\"\n",
+    ")\n",
+    "\n",
+    "print(f\"Merged shape: {df.shape}\")\n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Tổng quan số lượng ảnh & report theo split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Số ảnh theo split\n",
+    "img_per_split = df[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\n",
+    "\n",
+    "# Số study (≈ report) theo split  (mỗi study_id = 1 report)\n",
+    "study_per_split = (\n",
+    "    df.drop_duplicates(\"study_id\")[\"split\"]\n",
+    "    .value_counts()\n",
+    "    .reindex([\"train\", \"validate\", \"test\"])\n",
+    ")\n",
+    "\n",
+    "summary = pd.DataFrame({\n",
+    "    \"Images (dicom_id)\": img_per_split,\n",
+    "    \"Studies / Reports\": study_per_split\n",
+    "})\n",
+    "summary.loc[\"TOTAL\"] = summary.sum()\n",
+    "print(summary.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(11, 4))\n",
+    "for ax, col, title in zip(axes, summary.columns, [\"Số ảnh theo split\", \"Số study/report theo split\"]):\n",
+    "    vals = summary.loc[[\"train\",\"validate\",\"test\"], col]\n",
+    "    bars = ax.bar(vals.index, vals.values, color=sns.color_palette(\"muted\", 3))\n",
+    "    ax.bar_label(bars, fmt=\"%d\")\n",
+    "    ax.set_title(title)\n",
+    "    ax.set_ylabel(\"Count\")\n",
+    "plt.suptitle(\"p18 subset — images vs reports per split\", fontsize=13, y=1.02)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Số ảnh mỗi study (1 study → bao nhiêu ảnh?)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imgs_per_study = df.groupby(\"study_id\")[\"dicom_id\"].count()\n",
+    "count_dist = imgs_per_study.value_counts().sort_index()\n",
+    "\n",
+    "print(\"Images per study distribution:\")\n",
+    "print(count_dist.to_string())\n",
+    "print(f\"\\nMax images in a single study: {imgs_per_study.max()}\")\n",
+    "print(f\"Mean images per study       : {imgs_per_study.mean():.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(8, 4))\n",
+    "ax.bar(count_dist.index.astype(str), count_dist.values, color=sns.color_palette(\"Blues_d\", len(count_dist)))\n",
+    "ax.set_xlabel(\"Số ảnh trong study\")\n",
+    "ax.set_ylabel(\"Số study\")\n",
+    "ax.set_title(\"Distribution: số ảnh mỗi study (p18)\")\n",
+    "for i, v in zip(count_dist.index, count_dist.values):\n",
+    "    ax.text(str(i), v + max(count_dist)*0.01, str(v), ha=\"center\", va=\"bottom\", fontsize=9)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Phân bố View Position (AP, PA, Lateral, ...)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_counts = df[\"ViewPosition\"].fillna(\"Unknown\").value_counts()\n",
+    "print(\"View position counts:\")\n",
+    "print(view_counts.to_string())\n",
+    "print(f\"\\nTotal images: {len(df):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
+    "\n",
+    "# Bar chart\n",
+    "bars = axes[0].bar(view_counts.index, view_counts.values,\n",
+    "                   color=sns.color_palette(\"Set2\", len(view_counts)))\n",
+    "axes[0].bar_label(bars, fmt=\"%d\")\n",
+    "axes[0].set_title(\"Số ảnh theo View Position\")\n",
+    "axes[0].set_ylabel(\"Count\")\n",
+    "\n",
+    "# Pie chart\n",
+    "axes[1].pie(view_counts.values, labels=view_counts.index, autopct=\"%1.1f%%\",\n",
+    "            colors=sns.color_palette(\"Set2\", len(view_counts)))\n",
+    "axes[1].set_title(\"Tỉ lệ View Position\")\n",
+    "\n",
+    "plt.suptitle(\"View Position Distribution — p18\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# View distribution theo split\n",
+    "view_split = df.groupby([\"split\", \"ViewPosition\"]).size().unstack(fill_value=0)\n",
+    "view_split = view_split.reindex([\"train\", \"validate\", \"test\"])\n",
+    "view_split.plot(kind=\"bar\", figsize=(10, 4), color=sns.color_palette(\"Set2\", view_split.shape[1]))\n",
+    "plt.title(\"View Position theo split — p18\")\n",
+    "plt.xlabel(\"Split\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.xticks(rotation=0)\n",
+    "plt.legend(title=\"ViewPosition\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae9f3d3c",
+   "source": "## 4b. Frontal-Only Sampling Strategy (AP > PA)\n\nChiến lược train: **1 report + 1 ảnh frontal** mỗi study.\n- Chỉ giữ AP hoặc PA; nếu study có cả hai thì **ưu tiên AP**.\n- Study không có ảnh frontal nào → loại khỏi tập train.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "d2ce6beb",
+   "source": "frontal = df[df[\"ViewPosition\"].isin([\"AP\", \"PA\"])].copy()\n\n# Với mỗi study: chọn AP trước, nếu không có thì chọn PA (lấy 1 ảnh duy nhất)\ndef pick_frontal_view(group):\n    ap = group[group[\"ViewPosition\"] == \"AP\"]\n    if len(ap) > 0:\n        return ap.iloc[[0]]\n    return group[group[\"ViewPosition\"] == \"PA\"].iloc[[0]]\n\nfrontal_1img = (\n    frontal.groupby(\"study_id\", group_keys=False)\n    .apply(pick_frontal_view)\n    .reset_index(drop=True)\n)\n\n# Thống kê tổng quan\nn_study_total    = df[\"study_id\"].nunique()\nn_study_frontal  = frontal_1img[\"study_id\"].nunique()\nn_study_no_front = n_study_total - n_study_frontal\n\nprint(\"=== Frontal-Only Sampling (p18) ===\")\nprint(f\"Tổng số study                   : {n_study_total:,}\")\nprint(f\"Study có ảnh frontal (AP/PA)    : {n_study_frontal:,}  ({n_study_frontal/n_study_total*100:.1f}%)\")\nprint(f\"Study bị loại (không có frontal): {n_study_no_front:,}  ({n_study_no_front/n_study_total*100:.1f}%)\")\nprint()\nprint(f\"Ảnh được chọn theo view:\")\nprint(frontal_1img[\"ViewPosition\"].value_counts().to_string())\nprint()\nprint(\"=== Mẫu train sau khi filter (split) ===\")\nsplit_frontal = frontal_1img[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\nsplit_all     = df.drop_duplicates(\"study_id\")[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\ncompare = pd.DataFrame({\n    \"All studies\": split_all,\n    \"Frontal-only\": split_frontal,\n    \"Giảm (%)\": ((split_all - split_frontal) / split_all * 100).round(1)\n})\nprint(compare.to_string())",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "9d4aaf5c",
+   "source": "fig, axes = plt.subplots(1, 3, figsize=(14, 4))\n\n# 1. All vs Frontal-only (study count)\ncats = [\"All studies\", \"Frontal-only\"]\nvals = [n_study_total, n_study_frontal]\nbars = axes[0].bar(cats, vals, color=[\"#4C72B0\", \"#55A868\"], width=0.5)\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Study count: All vs Frontal-only\")\naxes[0].set_ylabel(\"Số study\")\n\n# 2. View breakdown của ảnh được chọn\nvc = frontal_1img[\"ViewPosition\"].value_counts()\naxes[1].pie(vc.values, labels=vc.index, autopct=\"%1.1f%%\",\n            colors=[\"#4C72B0\", \"#DD8452\"])\naxes[1].set_title(\"View được chọn (AP ưu tiên)\")\n\n# 3. So sánh train/val/test\nx = np.arange(3)\nw = 0.35\nsplits = [\"train\", \"validate\", \"test\"]\naxes[2].bar(x - w/2, split_all.values,     w, label=\"All\",          color=\"#4C72B0\", alpha=0.85)\naxes[2].bar(x + w/2, split_frontal.values, w, label=\"Frontal-only\", color=\"#55A868\", alpha=0.85)\naxes[2].set_xticks(x)\naxes[2].set_xticklabels(splits)\naxes[2].set_title(\"Frontal-only vs All (per split)\")\naxes[2].set_ylabel(\"Số study\")\naxes[2].legend()\n\nplt.suptitle(\"Frontal-Only Sampling Strategy — p18\", fontsize=13)\nplt.tight_layout()\nplt.show()",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. CheXpert Labels — 14 nhãn bệnh lý"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chỉ lấy cột labels (1 = positive, 0 = negative, -1 = uncertain, NaN = not mentioned)\n",
+    "label_cols = [c for c in p18_chex.columns if c in CHEXPERT_LABELS]\n",
+    "\n",
+    "# Số study có nhãn Positive (=1) mỗi bệnh\n",
+    "positive_counts = (p18_chex[label_cols] == 1).sum().sort_values(ascending=False)\n",
+    "uncertain_counts = (p18_chex[label_cols] == -1).sum().sort_values(ascending=False)\n",
+    "negative_counts = (p18_chex[label_cols] == 0).sum().sort_values(ascending=False)\n",
+    "\n",
+    "label_summary = pd.DataFrame({\n",
+    "    \"Positive\": positive_counts,\n",
+    "    \"Uncertain\": uncertain_counts,\n",
+    "    \"Negative\": negative_counts,\n",
+    "    \"Not Mentioned\": p18_chex[label_cols].isna().sum()\n",
+    "})\n",
+    "label_summary[\"Total Studies\"] = len(p18_chex)\n",
+    "label_summary[\"Positive %\"] = (label_summary[\"Positive\"] / len(p18_chex) * 100).round(1)\n",
+    "print(label_summary[[\"Positive\",\"Uncertain\",\"Negative\",\"Not Mentioned\",\"Positive %\"]]\n",
+    "      .sort_values(\"Positive\", ascending=False).to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "# Headers hành chính — không phải findings\nADMIN_HEADERS = {\n    'EXAMINATION', 'INDICATION', 'CLINICAL INDICATION', 'TECHNIQUE',\n    'COMPARISON', 'HISTORY', 'REASON', 'REASON FOR EXAM',\n    'REASON FOR EXAMINATION', 'PROCEDURE', 'FINAL REPORT',\n    'NOTIFICATION', 'RECOMMENDATION', 'ADDENDUM'\n}\n\n# Detect section header: dòng bắt đầu bằng ALL-CAPS (có thể có space/dấu câu) rồi đến \":\"\nSECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\-]{1,70}?):\\s*', re.MULTILINE)\n\ndef parse_report(txt_path: Path) -> dict:\n    \"\"\"\n    Parse report .txt thành dict {'findings': str|None, 'impression': str|None}.\n\n    Quy luật detect section: mọi header đều VIẾT HOA TOÀN BỘ và kết thúc bằng ':',\n    ví dụ: FINDINGS:, IMPRESSION:, FRONTAL AND LATERAL VIEWS OF THE CHEST:\n    → dùng regex bắt pattern đó, không hardcode t��ng keyword.\n\n    Nếu không có section FINDINGS tường minh, fallback lấy section\n    descriptive đầu tiên (không phải admin header).\n    \"\"\"\n    try:\n        text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n    except FileNotFoundError:\n        return {\"findings\": None, \"impression\": None}\n\n    matches = list(SECTION_RE.finditer(text))\n    if not matches:\n        return {\"findings\": None, \"impression\": None}\n\n    # Tách từng section thành (header, content)\n    sections = []\n    for i, m in enumerate(matches):\n        header  = m.group(1).strip()\n        start   = m.end()\n        end     = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n        content = text[start:end].strip()\n        sections.append((header, content))\n\n    findings = impression = None\n    for header, content in sections:\n        h = header.upper()\n        if \"FINDING\" in h and findings is None:\n            findings = content or None\n        elif \"IMPRESSION\" in h and impression is None:\n            impression = content or None\n\n    # Fallback: không có FINDINGS tường minh → lấy section descriptive đầu tiên\n    if findings is None:\n        for header, content in sections:\n            h = header.upper()\n            if h not in ADMIN_HEADERS and \"IMPRESSION\" not in h and content:\n                findings = content\n                break\n\n    return {\"findings\": findings, \"impression\": impression}\n\n\n# Lấy danh sách unique studies trong p18\np18_studies = (\n    df[[\"subject_id\", \"study_id\"]]\n    .drop_duplicates(\"study_id\")\n    .reset_index(drop=True)\n)\n\nprint(f\"Số study cần parse: {len(p18_studies):,}\")\nprint(\"Parsing reports...\")\n\nrecords = []\nfor _, row in p18_studies.iterrows():\n    sid  = str(row[\"subject_id\"])\n    stid = str(row[\"study_id\"])\n    txt_path = CXR_ROOT / \"files\" / \"p18\" / f\"p{sid}\" / f\"s{stid}.txt\"\n    parsed = parse_report(txt_path)\n    records.append({\"study_id\": stid, **parsed})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"]   = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\ntotal = len(report_df)\nprint(f\"\\nFindings   found : {report_df['findings'].notna().sum():,} / {total:,}  ({report_df['findings'].notna().mean()*100:.1f}%)\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {total:,}  ({report_df['impression'].notna().mean()*100:.1f}%)\")\nboth    = (report_df['findings'].notna() & report_df['impression'].notna()).sum()\nneither = (report_df['findings'].isna()  & report_df['impression'].isna()).sum()\nprint(f\"Cả hai           : {both:,} / {total:,}  ({both/total*100:.1f}%)\")\nprint(f\"Không có cả hai  : {neither:,} / {total:,}  ({neither/total*100:.1f}%)\")"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Số nhãn positive mỗi study (label co-occurrence)\n",
+    "labels_per_study = (p18_chex[label_cols] == 1).sum(axis=1)\n",
+    "print(\"Số nhãn positive mỗi study:\")\n",
+    "print(labels_per_study.value_counts().sort_index().to_string())\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(9, 4))\n",
+    "lps_counts = labels_per_study.value_counts().sort_index()\n",
+    "ax.bar(lps_counts.index.astype(str), lps_counts.values, color=sns.color_palette(\"Blues_d\", len(lps_counts)))\n",
+    "ax.set_xlabel(\"Số nhãn positive\")\n",
+    "ax.set_ylabel(\"Số study\")\n",
+    "ax.set_title(\"Phân bố số nhãn positive mỗi study (p18)\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Phân tích Report — Findings & Impression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_report(txt_path: Path) -> dict:\n",
+    "    \"\"\"Trả về dict với 'findings' và 'impression' (str hoặc None).\"\"\"\n",
+    "    try:\n",
+    "        text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n",
+    "    except FileNotFoundError:\n",
+    "        return {\"findings\": None, \"impression\": None}\n",
+    "\n",
+    "    text = re.sub(r\"[\\r\\n]+\", \" \", text)  # flatten newlines\n",
+    "\n",
+    "    def extract_section(pattern, text):\n",
+    "        m = re.search(pattern, text, re.IGNORECASE)\n",
+    "        if not m:\n",
+    "            return None\n",
+    "        start = m.end()\n",
+    "        # cắt đến section tiếp theo hoặc hết string\n",
+    "        next_sec = re.search(\n",
+    "            r\"(IMPRESSION|FINDINGS|CONCLUSION|RECOMMENDATION|NOTIFICATION)\",\n",
+    "            text[start:], re.IGNORECASE\n",
+    "        )\n",
+    "        end = start + next_sec.start() if next_sec else len(text)\n",
+    "        return text[start:end].strip()\n",
+    "\n",
+    "    findings   = extract_section(r\"FINDINGS\\s*:\", text)\n",
+    "    impression = extract_section(r\"IMPRESSION\\s*:\", text)\n",
+    "    return {\"findings\": findings, \"impression\": impression}\n",
+    "\n",
+    "\n",
+    "# Lấy danh sách unique studies trong p18\n",
+    "p18_studies = (\n",
+    "    df[[\"subject_id\", \"study_id\"]]\n",
+    "    .drop_duplicates(\"study_id\")\n",
+    "    .reset_index(drop=True)\n",
+    ")\n",
+    "\n",
+    "print(f\"Số study cần parse: {len(p18_studies):,}\")\n",
+    "print(\"Parsing reports... (có thể mất vài giây)\")\n",
+    "\n",
+    "records = []\n",
+    "for _, row in p18_studies.iterrows():\n",
+    "    sid  = str(row[\"subject_id\"])\n",
+    "    stid = str(row[\"study_id\"])\n",
+    "    txt_path = CXR_ROOT / \"files\" / \"p18\" / f\"p{sid}\" / f\"s{stid}.txt\"\n",
+    "    parsed = parse_report(txt_path)\n",
+    "    records.append({\"study_id\": stid, **parsed})\n",
+    "\n",
+    "report_df = pd.DataFrame(records)\n",
+    "report_df[\"findings_len\"]   = report_df[\"findings\"].dropna().str.split().str.len()\n",
+    "report_df[\"impression_len\"] = report_df[\"impression\"].dropna().str.split().str.len()\n",
+    "\n",
+    "print(f\"\\nFindings   found : {report_df['findings'].notna().sum():,} / {len(report_df):,}\")\n",
+    "print(f\"Impression found : {report_df['impression'].notna().sum():,} / {len(report_df):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Descriptive stats\n",
+    "print(\"=== Findings word count ===\")\n",
+    "print(report_df[\"findings_len\"].describe().round(1).to_string())\n",
+    "print(\"\\n=== Impression word count ===\")\n",
+    "print(report_df[\"impression_len\"].describe().round(1).to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
+    "\n",
+    "for ax, col, title, color in zip(\n",
+    "    axes,\n",
+    "    [\"findings_len\", \"impression_len\"],\n",
+    "    [\"Findings — phân bố độ dài (số từ)\", \"Impression — phân bố độ dài (số từ)\"],\n",
+    "    [\"steelblue\", \"tomato\"]\n",
+    "):\n",
+    "    data = report_df[col].dropna()\n",
+    "    # clip outliers để biểu đồ dễ nhìn\n",
+    "    p99 = data.quantile(0.99)\n",
+    "    data_clipped = data[data <= p99]\n",
+    "    ax.hist(data_clipped, bins=40, color=color, edgecolor=\"white\", alpha=0.85)\n",
+    "    ax.axvline(data.median(), color=\"black\", linestyle=\"--\", linewidth=1.2, label=f\"Median={data.median():.0f}\")\n",
+    "    ax.axvline(data.mean(),   color=\"gray\",  linestyle=\":\",  linewidth=1.2, label=f\"Mean={data.mean():.0f}\")\n",
+    "    ax.set_title(title)\n",
+    "    ax.set_xlabel(\"Số từ\")\n",
+    "    ax.set_ylabel(\"Số report\")\n",
+    "    ax.legend(fontsize=9)\n",
+    "    ax.text(0.97, 0.95, f\"n={len(data):,}\\n(hiển thị ≤p99={p99:.0f}w)\",\n",
+    "            transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
+    "\n",
+    "plt.suptitle(\"Phân bố độ dài report — p18\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Box plot so sánh Findings vs Impression\n",
+    "combined = pd.DataFrame({\n",
+    "    \"word_count\": pd.concat([report_df[\"findings_len\"], report_df[\"impression_len\"]], ignore_index=True),\n",
+    "    \"section\": [\"Findings\"] * len(report_df) + [\"Impression\"] * len(report_df)\n",
+    "}).dropna()\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(7, 4))\n",
+    "sns.boxplot(data=combined, x=\"section\", y=\"word_count\",\n",
+    "            palette=[\"steelblue\", \"tomato\"], showfliers=False, ax=ax)\n",
+    "ax.set_title(\"Findings vs Impression — độ dài (box plot, no outliers)\")\n",
+    "ax.set_ylabel(\"Số từ\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. VQA — phân tích câu hỏi & đáp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vqa_dfs = []\n",
+    "for fpath, split_name in [(VQA_TRAIN, \"train\"), (VQA_VALID, \"valid\"), (VQA_TEST, \"test\")]:\n",
+    "    if fpath.exists():\n",
+    "        with open(fpath, encoding=\"utf-8\") as f:\n",
+    "            data = json.load(f)\n",
+    "        tmp = pd.DataFrame(data)\n",
+    "        tmp[\"split\"] = split_name\n",
+    "        vqa_dfs.append(tmp)\n",
+    "    else:\n",
+    "        print(f\"[WARNING] File not found: {fpath}\")\n",
+    "\n",
+    "vqa_all = pd.concat(vqa_dfs, ignore_index=True)\n",
+    "\n",
+    "# Lọc p18\n",
+    "vqa_p18 = vqa_all[vqa_all[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
+    "\n",
+    "print(f\"VQA total records   : {len(vqa_all):,}\")\n",
+    "print(f\"VQA p18 records     : {len(vqa_p18):,}\")\n",
+    "print(f\"\\nColumns: {list(vqa_p18.columns)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Số VQA mẫu theo split\n",
+    "print(\"VQA p18 per split:\")\n",
+    "print(vqa_p18[\"split\"].value_counts().to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Semantic type: verify / choose / query\n",
+    "sem_counts = vqa_p18[\"semantic_type\"].value_counts()\n",
+    "print(\"Semantic type (verify/choose/query):\")\n",
+    "print(sem_counts.to_string())\n",
+    "\n",
+    "# Content type: presence / anatomy / attribute / abnormality / size / plane / gender\n",
+    "con_counts = vqa_p18[\"content_type\"].value_counts()\n",
+    "print(\"\\nContent type:\")\n",
+    "print(con_counts.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
+    "\n",
+    "# Semantic type\n",
+    "bars = axes[0].bar(sem_counts.index, sem_counts.values,\n",
+    "                   color=sns.color_palette(\"Set1\", len(sem_counts)))\n",
+    "axes[0].bar_label(bars, fmt=\"%d\")\n",
+    "axes[0].set_title(\"VQA — Semantic Type (p18)\")\n",
+    "axes[0].set_ylabel(\"Count\")\n",
+    "\n",
+    "# Content type\n",
+    "bars2 = axes[1].bar(con_counts.index, con_counts.values,\n",
+    "                    color=sns.color_palette(\"Set2\", len(con_counts)))\n",
+    "axes[1].bar_label(bars2, fmt=\"%d\")\n",
+    "axes[1].set_title(\"VQA — Content Type (p18)\")\n",
+    "axes[1].set_ylabel(\"Count\")\n",
+    "axes[1].tick_params(axis=\"x\", rotation=30)\n",
+    "\n",
+    "plt.suptitle(\"VQA Question Analysis — p18\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c313b9c3",
+   "source": "### VQA × View Position — mẫu hỏi đáp thuộc ảnh view nào",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "0791482f",
+   "source": "# image_id trong VQA = dicom_id trong metadata\nvqa_view = vqa_p18.merge(\n    p18_meta[[\"dicom_id\", \"ViewPosition\"]],\n    left_on=\"image_id\", right_on=\"dicom_id\",\n    how=\"left\"\n)\n\nmissing_view_vqa = vqa_view[\"ViewPosition\"].isna().sum()\nvqa_view[\"ViewPosition\"] = vqa_view[\"ViewPosition\"].fillna(\"Unknown\")\n\nview_vqa_counts = vqa_view[\"ViewPosition\"].value_counts()\nprint(\"=== VQA samples theo View Position (p18) ===\")\nprint(view_vqa_counts.to_string())\nprint(f\"\\nKhông map được ViewPosition: {missing_view_vqa:,} ({missing_view_vqa/len(vqa_view)*100:.1f}%)\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "049baaef",
+   "source": "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. Bar: số mẫu VQA theo view\nbars = axes[0].bar(view_vqa_counts.index, view_vqa_counts.values,\n                   color=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Số mẫu VQA theo View Position\")\naxes[0].set_ylabel(\"Số mẫu\")\n\n# 2. Pie\naxes[1].pie(view_vqa_counts.values, labels=view_vqa_counts.index,\n            autopct=\"%1.1f%%\", colors=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[1].set_title(\"Tỉ lệ VQA theo View Position\")\n\n# 3. Semantic type × View (stacked bar)\nsem_view = vqa_view.groupby([\"ViewPosition\", \"semantic_type\"]).size().unstack(fill_value=0)\nsem_view.plot(kind=\"bar\", ax=axes[2], color=sns.color_palette(\"Set1\", sem_view.shape[1]),\n              width=0.7, stacked=True)\naxes[2].set_title(\"Semantic Type × View Position\")\naxes[2].set_xlabel(\"View Position\")\naxes[2].set_ylabel(\"Số mẫu\")\naxes[2].tick_params(axis=\"x\", rotation=30)\naxes[2].legend(title=\"Semantic Type\", fontsize=8)\n\nplt.suptitle(\"VQA × View Position — p18\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# Content type × View\nprint(\"\\nContent type theo View Position:\")\nprint(vqa_view.groupby([\"ViewPosition\", \"content_type\"]).size()\n      .unstack(fill_value=0).to_string())",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cross-tab: semantic_type × content_type\n",
+    "cross = pd.crosstab(vqa_p18[\"semantic_type\"], vqa_p18[\"content_type\"])\n",
+    "print(\"Cross-tab semantic × content:\")\n",
+    "print(cross.to_string())\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 3))\n",
+    "sns.heatmap(cross, annot=True, fmt=\"d\", cmap=\"YlOrRd\", ax=ax)\n",
+    "ax.set_title(\"VQA — Semantic Type × Content Type (p18)\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Phân bố độ dài câu hỏi (số từ)\n",
+    "vqa_p18[\"q_len\"] = vqa_p18[\"question\"].str.split().str.len()\n",
+    "\n",
+    "print(\"Question length stats:\")\n",
+    "print(vqa_p18[\"q_len\"].describe().round(1).to_string())\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(9, 3))\n",
+    "ax.hist(vqa_p18[\"q_len\"].clip(upper=vqa_p18[\"q_len\"].quantile(0.99)),\n",
+    "        bins=30, color=\"slateblue\", edgecolor=\"white\")\n",
+    "ax.axvline(vqa_p18[\"q_len\"].median(), color=\"black\", linestyle=\"--\",\n",
+    "           label=f\"Median={vqa_p18['q_len'].median():.0f}\")\n",
+    "ax.set_title(\"Phân bố độ dài câu hỏi VQA (p18)\")\n",
+    "ax.set_xlabel(\"Số từ\")\n",
+    "ax.set_ylabel(\"Count\")\n",
+    "ax.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Phân bố dạng câu trả lời: yes/no vs. khác\n",
+    "def classify_answer(ans_list):\n",
+    "    if not isinstance(ans_list, list) or len(ans_list) == 0:\n",
+    "        return \"no answer\"\n",
+    "    a = ans_list[0].strip().lower()\n",
+    "    if a in [\"yes\", \"no\"]:\n",
+    "        return a\n",
+    "    return \"open\"\n",
+    "\n",
+    "vqa_p18[\"ans_type\"] = vqa_p18[\"answer\"].apply(classify_answer)\n",
+    "\n",
+    "ans_counts = vqa_p18[\"ans_type\"].value_counts()\n",
+    "print(\"Answer type distribution:\")\n",
+    "print(ans_counts.to_string())\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(6, 3))\n",
+    "bars = ax.bar(ans_counts.index, ans_counts.values,\n",
+    "              color=sns.color_palette(\"Pastel1\", len(ans_counts)))\n",
+    "ax.bar_label(bars, fmt=\"%d\")\n",
+    "ax.set_title(\"VQA — Answer Type Distribution (p18)\")\n",
+    "ax.set_ylabel(\"Count\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Gợi ý thêm — Missing data & Data Quality"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 8.1 Tỉ lệ study không có findings / không có impression\n",
+    "no_findings   = report_df[\"findings\"].isna().sum()\n",
+    "no_impression = report_df[\"impression\"].isna().sum()\n",
+    "total_studies = len(report_df)\n",
+    "\n",
+    "print(f\"Studies thiếu Findings   : {no_findings:,} / {total_studies:,} ({no_findings/total_studies*100:.1f}%)\")\n",
+    "print(f\"Studies thiếu Impression : {no_impression:,} / {total_studies:,} ({no_impression/total_studies*100:.1f}%)\")\n",
+    "both_missing = (report_df[\"findings\"].isna() & report_df[\"impression\"].isna()).sum()\n",
+    "print(f\"Studies thiếu CẢ HAI     : {both_missing:,} / {total_studies:,} ({both_missing/total_studies*100:.1f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 8.2 Tỉ lệ ảnh thiếu ViewPosition\n",
+    "missing_view = df[\"ViewPosition\"].isna().sum()\n",
+    "print(f\"Ảnh thiếu ViewPosition: {missing_view:,} / {len(df):,} ({missing_view/len(df)*100:.1f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 8.3 Số bệnh nhân (subject_id) trong p18\n",
+    "n_subjects = df[\"subject_id\"].nunique()\n",
+    "n_studies  = df[\"study_id\"].nunique()\n",
+    "n_images   = df[\"dicom_id\"].nunique()\n",
+    "\n",
+    "print(f\"Bệnh nhân (subject_id) : {n_subjects:,}\")\n",
+    "print(f\"Lần khám   (study_id)  : {n_studies:,}\")\n",
+    "print(f\"Ảnh        (dicom_id)  : {n_images:,}\")\n",
+    "print(f\"\\nTrung bình study/bệnh nhân  : {n_studies/n_subjects:.2f}\")\n",
+    "print(f\"Trung bình ảnh/bệnh nhân    : {n_images/n_subjects:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 8.4 Study distribution per patient\n",
+    "studies_per_patient = df.groupby(\"subject_id\")[\"study_id\"].nunique()\n",
+    "print(\"Studies per patient stats:\")\n",
+    "print(studies_per_patient.describe().round(1).to_string())\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(9, 3))\n",
+    "spp = studies_per_patient.value_counts().sort_index()\n",
+    "ax.bar(spp.index.astype(str), spp.values, color=\"mediumpurple\")\n",
+    "ax.set_xlabel(\"Số study mỗi bệnh nhân\")\n",
+    "ax.set_ylabel(\"Số bệnh nhân\")\n",
+    "ax.set_title(\"Phân bố số lần khám mỗi bệnh nhân — p18\")\n",
+    "ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True, nbins=20))\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 8.5 Image resolution distribution (nếu có cột Rows/Columns trong metadata)\n",
+    "if \"Rows\" in df.columns and \"Columns\" in df.columns:\n",
+    "    print(\"Image resolution stats:\")\n",
+    "    print(df[[\"Rows\", \"Columns\"]].describe().round(0).to_string())\n",
+    "\n",
+    "    res_counts = df.groupby([\"Rows\", \"Columns\"]).size().sort_values(ascending=False).head(15)\n",
+    "    print(\"\\nTop-15 resolutions:\")\n",
+    "    print(res_counts.to_string())\nelse:\n",
+    "    print(\"Cột Rows/Columns không có trong metadata.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Tóm tắt (Summary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=\"*55)\n",
+    "print(\"  SUMMARY — MIMIC-CXR Subset p18\")\n",
+    "print(\"=\"*55)\n",
+    "print(f\"  Bệnh nhân            : {n_subjects:,}\")\n",
+    "print(f\"  Studies (reports)    : {n_studies:,}\")\n",
+    "print(f\"  Ảnh (dicom/jpg)      : {n_images:,}\")\n",
+    "print()\n",
+    "for sp in [\"train\", \"validate\", \"test\"]:\n",
+    "    ni = img_per_split.get(sp, 0)\n",
+    "    ns = study_per_split.get(sp, 0)\n",
+    "    print(f\"  [{sp:>8}]  ảnh={ni:>5,}   studies={ns:>5,}\")\n",
+    "print()\n",
+    "print(f\"  Frontal (PA+AP)      : {view_counts.get('PA',0)+view_counts.get('AP',0):,} ảnh\")\n",
+    "print(f\"  Lateral              : {view_counts.get('LL',0)+view_counts.get('LATERAL',0):,} ảnh\")\n",
+    "print(f\"  Findings available   : {report_df['findings'].notna().sum():,}/{total_studies:,}\")\n",
+    "print(f\"  Impression available : {report_df['impression'].notna().sum():,}/{total_studies:,}\")\n",
+    "print(f\"  VQA samples (p18)    : {len(vqa_p18):,}\")\n",
+    "print(\"=\"*55)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

data/eda_reports.ipynb ADDED Viewed

	@@ -0,0 +1,741 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# EDA — MIMIC-CXR Reports\n",
+    "\n",
+    "Phân tích chuyên sâu toàn bộ report `.txt` trong MIMIC-CXR:\n",
+    "- Thống kê tất cả loại section header thực tế\n",
+    "- Tỉ lệ report có/thiếu findings, impression theo subset\n",
+    "- Phân phối độ dài findings & impression\n",
+    "- Parser cập nhật xử lý đầy đủ alias (CONCLUSION, FINDINGS AND IMPRESSION, v.v.)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "CXR_ROOT  = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\\mimic-cxr-reports\")  # files/p10…p19/\n",
+    "SPLIT_CSV = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\\mimic-cxr-2.0.0-split.csv\")\n",
+    "\n",
+    "# None = parse hết ~227k, số nguyên = sample nhanh\n",
+    "SAMPLE_SIZE = None\n",
+    "\n",
+    "for name, p in [(\"CXR_ROOT\", CXR_ROOT), (\"SPLIT_CSV\", SPLIT_CSV)]:\n",
+    "    print(f\"  {'✓' if p.exists() else '✗ NOT FOUND'}  {name}: {p}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.ticker as mticker\n",
+    "import seaborn as sns\n",
+    "from collections import Counter\n",
+    "\n",
+    "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
+    "plt.rcParams[\"figure.dpi\"] = 120\n",
+    "\n",
+    "ALL_SUBSETS = [f\"p{i}\" for i in range(10, 20)]\n",
+    "print(\"Ready.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Parser — ALL-CAPS header detection\n",
+    "\n",
+    "Quy luật: mọi section header trong MIMIC-CXR đều **VIẾT HOA TOÀN BỘ** và kết thúc bằng `:`  \n",
+    "→ dùng regex detect tất cả, sau đó phân loại theo nhóm."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Regex: dòng bắt đầu bằng chuỗi ALL-CAPS rồi đến \":\"\n",
+    "SECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\.\\-]{1,70}?):\\s*', re.MULTILINE)\n",
+    "\n",
+    "# ── Nhóm IMPRESSION (nội dung kết luận) ─────────────────────────────────────\n",
+    "IMPRESSION_KEYWORDS = {\n",
+    "    \"IMPRESSION\",\n",
+    "    \"CONCLUSION\",\n",
+    "    \"CONCLUSIONS\",\n",
+    "    \"FINDINGS AND IMPRESSION\",\n",
+    "    \"FINDINGS/IMPRESSION\",\n",
+    "    \"PROVISIONAL FINDINGS IMPRESSION (PFI)\",\n",
+    "    \"PFI\",\n",
+    "    \"WET READ\",               # quick impression trước khi có final report\n",
+    "    \"RECOMMENDATION\",\n",
+    "    \"RECOMMENDATION(S)\",\n",
+    "    \"RECOMMENDATIONS\",\n",
+    "}\n",
+    "\n",
+    "# ── Nhóm FINDINGS (mô tả hình ảnh) ──────────────────────────────────────────\n",
+    "FINDINGS_KEYWORDS = {\n",
+    "    \"FINDINGS\",\n",
+    "    \"REPORT\",\n",
+    "}\n",
+    "\n",
+    "# Patterns dạng view description (findings không tường minh)\n",
+    "FINDINGS_VIEW_RE = re.compile(\n",
+    "    r'(VIEW|VIEWS|RADIOGRAPH|RADIOGRAPHS|CHEST|PORTABLE|FRONTAL|LATERAL|PA AND|AP AND|UPRIGHT|SUPINE|SEMI)',\n",
+    "    re.IGNORECASE\n",
+    ")\n",
+    "\n",
+    "# ── Admin headers (bỏ qua khi fallback) ─────────────────────────────────────\n",
+    "ADMIN_KEYWORDS = {\n",
+    "    \"EXAMINATION\", \"EXAM\", \"INDICATION\", \"INDICATIONS\",\n",
+    "    \"CLINICAL INDICATION\", \"CLINICAL HISTORY\", \"CLINICAL INFORMATION\",\n",
+    "    \"TECHNIQUE\", \"COMPARISON\", \"COMPARISONS\", \"COMPARISON EXAM\",\n",
+    "    \"COMPARISON FILM\", \"COMPARISON STUDY\", \"REFERENCE EXAM\",\n",
+    "    \"HISTORY\", \"PATIENT HISTORY\",\n",
+    "    \"REASON\", \"REASON FOR EXAM\", \"REASON FOR EXAMINATION\",\n",
+    "    \"TYPE OF EXAMINATION\", \"PROCEDURE\",\n",
+    "    \"NOTIFICATION\", \"NOTIFICATIONS\", \"ADDENDUM\",\n",
+    "    \"STUDY\", \"DATE\", \"CC\", \"NOTE\", \"COMMENT\", \"COMMENTS\",\n",
+    "    \"FINAL REPORT\",\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def classify_header(h: str) -> str:\n",
+    "    \"\"\"Phân loại header vào: findings / impression / admin / view_desc / other.\"\"\"\n",
+    "    h = h.upper().strip()\n",
+    "    if h in FINDINGS_KEYWORDS or \"FINDING\" in h:\n",
+    "        return \"findings\"\n",
+    "    if h in IMPRESSION_KEYWORDS or \"IMPRESSION\" in h or \"CONCLUSION\" in h:\n",
+    "        return \"impression\"\n",
+    "    if h in ADMIN_KEYWORDS:\n",
+    "        return \"admin\"\n",
+    "    if FINDINGS_VIEW_RE.search(h):\n",
+    "        return \"view_desc\"   # potential findings\n",
+    "    return \"other\"\n",
+    "\n",
+    "\n",
+    "def parse_report(txt_path: Path) -> dict:\n",
+    "    \"\"\"\n",
+    "    Trả về dict:\n",
+    "      findings   : str | None\n",
+    "      impression : str | None\n",
+    "      sections   : list of (header, category, content)\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n",
+    "    except FileNotFoundError:\n",
+    "        return {\"findings\": None, \"impression\": None, \"sections\": []}\n",
+    "\n",
+    "    matches = list(SECTION_RE.finditer(text))\n",
+    "    if not matches:\n",
+    "        return {\"findings\": None, \"impression\": None, \"sections\": []}\n",
+    "\n",
+    "    sections = []\n",
+    "    for i, m in enumerate(matches):\n",
+    "        header  = m.group(1).strip()\n",
+    "        start   = m.end()\n",
+    "        end     = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n",
+    "        content = text[start:end].strip()\n",
+    "        cat     = classify_header(header)\n",
+    "        sections.append((header, cat, content))\n",
+    "\n",
+    "    findings = impression = None\n",
+    "\n",
+    "    # Pass 1: tìm tường minh\n",
+    "    for header, cat, content in sections:\n",
+    "        if cat == \"findings\" and findings is None:\n",
+    "            findings = content or None\n",
+    "        elif cat == \"impression\" and impression is None:\n",
+    "            impression = content or None\n",
+    "\n",
+    "    # Pass 2: fallback findings từ view_desc\n",
+    "    if findings is None:\n",
+    "        for header, cat, content in sections:\n",
+    "            if cat == \"view_desc\" and content:\n",
+    "                findings = content\n",
+    "                break\n",
+    "\n",
+    "    return {\"findings\": findings, \"impression\": impression, \"sections\": sections}\n",
+    "\n",
+    "\n",
+    "print(\"Parser defined.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Load & parse tất cả reports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lấy danh sách study từ split.csv để biết subset của từng study\n",
+    "split_df = pd.read_csv(SPLIT_CSV)\n",
+    "split_df[\"subset\"] = \"p\" + split_df[\"subject_id\"].astype(str).str[:2]\n",
+    "\n",
+    "studies = (\n",
+    "    split_df[[\"subject_id\", \"study_id\", \"subset\", \"split\"]]\n",
+    "    .drop_duplicates(\"study_id\")\n",
+    "    .reset_index(drop=True)\n",
+    ")\n",
+    "\n",
+    "if SAMPLE_SIZE:\n",
+    "    studies = studies.sample(n=min(SAMPLE_SIZE, len(studies)), random_state=42).reset_index(drop=True)\n",
+    "    print(f\"Sample: {len(studies):,} studies\")\n",
+    "else:\n",
+    "    print(f\"Total studies to parse: {len(studies):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "records      = []\n",
+    "header_counter = Counter()\n",
+    "\n",
+    "for _, row in studies.iterrows():\n",
+    "    sid    = str(row[\"subject_id\"])\n",
+    "    stid   = str(row[\"study_id\"])\n",
+    "    subset = row[\"subset\"]\n",
+    "    split  = row[\"split\"]\n",
+    "    path   = CXR_ROOT / \"files\" / subset / f\"p{sid}\" / f\"s{stid}.txt\"\n",
+    "\n",
+    "    result = parse_report(path)\n",
+    "\n",
+    "    for header, cat, _ in result[\"sections\"]:\n",
+    "        header_counter[header.upper()] += 1\n",
+    "\n",
+    "    records.append({\n",
+    "        \"study_id\":      stid,\n",
+    "        \"subject_id\":    sid,\n",
+    "        \"subset\":        subset,\n",
+    "        \"split\":         split,\n",
+    "        \"findings\":      result[\"findings\"],\n",
+    "        \"impression\":    result[\"impression\"],\n",
+    "        \"n_sections\":    len(result[\"sections\"]),\n",
+    "        \"section_headers\": \"|\".join(h for h, _, _ in result[\"sections\"]),\n",
+    "    })\n",
+    "\n",
+    "df = pd.DataFrame(records)\n",
+    "df[\"findings_len\"]   = df[\"findings\"].str.split().str.len()\n",
+    "df[\"impression_len\"] = df[\"impression\"].str.split().str.len()\n",
+    "\n",
+    "total = len(df)\n",
+    "has_f = df[\"findings\"].notna().sum()\n",
+    "has_i = df[\"impression\"].notna().sum()\n",
+    "has_both    = (df[\"findings\"].notna() & df[\"impression\"].notna()).sum()\n",
+    "has_neither = (df[\"findings\"].isna()  & df[\"impression\"].isna()).sum()\n",
+    "\n",
+    "print(f\"Total studies parsed  : {total:,}\")\n",
+    "print(f\"Has findings          : {has_f:,}  ({has_f/total*100:.1f}%)\")\n",
+    "print(f\"Has impression        : {has_i:,}  ({has_i/total*100:.1f}%)\")\n",
+    "print(f\"Has both              : {has_both:,}  ({has_both/total*100:.1f}%)\")\n",
+    "print(f\"Has neither           : {has_neither:,}  ({has_neither/total*100:.1f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Thống kê tất cả section headers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bảng đầy đủ tất cả headers + category\n",
+    "header_rows = []\n",
+    "for h, cnt in header_counter.most_common():\n",
+    "    header_rows.append({\n",
+    "        \"header\":   h,\n",
+    "        \"count\":    cnt,\n",
+    "        \"category\": classify_header(h),\n",
+    "        \"pct\":      cnt / total * 100\n",
+    "    })\n",
+    "\n",
+    "header_df = pd.DataFrame(header_rows)\n",
+    "\n",
+    "print(f\"Distinct section headers: {len(header_df)}\")\n",
+    "print(\"\\n=== Top 50 headers ===\")\n",
+    "print(header_df.head(50).to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Phân bố theo category\n",
+    "cat_summary = header_df.groupby(\"category\")[\"count\"].sum().sort_values(ascending=False)\n",
+    "print(\"=== Tổng count theo category ===\")\n",
+    "print(cat_summary.to_string())\n",
+    "\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
+    "\n",
+    "# Bar: category totals\n",
+    "colors = {\"findings\": \"#4C72B0\", \"impression\": \"#DD8452\",\n",
+    "          \"admin\": \"#8c8c8c\", \"view_desc\": \"#55A868\", \"other\": \"#C44E52\"}\n",
+    "cat_colors = [colors.get(c, \"gray\") for c in cat_summary.index]\n",
+    "bars = axes[0].bar(cat_summary.index, cat_summary.values, color=cat_colors)\n",
+    "axes[0].bar_label(bars, fmt=\"%d\")\n",
+    "axes[0].set_title(\"Tổng số lần xuất hiện theo category\")\n",
+    "axes[0].set_ylabel(\"Count\")\n",
+    "axes[0].tick_params(axis=\"x\", rotation=20)\n",
+    "\n",
+    "# Bar: số header distinct mỗi category\n",
+    "cat_distinct = header_df.groupby(\"category\").size().sort_values(ascending=False)\n",
+    "bars2 = axes[1].bar(cat_distinct.index, cat_distinct.values,\n",
+    "                    color=[colors.get(c, \"gray\") for c in cat_distinct.index])\n",
+    "axes[1].bar_label(bars2, fmt=\"%d\")\n",
+    "axes[1].set_title(\"Số header phân biệt mỗi category\")\n",
+    "axes[1].set_ylabel(\"Distinct headers\")\n",
+    "axes[1].tick_params(axis=\"x\", rotation=20)\n",
+    "\n",
+    "plt.suptitle(\"Section Header Categories\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Top headers mỗi category\n",
+    "for cat in [\"findings\", \"impression\", \"view_desc\", \"other\"]:\n",
+    "    sub = header_df[header_df[\"category\"] == cat].head(15)\n",
+    "    print(f\"\\n=== [{cat}] Top headers ===\")\n",
+    "    print(sub[[\"header\", \"count\", \"pct\"]].to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Top 20 headers — horizontal bar\n",
+    "top20 = header_df.head(20).copy()\n",
+    "top20_colors = [colors.get(c, \"gray\") for c in top20[\"category\"]]\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 7))\n",
+    "bars = ax.barh(top20[\"header\"][::-1], top20[\"count\"][::-1], color=top20_colors[::-1])\n",
+    "ax.bar_label(bars, fmt=\"%d\", padding=3, fontsize=8)\n",
+    "ax.set_xlabel(\"Count\")\n",
+    "ax.set_title(\"Top 20 Section Headers (tô màu theo category)\")\n",
+    "\n",
+    "from matplotlib.patches import Patch\n",
+    "legend_elements = [Patch(facecolor=v, label=k) for k, v in colors.items()]\n",
+    "ax.legend(handles=legend_elements, loc=\"lower right\", fontsize=9)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Tỉ lệ có/thiếu Findings & Impression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tạo cột status\n",
+    "def report_status(row):\n",
+    "    f = row[\"findings\"] is not None\n",
+    "    i = row[\"impression\"] is not None\n",
+    "    if f and i:  return \"both\"\n",
+    "    if f:        return \"findings only\"\n",
+    "    if i:        return \"impression only\"\n",
+    "    return \"neither\"\n",
+    "\n",
+    "df[\"status\"] = df.apply(report_status, axis=1)\n",
+    "\n",
+    "status_counts = df[\"status\"].value_counts()\n",
+    "print(\"=== Report completeness (full dataset) ===\")\n",
+    "for s, c in status_counts.items():\n",
+    "    print(f\"  {s:<20}: {c:>7,}  ({c/total*100:.1f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
+    "status_order  = [\"both\", \"impression only\", \"findings only\", \"neither\"]\n",
+    "status_colors = [\"#55A868\", \"#DD8452\", \"#4C72B0\", \"#C44E52\"]\n",
+    "vals = [status_counts.get(s, 0) for s in status_order]\n",
+    "\n",
+    "bars = axes[0].bar(status_order, vals, color=status_colors)\n",
+    "axes[0].bar_label(bars, fmt=\"%d\")\n",
+    "axes[0].set_title(\"Report completeness (count)\")\n",
+    "axes[0].set_ylabel(\"Số report\")\n",
+    "axes[0].tick_params(axis=\"x\", rotation=15)\n",
+    "\n",
+    "axes[1].pie(vals, labels=status_order, autopct=\"%1.1f%%\", colors=status_colors,\n",
+    "            startangle=140)\n",
+    "axes[1].set_title(\"Report completeness (%)\")\n",
+    "\n",
+    "plt.suptitle(\"Findings & Impression Availability\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Breakdown theo Subset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subset_stats = df.groupby(\"subset\").apply(lambda g: pd.Series({\n",
+    "    \"total\":           len(g),\n",
+    "    \"has_findings\":    g[\"findings\"].notna().sum(),\n",
+    "    \"has_impression\":  g[\"impression\"].notna().sum(),\n",
+    "    \"has_both\":        (g[\"findings\"].notna() & g[\"impression\"].notna()).sum(),\n",
+    "    \"has_neither\":     (g[\"findings\"].isna()  & g[\"impression\"].isna()).sum(),\n",
+    "})).reindex(ALL_SUBSETS)\n",
+    "\n",
+    "subset_pct = subset_stats.div(subset_stats[\"total\"], axis=0).mul(100).round(1)\n",
+    "\n",
+    "print(\"=== Count per subset ===\")\n",
+    "print(subset_stats.to_string())\n",
+    "print(\"\\n=== % per subset ===\")\n",
+    "print(subset_pct[[\"has_findings\",\"has_impression\",\"has_both\",\"has_neither\"]].to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
+    "\n",
+    "# Stacked % bar\n",
+    "status_per_subset = (\n",
+    "    df.groupby([\"subset\", \"status\"]).size()\n",
+    "    .unstack(fill_value=0)\n",
+    "    .reindex(ALL_SUBSETS, fill_value=0)\n",
+    ")\n",
+    "# Tỉ lệ %\n",
+    "status_pct_subset = status_per_subset.div(status_per_subset.sum(axis=1), axis=0) * 100\n",
+    "status_pct_subset = status_pct_subset.reindex(\n",
+    "    columns=[c for c in status_order if c in status_pct_subset.columns]\n",
+    ")\n",
+    "status_pct_subset.plot(\n",
+    "    kind=\"bar\", stacked=True, ax=axes[0],\n",
+    "    color=[status_colors[status_order.index(c)] for c in status_pct_subset.columns],\n",
+    "    width=0.75\n",
+    ")\n",
+    "axes[0].set_title(\"Report completeness (%) theo subset\")\n",
+    "axes[0].set_ylabel(\"%\")\n",
+    "axes[0].set_ylim(0, 105)\n",
+    "axes[0].tick_params(axis=\"x\", rotation=0)\n",
+    "axes[0].legend(loc=\"lower right\", fontsize=8)\n",
+    "\n",
+    "# Heatmap tỉ lệ % has_both / has_neither\n",
+    "heatmap_data = subset_pct[[\"has_findings\",\"has_impression\",\"has_both\",\"has_neither\"]]\n",
+    "sns.heatmap(heatmap_data, annot=True, fmt=\".1f\", cmap=\"RdYlGn\",\n",
+    "            linewidths=0.5, ax=axes[1], vmin=0, vmax=100,\n",
+    "            cbar_kws={\"label\": \"%\"})\n",
+    "axes[1].set_title(\"Tỉ lệ (%) completeness mỗi subset\")\n",
+    "\n",
+    "plt.suptitle(\"Report Completeness per Subset\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Breakdown theo split (train/validate/test)\n",
+    "split_stats = df.groupby(\"split\").apply(lambda g: pd.Series({\n",
+    "    \"total\":          len(g),\n",
+    "    \"has_findings %\":  g[\"findings\"].notna().mean() * 100,\n",
+    "    \"has_impression %\": g[\"impression\"].notna().mean() * 100,\n",
+    "    \"has_both %\":     (g[\"findings\"].notna() & g[\"impression\"].notna()).mean() * 100,\n",
+    "    \"has_neither %\":  (g[\"findings\"].isna()  & g[\"impression\"].isna()).mean() * 100,\n",
+    "})).reindex([\"train\", \"validate\", \"test\"])\n",
+    "\n",
+    "print(\"=== Completeness by split ===\")\n",
+    "print(split_stats.round(1).to_string())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Phân phối độ dài Findings & Impression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=== Findings word count ===\")\n",
+    "print(df[\"findings_len\"].describe().round(1).to_string())\n",
+    "print(\"\\n=== Impression word count ===\")\n",
+    "print(df[\"impression_len\"].describe().round(1).to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+    "for ax, col, title, color in zip(\n",
+    "    axes,\n",
+    "    [\"findings_len\", \"impression_len\"],\n",
+    "    [\"Findings — độ dài (số từ)\", \"Impression — độ dài (số từ)\"],\n",
+    "    [\"#4C72B0\", \"#DD8452\"]\n",
+    "):\n",
+    "    data = df[col].dropna()\n",
+    "    p99  = data.quantile(0.99)\n",
+    "    ax.hist(data[data <= p99], bins=60, color=color, edgecolor=\"white\", alpha=0.85)\n",
+    "    ax.axvline(data.median(), color=\"black\", ls=\"--\", lw=1.3, label=f\"Median={data.median():.0f}\")\n",
+    "    ax.axvline(data.mean(),   color=\"gray\",  ls=\":\",  lw=1.3, label=f\"Mean={data.mean():.0f}\")\n",
+    "    ax.set_title(title)\n",
+    "    ax.set_xlabel(\"Số từ\")\n",
+    "    ax.set_ylabel(\"Số report\")\n",
+    "    ax.legend(fontsize=9)\n",
+    "    ax.text(0.97, 0.95, f\"n={len(data):,}\\n(≤p99={p99:.0f}w)\",\n",
+    "            transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
+    "\n",
+    "plt.suptitle(\"Phân phối độ dài Findings & Impression\", fontsize=13)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Boxplot: Findings vs Impression\n",
+    "combined = pd.concat([\n",
+    "    df[[\"findings_len\"]].rename(columns={\"findings_len\": \"words\"}).assign(section=\"Findings\"),\n",
+    "    df[[\"impression_len\"]].rename(columns={\"impression_len\": \"words\"}).assign(section=\"Impression\"),\n",
+    "]).dropna()\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(7, 4))\n",
+    "sns.boxplot(data=combined, x=\"section\", y=\"words\",\n",
+    "            palette=[\"#4C72B0\", \"#DD8452\"], showfliers=False, ax=ax)\n",
+    "ax.set_title(\"Findings vs Impression — độ dài (no outliers)\")\n",
+    "ax.set_ylabel(\"Số từ\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Median length per subset\n",
+    "med_subset = df.groupby(\"subset\")[[\"findings_len\",\"impression_len\"]].median().reindex(ALL_SUBSETS)\n",
+    "\n",
+    "med_subset.plot(kind=\"bar\", figsize=(12, 4),\n",
+    "                color=[\"#4C72B0\", \"#DD8452\"], width=0.7)\n",
+    "plt.title(\"Median độ dài Findings & Impression theo subset\")\n",
+    "plt.xlabel(\"Subset\")\n",
+    "plt.ylabel(\"Median (số từ)\")\n",
+    "plt.xticks(rotation=0)\n",
+    "plt.legend([\"Findings\", \"Impression\"])\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Heatmap percentile độ dài theo subset\n",
+    "for col, label in [(\"findings_len\", \"Findings\"), (\"impression_len\", \"Impression\")]:\n",
+    "    pct_data = df.groupby(\"subset\")[col].describe(\n",
+    "        percentiles=[.25, .5, .75, .95]\n",
+    "    )[[\"count\", \"mean\", \"25%\", \"50%\", \"75%\", \"95%\", \"max\"]].reindex(ALL_SUBSETS).round(0)\n",
+    "    print(f\"\\n=== {label} length per subset ===\")\n",
+    "    print(pct_data.to_string())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Reports \"has neither\" — phân tích thêm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "neither_df = df[df[\"status\"] == \"neither\"].copy()\n",
+    "print(f\"Reports không có cả findings lẫn impression: {len(neither_df):,}\")\n",
+    "print(f\"\\nPhân bố n_sections của những report này:\")\n",
+    "print(neither_df[\"n_sections\"].value_counts().sort_index().head(10).to_string())\n",
+    "\n",
+    "# Xem top headers trong những report này\n",
+    "neither_headers = Counter()\n",
+    "for row in neither_df[\"section_headers\"]:\n",
+    "    if isinstance(row, str):\n",
+    "        for h in row.split(\"|\"):\n",
+    "            if h:\n",
+    "                neither_headers[h] += 1\n",
+    "\n",
+    "print(\"\\nTop section headers trong reports 'neither':\")\n",
+    "for h, c in neither_headers.most_common(15):\n",
+    "    print(f\"  {c:>6,}  {h}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Vài ví dụ report có neither\n",
+    "import random\n",
+    "random.seed(0)\n",
+    "sample_neither = neither_df.sample(min(3, len(neither_df)), random_state=0)\n",
+    "for _, row in sample_neither.iterrows():\n",
+    "    path = CXR_ROOT / \"files\" / row[\"subset\"] / f\"p{row['subject_id']}\" / f\"s{row['study_id']}.txt\"\n",
+    "    print(f\"\\n{'='*60}\")\n",
+    "    print(f\"s{row['study_id']}.txt  (sections: {row['section_headers']})\")\n",
+    "    try:\n",
+    "        print(path.read_text(encoding=\"utf-8\", errors=\"ignore\")[:600])\n",
+    "    except:\n",
+    "        print(\"[file not found]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Số section mỗi report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_dist = df[\"n_sections\"].value_counts().sort_index()\n",
+    "print(\"Phân bố số sections mỗi report:\")\n",
+    "print(sec_dist.head(20).to_string())\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(11, 4))\n",
+    "sec_clip = sec_dist[sec_dist.index <= df[\"n_sections\"].quantile(0.99)]\n",
+    "ax.bar(sec_clip.index.astype(str), sec_clip.values, color=\"steelblue\")\n",
+    "ax.set_xlabel(\"Số sections\")\n",
+    "ax.set_ylabel(\"Số report\")\n",
+    "ax.set_title(\"Phân bố số sections mỗi report (≤p99)\")\n",
+    "ax.axvline(str(int(df[\"n_sections\"].median())), color=\"black\", ls=\"--\",\n",
+    "           label=f\"Median={df['n_sections'].median():.0f}\")\n",
+    "ax.legend()\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=\"*60)\n",
+    "print(\"  MIMIC-CXR Report EDA Summary\")\n",
+    "print(\"=\"*60)\n",
+    "print(f\"  Total reports parsed      : {total:,}\")\n",
+    "print(f\"  Distinct section headers  : {len(header_df)}\")\n",
+    "print()\n",
+    "print(f\"  Has findings              : {has_f:,}  ({has_f/total*100:.1f}%)\")\n",
+    "print(f\"  Has impression            : {has_i:,}  ({has_i/total*100:.1f}%)\")\n",
+    "print(f\"  Has BOTH (usable)         : {has_both:,}  ({has_both/total*100:.1f}%)\")\n",
+    "print(f\"  Has neither               : {has_neither:,}  ({has_neither/total*100:.1f}%)\")\n",
+    "print()\n",
+    "print(f\"  Findings  median length   : {df['findings_len'].median():.0f} words\")\n",
+    "print(f\"  Impression median length  : {df['impression_len'].median():.0f} words\")\n",
+    "print(\"=\"*60)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

model/cxr_vlm.py CHANGED Viewed

@@ -53,10 +53,14 @@ class CXRVisionLanguageModel(nn.Module):
         super().__init__()
         self.cfg = model_cfg
-        # ── 1. Image Encoder (BioViL-T, frozen) ─────────────────────────────
         self.image_encoder = BioViLTEncoder(
             frozen   = model_cfg.image_encoder.frozen,
             img_size = model_cfg.image_encoder.img_size,
         )
         # ── 2. MLP Projection (trained) ──────────────────────────────────────

         super().__init__()
         self.cfg = model_cfg
+        # ── 1. Image Encoder (rad_dino / biovilt / vit, frozen) ─────────────
+        # `backend` defaults to "auto" → tries rad_dino → biovilt → vit and
+        # uses the first that loads (see model/image_encoder.py docstring).
+        _enc_backend = getattr(model_cfg.image_encoder, "backend", "auto")
         self.image_encoder = BioViLTEncoder(
             frozen   = model_cfg.image_encoder.frozen,
             img_size = model_cfg.image_encoder.img_size,
+            backend  = _enc_backend,
         )
         # ── 2. MLP Projection (trained) ──────────────────────────────────────

model/image_encoder.py CHANGED Viewed

@@ -1,16 +1,24 @@
 """
 image_encoder.py
 ----------------
-Vision encoder wrapper.
-Originally this used BioViL-T via Microsoft's `hi-ml-multimodal` package.
-That package has not been updated for Python 3.12 and fails to install on
-recent Kaggle images, so we fall back to a ViT-B/16 from `timm` (ImageNet
-pretrained, 768-dim patch features — same output shape contract as BioViL-T,
-so the rest of the model is unchanged).
-If you want real BioViL-T weights, install `hi-ml-multimodal` in a Python 3.10
-environment and set `backend="biovilt"` in the constructor.
 """
 import torch
@@ -23,6 +31,12 @@ try:
 except ImportError:
     TIMM_AVAILABLE = False
 try:
     from health_multimodal.image import get_biovil_t_image_encoder
     from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference
@@ -31,12 +45,17 @@ except ImportError:
     BIOVIL_AVAILABLE = False
 class BioViLTEncoder(nn.Module):
     """
-    Image encoder. Name kept for backward compatibility; actual backbone
-    depends on `backend`:
-      - "biovilt": microsoft BioViL-T (needs hi-ml-multimodal)
-      - "vit":     timm ViT-B/16 ImageNet pretrained  ← default fallback
     Output contract: (B, num_patches, 768)
     """
@@ -44,51 +63,115 @@ class BioViLTEncoder(nn.Module):
     PATCH_DIM = 768
     IMG_SIZE  = 448
     def __init__(
         self,
         frozen:   bool = True,
         img_size: int  = 448,
-        backend:  str  = "auto",         # "auto" | "biovilt" | "vit"
         device:   Optional[str] = None,
     ):
         super().__init__()
-        self.img_size = img_size
-        self.frozen   = frozen
-        if backend == "auto":
-            backend = "biovilt" if BIOVIL_AVAILABLE else "vit"
-        self.backend = backend
-        if backend == "biovilt":
-            if not BIOVIL_AVAILABLE:
-                raise ImportError("hi-ml-multimodal not installed; choose backend='vit'")
-            self.encoder = get_biovil_t_image_encoder()
-        elif backend == "vit":
-            if not TIMM_AVAILABLE:
-                raise ImportError("timm is required for vit backend. pip install timm")
-            # ViT-B/16 — 768-dim patch features. img_size overridden to 224 (ViT's native);
-            # the image transform handles resize, so downstream code doesn't change.
-            self.encoder = timm.create_model(
-                "vit_base_patch16_224",
-                pretrained   = True,
-                num_classes  = 0,           # drop classifier head
-                global_pool  = "",          # keep patch tokens
             )
-            # ViT native input size is 224 — override our own stored img_size
-            self.img_size = 224
-        else:
-            raise ValueError(f"Unknown backend: {backend}")
         if frozen:
             self._freeze()
         print(f"[BioViLTEncoder] backend={self.backend} frozen={frozen} img_size={self.img_size}")
     def _freeze(self):
         for p in self.encoder.parameters():
             p.requires_grad = False
         self.encoder.eval()
     def forward(self, images: torch.Tensor) -> torch.Tensor:
         """
         Args:
@@ -98,38 +181,85 @@ class BioViLTEncoder(nn.Module):
         """
         ctx = torch.no_grad() if self.frozen else torch.enable_grad()
         with ctx:
-            if self.backend == "biovilt":
                 out = self.encoder(images)
                 feats = out.patch_embedding                   # (B, 768, H', W')
                 B, C, H, W = feats.shape
                 feats = feats.flatten(2).transpose(1, 2)       # (B, H'*W', 768)
             else:  # vit
                 # timm ViT with num_classes=0, global_pool="" returns (B, N+1, 768)
                 # where token 0 is [CLS]. Drop it.
-                feats = self.encoder.forward_features(images)  # (B, N+1, 768)
                 if feats.ndim == 3 and feats.shape[1] > 1:
-                    feats = feats[:, 1:, :]                    # drop CLS
         return feats
     @staticmethod
-    def get_transform(split: str = "train"):
         """
-        Return an image transform. Uses BioViL-T's transform if available,
-        otherwise a generic ViT-compatible transform.
         """
-        if BIOVIL_AVAILABLE:
-            return create_chest_xray_transform_for_inference(
-                width  = BioViLTEncoder.IMG_SIZE,
-                height = BioViLTEncoder.IMG_SIZE,
-            )
-        # Fallback ViT transform — 224×224, ImageNet norm
-        from torchvision import transforms
-        return transforms.Compose([
-            transforms.Resize((224, 224)),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                 std=[0.229, 0.224, 0.225]),
-        ])
     @property
     def output_dim(self) -> int:

 """
 image_encoder.py
 ----------------
+Vision encoder wrapper. Tries multiple backends in this priority order:
+  1. "rad_dino" — Microsoft RAD-DINO (microsoft/rad-dino, HF Hub).
+                  Chest-X-ray self-supervised DINOv2. Loaded via the
+                  `transformers` library, so works on Python 3.12.
+                  ⭐ recommended for CXR.
+  2. "biovilt"  — Microsoft BioViL-T via `hi-ml-multimodal`.
+                  Original choice; package requires Python <3.11 so
+                  it doesn't install on recent Colab/Kaggle images.
+  3. "vit"      — timm ViT-B/16 ImageNet-pretrained.
+                  Generic fallback (not domain-pretrained).
+All backends output (B, num_patches, 768) — the MLP Projection layer
+downstream cross-attention pools that to 32 visual tokens regardless of
+num_patches, so swapping backends does not break anything else.
+To force a specific backend, set `image_encoder.backend` in
+`configs/model_config.yaml` to one of the names above.
 """
 import torch
 except ImportError:
     TIMM_AVAILABLE = False
+try:
+    from transformers import AutoModel, AutoImageProcessor
+    HF_TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    HF_TRANSFORMERS_AVAILABLE = False
 try:
     from health_multimodal.image import get_biovil_t_image_encoder
     from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference
     BIOVIL_AVAILABLE = False
+RAD_DINO_ID = "microsoft/rad-dino"
 class BioViLTEncoder(nn.Module):
     """
+    Image encoder. Name kept for backward compatibility with existing
+    checkpoints; actual backbone depends on `backend`:
+      - "auto":     try rad_dino → biovilt → vit, first that loads wins
+      - "rad_dino": Microsoft RAD-DINO (HF Hub) ⭐ recommended for CXR
+      - "biovilt":  Microsoft BioViL-T (hi-ml-multimodal)
+      - "vit":      timm ViT-B/16 ImageNet pretrained
     Output contract: (B, num_patches, 768)
     """
     PATCH_DIM = 768
     IMG_SIZE  = 448
+    # Native input size per backend (used when caller passes img_size=None)
+    _DEFAULT_SIZE = {
+        "rad_dino": 518,   # RAD-DINO trained at 518×518 (patch 14)
+        "biovilt":  448,   # BioViL-T trained at 448×448
+        "vit":      224,   # ViT-B/16 native 224×224 (patch 16)
+    }
     def __init__(
         self,
         frozen:   bool = True,
         img_size: int  = 448,
+        backend:  str  = "auto",         # "auto" | "rad_dino" | "biovilt" | "vit"
         device:   Optional[str] = None,
     ):
         super().__init__()
+        self.frozen = frozen
+        # `img_size` may be overridden by the chosen backend's native size if
+        # the caller didn't pass anything specific.
+        self._requested_img_size = img_size
+        # ── Resolve backend ──────────────────────────────────────────────
+        # "auto" tries each candidate in priority order and uses the first
+        # one that successfully loads. Per-backend load failures are caught
+        # and logged so a missing dependency on one path doesn't kill the
+        # run; only if EVERY backend fails do we raise.
+        candidates = (
+            ("rad_dino", "biovilt", "vit") if backend == "auto" else (backend,)
+        )
+        last_error = None
+        chosen = None
+        for cand in candidates:
+            ok, err = self._try_load_backend(cand, img_size)
+            if ok:
+                chosen = cand
+                break
+            last_error = err
+            print(f"[BioViLTEncoder] backend '{cand}' unavailable: {err}")
+        if chosen is None:
+            raise RuntimeError(
+                f"No image encoder backend could be loaded. Last error: {last_error}"
             )
+        self.backend = chosen
         if frozen:
             self._freeze()
         print(f"[BioViLTEncoder] backend={self.backend} frozen={frozen} img_size={self.img_size}")
+    # ────────────────────────────────────────────────────────────────────
+    # Backend loading
+    # ────────────────────────────────────────────────────────────────────
+    def _try_load_backend(self, backend: str, img_size_hint: Optional[int]):
+        """
+        Try to load `backend`. Returns (success: bool, error_or_None).
+        On success, sets self.encoder and self.img_size.
+        """
+        try:
+            if backend == "rad_dino":
+                if not HF_TRANSFORMERS_AVAILABLE:
+                    return False, "transformers not installed"
+                # Load via HF transformers — works on any Python version
+                # that runs `transformers`. Weights download from HF Hub on
+                # first use (~340MB, cached afterwards).
+                self.encoder = AutoModel.from_pretrained(RAD_DINO_ID)
+                # img_size_hint=448 will be honoured if user set it; otherwise
+                # default to 518 (RAD-DINO's native training resolution).
+                self.img_size = img_size_hint or self._DEFAULT_SIZE["rad_dino"]
+                return True, None
+            elif backend == "biovilt":
+                if not BIOVIL_AVAILABLE:
+                    return False, "hi-ml-multimodal not installed"
+                self.encoder = get_biovil_t_image_encoder()
+                self.img_size = img_size_hint or self._DEFAULT_SIZE["biovilt"]
+                return True, None
+            elif backend == "vit":
+                if not TIMM_AVAILABLE:
+                    return False, "timm not installed"
+                self.encoder = timm.create_model(
+                    "vit_base_patch16_224",
+                    pretrained   = True,
+                    num_classes  = 0,        # drop classifier head
+                    global_pool  = "",       # keep patch tokens
+                )
+                # ViT-B/16 is locked to 224 by its position embeddings
+                self.img_size = self._DEFAULT_SIZE["vit"]
+                return True, None
+            else:
+                return False, f"unknown backend name: {backend!r}"
+        except Exception as e:
+            # AutoModel.from_pretrained may fail on network / auth / disk.
+            # Treat it as "backend unavailable" so auto-fallback can proceed.
+            return False, f"{type(e).__name__}: {e}"
     def _freeze(self):
         for p in self.encoder.parameters():
             p.requires_grad = False
         self.encoder.eval()
+    # ────────────────────────────────────────────────────────────────────
+    # Forward
+    # ────────────────────────────────────────────────────────────────────
     def forward(self, images: torch.Tensor) -> torch.Tensor:
         """
         Args:
         """
         ctx = torch.no_grad() if self.frozen else torch.enable_grad()
         with ctx:
+            if self.backend == "rad_dino":
+                # HF AutoModel returns BaseModelOutput; last_hidden_state has
+                # shape (B, N+1, 768) where token 0 is the CLS — drop it.
+                out = self.encoder(pixel_values=images)
+                feats = out.last_hidden_state
+                if feats.ndim == 3 and feats.shape[1] > 1:
+                    feats = feats[:, 1:, :]
+            elif self.backend == "biovilt":
                 out = self.encoder(images)
                 feats = out.patch_embedding                   # (B, 768, H', W')
                 B, C, H, W = feats.shape
                 feats = feats.flatten(2).transpose(1, 2)       # (B, H'*W', 768)
             else:  # vit
                 # timm ViT with num_classes=0, global_pool="" returns (B, N+1, 768)
                 # where token 0 is [CLS]. Drop it.
+                feats = self.encoder.forward_features(images)
                 if feats.ndim == 3 and feats.shape[1] > 1:
+                    feats = feats[:, 1:, :]
         return feats
+    # ────────────────────────────────────────────────────────────────────
+    # Image transform (preprocessing)
+    # ────────────────────────────────────────────────────────────────────
     @staticmethod
+    def get_transform(split: str = "train", backend: str = "auto"):
         """
+        Return an image transform that matches the chosen backend's expected
+        normalization and input size.
+        Priority is the same as backend selection (rad_dino → biovilt → vit).
+        `backend="auto"` picks whichever transform we can construct; pass an
+        explicit backend name to force one.
+        The returned object is callable: `transform(pil_image) -> tensor`.
         """
+        candidates = (
+            ("rad_dino", "biovilt", "vit") if backend == "auto" else (backend,)
+        )
+        for cand in candidates:
+            try:
+                if cand == "rad_dino" and HF_TRANSFORMERS_AVAILABLE:
+                    # RAD-DINO ships its own preprocessor (correct chest-X-ray
+                    # specific mean/std, native 518×518 resize, RGB channels).
+                    proc = AutoImageProcessor.from_pretrained(RAD_DINO_ID)
+                    def _rad_dino_transform(pil_img):
+                        return proc(images=pil_img, return_tensors="pt")["pixel_values"][0]
+                    return _rad_dino_transform
+                if cand == "biovilt" and BIOVIL_AVAILABLE:
+                    return create_chest_xray_transform_for_inference(
+                        width  = BioViLTEncoder._DEFAULT_SIZE["biovilt"],
+                        height = BioViLTEncoder._DEFAULT_SIZE["biovilt"],
+                    )
+                if cand == "vit":
+                    from torchvision import transforms
+                    size = BioViLTEncoder._DEFAULT_SIZE["vit"]
+                    return transforms.Compose([
+                        transforms.Resize((size, size)),
+                        transforms.ToTensor(),
+                        transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                             std=[0.229, 0.224, 0.225]),
+                    ])
+            except Exception as e:
+                print(f"[BioViLTEncoder.get_transform] '{cand}' transform "
+                      f"unavailable: {type(e).__name__}: {e}")
+                continue
+        raise RuntimeError(
+            "No image transform could be constructed (rad_dino/biovilt/vit all failed)"
+        )
     @property
     def output_dim(self) -> int: