convitom commited on
Commit
78b85ff
·
1 Parent(s): 9dadb47

chore: ignore .claude worktrees

Browse files
.claude/worktrees/strange-agnesi-73641a ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 9dadb472ab6ab5dee7a656bf525b249a605a68ff
.gitignore CHANGED
Binary files a/.gitignore and b/.gitignore differ
 
configs/model_config.yaml CHANGED
@@ -3,11 +3,18 @@
3
  # ─────────────────────────────────────────────
4
 
5
  # ── Vision Encoder ──────────────────────────
 
 
 
 
 
 
6
  image_encoder:
7
- name: "microsoft/BioViL-T" # BioViL-T from hi-ml-multimodal
 
8
  frozen: true # freeze encoder during training
9
- img_size: 448 # input image resolution
10
- output_dim: 768 # patch feature dimension
11
 
12
  # ── MLP Projection (Alignment Layer) ────────
13
  projection:
 
3
  # ─────────────────────────────────────────────
4
 
5
  # ── Vision Encoder ──────────────────────────
6
+ # `backend` chooses the underlying model. "auto" tries rad_dino → biovilt → vit
7
+ # in priority order and uses the first one that loads.
8
+ # - rad_dino : microsoft/rad-dino, chest-X-ray DINOv2 (HF transformers).
9
+ # Works on Python 3.12, recommended for CXR.
10
+ # - biovilt : Microsoft BioViL-T (needs hi-ml-multimodal, Python <3.11).
11
+ # - vit : timm ViT-B/16 ImageNet — generic fallback if above fail.
12
  image_encoder:
13
+ name: "microsoft/rad-dino" # informational; backend below drives loading
14
+ backend: "auto" # "auto" | "rad_dino" | "biovilt" | "vit"
15
  frozen: true # freeze encoder during training
16
+ img_size: 448 # input image resolution (RAD-DINO native is 518)
17
+ output_dim: 768 # patch feature dimension (768 for all backends)
18
 
19
  # ── MLP Projection (Alignment Layer) ────────
20
  projection:
data/eda_full.ipynb ADDED
@@ -0,0 +1,874 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# EDA — MIMIC-CXR Full Dataset\n",
8
+ "\n",
9
+ "**Datasets used:**\n",
10
+ "- `MIMIC-CXR-JPG` (v2.1.0) — ảnh JPG + CSV metadata\n",
11
+ "- `MIMIC-CXR` (v2.1.0) — report `.txt` (Findings / Impression)\n",
12
+ "- `MIMIC-Ext-MIMIC-CXR-VQA` (v1.0.0) — câu hỏi/đáp VQA\n",
13
+ "\n",
14
+ "**Scope:** toàn bộ dataset (tất cả subset p10–p19).\n",
15
+ "\n",
16
+ "> ℹ️ **Không cần tải ảnh JPG** để chạy notebook này — toàn bộ EDA dựa trên CSV, .txt reports và .json VQA."
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "markdown",
21
+ "metadata": {},
22
+ "source": [
23
+ "## 0. Cấu hình đường dẫn"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": "from pathlib import Path\n\nDATA_DIR = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\")\nCXR_ROOT = DATA_DIR / \"mimic-cxr-reports\" # files/p10…p19/pXXXXXX/sYYYYYY.txt — toàn bộ dataset\n\nSPLIT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-split.csv\"\nMETA_CSV = DATA_DIR / \"mimic-cxr-2.0.0-metadata.csv\"\nCHEXPERT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-chexpert.csv\"\n\n_VQA_DIR = (DATA_DIR\n / \"mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\"\n / \"MIMIC-Ext-MIMIC-CXR-VQA\"\n / \"dataset\")\nVQA_TRAIN = _VQA_DIR / \"train.json\"\nVQA_VALID = _VQA_DIR / \"valid.json\"\nVQA_TEST = _VQA_DIR / \"test.json\"\n\n# None = parse hết toàn bộ (~227k studies, mất 10-20 phút)\n# Số nguyên = sample ngẫu nhiên để chạy nhanh\nREPORT_SAMPLE_SIZE = 10000\n\n# Kiểm tra nhanh\nfor name, p in [(\"SPLIT_CSV\", SPLIT_CSV),\n (\"META_CSV\", META_CSV),\n (\"CHEXPERT_CSV\", CHEXPERT_CSV),\n (\"CXR_ROOT\", CXR_ROOT),\n (\"VQA_TRAIN\", VQA_TRAIN)]:\n status = \"✓\" if p.exists() else \"✗ NOT FOUND\"\n print(f\" {status} {name}: {p}\")\n\nprint(\"\\nPaths configured.\")"
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "import pandas as pd\n",
40
+ "import numpy as np\n",
41
+ "import json\n",
42
+ "import re\n",
43
+ "import matplotlib.pyplot as plt\n",
44
+ "import matplotlib.ticker as mticker\n",
45
+ "import seaborn as sns\n",
46
+ "from collections import Counter\n",
47
+ "\n",
48
+ "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
49
+ "plt.rcParams[\"figure.dpi\"] = 120\n",
50
+ "plt.rcParams[\"figure.figsize\"] = (11, 4)\n",
51
+ "\n",
52
+ "CHEXPERT_LABELS = [\n",
53
+ " \"Atelectasis\", \"Cardiomegaly\", \"Consolidation\", \"Edema\",\n",
54
+ " \"Enlarged Cardiomediastinum\", \"Fracture\", \"Lung Lesion\",\n",
55
+ " \"Lung Opacity\", \"No Finding\", \"Pleural Effusion\",\n",
56
+ " \"Pleural Other\", \"Pneumonia\", \"Pneumothorax\", \"Support Devices\"\n",
57
+ "]\n",
58
+ "\n",
59
+ "# Subset folders p10–p19\n",
60
+ "ALL_SUBSETS = [f\"p{i}\" for i in range(10, 20)]\n",
61
+ "\n",
62
+ "print(\"Libraries imported.\")"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "markdown",
67
+ "metadata": {},
68
+ "source": [
69
+ "## 1. Load CSV files"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "split_df = pd.read_csv(SPLIT_CSV)\n",
79
+ "meta_df = pd.read_csv(META_CSV)\n",
80
+ "chexpert_df = pd.read_csv(CHEXPERT_CSV)\n",
81
+ "\n",
82
+ "# Tạo cột subset folder (p10, p11, ..., p19)\n",
83
+ "def get_subset(subject_id):\n",
84
+ " return \"p\" + str(subject_id)[:2]\n",
85
+ "\n",
86
+ "for df_ in [split_df, meta_df, chexpert_df]:\n",
87
+ " df_[\"subset\"] = df_[\"subject_id\"].astype(str).str[:2].apply(lambda x: f\"p{x}\")\n",
88
+ "\n",
89
+ "print(f\"split.csv — total images : {len(split_df):,}\")\n",
90
+ "print(f\"metadata — total images : {len(meta_df):,}\")\n",
91
+ "print(f\"chexpert — total studies : {len(chexpert_df):,}\")\n",
92
+ "print(f\"\\nSubsets found in split.csv:\")\n",
93
+ "print(split_df[\"subset\"].value_counts().sort_index().to_string())"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": null,
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "# Merge split + metadata\n",
103
+ "df = split_df.merge(\n",
104
+ " meta_df[[\"dicom_id\", \"ViewPosition\", \"Rows\", \"Columns\"]],\n",
105
+ " on=\"dicom_id\", how=\"left\"\n",
106
+ ")\n",
107
+ "# Giữ lại cột subset từ split_df\n",
108
+ "if \"subset_y\" in df.columns:\n",
109
+ " df = df.drop(columns=[\"subset_y\"]).rename(columns={\"subset_x\": \"subset\"})\n",
110
+ "\n",
111
+ "print(f\"Merged shape: {df.shape}\")\n",
112
+ "df.head(3)"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "markdown",
117
+ "metadata": {},
118
+ "source": [
119
+ "## 2. Tổng quan: số ảnh & report theo split + subset folder"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": [
128
+ "# Tổng theo split\n",
129
+ "img_per_split = df[\"split\"].value_counts().reindex([\"train\",\"validate\",\"test\"])\n",
130
+ "study_per_split = (\n",
131
+ " df.drop_duplicates(\"study_id\")[\"split\"]\n",
132
+ " .value_counts().reindex([\"train\",\"validate\",\"test\"])\n",
133
+ ")\n",
134
+ "\n",
135
+ "summary_total = pd.DataFrame({\n",
136
+ " \"Images\": img_per_split,\n",
137
+ " \"Studies/Reports\": study_per_split\n",
138
+ "})\n",
139
+ "summary_total.loc[\"TOTAL\"] = summary_total.sum()\n",
140
+ "print(\"=== Overall split summary ===\")\n",
141
+ "print(summary_total.to_string())"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "# ── Breakdown theo từng subset folder ────────────────────────────────────────\n",
151
+ "img_subset_split = (\n",
152
+ " df.groupby([\"subset\", \"split\"])[\"dicom_id\"]\n",
153
+ " .count()\n",
154
+ " .unstack(fill_value=0)\n",
155
+ " .reindex(columns=[\"train\",\"validate\",\"test\"], fill_value=0)\n",
156
+ " .reindex(ALL_SUBSETS, fill_value=0)\n",
157
+ ")\n",
158
+ "img_subset_split[\"TOTAL\"] = img_subset_split.sum(axis=1)\n",
159
+ "\n",
160
+ "study_subset_split = (\n",
161
+ " df.drop_duplicates(\"study_id\")\n",
162
+ " .groupby([\"subset\", \"split\"])[\"study_id\"]\n",
163
+ " .count()\n",
164
+ " .unstack(fill_value=0)\n",
165
+ " .reindex(columns=[\"train\",\"validate\",\"test\"], fill_value=0)\n",
166
+ " .reindex(ALL_SUBSETS, fill_value=0)\n",
167
+ ")\n",
168
+ "study_subset_split[\"TOTAL\"] = study_subset_split.sum(axis=1)\n",
169
+ "\n",
170
+ "print(\"=== Images per subset × split ===\")\n",
171
+ "print(img_subset_split.to_string())\n",
172
+ "print(\"\\n=== Studies/Reports per subset × split ===\")\n",
173
+ "print(study_subset_split.to_string())"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
183
+ "palette = {\"train\": \"#4C72B0\", \"validate\": \"#DD8452\", \"test\": \"#55A868\"}\n",
184
+ "\n",
185
+ "for ax, data, title in zip(\n",
186
+ " axes,\n",
187
+ " [img_subset_split[[\"train\",\"validate\",\"test\"]], study_subset_split[[\"train\",\"validate\",\"test\"]]],\n",
188
+ " [\"Số ảnh theo subset × split\", \"Số study/report theo subset × split\"]\n",
189
+ "):\n",
190
+ " data.plot(kind=\"bar\", ax=ax, color=[palette[c] for c in data.columns], width=0.75)\n",
191
+ " ax.set_title(title, fontsize=12)\n",
192
+ " ax.set_xlabel(\"Subset folder\")\n",
193
+ " ax.set_ylabel(\"Count\")\n",
194
+ " ax.tick_params(axis=\"x\", rotation=0)\n",
195
+ " ax.legend(title=\"Split\")\n",
196
+ "\n",
197
+ "plt.suptitle(\"MIMIC-CXR Full Dataset — Split × Subset\", fontsize=14, y=1.02)\n",
198
+ "plt.tight_layout()\n",
199
+ "plt.show()"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": null,
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": [
208
+ "# Heatmap: tỉ lệ % train/val/test trong mỗi subset\n",
209
+ "img_pct = img_subset_split[[\"train\",\"validate\",\"test\"]].div(\n",
210
+ " img_subset_split[\"TOTAL\"], axis=0\n",
211
+ ") * 100\n",
212
+ "\n",
213
+ "fig, ax = plt.subplots(figsize=(8, 5))\n",
214
+ "sns.heatmap(\n",
215
+ " img_pct.round(1), annot=True, fmt=\".1f\", cmap=\"YlGnBu\",\n",
216
+ " linewidths=0.5, ax=ax, cbar_kws={\"label\": \"%\"}\n",
217
+ ")\n",
218
+ "ax.set_title(\"Tỉ lệ (%) train/val/test trong mỗi subset folder\")\n",
219
+ "ax.set_xlabel(\"Split\")\n",
220
+ "ax.set_ylabel(\"Subset\")\n",
221
+ "plt.tight_layout()\n",
222
+ "plt.show()"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "markdown",
227
+ "metadata": {},
228
+ "source": [
229
+ "## 3. Số ảnh mỗi study"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": null,
235
+ "metadata": {},
236
+ "outputs": [],
237
+ "source": [
238
+ "imgs_per_study = df.groupby(\"study_id\")[\"dicom_id\"].count()\n",
239
+ "count_dist = imgs_per_study.value_counts().sort_index()\n",
240
+ "\n",
241
+ "print(\"Images per study distribution:\")\n",
242
+ "print(count_dist.to_string())\n",
243
+ "print(f\"\\nMax : {imgs_per_study.max()}\")\n",
244
+ "print(f\"Mean: {imgs_per_study.mean():.2f}\")"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": null,
250
+ "metadata": {},
251
+ "outputs": [],
252
+ "source": [
253
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
254
+ "\n",
255
+ "# Full distribution\n",
256
+ "axes[0].bar(count_dist.index.astype(str), count_dist.values,\n",
257
+ " color=sns.color_palette(\"Blues_d\", len(count_dist)))\n",
258
+ "axes[0].set_title(\"Số ảnh mỗi study (toàn bộ)\")\n",
259
+ "axes[0].set_xlabel(\"Số ảnh trong study\")\n",
260
+ "axes[0].set_ylabel(\"Số study\")\n",
261
+ "for x, v in zip(count_dist.index, count_dist.values):\n",
262
+ " axes[0].text(str(x), v * 1.01, f\"{v:,}\", ha=\"center\", va=\"bottom\", fontsize=8)\n",
263
+ "\n",
264
+ "# Per-subset: mean images per study\n",
265
+ "mean_imgs = df.groupby(\"subset\").apply(\n",
266
+ " lambda g: g.groupby(\"study_id\")[\"dicom_id\"].count().mean()\n",
267
+ ").reindex(ALL_SUBSETS)\n",
268
+ "axes[1].bar(mean_imgs.index, mean_imgs.values, color=\"steelblue\")\n",
269
+ "axes[1].set_title(\"Trung bình số ảnh/study theo subset\")\n",
270
+ "axes[1].set_xlabel(\"Subset\")\n",
271
+ "axes[1].set_ylabel(\"Mean images/study\")\n",
272
+ "axes[1].set_ylim(0, mean_imgs.max() * 1.2)\n",
273
+ "for x, v in zip(mean_imgs.index, mean_imgs.values):\n",
274
+ " axes[1].text(x, v * 1.01, f\"{v:.2f}\", ha=\"center\", va=\"bottom\", fontsize=9)\n",
275
+ "\n",
276
+ "plt.suptitle(\"Images per Study Distribution\", fontsize=13)\n",
277
+ "plt.tight_layout()\n",
278
+ "plt.show()"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "markdown",
283
+ "metadata": {},
284
+ "source": [
285
+ "## 4. View Position"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "metadata": {},
292
+ "outputs": [],
293
+ "source": [
294
+ "view_counts = df[\"ViewPosition\"].fillna(\"Unknown\").value_counts()\n",
295
+ "print(\"View position counts (total):\")\n",
296
+ "print(view_counts.to_string())"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": null,
302
+ "metadata": {},
303
+ "outputs": [],
304
+ "source": [
305
+ "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
306
+ "\n",
307
+ "bars = axes[0].bar(view_counts.index, view_counts.values,\n",
308
+ " color=sns.color_palette(\"Set2\", len(view_counts)))\n",
309
+ "axes[0].bar_label(bars, fmt=\"%d\")\n",
310
+ "axes[0].set_title(\"Số ảnh theo View Position\")\n",
311
+ "axes[0].set_ylabel(\"Count\")\n",
312
+ "\n",
313
+ "axes[1].pie(view_counts.values, labels=view_counts.index, autopct=\"%1.1f%%\",\n",
314
+ " colors=sns.color_palette(\"Set2\", len(view_counts)))\n",
315
+ "axes[1].set_title(\"Tỉ lệ View Position\")\n",
316
+ "\n",
317
+ "plt.suptitle(\"View Position Distribution — Full Dataset\", fontsize=13)\n",
318
+ "plt.tight_layout()\n",
319
+ "plt.show()"
320
+ ]
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "execution_count": null,
325
+ "metadata": {},
326
+ "outputs": [],
327
+ "source": [
328
+ "# View per subset\n",
329
+ "view_subset = (\n",
330
+ " df.fillna({\"ViewPosition\": \"Unknown\"})\n",
331
+ " .groupby([\"subset\", \"ViewPosition\"])[\"dicom_id\"]\n",
332
+ " .count()\n",
333
+ " .unstack(fill_value=0)\n",
334
+ " .reindex(ALL_SUBSETS, fill_value=0)\n",
335
+ ")\n",
336
+ "\n",
337
+ "view_subset.plot(kind=\"bar\", figsize=(14, 4),\n",
338
+ " color=sns.color_palette(\"Set2\", view_subset.shape[1]),\n",
339
+ " width=0.8)\n",
340
+ "plt.title(\"View Position theo subset folder\")\n",
341
+ "plt.xlabel(\"Subset\")\n",
342
+ "plt.ylabel(\"Count\")\n",
343
+ "plt.xticks(rotation=0)\n",
344
+ "plt.legend(title=\"ViewPosition\", bbox_to_anchor=(1.01, 1), loc=\"upper left\")\n",
345
+ "plt.tight_layout()\n",
346
+ "plt.show()"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": null,
352
+ "metadata": {},
353
+ "outputs": [],
354
+ "source": [
355
+ "# View split breakdown\n",
356
+ "view_split = df.groupby([\"split\", \"ViewPosition\"]).size().unstack(fill_value=0)\n",
357
+ "view_split = view_split.reindex([\"train\",\"validate\",\"test\"])\n",
358
+ "view_split.plot(kind=\"bar\", figsize=(10, 4),\n",
359
+ " color=sns.color_palette(\"Set2\", view_split.shape[1]))\n",
360
+ "plt.title(\"View Position theo split\")\n",
361
+ "plt.xlabel(\"Split\")\n",
362
+ "plt.xticks(rotation=0)\n",
363
+ "plt.legend(title=\"ViewPosition\")\n",
364
+ "plt.tight_layout()\n",
365
+ "plt.show()"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "markdown",
370
+ "id": "8845a29f",
371
+ "source": "## 4b. Frontal-Only Sampling Strategy (AP > PA)\n\nChiến lược train: **1 report + 1 ảnh frontal** mỗi study.\n- Chỉ giữ AP hoặc PA; nếu study có cả hai thì **ưu tiên AP**.\n- Study không có ảnh frontal nào → loại khỏi tập train.",
372
+ "metadata": {}
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "id": "22a327eb",
377
+ "source": "frontal = df[df[\"ViewPosition\"].isin([\"AP\", \"PA\"])].copy()\n\ndef pick_frontal_view(group):\n ap = group[group[\"ViewPosition\"] == \"AP\"]\n if len(ap) > 0:\n return ap.iloc[[0]]\n return group[group[\"ViewPosition\"] == \"PA\"].iloc[[0]]\n\nfrontal_1img = (\n frontal.groupby(\"study_id\", group_keys=False)\n .apply(pick_frontal_view)\n .reset_index(drop=True)\n)\n\nn_study_total = df[\"study_id\"].nunique()\nn_study_frontal = frontal_1img[\"study_id\"].nunique()\nn_study_no_front = n_study_total - n_study_frontal\n\nprint(\"=== Frontal-Only Sampling (Full Dataset) ===\")\nprint(f\"Tổng số study : {n_study_total:,}\")\nprint(f\"Study có ảnh frontal (AP/PA) : {n_study_frontal:,} ({n_study_frontal/n_study_total*100:.1f}%)\")\nprint(f\"Study bị loại (không có frontal): {n_study_no_front:,} ({n_study_no_front/n_study_total*100:.1f}%)\")\nprint()\nprint(\"Ảnh được chọn theo view:\")\nprint(frontal_1img[\"ViewPosition\"].value_counts().to_string())\nprint()\n\nsplit_frontal = frontal_1img[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\nsplit_all = df.drop_duplicates(\"study_id\")[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\ncompare = pd.DataFrame({\n \"All studies\": split_all,\n \"Frontal-only\": split_frontal,\n \"Giảm (%)\": ((split_all - split_frontal) / split_all * 100).round(1)\n})\nprint(\"=== Mẫu train sau khi filter (split) ===\")\nprint(compare.to_string())",
378
+ "metadata": {},
379
+ "execution_count": null,
380
+ "outputs": []
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "id": "712ff838",
385
+ "source": "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. All vs Frontal-only\nbars = axes[0].bar([\"All studies\", \"Frontal-only\"],\n [n_study_total, n_study_frontal],\n color=[\"#4C72B0\", \"#55A868\"], width=0.5)\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Study count: All vs Frontal-only\")\naxes[0].set_ylabel(\"Số study\")\n\n# 2. Pie: view được chọn\nvc = frontal_1img[\"ViewPosition\"].value_counts()\naxes[1].pie(vc.values, labels=vc.index, autopct=\"%1.1f%%\",\n colors=[\"#4C72B0\", \"#DD8452\"])\naxes[1].set_title(\"View được chọn (AP ưu tiên)\")\n\n# 3. Per-split comparison\nx = np.arange(3)\nw = 0.35\naxes[2].bar(x - w/2, split_all.values, w, label=\"All\", color=\"#4C72B0\", alpha=0.85)\naxes[2].bar(x + w/2, split_frontal.values, w, label=\"Frontal-only\", color=\"#55A868\", alpha=0.85)\naxes[2].set_xticks(x)\naxes[2].set_xticklabels([\"train\", \"validate\", \"test\"])\naxes[2].set_title(\"Frontal-only vs All (per split)\")\naxes[2].set_ylabel(\"Số study\")\naxes[2].legend()\n\nplt.suptitle(\"Frontal-Only Sampling Strategy — Full Dataset\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# 4. Frontal-only per subset\nfrontal_subset = (\n frontal_1img.groupby([\"subset\", \"split\"]).size()\n .unstack(fill_value=0)\n .reindex(columns=[\"train\", \"validate\", \"test\"], fill_value=0)\n .reindex(ALL_SUBSETS, fill_value=0)\n)\nfrontal_subset[\"TOTAL\"] = frontal_subset.sum(axis=1)\nprint(\"Frontal-only samples per subset:\")\nprint(frontal_subset.to_string())\n\nfrontal_subset[[\"train\",\"validate\",\"test\"]].plot(\n kind=\"bar\", figsize=(13, 4),\n color=[\"#4C72B0\", \"#DD8452\", \"#55A868\"], width=0.75\n)\nplt.title(\"Frontal-Only samples theo subset × split\")\nplt.xlabel(\"Subset\")\nplt.ylabel(\"Số study\")\nplt.xticks(rotation=0)\nplt.legend(title=\"Split\")\nplt.tight_layout()\nplt.show()",
386
+ "metadata": {},
387
+ "execution_count": null,
388
+ "outputs": []
389
+ },
390
+ {
391
+ "cell_type": "markdown",
392
+ "metadata": {},
393
+ "source": [
394
+ "## 5. CheXpert Labels — 14 nhãn bệnh lý"
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "code",
399
+ "execution_count": null,
400
+ "metadata": {},
401
+ "outputs": [],
402
+ "source": [
403
+ "label_cols = [c for c in chexpert_df.columns if c in CHEXPERT_LABELS]\n",
404
+ "\n",
405
+ "positive_counts = (chexpert_df[label_cols] == 1).sum().sort_values(ascending=False)\n",
406
+ "uncertain_counts = (chexpert_df[label_cols] == -1).sum()\n",
407
+ "negative_counts = (chexpert_df[label_cols] == 0).sum()\n",
408
+ "\n",
409
+ "label_summary = pd.DataFrame({\n",
410
+ " \"Positive\": positive_counts,\n",
411
+ " \"Uncertain\": uncertain_counts,\n",
412
+ " \"Negative\": negative_counts,\n",
413
+ " \"Not Mentioned\": chexpert_df[label_cols].isna().sum()\n",
414
+ "})\n",
415
+ "label_summary[\"Positive %\"] = (label_summary[\"Positive\"] / len(chexpert_df) * 100).round(1)\n",
416
+ "print(label_summary.sort_values(\"Positive\", ascending=False).to_string())"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": null,
422
+ "metadata": {},
423
+ "outputs": [],
424
+ "source": [
425
+ "ordered_labels = label_summary.sort_values(\"Positive\", ascending=False).index.tolist()\n",
426
+ "x = np.arange(len(ordered_labels))\n",
427
+ "w = 0.25\n",
428
+ "\n",
429
+ "fig, ax = plt.subplots(figsize=(14, 5))\n",
430
+ "ax.bar(x - w, label_summary.loc[ordered_labels, \"Positive\"], w, label=\"Positive\", color=\"#e74c3c\")\n",
431
+ "ax.bar(x, label_summary.loc[ordered_labels, \"Uncertain\"], w, label=\"Uncertain\", color=\"#f39c12\")\n",
432
+ "ax.bar(x + w, label_summary.loc[ordered_labels, \"Negative\"], w, label=\"Negative\", color=\"#2ecc71\")\n",
433
+ "ax.set_xticks(x)\n",
434
+ "ax.set_xticklabels(ordered_labels, rotation=40, ha=\"right\", fontsize=9)\n",
435
+ "ax.set_ylabel(\"Số study\")\n",
436
+ "ax.set_title(\"CheXpert Labels — Positive / Uncertain / Negative (Full Dataset)\")\n",
437
+ "ax.legend()\n",
438
+ "plt.tight_layout()\n",
439
+ "plt.show()"
440
+ ]
441
+ },
442
+ {
443
+ "cell_type": "code",
444
+ "execution_count": null,
445
+ "metadata": {},
446
+ "outputs": [],
447
+ "source": "ADMIN_HEADERS = {\n 'EXAMINATION', 'INDICATION', 'CLINICAL INDICATION', 'TECHNIQUE',\n 'COMPARISON', 'HISTORY', 'REASON', 'REASON FOR EXAM',\n 'REASON FOR EXAMINATION', 'PROCEDURE', 'FINAL REPORT',\n 'NOTIFICATION', 'RECOMMENDATION', 'ADDENDUM'\n}\n\nSECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\-]{1,70}?):\\s*', re.MULTILINE)\n\ndef parse_report(txt_path: Path) -> dict:\n \"\"\"\n Quy luật detect section: mọi header đều VIẾT HOA TOÀN BỘ và kết thúc bằng ':'.\n Fallback: nếu không có FINDINGS tường minh, lấy section descriptive đầu tiên.\n \"\"\"\n try:\n text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n except FileNotFoundError:\n return {\"findings\": None, \"impression\": None}\n\n matches = list(SECTION_RE.finditer(text))\n if not matches:\n return {\"findings\": None, \"impression\": None}\n\n sections = []\n for i, m in enumerate(matches):\n header = m.group(1).strip()\n start = m.end()\n end = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n content = text[start:end].strip()\n sections.append((header, content))\n\n findings = impression = None\n for header, content in sections:\n h = header.upper()\n if \"FINDING\" in h and findings is None:\n findings = content or None\n elif \"IMPRESSION\" in h and impression is None:\n impression = content or None\n\n if findings is None:\n for header, content in sections:\n h = header.upper()\n if h not in ADMIN_HEADERS and \"IMPRESSION\" not in h and content:\n findings = content\n break\n\n return {\"findings\": findings, \"impression\": impression}\n\n\nall_studies = (\n df[[\"subject_id\", \"study_id\", \"subset\"]]\n .drop_duplicates(\"study_id\")\n .reset_index(drop=True)\n)\n\nif REPORT_SAMPLE_SIZE is not None:\n parse_studies = all_studies.sample(\n n=min(REPORT_SAMPLE_SIZE, len(all_studies)), random_state=42\n ).reset_index(drop=True)\n print(f\"Sample {len(parse_studies):,} / {len(all_studies):,} studies\")\nelse:\n parse_studies = all_studies\n print(f\"Parsing ALL {len(parse_studies):,} studies...\")\n\nrecords = []\nfor _, row in parse_studies.iterrows():\n sid = str(row[\"subject_id\"])\n stid = str(row[\"study_id\"])\n sub = row[\"subset\"]\n txt_path = CXR_ROOT / \"files\" / sub / f\"p{sid}\" / f\"s{stid}.txt\"\n records.append({\"study_id\": stid, \"subset\": sub, **parse_report(txt_path)})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"] = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\ntotal = len(report_df)\nprint(f\"\\nFindings found : {report_df['findings'].notna().sum():,} / {total:,} ({report_df['findings'].notna().mean()*100:.1f}%)\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {total:,} ({report_df['impression'].notna().mean()*100:.1f}%)\")\nboth = (report_df['findings'].notna() & report_df['impression'].notna()).sum()\nneither = (report_df['findings'].isna() & report_df['impression'].isna()).sum()\nprint(f\"Cả hai : {both:,} / {total:,} ({both/total*100:.1f}%)\")\nprint(f\"Không có cả hai : {neither:,} / {total:,} ({neither/total*100:.1f}%)\")"
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "metadata": {},
453
+ "outputs": [],
454
+ "source": [
455
+ "# Số nhãn positive mỗi study\n",
456
+ "labels_per_study = (chexpert_df[label_cols] == 1).sum(axis=1)\n",
457
+ "lps_counts = labels_per_study.value_counts().sort_index()\n",
458
+ "\n",
459
+ "fig, ax = plt.subplots(figsize=(9, 4))\n",
460
+ "ax.bar(lps_counts.index.astype(str), lps_counts.values,\n",
461
+ " color=sns.color_palette(\"Blues_d\", len(lps_counts)))\n",
462
+ "ax.set_xlabel(\"Số nhãn positive\")\n",
463
+ "ax.set_ylabel(\"Số study\")\n",
464
+ "ax.set_title(\"Phân bố số nhãn positive mỗi study (Full Dataset)\")\n",
465
+ "for x_, v in zip(lps_counts.index, lps_counts.values):\n",
466
+ " ax.text(str(x_), v * 1.01, f\"{v:,}\", ha=\"center\", va=\"bottom\", fontsize=8)\n",
467
+ "plt.tight_layout()\n",
468
+ "plt.show()"
469
+ ]
470
+ },
471
+ {
472
+ "cell_type": "markdown",
473
+ "metadata": {},
474
+ "source": "## 6. Phân tích Report — Findings & Impression\n\n> ℹ️ Report parsing chỉ hoạt động với subset **đã tải về**. Các subset chưa có sẽ tự động bị bỏ qua."
475
+ },
476
+ {
477
+ "cell_type": "code",
478
+ "execution_count": null,
479
+ "metadata": {},
480
+ "outputs": [],
481
+ "source": "def parse_report(txt_path: Path) -> dict:\n try:\n text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n except FileNotFoundError:\n return {\"findings\": None, \"impression\": None}\n\n text = re.sub(r\"[\\r\\n]+\", \" \", text)\n\n def extract_section(pattern, text):\n m = re.search(pattern, text, re.IGNORECASE)\n if not m:\n return None\n start = m.end()\n nxt = re.search(\n r\"(IMPRESSION|FINDINGS|CONCLUSION|RECOMMENDATION|NOTIFICATION)\",\n text[start:], re.IGNORECASE\n )\n end = start + nxt.start() if nxt else len(text)\n return text[start:end].strip()\n\n return {\n \"findings\": extract_section(r\"FINDINGS\\s*:\", text),\n \"impression\": extract_section(r\"IMPRESSION\\s*:\", text)\n }\n\n\nall_studies = (\n df[[\"subject_id\", \"study_id\", \"subset\"]]\n .drop_duplicates(\"study_id\")\n .reset_index(drop=True)\n)\n\nif REPORT_SAMPLE_SIZE is not None:\n parse_studies = all_studies.sample(\n n=min(REPORT_SAMPLE_SIZE, len(all_studies)), random_state=42\n ).reset_index(drop=True)\n print(f\"Sample {len(parse_studies):,} / {len(all_studies):,} studies\")\nelse:\n parse_studies = all_studies\n print(f\"Parsing ALL {len(parse_studies):,} studies... (có thể mất 10-20 phút)\")\n\nrecords = []\nfor _, row in parse_studies.iterrows():\n sid = str(row[\"subject_id\"])\n stid = str(row[\"study_id\"])\n sub = row[\"subset\"]\n txt_path = CXR_ROOT / \"files\" / sub / f\"p{sid}\" / f\"s{stid}.txt\"\n records.append({\"study_id\": stid, \"subset\": sub, **parse_report(txt_path)})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"] = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\nprint(f\"Findings found : {report_df['findings'].notna().sum():,} / {len(report_df):,}\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {len(report_df):,}\")"
482
+ },
483
+ {
484
+ "cell_type": "code",
485
+ "execution_count": null,
486
+ "metadata": {},
487
+ "outputs": [],
488
+ "source": [
489
+ "print(\"=== Findings word count ===\")\n",
490
+ "print(report_df[\"findings_len\"].describe().round(1).to_string())\n",
491
+ "print(\"\\n=== Impression word count ===\")\n",
492
+ "print(report_df[\"impression_len\"].describe().round(1).to_string())"
493
+ ]
494
+ },
495
+ {
496
+ "cell_type": "code",
497
+ "execution_count": null,
498
+ "metadata": {},
499
+ "outputs": [],
500
+ "source": [
501
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
502
+ "\n",
503
+ "for ax, col, title, color in zip(\n",
504
+ " axes,\n",
505
+ " [\"findings_len\", \"impression_len\"],\n",
506
+ " [\"Findings — độ dài (số từ)\", \"Impression — độ dài (số từ)\"],\n",
507
+ " [\"steelblue\", \"tomato\"]\n",
508
+ "):\n",
509
+ " data = report_df[col].dropna()\n",
510
+ " p99 = data.quantile(0.99)\n",
511
+ " ax.hist(data[data <= p99], bins=50, color=color, edgecolor=\"white\", alpha=0.85)\n",
512
+ " ax.axvline(data.median(), color=\"black\", ls=\"--\", lw=1.3, label=f\"Median={data.median():.0f}\")\n",
513
+ " ax.axvline(data.mean(), color=\"gray\", ls=\":\", lw=1.3, label=f\"Mean={data.mean():.0f}\")\n",
514
+ " ax.set_title(title)\n",
515
+ " ax.set_xlabel(\"Số từ\")\n",
516
+ " ax.set_ylabel(\"Số report\")\n",
517
+ " ax.legend(fontsize=9)\n",
518
+ " ax.text(0.97, 0.95, f\"n={len(data):,}\\n(≤p99={p99:.0f}w)\",\n",
519
+ " transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
520
+ "\n",
521
+ "plt.suptitle(\"Phân bố độ dài report — Full Dataset\", fontsize=13)\n",
522
+ "plt.tight_layout()\n",
523
+ "plt.show()"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": null,
529
+ "metadata": {},
530
+ "outputs": [],
531
+ "source": [
532
+ "# Box plot Findings vs Impression\n",
533
+ "combined = pd.DataFrame({\n",
534
+ " \"word_count\": pd.concat([report_df[\"findings_len\"], report_df[\"impression_len\"]], ignore_index=True),\n",
535
+ " \"section\": [\"Findings\"] * len(report_df) + [\"Impression\"] * len(report_df)\n",
536
+ "}).dropna()\n",
537
+ "\n",
538
+ "fig, ax = plt.subplots(figsize=(7, 4))\n",
539
+ "sns.boxplot(data=combined, x=\"section\", y=\"word_count\",\n",
540
+ " palette=[\"steelblue\", \"tomato\"], showfliers=False, ax=ax)\n",
541
+ "ax.set_title(\"Findings vs Impression — độ dài (no outliers)\")\n",
542
+ "ax.set_ylabel(\"Số từ\")\n",
543
+ "plt.tight_layout()\n",
544
+ "plt.show()"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": null,
550
+ "metadata": {},
551
+ "outputs": [],
552
+ "source": [
553
+ "# Median report length theo subset\n",
554
+ "rep_by_subset = report_df.groupby(\"subset\")[[\"findings_len\",\"impression_len\"]].median().reindex(ALL_SUBSETS)\n",
555
+ "\n",
556
+ "rep_by_subset.plot(kind=\"bar\", figsize=(12, 4),\n",
557
+ " color=[\"steelblue\", \"tomato\"], width=0.7)\n",
558
+ "plt.title(\"Median độ dài Findings & Impression theo subset\")\n",
559
+ "plt.xlabel(\"Subset\")\n",
560
+ "plt.ylabel(\"Median số từ\")\n",
561
+ "plt.xticks(rotation=0)\n",
562
+ "plt.legend([\"Findings\", \"Impression\"])\n",
563
+ "plt.tight_layout()\n",
564
+ "plt.show()"
565
+ ]
566
+ },
567
+ {
568
+ "cell_type": "markdown",
569
+ "metadata": {},
570
+ "source": [
571
+ "## 7. VQA — phân tích câu hỏi & đáp"
572
+ ]
573
+ },
574
+ {
575
+ "cell_type": "code",
576
+ "execution_count": null,
577
+ "metadata": {},
578
+ "outputs": [],
579
+ "source": [
580
+ "vqa_dfs = []\n",
581
+ "for fpath, sname in [(VQA_TRAIN, \"train\"), (VQA_VALID, \"valid\"), (VQA_TEST, \"test\")]:\n",
582
+ " if fpath.exists():\n",
583
+ " with open(fpath, encoding=\"utf-8\") as f:\n",
584
+ " data = json.load(f)\n",
585
+ " tmp = pd.DataFrame(data)\n",
586
+ " tmp[\"split\"] = sname\n",
587
+ " vqa_dfs.append(tmp)\n",
588
+ " else:\n",
589
+ " print(f\"[WARNING] Not found: {fpath}\")\n",
590
+ "\n",
591
+ "vqa_all = pd.concat(vqa_dfs, ignore_index=True)\n",
592
+ "vqa_all[\"subset\"] = \"p\" + vqa_all[\"subject_id\"].astype(str).str[:2]\n",
593
+ "\n",
594
+ "print(f\"VQA total: {len(vqa_all):,}\")\n",
595
+ "print(f\"\\nPer split:\")\n",
596
+ "print(vqa_all[\"split\"].value_counts().to_string())"
597
+ ]
598
+ },
599
+ {
600
+ "cell_type": "code",
601
+ "execution_count": null,
602
+ "metadata": {},
603
+ "outputs": [],
604
+ "source": [
605
+ "# VQA per subset × split\n",
606
+ "vqa_subset_split = (\n",
607
+ " vqa_all.groupby([\"subset\", \"split\"]).size()\n",
608
+ " .unstack(fill_value=0)\n",
609
+ " .reindex(columns=[\"train\",\"valid\",\"test\"], fill_value=0)\n",
610
+ " .reindex(ALL_SUBSETS, fill_value=0)\n",
611
+ ")\n",
612
+ "vqa_subset_split[\"TOTAL\"] = vqa_subset_split.sum(axis=1)\n",
613
+ "print(\"VQA samples per subset × split:\")\n",
614
+ "print(vqa_subset_split.to_string())"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": null,
620
+ "metadata": {},
621
+ "outputs": [],
622
+ "source": [
623
+ "vqa_subset_split[[\"train\",\"valid\",\"test\"]].plot(\n",
624
+ " kind=\"bar\", figsize=(13, 4),\n",
625
+ " color=[palette[\"train\"], palette[\"validate\"], palette[\"test\"]],\n",
626
+ " width=0.75\n",
627
+ ")\n",
628
+ "plt.title(\"VQA samples theo subset × split\")\n",
629
+ "plt.xlabel(\"Subset\")\n",
630
+ "plt.ylabel(\"Count\")\n",
631
+ "plt.xticks(rotation=0)\n",
632
+ "plt.legend(title=\"Split\")\n",
633
+ "plt.tight_layout()\n",
634
+ "plt.show()"
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "markdown",
639
+ "id": "63f3247e",
640
+ "source": "### VQA × View Position — mẫu hỏi đáp thuộc ảnh view nào",
641
+ "metadata": {}
642
+ },
643
+ {
644
+ "cell_type": "code",
645
+ "id": "d5e6a532",
646
+ "source": "# image_id trong VQA = dicom_id trong metadata\nvqa_view = vqa_all.merge(\n meta_df[[\"dicom_id\", \"ViewPosition\"]],\n left_on=\"image_id\", right_on=\"dicom_id\",\n how=\"left\"\n)\n\nmissing_view_vqa = vqa_view[\"ViewPosition\"].isna().sum()\nvqa_view[\"ViewPosition\"] = vqa_view[\"ViewPosition\"].fillna(\"Unknown\")\n\nview_vqa_counts = vqa_view[\"ViewPosition\"].value_counts()\nprint(\"=== VQA samples theo View Position (Full Dataset) ===\")\nprint(view_vqa_counts.to_string())\nprint(f\"\\nKhông map được ViewPosition: {missing_view_vqa:,} ({missing_view_vqa/len(vqa_view)*100:.1f}%)\")\n\nfig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. Bar\nbars = axes[0].bar(view_vqa_counts.index, view_vqa_counts.values,\n color=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Số mẫu VQA theo View Position\")\naxes[0].set_ylabel(\"Số mẫu\")\n\n# 2. Pie\naxes[1].pie(view_vqa_counts.values, labels=view_vqa_counts.index,\n autopct=\"%1.1f%%\", colors=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[1].set_title(\"Tỉ lệ VQA theo View Position\")\n\n# 3. Semantic type × View (stacked bar)\nsem_view = vqa_view.groupby([\"ViewPosition\", \"semantic_type\"]).size().unstack(fill_value=0)\nsem_view.plot(kind=\"bar\", ax=axes[2], color=sns.color_palette(\"Set1\", sem_view.shape[1]),\n width=0.7, stacked=True)\naxes[2].set_title(\"Semantic Type × View Position\")\naxes[2].set_xlabel(\"View Position\")\naxes[2].set_ylabel(\"Số mẫu\")\naxes[2].tick_params(axis=\"x\", rotation=30)\naxes[2].legend(title=\"Semantic Type\", fontsize=8)\n\nplt.suptitle(\"VQA × View Position — Full Dataset\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# Content type × View (heatmap)\ncontent_view = (vqa_view.groupby([\"ViewPosition\", \"content_type\"]).size()\n .unstack(fill_value=0))\nprint(\"\\nContent type theo View Position:\")\nprint(content_view.to_string())\n\nfig, ax = plt.subplots(figsize=(11, 4))\nsns.heatmap(content_view, annot=True, fmt=\"d\", cmap=\"YlGnBu\",\n linewidths=0.4, ax=ax)\nax.set_title(\"VQA — Content Type × View Position\")\nplt.tight_layout()\nplt.show()",
647
+ "metadata": {},
648
+ "execution_count": null,
649
+ "outputs": []
650
+ },
651
+ {
652
+ "cell_type": "code",
653
+ "execution_count": null,
654
+ "metadata": {},
655
+ "outputs": [],
656
+ "source": [
657
+ "# Semantic type & Content type\n",
658
+ "sem_counts = vqa_all[\"semantic_type\"].value_counts()\n",
659
+ "con_counts = vqa_all[\"content_type\"].value_counts()\n",
660
+ "\n",
661
+ "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
662
+ "for ax, counts, title in zip(\n",
663
+ " axes,\n",
664
+ " [sem_counts, con_counts],\n",
665
+ " [\"VQA — Semantic Type\", \"VQA — Content Type\"]\n",
666
+ "):\n",
667
+ " bars = ax.bar(counts.index, counts.values,\n",
668
+ " color=sns.color_palette(\"Set2\", len(counts)))\n",
669
+ " ax.bar_label(bars, fmt=\"%d\")\n",
670
+ " ax.set_title(title)\n",
671
+ " ax.set_ylabel(\"Count\")\n",
672
+ " ax.tick_params(axis=\"x\", rotation=30)\n",
673
+ "\n",
674
+ "plt.suptitle(\"VQA Question Analysis — Full Dataset\", fontsize=13)\n",
675
+ "plt.tight_layout()\n",
676
+ "plt.show()"
677
+ ]
678
+ },
679
+ {
680
+ "cell_type": "code",
681
+ "execution_count": null,
682
+ "metadata": {},
683
+ "outputs": [],
684
+ "source": [
685
+ "# Cross-tab semantic × content\n",
686
+ "cross = pd.crosstab(vqa_all[\"semantic_type\"], vqa_all[\"content_type\"])\n",
687
+ "fig, ax = plt.subplots(figsize=(10, 3))\n",
688
+ "sns.heatmap(cross, annot=True, fmt=\"d\", cmap=\"YlOrRd\", ax=ax)\n",
689
+ "ax.set_title(\"VQA — Semantic Type × Content Type\")\n",
690
+ "plt.tight_layout()\n",
691
+ "plt.show()"
692
+ ]
693
+ },
694
+ {
695
+ "cell_type": "code",
696
+ "execution_count": null,
697
+ "metadata": {},
698
+ "outputs": [],
699
+ "source": [
700
+ "# Answer type\n",
701
+ "def classify_answer(ans_list):\n",
702
+ " if not isinstance(ans_list, list) or len(ans_list) == 0:\n",
703
+ " return \"no answer\"\n",
704
+ " a = ans_list[0].strip().lower()\n",
705
+ " return a if a in [\"yes\", \"no\"] else \"open\"\n",
706
+ "\n",
707
+ "vqa_all[\"ans_type\"] = vqa_all[\"answer\"].apply(classify_answer)\n",
708
+ "ans_counts = vqa_all[\"ans_type\"].value_counts()\n",
709
+ "\n",
710
+ "fig, ax = plt.subplots(figsize=(6, 3))\n",
711
+ "bars = ax.bar(ans_counts.index, ans_counts.values,\n",
712
+ " color=sns.color_palette(\"Pastel1\", len(ans_counts)))\n",
713
+ "ax.bar_label(bars, fmt=\"%d\")\n",
714
+ "ax.set_title(\"VQA — Answer Type (Full Dataset)\")\n",
715
+ "ax.set_ylabel(\"Count\")\n",
716
+ "plt.tight_layout()\n",
717
+ "plt.show()"
718
+ ]
719
+ },
720
+ {
721
+ "cell_type": "markdown",
722
+ "metadata": {},
723
+ "source": [
724
+ "## 8. Data Quality & Missing Data"
725
+ ]
726
+ },
727
+ {
728
+ "cell_type": "code",
729
+ "execution_count": null,
730
+ "metadata": {},
731
+ "outputs": [],
732
+ "source": [
733
+ "# Missing ViewPosition\n",
734
+ "missing_view = df[\"ViewPosition\"].isna().sum()\n",
735
+ "print(f\"Ảnh thiếu ViewPosition: {missing_view:,} / {len(df):,} ({missing_view/len(df)*100:.2f}%)\")\n",
736
+ "\n",
737
+ "# Missing view per subset\n",
738
+ "mv_subset = df[df[\"ViewPosition\"].isna()].groupby(\"subset\").size().reindex(ALL_SUBSETS, fill_value=0)\n",
739
+ "print(\"\\nMissing ViewPosition per subset:\")\n",
740
+ "print(mv_subset.to_string())"
741
+ ]
742
+ },
743
+ {
744
+ "cell_type": "code",
745
+ "execution_count": null,
746
+ "metadata": {},
747
+ "outputs": [],
748
+ "source": [
749
+ "# Missing findings/impression (từ sample)\n",
750
+ "no_findings = report_df[\"findings\"].isna().sum()\n",
751
+ "no_impression = report_df[\"impression\"].isna().sum()\n",
752
+ "n = len(report_df)\n",
753
+ "print(f\"Reports thiếu Findings : {no_findings:,}/{n:,} ({no_findings/n*100:.1f}%)\")\n",
754
+ "print(f\"Reports thiếu Impression : {no_impression:,}/{n:,} ({no_impression/n*100:.1f}%)\")\n",
755
+ "print(f\"Reports thiếu CẢ HAI : {(report_df['findings'].isna() & report_df['impression'].isna()).sum():,}/{n:,}\")"
756
+ ]
757
+ },
758
+ {
759
+ "cell_type": "code",
760
+ "execution_count": null,
761
+ "metadata": {},
762
+ "outputs": [],
763
+ "source": [
764
+ "# Bệnh nhân / study / ảnh tổng quan\n",
765
+ "n_subjects = df[\"subject_id\"].nunique()\n",
766
+ "n_studies = df[\"study_id\"].nunique()\n",
767
+ "n_images = df[\"dicom_id\"].nunique()\n",
768
+ "\n",
769
+ "print(f\"Bệnh nhân : {n_subjects:,}\")\n",
770
+ "print(f\"Studies : {n_studies:,}\")\n",
771
+ "print(f\"Ảnh : {n_images:,}\")\n",
772
+ "print(f\"Trung bình study/patient : {n_studies/n_subjects:.2f}\")\n",
773
+ "print(f\"Trung bình ảnh/patient : {n_images/n_subjects:.2f}\")"
774
+ ]
775
+ },
776
+ {
777
+ "cell_type": "code",
778
+ "execution_count": null,
779
+ "metadata": {},
780
+ "outputs": [],
781
+ "source": [
782
+ "# Study per patient distribution\n",
783
+ "spp = df.groupby(\"subject_id\")[\"study_id\"].nunique()\n",
784
+ "print(\"Studies per patient:\")\n",
785
+ "print(spp.describe().round(1).to_string())\n",
786
+ "\n",
787
+ "fig, ax = plt.subplots(figsize=(10, 4))\n",
788
+ "spp_vc = spp.value_counts().sort_index()\n",
789
+ "# clip tails\n",
790
+ "spp_vc_clip = spp_vc[spp_vc.index <= spp.quantile(0.99)]\n",
791
+ "ax.bar(spp_vc_clip.index.astype(str), spp_vc_clip.values, color=\"mediumpurple\")\n",
792
+ "ax.set_xlabel(\"Số study mỗi bệnh nhân\")\n",
793
+ "ax.set_ylabel(\"Số bệnh nhân\")\n",
794
+ "ax.set_title(\"Phân bố số lần khám mỗi bệnh nhân (≤p99)\")\n",
795
+ "ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True, nbins=20))\n",
796
+ "plt.tight_layout()\n",
797
+ "plt.show()"
798
+ ]
799
+ },
800
+ {
801
+ "cell_type": "code",
802
+ "execution_count": null,
803
+ "metadata": {},
804
+ "outputs": [],
805
+ "source": [
806
+ "# Số bệnh nhân và study per subset\n",
807
+ "patient_subset = df.groupby(\"subset\")[\"subject_id\"].nunique().reindex(ALL_SUBSETS)\n",
808
+ "study_subset = df.groupby(\"subset\")[\"study_id\"].nunique().reindex(ALL_SUBSETS)\n",
809
+ "image_subset = df.groupby(\"subset\")[\"dicom_id\"].nunique().reindex(ALL_SUBSETS)\n",
810
+ "\n",
811
+ "subset_overview = pd.DataFrame({\n",
812
+ " \"Patients\": patient_subset,\n",
813
+ " \"Studies\": study_subset,\n",
814
+ " \"Images\": image_subset\n",
815
+ "})\n",
816
+ "print(subset_overview.to_string())\n",
817
+ "\n",
818
+ "subset_overview.plot(kind=\"bar\", figsize=(13, 4),\n",
819
+ " color=[\"#5e81ac\", \"#88c0d0\", \"#a3be8c\"], width=0.75)\n",
820
+ "plt.title(\"Patients / Studies / Images theo subset\")\n",
821
+ "plt.xlabel(\"Subset\")\n",
822
+ "plt.ylabel(\"Count\")\n",
823
+ "plt.xticks(rotation=0)\n",
824
+ "plt.legend()\n",
825
+ "plt.tight_layout()\n",
826
+ "plt.show()"
827
+ ]
828
+ },
829
+ {
830
+ "cell_type": "markdown",
831
+ "metadata": {},
832
+ "source": [
833
+ "## 9. Summary"
834
+ ]
835
+ },
836
+ {
837
+ "cell_type": "code",
838
+ "execution_count": null,
839
+ "metadata": {},
840
+ "outputs": [],
841
+ "source": [
842
+ "print(\"=\"*60)\n",
843
+ "print(\" SUMMARY — MIMIC-CXR Full Dataset\")\n",
844
+ "print(\"=\"*60)\n",
845
+ "print(f\" Bệnh nhân : {n_subjects:,}\")\n",
846
+ "print(f\" Studies (reports) : {n_studies:,}\")\n",
847
+ "print(f\" Ảnh (dicom/jpg) : {n_images:,}\")\n",
848
+ "print()\n",
849
+ "for sp in [\"train\", \"validate\", \"test\"]:\n",
850
+ " ni = img_per_split.get(sp, 0)\n",
851
+ " ns = study_per_split.get(sp, 0)\n",
852
+ " print(f\" [{sp:>8}] ảnh={ni:>6,} studies={ns:>6,}\")\n",
853
+ "print()\n",
854
+ "print(f\" Frontal (PA+AP) : {view_counts.get('PA',0)+view_counts.get('AP',0):,} ảnh\")\n",
855
+ "print(f\" Lateral (LL) : {view_counts.get('LL',view_counts.get('LATERAL',0)):,} ảnh\")\n",
856
+ "print(f\" VQA total samples : {len(vqa_all):,}\")\n",
857
+ "print(\"=\"*60)"
858
+ ]
859
+ }
860
+ ],
861
+ "metadata": {
862
+ "kernelspec": {
863
+ "display_name": "Python 3",
864
+ "language": "python",
865
+ "name": "python3"
866
+ },
867
+ "language_info": {
868
+ "name": "python",
869
+ "version": "3.10.0"
870
+ }
871
+ },
872
+ "nbformat": 4,
873
+ "nbformat_minor": 5
874
+ }
data/eda_p18.ipynb ADDED
@@ -0,0 +1,797 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# EDA — MIMIC-CXR Subset p18\n",
8
+ "\n",
9
+ "**Datasets used:**\n",
10
+ "- `MIMIC-CXR-JPG` (v2.1.0) — ảnh JPG + CSV metadata\n",
11
+ "- `MIMIC-CXR` (v2.1.0) — report `.txt` (Findings / Impression)\n",
12
+ "- `MIMIC-Ext-MIMIC-CXR-VQA` (v1.0.0) — câu hỏi/đáp VQA\n",
13
+ "\n",
14
+ "**Scope:** chỉ phân tích bệnh nhân có `subject_id` bắt đầu bằng `18` (folder `p18`).\n",
15
+ "\n",
16
+ "> ℹ️ **Không cần tải ảnh JPG** để chạy notebook này — toàn bộ EDA dựa trên CSV, .txt reports và .json VQA."
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "markdown",
21
+ "metadata": {},
22
+ "source": [
23
+ "## 0. Cấu hình đường dẫn"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": "from pathlib import Path\n\nDATA_DIR = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\")\nCXR_ROOT = DATA_DIR / \"mimic-cxr-reports\" # files/p10…p19/pXXXXXX/sYYYYYY.txt\n\nSPLIT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-split.csv\"\nMETA_CSV = DATA_DIR / \"mimic-cxr-2.0.0-metadata.csv\"\nCHEXPERT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-chexpert.csv\"\n\n_VQA_DIR = (DATA_DIR\n / \"mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\"\n / \"MIMIC-Ext-MIMIC-CXR-VQA\"\n / \"dataset\")\nVQA_TRAIN = _VQA_DIR / \"train.json\"\nVQA_VALID = _VQA_DIR / \"valid.json\"\nVQA_TEST = _VQA_DIR / \"test.json\"\n\n# Kiểm tra nhanh\nfor name, p in [(\"SPLIT_CSV\", SPLIT_CSV),\n (\"META_CSV\", META_CSV),\n (\"CHEXPERT_CSV\", CHEXPERT_CSV),\n (\"CXR_ROOT\", CXR_ROOT),\n (\"VQA_TRAIN\", VQA_TRAIN)]:\n status = \"✓\" if p.exists() else \"✗ NOT FOUND\"\n print(f\" {status} {name}: {p}\")\n\nprint(\"\\nPaths configured.\")"
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "import pandas as pd\n",
40
+ "import numpy as np\n",
41
+ "import json\n",
42
+ "import re\n",
43
+ "import matplotlib.pyplot as plt\n",
44
+ "import matplotlib.ticker as mticker\n",
45
+ "import seaborn as sns\n",
46
+ "from collections import Counter\n",
47
+ "\n",
48
+ "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
49
+ "plt.rcParams[\"figure.dpi\"] = 120\n",
50
+ "plt.rcParams[\"figure.figsize\"] = (10, 4)\n",
51
+ "\n",
52
+ "CHEXPERT_LABELS = [\n",
53
+ " \"Atelectasis\", \"Cardiomegaly\", \"Consolidation\", \"Edema\",\n",
54
+ " \"Enlarged Cardiomediastinum\", \"Fracture\", \"Lung Lesion\",\n",
55
+ " \"Lung Opacity\", \"No Finding\", \"Pleural Effusion\",\n",
56
+ " \"Pleural Other\", \"Pneumonia\", \"Pneumothorax\", \"Support Devices\"\n",
57
+ "]\n",
58
+ "\n",
59
+ "print(\"Libraries imported.\")"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "metadata": {},
65
+ "source": [
66
+ "## 1. Load & lọc subset p18"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": null,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "split_df = pd.read_csv(SPLIT_CSV)\n",
76
+ "meta_df = pd.read_csv(META_CSV)\n",
77
+ "chexpert_df = pd.read_csv(CHEXPERT_CSV)\n",
78
+ "\n",
79
+ "# Lọc p18\n",
80
+ "p18_split = split_df[split_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
81
+ "p18_meta = meta_df[meta_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
82
+ "p18_chex = chexpert_df[chexpert_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
83
+ "\n",
84
+ "print(f\"split.csv — p18 images : {len(p18_split):,}\")\n",
85
+ "print(f\"metadata — p18 images : {len(p18_meta):,}\")\n",
86
+ "print(f\"chexpert — p18 studies : {len(p18_chex):,}\")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "# Merge split + metadata (by dicom_id)\n",
96
+ "df = p18_split.merge(\n",
97
+ " p18_meta[[\"dicom_id\", \"ViewPosition\", \"Rows\", \"Columns\"]],\n",
98
+ " on=\"dicom_id\", how=\"left\"\n",
99
+ ")\n",
100
+ "\n",
101
+ "print(f\"Merged shape: {df.shape}\")\n",
102
+ "df.head(3)"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "markdown",
107
+ "metadata": {},
108
+ "source": [
109
+ "## 2. Tổng quan số lượng ảnh & report theo split"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "metadata": {},
116
+ "outputs": [],
117
+ "source": [
118
+ "# Số ảnh theo split\n",
119
+ "img_per_split = df[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\n",
120
+ "\n",
121
+ "# Số study (≈ report) theo split (mỗi study_id = 1 report)\n",
122
+ "study_per_split = (\n",
123
+ " df.drop_duplicates(\"study_id\")[\"split\"]\n",
124
+ " .value_counts()\n",
125
+ " .reindex([\"train\", \"validate\", \"test\"])\n",
126
+ ")\n",
127
+ "\n",
128
+ "summary = pd.DataFrame({\n",
129
+ " \"Images (dicom_id)\": img_per_split,\n",
130
+ " \"Studies / Reports\": study_per_split\n",
131
+ "})\n",
132
+ "summary.loc[\"TOTAL\"] = summary.sum()\n",
133
+ "print(summary.to_string())"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "fig, axes = plt.subplots(1, 2, figsize=(11, 4))\n",
143
+ "for ax, col, title in zip(axes, summary.columns, [\"Số ảnh theo split\", \"Số study/report theo split\"]):\n",
144
+ " vals = summary.loc[[\"train\",\"validate\",\"test\"], col]\n",
145
+ " bars = ax.bar(vals.index, vals.values, color=sns.color_palette(\"muted\", 3))\n",
146
+ " ax.bar_label(bars, fmt=\"%d\")\n",
147
+ " ax.set_title(title)\n",
148
+ " ax.set_ylabel(\"Count\")\n",
149
+ "plt.suptitle(\"p18 subset — images vs reports per split\", fontsize=13, y=1.02)\n",
150
+ "plt.tight_layout()\n",
151
+ "plt.show()"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "markdown",
156
+ "metadata": {},
157
+ "source": [
158
+ "## 3. Số ảnh mỗi study (1 study → bao nhiêu ảnh?)"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": null,
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "imgs_per_study = df.groupby(\"study_id\")[\"dicom_id\"].count()\n",
168
+ "count_dist = imgs_per_study.value_counts().sort_index()\n",
169
+ "\n",
170
+ "print(\"Images per study distribution:\")\n",
171
+ "print(count_dist.to_string())\n",
172
+ "print(f\"\\nMax images in a single study: {imgs_per_study.max()}\")\n",
173
+ "print(f\"Mean images per study : {imgs_per_study.mean():.2f}\")"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "fig, ax = plt.subplots(figsize=(8, 4))\n",
183
+ "ax.bar(count_dist.index.astype(str), count_dist.values, color=sns.color_palette(\"Blues_d\", len(count_dist)))\n",
184
+ "ax.set_xlabel(\"Số ảnh trong study\")\n",
185
+ "ax.set_ylabel(\"Số study\")\n",
186
+ "ax.set_title(\"Distribution: số ảnh mỗi study (p18)\")\n",
187
+ "for i, v in zip(count_dist.index, count_dist.values):\n",
188
+ " ax.text(str(i), v + max(count_dist)*0.01, str(v), ha=\"center\", va=\"bottom\", fontsize=9)\n",
189
+ "plt.tight_layout()\n",
190
+ "plt.show()"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "markdown",
195
+ "metadata": {},
196
+ "source": [
197
+ "## 4. Phân bố View Position (AP, PA, Lateral, ...)"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": null,
203
+ "metadata": {},
204
+ "outputs": [],
205
+ "source": [
206
+ "view_counts = df[\"ViewPosition\"].fillna(\"Unknown\").value_counts()\n",
207
+ "print(\"View position counts:\")\n",
208
+ "print(view_counts.to_string())\n",
209
+ "print(f\"\\nTotal images: {len(df):,}\")"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": null,
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
219
+ "\n",
220
+ "# Bar chart\n",
221
+ "bars = axes[0].bar(view_counts.index, view_counts.values,\n",
222
+ " color=sns.color_palette(\"Set2\", len(view_counts)))\n",
223
+ "axes[0].bar_label(bars, fmt=\"%d\")\n",
224
+ "axes[0].set_title(\"Số ảnh theo View Position\")\n",
225
+ "axes[0].set_ylabel(\"Count\")\n",
226
+ "\n",
227
+ "# Pie chart\n",
228
+ "axes[1].pie(view_counts.values, labels=view_counts.index, autopct=\"%1.1f%%\",\n",
229
+ " colors=sns.color_palette(\"Set2\", len(view_counts)))\n",
230
+ "axes[1].set_title(\"Tỉ lệ View Position\")\n",
231
+ "\n",
232
+ "plt.suptitle(\"View Position Distribution — p18\", fontsize=13)\n",
233
+ "plt.tight_layout()\n",
234
+ "plt.show()"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": null,
240
+ "metadata": {},
241
+ "outputs": [],
242
+ "source": [
243
+ "# View distribution theo split\n",
244
+ "view_split = df.groupby([\"split\", \"ViewPosition\"]).size().unstack(fill_value=0)\n",
245
+ "view_split = view_split.reindex([\"train\", \"validate\", \"test\"])\n",
246
+ "view_split.plot(kind=\"bar\", figsize=(10, 4), color=sns.color_palette(\"Set2\", view_split.shape[1]))\n",
247
+ "plt.title(\"View Position theo split — p18\")\n",
248
+ "plt.xlabel(\"Split\")\n",
249
+ "plt.ylabel(\"Count\")\n",
250
+ "plt.xticks(rotation=0)\n",
251
+ "plt.legend(title=\"ViewPosition\")\n",
252
+ "plt.tight_layout()\n",
253
+ "plt.show()"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "markdown",
258
+ "id": "ae9f3d3c",
259
+ "source": "## 4b. Frontal-Only Sampling Strategy (AP > PA)\n\nChiến lược train: **1 report + 1 ảnh frontal** mỗi study.\n- Chỉ giữ AP hoặc PA; nếu study có cả hai thì **ưu tiên AP**.\n- Study không có ảnh frontal nào → loại khỏi tập train.",
260
+ "metadata": {}
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "id": "d2ce6beb",
265
+ "source": "frontal = df[df[\"ViewPosition\"].isin([\"AP\", \"PA\"])].copy()\n\n# Với mỗi study: chọn AP trước, nếu không có thì chọn PA (lấy 1 ảnh duy nhất)\ndef pick_frontal_view(group):\n ap = group[group[\"ViewPosition\"] == \"AP\"]\n if len(ap) > 0:\n return ap.iloc[[0]]\n return group[group[\"ViewPosition\"] == \"PA\"].iloc[[0]]\n\nfrontal_1img = (\n frontal.groupby(\"study_id\", group_keys=False)\n .apply(pick_frontal_view)\n .reset_index(drop=True)\n)\n\n# Thống kê tổng quan\nn_study_total = df[\"study_id\"].nunique()\nn_study_frontal = frontal_1img[\"study_id\"].nunique()\nn_study_no_front = n_study_total - n_study_frontal\n\nprint(\"=== Frontal-Only Sampling (p18) ===\")\nprint(f\"Tổng số study : {n_study_total:,}\")\nprint(f\"Study có ảnh frontal (AP/PA) : {n_study_frontal:,} ({n_study_frontal/n_study_total*100:.1f}%)\")\nprint(f\"Study bị loại (không có frontal): {n_study_no_front:,} ({n_study_no_front/n_study_total*100:.1f}%)\")\nprint()\nprint(f\"Ảnh được chọn theo view:\")\nprint(frontal_1img[\"ViewPosition\"].value_counts().to_string())\nprint()\nprint(\"=== Mẫu train sau khi filter (split) ===\")\nsplit_frontal = frontal_1img[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\nsplit_all = df.drop_duplicates(\"study_id\")[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\ncompare = pd.DataFrame({\n \"All studies\": split_all,\n \"Frontal-only\": split_frontal,\n \"Giảm (%)\": ((split_all - split_frontal) / split_all * 100).round(1)\n})\nprint(compare.to_string())",
266
+ "metadata": {},
267
+ "execution_count": null,
268
+ "outputs": []
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "id": "9d4aaf5c",
273
+ "source": "fig, axes = plt.subplots(1, 3, figsize=(14, 4))\n\n# 1. All vs Frontal-only (study count)\ncats = [\"All studies\", \"Frontal-only\"]\nvals = [n_study_total, n_study_frontal]\nbars = axes[0].bar(cats, vals, color=[\"#4C72B0\", \"#55A868\"], width=0.5)\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Study count: All vs Frontal-only\")\naxes[0].set_ylabel(\"Số study\")\n\n# 2. View breakdown của ảnh được chọn\nvc = frontal_1img[\"ViewPosition\"].value_counts()\naxes[1].pie(vc.values, labels=vc.index, autopct=\"%1.1f%%\",\n colors=[\"#4C72B0\", \"#DD8452\"])\naxes[1].set_title(\"View được chọn (AP ưu tiên)\")\n\n# 3. So sánh train/val/test\nx = np.arange(3)\nw = 0.35\nsplits = [\"train\", \"validate\", \"test\"]\naxes[2].bar(x - w/2, split_all.values, w, label=\"All\", color=\"#4C72B0\", alpha=0.85)\naxes[2].bar(x + w/2, split_frontal.values, w, label=\"Frontal-only\", color=\"#55A868\", alpha=0.85)\naxes[2].set_xticks(x)\naxes[2].set_xticklabels(splits)\naxes[2].set_title(\"Frontal-only vs All (per split)\")\naxes[2].set_ylabel(\"Số study\")\naxes[2].legend()\n\nplt.suptitle(\"Frontal-Only Sampling Strategy — p18\", fontsize=13)\nplt.tight_layout()\nplt.show()",
274
+ "metadata": {},
275
+ "execution_count": null,
276
+ "outputs": []
277
+ },
278
+ {
279
+ "cell_type": "markdown",
280
+ "metadata": {},
281
+ "source": [
282
+ "## 5. CheXpert Labels — 14 nhãn bệnh lý"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": null,
288
+ "metadata": {},
289
+ "outputs": [],
290
+ "source": [
291
+ "# Chỉ lấy cột labels (1 = positive, 0 = negative, -1 = uncertain, NaN = not mentioned)\n",
292
+ "label_cols = [c for c in p18_chex.columns if c in CHEXPERT_LABELS]\n",
293
+ "\n",
294
+ "# Số study có nhãn Positive (=1) mỗi bệnh\n",
295
+ "positive_counts = (p18_chex[label_cols] == 1).sum().sort_values(ascending=False)\n",
296
+ "uncertain_counts = (p18_chex[label_cols] == -1).sum().sort_values(ascending=False)\n",
297
+ "negative_counts = (p18_chex[label_cols] == 0).sum().sort_values(ascending=False)\n",
298
+ "\n",
299
+ "label_summary = pd.DataFrame({\n",
300
+ " \"Positive\": positive_counts,\n",
301
+ " \"Uncertain\": uncertain_counts,\n",
302
+ " \"Negative\": negative_counts,\n",
303
+ " \"Not Mentioned\": p18_chex[label_cols].isna().sum()\n",
304
+ "})\n",
305
+ "label_summary[\"Total Studies\"] = len(p18_chex)\n",
306
+ "label_summary[\"Positive %\"] = (label_summary[\"Positive\"] / len(p18_chex) * 100).round(1)\n",
307
+ "print(label_summary[[\"Positive\",\"Uncertain\",\"Negative\",\"Not Mentioned\",\"Positive %\"]]\n",
308
+ " .sort_values(\"Positive\", ascending=False).to_string())"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": null,
314
+ "metadata": {},
315
+ "outputs": [],
316
+ "source": "# Headers hành chính — không phải findings\nADMIN_HEADERS = {\n 'EXAMINATION', 'INDICATION', 'CLINICAL INDICATION', 'TECHNIQUE',\n 'COMPARISON', 'HISTORY', 'REASON', 'REASON FOR EXAM',\n 'REASON FOR EXAMINATION', 'PROCEDURE', 'FINAL REPORT',\n 'NOTIFICATION', 'RECOMMENDATION', 'ADDENDUM'\n}\n\n# Detect section header: dòng bắt đầu bằng ALL-CAPS (có thể có space/dấu câu) rồi đến \":\"\nSECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\-]{1,70}?):\\s*', re.MULTILINE)\n\ndef parse_report(txt_path: Path) -> dict:\n \"\"\"\n Parse report .txt thành dict {'findings': str|None, 'impression': str|None}.\n\n Quy luật detect section: mọi header đều VIẾT HOA TOÀN BỘ và kết thúc bằng ':',\n ví dụ: FINDINGS:, IMPRESSION:, FRONTAL AND LATERAL VIEWS OF THE CHEST:\n → dùng regex bắt pattern đó, không hardcode t��ng keyword.\n\n Nếu không có section FINDINGS tường minh, fallback lấy section\n descriptive đầu tiên (không phải admin header).\n \"\"\"\n try:\n text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n except FileNotFoundError:\n return {\"findings\": None, \"impression\": None}\n\n matches = list(SECTION_RE.finditer(text))\n if not matches:\n return {\"findings\": None, \"impression\": None}\n\n # Tách từng section thành (header, content)\n sections = []\n for i, m in enumerate(matches):\n header = m.group(1).strip()\n start = m.end()\n end = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n content = text[start:end].strip()\n sections.append((header, content))\n\n findings = impression = None\n for header, content in sections:\n h = header.upper()\n if \"FINDING\" in h and findings is None:\n findings = content or None\n elif \"IMPRESSION\" in h and impression is None:\n impression = content or None\n\n # Fallback: không có FINDINGS tường minh → lấy section descriptive đầu tiên\n if findings is None:\n for header, content in sections:\n h = header.upper()\n if h not in ADMIN_HEADERS and \"IMPRESSION\" not in h and content:\n findings = content\n break\n\n return {\"findings\": findings, \"impression\": impression}\n\n\n# Lấy danh sách unique studies trong p18\np18_studies = (\n df[[\"subject_id\", \"study_id\"]]\n .drop_duplicates(\"study_id\")\n .reset_index(drop=True)\n)\n\nprint(f\"Số study cần parse: {len(p18_studies):,}\")\nprint(\"Parsing reports...\")\n\nrecords = []\nfor _, row in p18_studies.iterrows():\n sid = str(row[\"subject_id\"])\n stid = str(row[\"study_id\"])\n txt_path = CXR_ROOT / \"files\" / \"p18\" / f\"p{sid}\" / f\"s{stid}.txt\"\n parsed = parse_report(txt_path)\n records.append({\"study_id\": stid, **parsed})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"] = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\ntotal = len(report_df)\nprint(f\"\\nFindings found : {report_df['findings'].notna().sum():,} / {total:,} ({report_df['findings'].notna().mean()*100:.1f}%)\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {total:,} ({report_df['impression'].notna().mean()*100:.1f}%)\")\nboth = (report_df['findings'].notna() & report_df['impression'].notna()).sum()\nneither = (report_df['findings'].isna() & report_df['impression'].isna()).sum()\nprint(f\"Cả hai : {both:,} / {total:,} ({both/total*100:.1f}%)\")\nprint(f\"Không có cả hai : {neither:,} / {total:,} ({neither/total*100:.1f}%)\")"
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": null,
321
+ "metadata": {},
322
+ "outputs": [],
323
+ "source": [
324
+ "# Số nhãn positive mỗi study (label co-occurrence)\n",
325
+ "labels_per_study = (p18_chex[label_cols] == 1).sum(axis=1)\n",
326
+ "print(\"Số nhãn positive mỗi study:\")\n",
327
+ "print(labels_per_study.value_counts().sort_index().to_string())\n",
328
+ "\n",
329
+ "fig, ax = plt.subplots(figsize=(9, 4))\n",
330
+ "lps_counts = labels_per_study.value_counts().sort_index()\n",
331
+ "ax.bar(lps_counts.index.astype(str), lps_counts.values, color=sns.color_palette(\"Blues_d\", len(lps_counts)))\n",
332
+ "ax.set_xlabel(\"Số nhãn positive\")\n",
333
+ "ax.set_ylabel(\"Số study\")\n",
334
+ "ax.set_title(\"Phân bố số nhãn positive mỗi study (p18)\")\n",
335
+ "plt.tight_layout()\n",
336
+ "plt.show()"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "markdown",
341
+ "metadata": {},
342
+ "source": [
343
+ "## 6. Phân tích Report — Findings & Impression"
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "code",
348
+ "execution_count": null,
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "def parse_report(txt_path: Path) -> dict:\n",
353
+ " \"\"\"Trả về dict với 'findings' và 'impression' (str hoặc None).\"\"\"\n",
354
+ " try:\n",
355
+ " text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n",
356
+ " except FileNotFoundError:\n",
357
+ " return {\"findings\": None, \"impression\": None}\n",
358
+ "\n",
359
+ " text = re.sub(r\"[\\r\\n]+\", \" \", text) # flatten newlines\n",
360
+ "\n",
361
+ " def extract_section(pattern, text):\n",
362
+ " m = re.search(pattern, text, re.IGNORECASE)\n",
363
+ " if not m:\n",
364
+ " return None\n",
365
+ " start = m.end()\n",
366
+ " # cắt đến section tiếp theo hoặc hết string\n",
367
+ " next_sec = re.search(\n",
368
+ " r\"(IMPRESSION|FINDINGS|CONCLUSION|RECOMMENDATION|NOTIFICATION)\",\n",
369
+ " text[start:], re.IGNORECASE\n",
370
+ " )\n",
371
+ " end = start + next_sec.start() if next_sec else len(text)\n",
372
+ " return text[start:end].strip()\n",
373
+ "\n",
374
+ " findings = extract_section(r\"FINDINGS\\s*:\", text)\n",
375
+ " impression = extract_section(r\"IMPRESSION\\s*:\", text)\n",
376
+ " return {\"findings\": findings, \"impression\": impression}\n",
377
+ "\n",
378
+ "\n",
379
+ "# Lấy danh sách unique studies trong p18\n",
380
+ "p18_studies = (\n",
381
+ " df[[\"subject_id\", \"study_id\"]]\n",
382
+ " .drop_duplicates(\"study_id\")\n",
383
+ " .reset_index(drop=True)\n",
384
+ ")\n",
385
+ "\n",
386
+ "print(f\"Số study cần parse: {len(p18_studies):,}\")\n",
387
+ "print(\"Parsing reports... (có thể mất vài giây)\")\n",
388
+ "\n",
389
+ "records = []\n",
390
+ "for _, row in p18_studies.iterrows():\n",
391
+ " sid = str(row[\"subject_id\"])\n",
392
+ " stid = str(row[\"study_id\"])\n",
393
+ " txt_path = CXR_ROOT / \"files\" / \"p18\" / f\"p{sid}\" / f\"s{stid}.txt\"\n",
394
+ " parsed = parse_report(txt_path)\n",
395
+ " records.append({\"study_id\": stid, **parsed})\n",
396
+ "\n",
397
+ "report_df = pd.DataFrame(records)\n",
398
+ "report_df[\"findings_len\"] = report_df[\"findings\"].dropna().str.split().str.len()\n",
399
+ "report_df[\"impression_len\"] = report_df[\"impression\"].dropna().str.split().str.len()\n",
400
+ "\n",
401
+ "print(f\"\\nFindings found : {report_df['findings'].notna().sum():,} / {len(report_df):,}\")\n",
402
+ "print(f\"Impression found : {report_df['impression'].notna().sum():,} / {len(report_df):,}\")"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": null,
408
+ "metadata": {},
409
+ "outputs": [],
410
+ "source": [
411
+ "# Descriptive stats\n",
412
+ "print(\"=== Findings word count ===\")\n",
413
+ "print(report_df[\"findings_len\"].describe().round(1).to_string())\n",
414
+ "print(\"\\n=== Impression word count ===\")\n",
415
+ "print(report_df[\"impression_len\"].describe().round(1).to_string())"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": null,
421
+ "metadata": {},
422
+ "outputs": [],
423
+ "source": [
424
+ "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
425
+ "\n",
426
+ "for ax, col, title, color in zip(\n",
427
+ " axes,\n",
428
+ " [\"findings_len\", \"impression_len\"],\n",
429
+ " [\"Findings — phân bố độ dài (số từ)\", \"Impression — phân bố độ dài (số từ)\"],\n",
430
+ " [\"steelblue\", \"tomato\"]\n",
431
+ "):\n",
432
+ " data = report_df[col].dropna()\n",
433
+ " # clip outliers để biểu đồ dễ nhìn\n",
434
+ " p99 = data.quantile(0.99)\n",
435
+ " data_clipped = data[data <= p99]\n",
436
+ " ax.hist(data_clipped, bins=40, color=color, edgecolor=\"white\", alpha=0.85)\n",
437
+ " ax.axvline(data.median(), color=\"black\", linestyle=\"--\", linewidth=1.2, label=f\"Median={data.median():.0f}\")\n",
438
+ " ax.axvline(data.mean(), color=\"gray\", linestyle=\":\", linewidth=1.2, label=f\"Mean={data.mean():.0f}\")\n",
439
+ " ax.set_title(title)\n",
440
+ " ax.set_xlabel(\"Số từ\")\n",
441
+ " ax.set_ylabel(\"Số report\")\n",
442
+ " ax.legend(fontsize=9)\n",
443
+ " ax.text(0.97, 0.95, f\"n={len(data):,}\\n(hiển thị ≤p99={p99:.0f}w)\",\n",
444
+ " transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
445
+ "\n",
446
+ "plt.suptitle(\"Phân bố độ dài report — p18\", fontsize=13)\n",
447
+ "plt.tight_layout()\n",
448
+ "plt.show()"
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "code",
453
+ "execution_count": null,
454
+ "metadata": {},
455
+ "outputs": [],
456
+ "source": [
457
+ "# Box plot so sánh Findings vs Impression\n",
458
+ "combined = pd.DataFrame({\n",
459
+ " \"word_count\": pd.concat([report_df[\"findings_len\"], report_df[\"impression_len\"]], ignore_index=True),\n",
460
+ " \"section\": [\"Findings\"] * len(report_df) + [\"Impression\"] * len(report_df)\n",
461
+ "}).dropna()\n",
462
+ "\n",
463
+ "fig, ax = plt.subplots(figsize=(7, 4))\n",
464
+ "sns.boxplot(data=combined, x=\"section\", y=\"word_count\",\n",
465
+ " palette=[\"steelblue\", \"tomato\"], showfliers=False, ax=ax)\n",
466
+ "ax.set_title(\"Findings vs Impression — độ dài (box plot, no outliers)\")\n",
467
+ "ax.set_ylabel(\"Số từ\")\n",
468
+ "plt.tight_layout()\n",
469
+ "plt.show()"
470
+ ]
471
+ },
472
+ {
473
+ "cell_type": "markdown",
474
+ "metadata": {},
475
+ "source": [
476
+ "## 7. VQA — phân tích câu hỏi & đáp"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "code",
481
+ "execution_count": null,
482
+ "metadata": {},
483
+ "outputs": [],
484
+ "source": [
485
+ "vqa_dfs = []\n",
486
+ "for fpath, split_name in [(VQA_TRAIN, \"train\"), (VQA_VALID, \"valid\"), (VQA_TEST, \"test\")]:\n",
487
+ " if fpath.exists():\n",
488
+ " with open(fpath, encoding=\"utf-8\") as f:\n",
489
+ " data = json.load(f)\n",
490
+ " tmp = pd.DataFrame(data)\n",
491
+ " tmp[\"split\"] = split_name\n",
492
+ " vqa_dfs.append(tmp)\n",
493
+ " else:\n",
494
+ " print(f\"[WARNING] File not found: {fpath}\")\n",
495
+ "\n",
496
+ "vqa_all = pd.concat(vqa_dfs, ignore_index=True)\n",
497
+ "\n",
498
+ "# Lọc p18\n",
499
+ "vqa_p18 = vqa_all[vqa_all[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
500
+ "\n",
501
+ "print(f\"VQA total records : {len(vqa_all):,}\")\n",
502
+ "print(f\"VQA p18 records : {len(vqa_p18):,}\")\n",
503
+ "print(f\"\\nColumns: {list(vqa_p18.columns)}\")"
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "code",
508
+ "execution_count": null,
509
+ "metadata": {},
510
+ "outputs": [],
511
+ "source": [
512
+ "# Số VQA mẫu theo split\n",
513
+ "print(\"VQA p18 per split:\")\n",
514
+ "print(vqa_p18[\"split\"].value_counts().to_string())"
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "code",
519
+ "execution_count": null,
520
+ "metadata": {},
521
+ "outputs": [],
522
+ "source": [
523
+ "# Semantic type: verify / choose / query\n",
524
+ "sem_counts = vqa_p18[\"semantic_type\"].value_counts()\n",
525
+ "print(\"Semantic type (verify/choose/query):\")\n",
526
+ "print(sem_counts.to_string())\n",
527
+ "\n",
528
+ "# Content type: presence / anatomy / attribute / abnormality / size / plane / gender\n",
529
+ "con_counts = vqa_p18[\"content_type\"].value_counts()\n",
530
+ "print(\"\\nContent type:\")\n",
531
+ "print(con_counts.to_string())"
532
+ ]
533
+ },
534
+ {
535
+ "cell_type": "code",
536
+ "execution_count": null,
537
+ "metadata": {},
538
+ "outputs": [],
539
+ "source": [
540
+ "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
541
+ "\n",
542
+ "# Semantic type\n",
543
+ "bars = axes[0].bar(sem_counts.index, sem_counts.values,\n",
544
+ " color=sns.color_palette(\"Set1\", len(sem_counts)))\n",
545
+ "axes[0].bar_label(bars, fmt=\"%d\")\n",
546
+ "axes[0].set_title(\"VQA — Semantic Type (p18)\")\n",
547
+ "axes[0].set_ylabel(\"Count\")\n",
548
+ "\n",
549
+ "# Content type\n",
550
+ "bars2 = axes[1].bar(con_counts.index, con_counts.values,\n",
551
+ " color=sns.color_palette(\"Set2\", len(con_counts)))\n",
552
+ "axes[1].bar_label(bars2, fmt=\"%d\")\n",
553
+ "axes[1].set_title(\"VQA — Content Type (p18)\")\n",
554
+ "axes[1].set_ylabel(\"Count\")\n",
555
+ "axes[1].tick_params(axis=\"x\", rotation=30)\n",
556
+ "\n",
557
+ "plt.suptitle(\"VQA Question Analysis — p18\", fontsize=13)\n",
558
+ "plt.tight_layout()\n",
559
+ "plt.show()"
560
+ ]
561
+ },
562
+ {
563
+ "cell_type": "markdown",
564
+ "id": "c313b9c3",
565
+ "source": "### VQA × View Position — mẫu hỏi đáp thuộc ảnh view nào",
566
+ "metadata": {}
567
+ },
568
+ {
569
+ "cell_type": "code",
570
+ "id": "0791482f",
571
+ "source": "# image_id trong VQA = dicom_id trong metadata\nvqa_view = vqa_p18.merge(\n p18_meta[[\"dicom_id\", \"ViewPosition\"]],\n left_on=\"image_id\", right_on=\"dicom_id\",\n how=\"left\"\n)\n\nmissing_view_vqa = vqa_view[\"ViewPosition\"].isna().sum()\nvqa_view[\"ViewPosition\"] = vqa_view[\"ViewPosition\"].fillna(\"Unknown\")\n\nview_vqa_counts = vqa_view[\"ViewPosition\"].value_counts()\nprint(\"=== VQA samples theo View Position (p18) ===\")\nprint(view_vqa_counts.to_string())\nprint(f\"\\nKhông map được ViewPosition: {missing_view_vqa:,} ({missing_view_vqa/len(vqa_view)*100:.1f}%)\")",
572
+ "metadata": {},
573
+ "execution_count": null,
574
+ "outputs": []
575
+ },
576
+ {
577
+ "cell_type": "code",
578
+ "id": "049baaef",
579
+ "source": "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. Bar: số mẫu VQA theo view\nbars = axes[0].bar(view_vqa_counts.index, view_vqa_counts.values,\n color=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Số mẫu VQA theo View Position\")\naxes[0].set_ylabel(\"Số mẫu\")\n\n# 2. Pie\naxes[1].pie(view_vqa_counts.values, labels=view_vqa_counts.index,\n autopct=\"%1.1f%%\", colors=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[1].set_title(\"Tỉ lệ VQA theo View Position\")\n\n# 3. Semantic type × View (stacked bar)\nsem_view = vqa_view.groupby([\"ViewPosition\", \"semantic_type\"]).size().unstack(fill_value=0)\nsem_view.plot(kind=\"bar\", ax=axes[2], color=sns.color_palette(\"Set1\", sem_view.shape[1]),\n width=0.7, stacked=True)\naxes[2].set_title(\"Semantic Type × View Position\")\naxes[2].set_xlabel(\"View Position\")\naxes[2].set_ylabel(\"Số mẫu\")\naxes[2].tick_params(axis=\"x\", rotation=30)\naxes[2].legend(title=\"Semantic Type\", fontsize=8)\n\nplt.suptitle(\"VQA × View Position — p18\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# Content type × View\nprint(\"\\nContent type theo View Position:\")\nprint(vqa_view.groupby([\"ViewPosition\", \"content_type\"]).size()\n .unstack(fill_value=0).to_string())",
580
+ "metadata": {},
581
+ "execution_count": null,
582
+ "outputs": []
583
+ },
584
+ {
585
+ "cell_type": "code",
586
+ "execution_count": null,
587
+ "metadata": {},
588
+ "outputs": [],
589
+ "source": [
590
+ "# Cross-tab: semantic_type × content_type\n",
591
+ "cross = pd.crosstab(vqa_p18[\"semantic_type\"], vqa_p18[\"content_type\"])\n",
592
+ "print(\"Cross-tab semantic × content:\")\n",
593
+ "print(cross.to_string())\n",
594
+ "\n",
595
+ "fig, ax = plt.subplots(figsize=(10, 3))\n",
596
+ "sns.heatmap(cross, annot=True, fmt=\"d\", cmap=\"YlOrRd\", ax=ax)\n",
597
+ "ax.set_title(\"VQA — Semantic Type × Content Type (p18)\")\n",
598
+ "plt.tight_layout()\n",
599
+ "plt.show()"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": null,
605
+ "metadata": {},
606
+ "outputs": [],
607
+ "source": [
608
+ "# Phân bố độ dài câu hỏi (số từ)\n",
609
+ "vqa_p18[\"q_len\"] = vqa_p18[\"question\"].str.split().str.len()\n",
610
+ "\n",
611
+ "print(\"Question length stats:\")\n",
612
+ "print(vqa_p18[\"q_len\"].describe().round(1).to_string())\n",
613
+ "\n",
614
+ "fig, ax = plt.subplots(figsize=(9, 3))\n",
615
+ "ax.hist(vqa_p18[\"q_len\"].clip(upper=vqa_p18[\"q_len\"].quantile(0.99)),\n",
616
+ " bins=30, color=\"slateblue\", edgecolor=\"white\")\n",
617
+ "ax.axvline(vqa_p18[\"q_len\"].median(), color=\"black\", linestyle=\"--\",\n",
618
+ " label=f\"Median={vqa_p18['q_len'].median():.0f}\")\n",
619
+ "ax.set_title(\"Phân bố độ dài câu hỏi VQA (p18)\")\n",
620
+ "ax.set_xlabel(\"Số từ\")\n",
621
+ "ax.set_ylabel(\"Count\")\n",
622
+ "ax.legend()\n",
623
+ "plt.tight_layout()\n",
624
+ "plt.show()"
625
+ ]
626
+ },
627
+ {
628
+ "cell_type": "code",
629
+ "execution_count": null,
630
+ "metadata": {},
631
+ "outputs": [],
632
+ "source": [
633
+ "# Phân bố dạng câu trả lời: yes/no vs. khác\n",
634
+ "def classify_answer(ans_list):\n",
635
+ " if not isinstance(ans_list, list) or len(ans_list) == 0:\n",
636
+ " return \"no answer\"\n",
637
+ " a = ans_list[0].strip().lower()\n",
638
+ " if a in [\"yes\", \"no\"]:\n",
639
+ " return a\n",
640
+ " return \"open\"\n",
641
+ "\n",
642
+ "vqa_p18[\"ans_type\"] = vqa_p18[\"answer\"].apply(classify_answer)\n",
643
+ "\n",
644
+ "ans_counts = vqa_p18[\"ans_type\"].value_counts()\n",
645
+ "print(\"Answer type distribution:\")\n",
646
+ "print(ans_counts.to_string())\n",
647
+ "\n",
648
+ "fig, ax = plt.subplots(figsize=(6, 3))\n",
649
+ "bars = ax.bar(ans_counts.index, ans_counts.values,\n",
650
+ " color=sns.color_palette(\"Pastel1\", len(ans_counts)))\n",
651
+ "ax.bar_label(bars, fmt=\"%d\")\n",
652
+ "ax.set_title(\"VQA — Answer Type Distribution (p18)\")\n",
653
+ "ax.set_ylabel(\"Count\")\n",
654
+ "plt.tight_layout()\n",
655
+ "plt.show()"
656
+ ]
657
+ },
658
+ {
659
+ "cell_type": "markdown",
660
+ "metadata": {},
661
+ "source": [
662
+ "## 8. Gợi ý thêm — Missing data & Data Quality"
663
+ ]
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "execution_count": null,
668
+ "metadata": {},
669
+ "outputs": [],
670
+ "source": [
671
+ "# 8.1 Tỉ lệ study không có findings / không có impression\n",
672
+ "no_findings = report_df[\"findings\"].isna().sum()\n",
673
+ "no_impression = report_df[\"impression\"].isna().sum()\n",
674
+ "total_studies = len(report_df)\n",
675
+ "\n",
676
+ "print(f\"Studies thiếu Findings : {no_findings:,} / {total_studies:,} ({no_findings/total_studies*100:.1f}%)\")\n",
677
+ "print(f\"Studies thiếu Impression : {no_impression:,} / {total_studies:,} ({no_impression/total_studies*100:.1f}%)\")\n",
678
+ "both_missing = (report_df[\"findings\"].isna() & report_df[\"impression\"].isna()).sum()\n",
679
+ "print(f\"Studies thiếu CẢ HAI : {both_missing:,} / {total_studies:,} ({both_missing/total_studies*100:.1f}%)\")"
680
+ ]
681
+ },
682
+ {
683
+ "cell_type": "code",
684
+ "execution_count": null,
685
+ "metadata": {},
686
+ "outputs": [],
687
+ "source": [
688
+ "# 8.2 Tỉ lệ ảnh thiếu ViewPosition\n",
689
+ "missing_view = df[\"ViewPosition\"].isna().sum()\n",
690
+ "print(f\"Ảnh thiếu ViewPosition: {missing_view:,} / {len(df):,} ({missing_view/len(df)*100:.1f}%)\")"
691
+ ]
692
+ },
693
+ {
694
+ "cell_type": "code",
695
+ "execution_count": null,
696
+ "metadata": {},
697
+ "outputs": [],
698
+ "source": [
699
+ "# 8.3 Số bệnh nhân (subject_id) trong p18\n",
700
+ "n_subjects = df[\"subject_id\"].nunique()\n",
701
+ "n_studies = df[\"study_id\"].nunique()\n",
702
+ "n_images = df[\"dicom_id\"].nunique()\n",
703
+ "\n",
704
+ "print(f\"Bệnh nhân (subject_id) : {n_subjects:,}\")\n",
705
+ "print(f\"Lần khám (study_id) : {n_studies:,}\")\n",
706
+ "print(f\"Ảnh (dicom_id) : {n_images:,}\")\n",
707
+ "print(f\"\\nTrung bình study/bệnh nhân : {n_studies/n_subjects:.2f}\")\n",
708
+ "print(f\"Trung bình ảnh/bệnh nhân : {n_images/n_subjects:.2f}\")"
709
+ ]
710
+ },
711
+ {
712
+ "cell_type": "code",
713
+ "execution_count": null,
714
+ "metadata": {},
715
+ "outputs": [],
716
+ "source": [
717
+ "# 8.4 Study distribution per patient\n",
718
+ "studies_per_patient = df.groupby(\"subject_id\")[\"study_id\"].nunique()\n",
719
+ "print(\"Studies per patient stats:\")\n",
720
+ "print(studies_per_patient.describe().round(1).to_string())\n",
721
+ "\n",
722
+ "fig, ax = plt.subplots(figsize=(9, 3))\n",
723
+ "spp = studies_per_patient.value_counts().sort_index()\n",
724
+ "ax.bar(spp.index.astype(str), spp.values, color=\"mediumpurple\")\n",
725
+ "ax.set_xlabel(\"Số study mỗi bệnh nhân\")\n",
726
+ "ax.set_ylabel(\"Số bệnh nhân\")\n",
727
+ "ax.set_title(\"Phân bố số lần khám mỗi bệnh nhân — p18\")\n",
728
+ "ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True, nbins=20))\n",
729
+ "plt.tight_layout()\n",
730
+ "plt.show()"
731
+ ]
732
+ },
733
+ {
734
+ "cell_type": "code",
735
+ "execution_count": null,
736
+ "metadata": {},
737
+ "outputs": [],
738
+ "source": [
739
+ "# 8.5 Image resolution distribution (nếu có cột Rows/Columns trong metadata)\n",
740
+ "if \"Rows\" in df.columns and \"Columns\" in df.columns:\n",
741
+ " print(\"Image resolution stats:\")\n",
742
+ " print(df[[\"Rows\", \"Columns\"]].describe().round(0).to_string())\n",
743
+ "\n",
744
+ " res_counts = df.groupby([\"Rows\", \"Columns\"]).size().sort_values(ascending=False).head(15)\n",
745
+ " print(\"\\nTop-15 resolutions:\")\n",
746
+ " print(res_counts.to_string())\nelse:\n",
747
+ " print(\"Cột Rows/Columns không có trong metadata.\")"
748
+ ]
749
+ },
750
+ {
751
+ "cell_type": "markdown",
752
+ "metadata": {},
753
+ "source": [
754
+ "## 9. Tóm tắt (Summary)"
755
+ ]
756
+ },
757
+ {
758
+ "cell_type": "code",
759
+ "execution_count": null,
760
+ "metadata": {},
761
+ "outputs": [],
762
+ "source": [
763
+ "print(\"=\"*55)\n",
764
+ "print(\" SUMMARY — MIMIC-CXR Subset p18\")\n",
765
+ "print(\"=\"*55)\n",
766
+ "print(f\" Bệnh nhân : {n_subjects:,}\")\n",
767
+ "print(f\" Studies (reports) : {n_studies:,}\")\n",
768
+ "print(f\" Ảnh (dicom/jpg) : {n_images:,}\")\n",
769
+ "print()\n",
770
+ "for sp in [\"train\", \"validate\", \"test\"]:\n",
771
+ " ni = img_per_split.get(sp, 0)\n",
772
+ " ns = study_per_split.get(sp, 0)\n",
773
+ " print(f\" [{sp:>8}] ảnh={ni:>5,} studies={ns:>5,}\")\n",
774
+ "print()\n",
775
+ "print(f\" Frontal (PA+AP) : {view_counts.get('PA',0)+view_counts.get('AP',0):,} ảnh\")\n",
776
+ "print(f\" Lateral : {view_counts.get('LL',0)+view_counts.get('LATERAL',0):,} ảnh\")\n",
777
+ "print(f\" Findings available : {report_df['findings'].notna().sum():,}/{total_studies:,}\")\n",
778
+ "print(f\" Impression available : {report_df['impression'].notna().sum():,}/{total_studies:,}\")\n",
779
+ "print(f\" VQA samples (p18) : {len(vqa_p18):,}\")\n",
780
+ "print(\"=\"*55)"
781
+ ]
782
+ }
783
+ ],
784
+ "metadata": {
785
+ "kernelspec": {
786
+ "display_name": "Python 3",
787
+ "language": "python",
788
+ "name": "python3"
789
+ },
790
+ "language_info": {
791
+ "name": "python",
792
+ "version": "3.10.0"
793
+ }
794
+ },
795
+ "nbformat": 4,
796
+ "nbformat_minor": 5
797
+ }
data/eda_reports.ipynb ADDED
@@ -0,0 +1,741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# EDA — MIMIC-CXR Reports\n",
8
+ "\n",
9
+ "Phân tích chuyên sâu toàn bộ report `.txt` trong MIMIC-CXR:\n",
10
+ "- Thống kê tất cả loại section header thực tế\n",
11
+ "- Tỉ lệ report có/thiếu findings, impression theo subset\n",
12
+ "- Phân phối độ dài findings & impression\n",
13
+ "- Parser cập nhật xử lý đầy đủ alias (CONCLUSION, FINDINGS AND IMPRESSION, v.v.)"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "markdown",
18
+ "metadata": {},
19
+ "source": [
20
+ "## 0. Config"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "from pathlib import Path\n",
30
+ "\n",
31
+ "CXR_ROOT = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\\mimic-cxr-reports\") # files/p10…p19/\n",
32
+ "SPLIT_CSV = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\\mimic-cxr-2.0.0-split.csv\")\n",
33
+ "\n",
34
+ "# None = parse hết ~227k, số nguyên = sample nhanh\n",
35
+ "SAMPLE_SIZE = None\n",
36
+ "\n",
37
+ "for name, p in [(\"CXR_ROOT\", CXR_ROOT), (\"SPLIT_CSV\", SPLIT_CSV)]:\n",
38
+ " print(f\" {'✓' if p.exists() else '✗ NOT FOUND'} {name}: {p}\")"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "import re\n",
48
+ "import pandas as pd\n",
49
+ "import numpy as np\n",
50
+ "import matplotlib.pyplot as plt\n",
51
+ "import matplotlib.ticker as mticker\n",
52
+ "import seaborn as sns\n",
53
+ "from collections import Counter\n",
54
+ "\n",
55
+ "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
56
+ "plt.rcParams[\"figure.dpi\"] = 120\n",
57
+ "\n",
58
+ "ALL_SUBSETS = [f\"p{i}\" for i in range(10, 20)]\n",
59
+ "print(\"Ready.\")"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "metadata": {},
65
+ "source": [
66
+ "## 1. Parser — ALL-CAPS header detection\n",
67
+ "\n",
68
+ "Quy luật: mọi section header trong MIMIC-CXR đều **VIẾT HOA TOÀN BỘ** và kết thúc bằng `:` \n",
69
+ "→ dùng regex detect tất cả, sau đó phân loại theo nhóm."
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "# Regex: dòng bắt đầu bằng chuỗi ALL-CAPS rồi đến \":\"\n",
79
+ "SECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\.\\-]{1,70}?):\\s*', re.MULTILINE)\n",
80
+ "\n",
81
+ "# ── Nhóm IMPRESSION (nội dung kết luận) ─────────────────────────────────────\n",
82
+ "IMPRESSION_KEYWORDS = {\n",
83
+ " \"IMPRESSION\",\n",
84
+ " \"CONCLUSION\",\n",
85
+ " \"CONCLUSIONS\",\n",
86
+ " \"FINDINGS AND IMPRESSION\",\n",
87
+ " \"FINDINGS/IMPRESSION\",\n",
88
+ " \"PROVISIONAL FINDINGS IMPRESSION (PFI)\",\n",
89
+ " \"PFI\",\n",
90
+ " \"WET READ\", # quick impression trước khi có final report\n",
91
+ " \"RECOMMENDATION\",\n",
92
+ " \"RECOMMENDATION(S)\",\n",
93
+ " \"RECOMMENDATIONS\",\n",
94
+ "}\n",
95
+ "\n",
96
+ "# ── Nhóm FINDINGS (mô tả hình ảnh) ──────────────────────────────────────────\n",
97
+ "FINDINGS_KEYWORDS = {\n",
98
+ " \"FINDINGS\",\n",
99
+ " \"REPORT\",\n",
100
+ "}\n",
101
+ "\n",
102
+ "# Patterns dạng view description (findings không tường minh)\n",
103
+ "FINDINGS_VIEW_RE = re.compile(\n",
104
+ " r'(VIEW|VIEWS|RADIOGRAPH|RADIOGRAPHS|CHEST|PORTABLE|FRONTAL|LATERAL|PA AND|AP AND|UPRIGHT|SUPINE|SEMI)',\n",
105
+ " re.IGNORECASE\n",
106
+ ")\n",
107
+ "\n",
108
+ "# ── Admin headers (bỏ qua khi fallback) ─────────────────────────────────────\n",
109
+ "ADMIN_KEYWORDS = {\n",
110
+ " \"EXAMINATION\", \"EXAM\", \"INDICATION\", \"INDICATIONS\",\n",
111
+ " \"CLINICAL INDICATION\", \"CLINICAL HISTORY\", \"CLINICAL INFORMATION\",\n",
112
+ " \"TECHNIQUE\", \"COMPARISON\", \"COMPARISONS\", \"COMPARISON EXAM\",\n",
113
+ " \"COMPARISON FILM\", \"COMPARISON STUDY\", \"REFERENCE EXAM\",\n",
114
+ " \"HISTORY\", \"PATIENT HISTORY\",\n",
115
+ " \"REASON\", \"REASON FOR EXAM\", \"REASON FOR EXAMINATION\",\n",
116
+ " \"TYPE OF EXAMINATION\", \"PROCEDURE\",\n",
117
+ " \"NOTIFICATION\", \"NOTIFICATIONS\", \"ADDENDUM\",\n",
118
+ " \"STUDY\", \"DATE\", \"CC\", \"NOTE\", \"COMMENT\", \"COMMENTS\",\n",
119
+ " \"FINAL REPORT\",\n",
120
+ "}\n",
121
+ "\n",
122
+ "\n",
123
+ "def classify_header(h: str) -> str:\n",
124
+ " \"\"\"Phân loại header vào: findings / impression / admin / view_desc / other.\"\"\"\n",
125
+ " h = h.upper().strip()\n",
126
+ " if h in FINDINGS_KEYWORDS or \"FINDING\" in h:\n",
127
+ " return \"findings\"\n",
128
+ " if h in IMPRESSION_KEYWORDS or \"IMPRESSION\" in h or \"CONCLUSION\" in h:\n",
129
+ " return \"impression\"\n",
130
+ " if h in ADMIN_KEYWORDS:\n",
131
+ " return \"admin\"\n",
132
+ " if FINDINGS_VIEW_RE.search(h):\n",
133
+ " return \"view_desc\" # potential findings\n",
134
+ " return \"other\"\n",
135
+ "\n",
136
+ "\n",
137
+ "def parse_report(txt_path: Path) -> dict:\n",
138
+ " \"\"\"\n",
139
+ " Trả về dict:\n",
140
+ " findings : str | None\n",
141
+ " impression : str | None\n",
142
+ " sections : list of (header, category, content)\n",
143
+ " \"\"\"\n",
144
+ " try:\n",
145
+ " text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n",
146
+ " except FileNotFoundError:\n",
147
+ " return {\"findings\": None, \"impression\": None, \"sections\": []}\n",
148
+ "\n",
149
+ " matches = list(SECTION_RE.finditer(text))\n",
150
+ " if not matches:\n",
151
+ " return {\"findings\": None, \"impression\": None, \"sections\": []}\n",
152
+ "\n",
153
+ " sections = []\n",
154
+ " for i, m in enumerate(matches):\n",
155
+ " header = m.group(1).strip()\n",
156
+ " start = m.end()\n",
157
+ " end = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n",
158
+ " content = text[start:end].strip()\n",
159
+ " cat = classify_header(header)\n",
160
+ " sections.append((header, cat, content))\n",
161
+ "\n",
162
+ " findings = impression = None\n",
163
+ "\n",
164
+ " # Pass 1: tìm tường minh\n",
165
+ " for header, cat, content in sections:\n",
166
+ " if cat == \"findings\" and findings is None:\n",
167
+ " findings = content or None\n",
168
+ " elif cat == \"impression\" and impression is None:\n",
169
+ " impression = content or None\n",
170
+ "\n",
171
+ " # Pass 2: fallback findings từ view_desc\n",
172
+ " if findings is None:\n",
173
+ " for header, cat, content in sections:\n",
174
+ " if cat == \"view_desc\" and content:\n",
175
+ " findings = content\n",
176
+ " break\n",
177
+ "\n",
178
+ " return {\"findings\": findings, \"impression\": impression, \"sections\": sections}\n",
179
+ "\n",
180
+ "\n",
181
+ "print(\"Parser defined.\")"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "markdown",
186
+ "metadata": {},
187
+ "source": [
188
+ "## 2. Load & parse tất cả reports"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "# Lấy danh sách study từ split.csv để biết subset của từng study\n",
198
+ "split_df = pd.read_csv(SPLIT_CSV)\n",
199
+ "split_df[\"subset\"] = \"p\" + split_df[\"subject_id\"].astype(str).str[:2]\n",
200
+ "\n",
201
+ "studies = (\n",
202
+ " split_df[[\"subject_id\", \"study_id\", \"subset\", \"split\"]]\n",
203
+ " .drop_duplicates(\"study_id\")\n",
204
+ " .reset_index(drop=True)\n",
205
+ ")\n",
206
+ "\n",
207
+ "if SAMPLE_SIZE:\n",
208
+ " studies = studies.sample(n=min(SAMPLE_SIZE, len(studies)), random_state=42).reset_index(drop=True)\n",
209
+ " print(f\"Sample: {len(studies):,} studies\")\n",
210
+ "else:\n",
211
+ " print(f\"Total studies to parse: {len(studies):,}\")"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": null,
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "records = []\n",
221
+ "header_counter = Counter()\n",
222
+ "\n",
223
+ "for _, row in studies.iterrows():\n",
224
+ " sid = str(row[\"subject_id\"])\n",
225
+ " stid = str(row[\"study_id\"])\n",
226
+ " subset = row[\"subset\"]\n",
227
+ " split = row[\"split\"]\n",
228
+ " path = CXR_ROOT / \"files\" / subset / f\"p{sid}\" / f\"s{stid}.txt\"\n",
229
+ "\n",
230
+ " result = parse_report(path)\n",
231
+ "\n",
232
+ " for header, cat, _ in result[\"sections\"]:\n",
233
+ " header_counter[header.upper()] += 1\n",
234
+ "\n",
235
+ " records.append({\n",
236
+ " \"study_id\": stid,\n",
237
+ " \"subject_id\": sid,\n",
238
+ " \"subset\": subset,\n",
239
+ " \"split\": split,\n",
240
+ " \"findings\": result[\"findings\"],\n",
241
+ " \"impression\": result[\"impression\"],\n",
242
+ " \"n_sections\": len(result[\"sections\"]),\n",
243
+ " \"section_headers\": \"|\".join(h for h, _, _ in result[\"sections\"]),\n",
244
+ " })\n",
245
+ "\n",
246
+ "df = pd.DataFrame(records)\n",
247
+ "df[\"findings_len\"] = df[\"findings\"].str.split().str.len()\n",
248
+ "df[\"impression_len\"] = df[\"impression\"].str.split().str.len()\n",
249
+ "\n",
250
+ "total = len(df)\n",
251
+ "has_f = df[\"findings\"].notna().sum()\n",
252
+ "has_i = df[\"impression\"].notna().sum()\n",
253
+ "has_both = (df[\"findings\"].notna() & df[\"impression\"].notna()).sum()\n",
254
+ "has_neither = (df[\"findings\"].isna() & df[\"impression\"].isna()).sum()\n",
255
+ "\n",
256
+ "print(f\"Total studies parsed : {total:,}\")\n",
257
+ "print(f\"Has findings : {has_f:,} ({has_f/total*100:.1f}%)\")\n",
258
+ "print(f\"Has impression : {has_i:,} ({has_i/total*100:.1f}%)\")\n",
259
+ "print(f\"Has both : {has_both:,} ({has_both/total*100:.1f}%)\")\n",
260
+ "print(f\"Has neither : {has_neither:,} ({has_neither/total*100:.1f}%)\")"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "markdown",
265
+ "metadata": {},
266
+ "source": [
267
+ "## 3. Thống kê tất cả section headers"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": [
276
+ "# Bảng đầy đủ tất cả headers + category\n",
277
+ "header_rows = []\n",
278
+ "for h, cnt in header_counter.most_common():\n",
279
+ " header_rows.append({\n",
280
+ " \"header\": h,\n",
281
+ " \"count\": cnt,\n",
282
+ " \"category\": classify_header(h),\n",
283
+ " \"pct\": cnt / total * 100\n",
284
+ " })\n",
285
+ "\n",
286
+ "header_df = pd.DataFrame(header_rows)\n",
287
+ "\n",
288
+ "print(f\"Distinct section headers: {len(header_df)}\")\n",
289
+ "print(\"\\n=== Top 50 headers ===\")\n",
290
+ "print(header_df.head(50).to_string(index=False))"
291
+ ]
292
+ },
293
+ {
294
+ "cell_type": "code",
295
+ "execution_count": null,
296
+ "metadata": {},
297
+ "outputs": [],
298
+ "source": [
299
+ "# Phân bố theo category\n",
300
+ "cat_summary = header_df.groupby(\"category\")[\"count\"].sum().sort_values(ascending=False)\n",
301
+ "print(\"=== Tổng count theo category ===\")\n",
302
+ "print(cat_summary.to_string())\n",
303
+ "\n",
304
+ "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
305
+ "\n",
306
+ "# Bar: category totals\n",
307
+ "colors = {\"findings\": \"#4C72B0\", \"impression\": \"#DD8452\",\n",
308
+ " \"admin\": \"#8c8c8c\", \"view_desc\": \"#55A868\", \"other\": \"#C44E52\"}\n",
309
+ "cat_colors = [colors.get(c, \"gray\") for c in cat_summary.index]\n",
310
+ "bars = axes[0].bar(cat_summary.index, cat_summary.values, color=cat_colors)\n",
311
+ "axes[0].bar_label(bars, fmt=\"%d\")\n",
312
+ "axes[0].set_title(\"Tổng số lần xuất hiện theo category\")\n",
313
+ "axes[0].set_ylabel(\"Count\")\n",
314
+ "axes[0].tick_params(axis=\"x\", rotation=20)\n",
315
+ "\n",
316
+ "# Bar: số header distinct mỗi category\n",
317
+ "cat_distinct = header_df.groupby(\"category\").size().sort_values(ascending=False)\n",
318
+ "bars2 = axes[1].bar(cat_distinct.index, cat_distinct.values,\n",
319
+ " color=[colors.get(c, \"gray\") for c in cat_distinct.index])\n",
320
+ "axes[1].bar_label(bars2, fmt=\"%d\")\n",
321
+ "axes[1].set_title(\"Số header phân biệt mỗi category\")\n",
322
+ "axes[1].set_ylabel(\"Distinct headers\")\n",
323
+ "axes[1].tick_params(axis=\"x\", rotation=20)\n",
324
+ "\n",
325
+ "plt.suptitle(\"Section Header Categories\", fontsize=13)\n",
326
+ "plt.tight_layout()\n",
327
+ "plt.show()"
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": null,
333
+ "metadata": {},
334
+ "outputs": [],
335
+ "source": [
336
+ "# Top headers mỗi category\n",
337
+ "for cat in [\"findings\", \"impression\", \"view_desc\", \"other\"]:\n",
338
+ " sub = header_df[header_df[\"category\"] == cat].head(15)\n",
339
+ " print(f\"\\n=== [{cat}] Top headers ===\")\n",
340
+ " print(sub[[\"header\", \"count\", \"pct\"]].to_string(index=False))"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": null,
346
+ "metadata": {},
347
+ "outputs": [],
348
+ "source": [
349
+ "# Top 20 headers — horizontal bar\n",
350
+ "top20 = header_df.head(20).copy()\n",
351
+ "top20_colors = [colors.get(c, \"gray\") for c in top20[\"category\"]]\n",
352
+ "\n",
353
+ "fig, ax = plt.subplots(figsize=(10, 7))\n",
354
+ "bars = ax.barh(top20[\"header\"][::-1], top20[\"count\"][::-1], color=top20_colors[::-1])\n",
355
+ "ax.bar_label(bars, fmt=\"%d\", padding=3, fontsize=8)\n",
356
+ "ax.set_xlabel(\"Count\")\n",
357
+ "ax.set_title(\"Top 20 Section Headers (tô màu theo category)\")\n",
358
+ "\n",
359
+ "from matplotlib.patches import Patch\n",
360
+ "legend_elements = [Patch(facecolor=v, label=k) for k, v in colors.items()]\n",
361
+ "ax.legend(handles=legend_elements, loc=\"lower right\", fontsize=9)\n",
362
+ "plt.tight_layout()\n",
363
+ "plt.show()"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "markdown",
368
+ "metadata": {},
369
+ "source": [
370
+ "## 4. Tỉ lệ có/thiếu Findings & Impression"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": null,
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "# Tạo cột status\n",
380
+ "def report_status(row):\n",
381
+ " f = row[\"findings\"] is not None\n",
382
+ " i = row[\"impression\"] is not None\n",
383
+ " if f and i: return \"both\"\n",
384
+ " if f: return \"findings only\"\n",
385
+ " if i: return \"impression only\"\n",
386
+ " return \"neither\"\n",
387
+ "\n",
388
+ "df[\"status\"] = df.apply(report_status, axis=1)\n",
389
+ "\n",
390
+ "status_counts = df[\"status\"].value_counts()\n",
391
+ "print(\"=== Report completeness (full dataset) ===\")\n",
392
+ "for s, c in status_counts.items():\n",
393
+ " print(f\" {s:<20}: {c:>7,} ({c/total*100:.1f}%)\")"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": null,
399
+ "metadata": {},
400
+ "outputs": [],
401
+ "source": [
402
+ "fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
403
+ "status_order = [\"both\", \"impression only\", \"findings only\", \"neither\"]\n",
404
+ "status_colors = [\"#55A868\", \"#DD8452\", \"#4C72B0\", \"#C44E52\"]\n",
405
+ "vals = [status_counts.get(s, 0) for s in status_order]\n",
406
+ "\n",
407
+ "bars = axes[0].bar(status_order, vals, color=status_colors)\n",
408
+ "axes[0].bar_label(bars, fmt=\"%d\")\n",
409
+ "axes[0].set_title(\"Report completeness (count)\")\n",
410
+ "axes[0].set_ylabel(\"Số report\")\n",
411
+ "axes[0].tick_params(axis=\"x\", rotation=15)\n",
412
+ "\n",
413
+ "axes[1].pie(vals, labels=status_order, autopct=\"%1.1f%%\", colors=status_colors,\n",
414
+ " startangle=140)\n",
415
+ "axes[1].set_title(\"Report completeness (%)\")\n",
416
+ "\n",
417
+ "plt.suptitle(\"Findings & Impression Availability\", fontsize=13)\n",
418
+ "plt.tight_layout()\n",
419
+ "plt.show()"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "markdown",
424
+ "metadata": {},
425
+ "source": [
426
+ "## 5. Breakdown theo Subset"
427
+ ]
428
+ },
429
+ {
430
+ "cell_type": "code",
431
+ "execution_count": null,
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "subset_stats = df.groupby(\"subset\").apply(lambda g: pd.Series({\n",
436
+ " \"total\": len(g),\n",
437
+ " \"has_findings\": g[\"findings\"].notna().sum(),\n",
438
+ " \"has_impression\": g[\"impression\"].notna().sum(),\n",
439
+ " \"has_both\": (g[\"findings\"].notna() & g[\"impression\"].notna()).sum(),\n",
440
+ " \"has_neither\": (g[\"findings\"].isna() & g[\"impression\"].isna()).sum(),\n",
441
+ "})).reindex(ALL_SUBSETS)\n",
442
+ "\n",
443
+ "subset_pct = subset_stats.div(subset_stats[\"total\"], axis=0).mul(100).round(1)\n",
444
+ "\n",
445
+ "print(\"=== Count per subset ===\")\n",
446
+ "print(subset_stats.to_string())\n",
447
+ "print(\"\\n=== % per subset ===\")\n",
448
+ "print(subset_pct[[\"has_findings\",\"has_impression\",\"has_both\",\"has_neither\"]].to_string())"
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "code",
453
+ "execution_count": null,
454
+ "metadata": {},
455
+ "outputs": [],
456
+ "source": [
457
+ "fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
458
+ "\n",
459
+ "# Stacked % bar\n",
460
+ "status_per_subset = (\n",
461
+ " df.groupby([\"subset\", \"status\"]).size()\n",
462
+ " .unstack(fill_value=0)\n",
463
+ " .reindex(ALL_SUBSETS, fill_value=0)\n",
464
+ ")\n",
465
+ "# Tỉ lệ %\n",
466
+ "status_pct_subset = status_per_subset.div(status_per_subset.sum(axis=1), axis=0) * 100\n",
467
+ "status_pct_subset = status_pct_subset.reindex(\n",
468
+ " columns=[c for c in status_order if c in status_pct_subset.columns]\n",
469
+ ")\n",
470
+ "status_pct_subset.plot(\n",
471
+ " kind=\"bar\", stacked=True, ax=axes[0],\n",
472
+ " color=[status_colors[status_order.index(c)] for c in status_pct_subset.columns],\n",
473
+ " width=0.75\n",
474
+ ")\n",
475
+ "axes[0].set_title(\"Report completeness (%) theo subset\")\n",
476
+ "axes[0].set_ylabel(\"%\")\n",
477
+ "axes[0].set_ylim(0, 105)\n",
478
+ "axes[0].tick_params(axis=\"x\", rotation=0)\n",
479
+ "axes[0].legend(loc=\"lower right\", fontsize=8)\n",
480
+ "\n",
481
+ "# Heatmap tỉ lệ % has_both / has_neither\n",
482
+ "heatmap_data = subset_pct[[\"has_findings\",\"has_impression\",\"has_both\",\"has_neither\"]]\n",
483
+ "sns.heatmap(heatmap_data, annot=True, fmt=\".1f\", cmap=\"RdYlGn\",\n",
484
+ " linewidths=0.5, ax=axes[1], vmin=0, vmax=100,\n",
485
+ " cbar_kws={\"label\": \"%\"})\n",
486
+ "axes[1].set_title(\"Tỉ lệ (%) completeness mỗi subset\")\n",
487
+ "\n",
488
+ "plt.suptitle(\"Report Completeness per Subset\", fontsize=13)\n",
489
+ "plt.tight_layout()\n",
490
+ "plt.show()"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "code",
495
+ "execution_count": null,
496
+ "metadata": {},
497
+ "outputs": [],
498
+ "source": [
499
+ "# Breakdown theo split (train/validate/test)\n",
500
+ "split_stats = df.groupby(\"split\").apply(lambda g: pd.Series({\n",
501
+ " \"total\": len(g),\n",
502
+ " \"has_findings %\": g[\"findings\"].notna().mean() * 100,\n",
503
+ " \"has_impression %\": g[\"impression\"].notna().mean() * 100,\n",
504
+ " \"has_both %\": (g[\"findings\"].notna() & g[\"impression\"].notna()).mean() * 100,\n",
505
+ " \"has_neither %\": (g[\"findings\"].isna() & g[\"impression\"].isna()).mean() * 100,\n",
506
+ "})).reindex([\"train\", \"validate\", \"test\"])\n",
507
+ "\n",
508
+ "print(\"=== Completeness by split ===\")\n",
509
+ "print(split_stats.round(1).to_string())"
510
+ ]
511
+ },
512
+ {
513
+ "cell_type": "markdown",
514
+ "metadata": {},
515
+ "source": [
516
+ "## 6. Phân phối độ dài Findings & Impression"
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": null,
522
+ "metadata": {},
523
+ "outputs": [],
524
+ "source": [
525
+ "print(\"=== Findings word count ===\")\n",
526
+ "print(df[\"findings_len\"].describe().round(1).to_string())\n",
527
+ "print(\"\\n=== Impression word count ===\")\n",
528
+ "print(df[\"impression_len\"].describe().round(1).to_string())"
529
+ ]
530
+ },
531
+ {
532
+ "cell_type": "code",
533
+ "execution_count": null,
534
+ "metadata": {},
535
+ "outputs": [],
536
+ "source": [
537
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
538
+ "for ax, col, title, color in zip(\n",
539
+ " axes,\n",
540
+ " [\"findings_len\", \"impression_len\"],\n",
541
+ " [\"Findings — độ dài (số từ)\", \"Impression — độ dài (số từ)\"],\n",
542
+ " [\"#4C72B0\", \"#DD8452\"]\n",
543
+ "):\n",
544
+ " data = df[col].dropna()\n",
545
+ " p99 = data.quantile(0.99)\n",
546
+ " ax.hist(data[data <= p99], bins=60, color=color, edgecolor=\"white\", alpha=0.85)\n",
547
+ " ax.axvline(data.median(), color=\"black\", ls=\"--\", lw=1.3, label=f\"Median={data.median():.0f}\")\n",
548
+ " ax.axvline(data.mean(), color=\"gray\", ls=\":\", lw=1.3, label=f\"Mean={data.mean():.0f}\")\n",
549
+ " ax.set_title(title)\n",
550
+ " ax.set_xlabel(\"Số từ\")\n",
551
+ " ax.set_ylabel(\"Số report\")\n",
552
+ " ax.legend(fontsize=9)\n",
553
+ " ax.text(0.97, 0.95, f\"n={len(data):,}\\n(≤p99={p99:.0f}w)\",\n",
554
+ " transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
555
+ "\n",
556
+ "plt.suptitle(\"Phân phối độ dài Findings & Impression\", fontsize=13)\n",
557
+ "plt.tight_layout()\n",
558
+ "plt.show()"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": null,
564
+ "metadata": {},
565
+ "outputs": [],
566
+ "source": [
567
+ "# Boxplot: Findings vs Impression\n",
568
+ "combined = pd.concat([\n",
569
+ " df[[\"findings_len\"]].rename(columns={\"findings_len\": \"words\"}).assign(section=\"Findings\"),\n",
570
+ " df[[\"impression_len\"]].rename(columns={\"impression_len\": \"words\"}).assign(section=\"Impression\"),\n",
571
+ "]).dropna()\n",
572
+ "\n",
573
+ "fig, ax = plt.subplots(figsize=(7, 4))\n",
574
+ "sns.boxplot(data=combined, x=\"section\", y=\"words\",\n",
575
+ " palette=[\"#4C72B0\", \"#DD8452\"], showfliers=False, ax=ax)\n",
576
+ "ax.set_title(\"Findings vs Impression — độ dài (no outliers)\")\n",
577
+ "ax.set_ylabel(\"Số từ\")\n",
578
+ "plt.tight_layout()\n",
579
+ "plt.show()"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": null,
585
+ "metadata": {},
586
+ "outputs": [],
587
+ "source": [
588
+ "# Median length per subset\n",
589
+ "med_subset = df.groupby(\"subset\")[[\"findings_len\",\"impression_len\"]].median().reindex(ALL_SUBSETS)\n",
590
+ "\n",
591
+ "med_subset.plot(kind=\"bar\", figsize=(12, 4),\n",
592
+ " color=[\"#4C72B0\", \"#DD8452\"], width=0.7)\n",
593
+ "plt.title(\"Median độ dài Findings & Impression theo subset\")\n",
594
+ "plt.xlabel(\"Subset\")\n",
595
+ "plt.ylabel(\"Median (số từ)\")\n",
596
+ "plt.xticks(rotation=0)\n",
597
+ "plt.legend([\"Findings\", \"Impression\"])\n",
598
+ "plt.tight_layout()\n",
599
+ "plt.show()"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": null,
605
+ "metadata": {},
606
+ "outputs": [],
607
+ "source": [
608
+ "# Heatmap percentile độ dài theo subset\n",
609
+ "for col, label in [(\"findings_len\", \"Findings\"), (\"impression_len\", \"Impression\")]:\n",
610
+ " pct_data = df.groupby(\"subset\")[col].describe(\n",
611
+ " percentiles=[.25, .5, .75, .95]\n",
612
+ " )[[\"count\", \"mean\", \"25%\", \"50%\", \"75%\", \"95%\", \"max\"]].reindex(ALL_SUBSETS).round(0)\n",
613
+ " print(f\"\\n=== {label} length per subset ===\")\n",
614
+ " print(pct_data.to_string())"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "markdown",
619
+ "metadata": {},
620
+ "source": [
621
+ "## 7. Reports \"has neither\" — phân tích thêm"
622
+ ]
623
+ },
624
+ {
625
+ "cell_type": "code",
626
+ "execution_count": null,
627
+ "metadata": {},
628
+ "outputs": [],
629
+ "source": [
630
+ "neither_df = df[df[\"status\"] == \"neither\"].copy()\n",
631
+ "print(f\"Reports không có cả findings lẫn impression: {len(neither_df):,}\")\n",
632
+ "print(f\"\\nPhân bố n_sections của những report này:\")\n",
633
+ "print(neither_df[\"n_sections\"].value_counts().sort_index().head(10).to_string())\n",
634
+ "\n",
635
+ "# Xem top headers trong những report này\n",
636
+ "neither_headers = Counter()\n",
637
+ "for row in neither_df[\"section_headers\"]:\n",
638
+ " if isinstance(row, str):\n",
639
+ " for h in row.split(\"|\"):\n",
640
+ " if h:\n",
641
+ " neither_headers[h] += 1\n",
642
+ "\n",
643
+ "print(\"\\nTop section headers trong reports 'neither':\")\n",
644
+ "for h, c in neither_headers.most_common(15):\n",
645
+ " print(f\" {c:>6,} {h}\")"
646
+ ]
647
+ },
648
+ {
649
+ "cell_type": "code",
650
+ "execution_count": null,
651
+ "metadata": {},
652
+ "outputs": [],
653
+ "source": [
654
+ "# Vài ví dụ report có neither\n",
655
+ "import random\n",
656
+ "random.seed(0)\n",
657
+ "sample_neither = neither_df.sample(min(3, len(neither_df)), random_state=0)\n",
658
+ "for _, row in sample_neither.iterrows():\n",
659
+ " path = CXR_ROOT / \"files\" / row[\"subset\"] / f\"p{row['subject_id']}\" / f\"s{row['study_id']}.txt\"\n",
660
+ " print(f\"\\n{'='*60}\")\n",
661
+ " print(f\"s{row['study_id']}.txt (sections: {row['section_headers']})\")\n",
662
+ " try:\n",
663
+ " print(path.read_text(encoding=\"utf-8\", errors=\"ignore\")[:600])\n",
664
+ " except:\n",
665
+ " print(\"[file not found]\")"
666
+ ]
667
+ },
668
+ {
669
+ "cell_type": "markdown",
670
+ "metadata": {},
671
+ "source": [
672
+ "## 8. Số section mỗi report"
673
+ ]
674
+ },
675
+ {
676
+ "cell_type": "code",
677
+ "execution_count": null,
678
+ "metadata": {},
679
+ "outputs": [],
680
+ "source": [
681
+ "sec_dist = df[\"n_sections\"].value_counts().sort_index()\n",
682
+ "print(\"Phân bố số sections mỗi report:\")\n",
683
+ "print(sec_dist.head(20).to_string())\n",
684
+ "\n",
685
+ "fig, ax = plt.subplots(figsize=(11, 4))\n",
686
+ "sec_clip = sec_dist[sec_dist.index <= df[\"n_sections\"].quantile(0.99)]\n",
687
+ "ax.bar(sec_clip.index.astype(str), sec_clip.values, color=\"steelblue\")\n",
688
+ "ax.set_xlabel(\"Số sections\")\n",
689
+ "ax.set_ylabel(\"Số report\")\n",
690
+ "ax.set_title(\"Phân bố số sections mỗi report (≤p99)\")\n",
691
+ "ax.axvline(str(int(df[\"n_sections\"].median())), color=\"black\", ls=\"--\",\n",
692
+ " label=f\"Median={df['n_sections'].median():.0f}\")\n",
693
+ "ax.legend()\n",
694
+ "plt.tight_layout()\n",
695
+ "plt.show()"
696
+ ]
697
+ },
698
+ {
699
+ "cell_type": "markdown",
700
+ "metadata": {},
701
+ "source": [
702
+ "## 9. Summary"
703
+ ]
704
+ },
705
+ {
706
+ "cell_type": "code",
707
+ "execution_count": null,
708
+ "metadata": {},
709
+ "outputs": [],
710
+ "source": [
711
+ "print(\"=\"*60)\n",
712
+ "print(\" MIMIC-CXR Report EDA Summary\")\n",
713
+ "print(\"=\"*60)\n",
714
+ "print(f\" Total reports parsed : {total:,}\")\n",
715
+ "print(f\" Distinct section headers : {len(header_df)}\")\n",
716
+ "print()\n",
717
+ "print(f\" Has findings : {has_f:,} ({has_f/total*100:.1f}%)\")\n",
718
+ "print(f\" Has impression : {has_i:,} ({has_i/total*100:.1f}%)\")\n",
719
+ "print(f\" Has BOTH (usable) : {has_both:,} ({has_both/total*100:.1f}%)\")\n",
720
+ "print(f\" Has neither : {has_neither:,} ({has_neither/total*100:.1f}%)\")\n",
721
+ "print()\n",
722
+ "print(f\" Findings median length : {df['findings_len'].median():.0f} words\")\n",
723
+ "print(f\" Impression median length : {df['impression_len'].median():.0f} words\")\n",
724
+ "print(\"=\"*60)"
725
+ ]
726
+ }
727
+ ],
728
+ "metadata": {
729
+ "kernelspec": {
730
+ "display_name": "Python 3",
731
+ "language": "python",
732
+ "name": "python3"
733
+ },
734
+ "language_info": {
735
+ "name": "python",
736
+ "version": "3.10.0"
737
+ }
738
+ },
739
+ "nbformat": 4,
740
+ "nbformat_minor": 5
741
+ }
model/cxr_vlm.py CHANGED
@@ -53,10 +53,14 @@ class CXRVisionLanguageModel(nn.Module):
53
  super().__init__()
54
  self.cfg = model_cfg
55
 
56
- # ── 1. Image Encoder (BioViL-T, frozen) ─────────────────────────────
 
 
 
57
  self.image_encoder = BioViLTEncoder(
58
  frozen = model_cfg.image_encoder.frozen,
59
  img_size = model_cfg.image_encoder.img_size,
 
60
  )
61
 
62
  # ── 2. MLP Projection (trained) ──────────────────────────────────────
 
53
  super().__init__()
54
  self.cfg = model_cfg
55
 
56
+ # ── 1. Image Encoder (rad_dino / biovilt / vit, frozen) ─────────────
57
+ # `backend` defaults to "auto" → tries rad_dino → biovilt → vit and
58
+ # uses the first that loads (see model/image_encoder.py docstring).
59
+ _enc_backend = getattr(model_cfg.image_encoder, "backend", "auto")
60
  self.image_encoder = BioViLTEncoder(
61
  frozen = model_cfg.image_encoder.frozen,
62
  img_size = model_cfg.image_encoder.img_size,
63
+ backend = _enc_backend,
64
  )
65
 
66
  # ── 2. MLP Projection (trained) ──────────────────────────────────────
model/image_encoder.py CHANGED
@@ -1,16 +1,24 @@
1
  """
2
  image_encoder.py
3
  ----------------
4
- Vision encoder wrapper.
5
 
6
- Originally this used BioViL-T via Microsoft's `hi-ml-multimodal` package.
7
- That package has not been updated for Python 3.12 and fails to install on
8
- recent Kaggle images, so we fall back to a ViT-B/16 from `timm` (ImageNet
9
- pretrained, 768-dim patch features — same output shape contract as BioViL-T,
10
- so the rest of the model is unchanged).
 
 
 
 
11
 
12
- If you want real BioViL-T weights, install `hi-ml-multimodal` in a Python 3.10
13
- environment and set `backend="biovilt"` in the constructor.
 
 
 
 
14
  """
15
 
16
  import torch
@@ -23,6 +31,12 @@ try:
23
  except ImportError:
24
  TIMM_AVAILABLE = False
25
 
 
 
 
 
 
 
26
  try:
27
  from health_multimodal.image import get_biovil_t_image_encoder
28
  from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference
@@ -31,12 +45,17 @@ except ImportError:
31
  BIOVIL_AVAILABLE = False
32
 
33
 
 
 
 
34
  class BioViLTEncoder(nn.Module):
35
  """
36
- Image encoder. Name kept for backward compatibility; actual backbone
37
- depends on `backend`:
38
- - "biovilt": microsoft BioViL-T (needs hi-ml-multimodal)
39
- - "vit": timm ViT-B/16 ImageNet pretrained ← default fallback
 
 
40
 
41
  Output contract: (B, num_patches, 768)
42
  """
@@ -44,51 +63,115 @@ class BioViLTEncoder(nn.Module):
44
  PATCH_DIM = 768
45
  IMG_SIZE = 448
46
 
 
 
 
 
 
 
 
47
  def __init__(
48
  self,
49
  frozen: bool = True,
50
  img_size: int = 448,
51
- backend: str = "auto", # "auto" | "biovilt" | "vit"
52
  device: Optional[str] = None,
53
  ):
54
  super().__init__()
55
- self.img_size = img_size
56
- self.frozen = frozen
57
-
58
- if backend == "auto":
59
- backend = "biovilt" if BIOVIL_AVAILABLE else "vit"
60
- self.backend = backend
61
-
62
- if backend == "biovilt":
63
- if not BIOVIL_AVAILABLE:
64
- raise ImportError("hi-ml-multimodal not installed; choose backend='vit'")
65
- self.encoder = get_biovil_t_image_encoder()
66
- elif backend == "vit":
67
- if not TIMM_AVAILABLE:
68
- raise ImportError("timm is required for vit backend. pip install timm")
69
- # ViT-B/16 — 768-dim patch features. img_size overridden to 224 (ViT's native);
70
- # the image transform handles resize, so downstream code doesn't change.
71
- self.encoder = timm.create_model(
72
- "vit_base_patch16_224",
73
- pretrained = True,
74
- num_classes = 0, # drop classifier head
75
- global_pool = "", # keep patch tokens
 
 
 
 
 
 
76
  )
77
- # ViT native input size is 224 — override our own stored img_size
78
- self.img_size = 224
79
- else:
80
- raise ValueError(f"Unknown backend: {backend}")
81
 
82
  if frozen:
83
  self._freeze()
84
 
85
  print(f"[BioViLTEncoder] backend={self.backend} frozen={frozen} img_size={self.img_size}")
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  def _freeze(self):
88
  for p in self.encoder.parameters():
89
  p.requires_grad = False
90
  self.encoder.eval()
91
 
 
 
 
 
92
  def forward(self, images: torch.Tensor) -> torch.Tensor:
93
  """
94
  Args:
@@ -98,38 +181,85 @@ class BioViLTEncoder(nn.Module):
98
  """
99
  ctx = torch.no_grad() if self.frozen else torch.enable_grad()
100
  with ctx:
101
- if self.backend == "biovilt":
 
 
 
 
 
 
 
 
102
  out = self.encoder(images)
103
  feats = out.patch_embedding # (B, 768, H', W')
104
  B, C, H, W = feats.shape
105
  feats = feats.flatten(2).transpose(1, 2) # (B, H'*W', 768)
 
106
  else: # vit
107
  # timm ViT with num_classes=0, global_pool="" returns (B, N+1, 768)
108
  # where token 0 is [CLS]. Drop it.
109
- feats = self.encoder.forward_features(images) # (B, N+1, 768)
110
  if feats.ndim == 3 and feats.shape[1] > 1:
111
- feats = feats[:, 1:, :] # drop CLS
 
112
  return feats
113
 
 
 
 
 
114
  @staticmethod
115
- def get_transform(split: str = "train"):
116
  """
117
- Return an image transform. Uses BioViL-T's transform if available,
118
- otherwise a generic ViT-compatible transform.
 
 
 
 
 
 
119
  """
120
- if BIOVIL_AVAILABLE:
121
- return create_chest_xray_transform_for_inference(
122
- width = BioViLTEncoder.IMG_SIZE,
123
- height = BioViLTEncoder.IMG_SIZE,
124
- )
125
- # Fallback ViT transform — 224×224, ImageNet norm
126
- from torchvision import transforms
127
- return transforms.Compose([
128
- transforms.Resize((224, 224)),
129
- transforms.ToTensor(),
130
- transforms.Normalize(mean=[0.485, 0.456, 0.406],
131
- std=[0.229, 0.224, 0.225]),
132
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  @property
135
  def output_dim(self) -> int:
 
1
  """
2
  image_encoder.py
3
  ----------------
4
+ Vision encoder wrapper. Tries multiple backends in this priority order:
5
 
6
+ 1. "rad_dino" Microsoft RAD-DINO (microsoft/rad-dino, HF Hub).
7
+ Chest-X-ray self-supervised DINOv2. Loaded via the
8
+ `transformers` library, so works on Python 3.12.
9
+ recommended for CXR.
10
+ 2. "biovilt" — Microsoft BioViL-T via `hi-ml-multimodal`.
11
+ Original choice; package requires Python <3.11 so
12
+ it doesn't install on recent Colab/Kaggle images.
13
+ 3. "vit" — timm ViT-B/16 ImageNet-pretrained.
14
+ Generic fallback (not domain-pretrained).
15
 
16
+ All backends output (B, num_patches, 768) the MLP Projection layer
17
+ downstream cross-attention pools that to 32 visual tokens regardless of
18
+ num_patches, so swapping backends does not break anything else.
19
+
20
+ To force a specific backend, set `image_encoder.backend` in
21
+ `configs/model_config.yaml` to one of the names above.
22
  """
23
 
24
  import torch
 
31
  except ImportError:
32
  TIMM_AVAILABLE = False
33
 
34
+ try:
35
+ from transformers import AutoModel, AutoImageProcessor
36
+ HF_TRANSFORMERS_AVAILABLE = True
37
+ except ImportError:
38
+ HF_TRANSFORMERS_AVAILABLE = False
39
+
40
  try:
41
  from health_multimodal.image import get_biovil_t_image_encoder
42
  from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference
 
45
  BIOVIL_AVAILABLE = False
46
 
47
 
48
+ RAD_DINO_ID = "microsoft/rad-dino"
49
+
50
+
51
  class BioViLTEncoder(nn.Module):
52
  """
53
+ Image encoder. Name kept for backward compatibility with existing
54
+ checkpoints; actual backbone depends on `backend`:
55
+ - "auto": try rad_dino biovilt → vit, first that loads wins
56
+ - "rad_dino": Microsoft RAD-DINO (HF Hub) recommended for CXR
57
+ - "biovilt": Microsoft BioViL-T (hi-ml-multimodal)
58
+ - "vit": timm ViT-B/16 ImageNet pretrained
59
 
60
  Output contract: (B, num_patches, 768)
61
  """
 
63
  PATCH_DIM = 768
64
  IMG_SIZE = 448
65
 
66
+ # Native input size per backend (used when caller passes img_size=None)
67
+ _DEFAULT_SIZE = {
68
+ "rad_dino": 518, # RAD-DINO trained at 518×518 (patch 14)
69
+ "biovilt": 448, # BioViL-T trained at 448×448
70
+ "vit": 224, # ViT-B/16 native 224×224 (patch 16)
71
+ }
72
+
73
  def __init__(
74
  self,
75
  frozen: bool = True,
76
  img_size: int = 448,
77
+ backend: str = "auto", # "auto" | "rad_dino" | "biovilt" | "vit"
78
  device: Optional[str] = None,
79
  ):
80
  super().__init__()
81
+ self.frozen = frozen
82
+ # `img_size` may be overridden by the chosen backend's native size if
83
+ # the caller didn't pass anything specific.
84
+ self._requested_img_size = img_size
85
+
86
+ # ── Resolve backend ──────────────────────────────────────────────
87
+ # "auto" tries each candidate in priority order and uses the first
88
+ # one that successfully loads. Per-backend load failures are caught
89
+ # and logged so a missing dependency on one path doesn't kill the
90
+ # run; only if EVERY backend fails do we raise.
91
+ candidates = (
92
+ ("rad_dino", "biovilt", "vit") if backend == "auto" else (backend,)
93
+ )
94
+
95
+ last_error = None
96
+ chosen = None
97
+ for cand in candidates:
98
+ ok, err = self._try_load_backend(cand, img_size)
99
+ if ok:
100
+ chosen = cand
101
+ break
102
+ last_error = err
103
+ print(f"[BioViLTEncoder] backend '{cand}' unavailable: {err}")
104
+
105
+ if chosen is None:
106
+ raise RuntimeError(
107
+ f"No image encoder backend could be loaded. Last error: {last_error}"
108
  )
109
+ self.backend = chosen
 
 
 
110
 
111
  if frozen:
112
  self._freeze()
113
 
114
  print(f"[BioViLTEncoder] backend={self.backend} frozen={frozen} img_size={self.img_size}")
115
 
116
+ # ────────────────────────────────────────────────────────────────────
117
+ # Backend loading
118
+ # ────────────────────────────────────────────────────────────────────
119
+
120
+ def _try_load_backend(self, backend: str, img_size_hint: Optional[int]):
121
+ """
122
+ Try to load `backend`. Returns (success: bool, error_or_None).
123
+ On success, sets self.encoder and self.img_size.
124
+ """
125
+ try:
126
+ if backend == "rad_dino":
127
+ if not HF_TRANSFORMERS_AVAILABLE:
128
+ return False, "transformers not installed"
129
+ # Load via HF transformers — works on any Python version
130
+ # that runs `transformers`. Weights download from HF Hub on
131
+ # first use (~340MB, cached afterwards).
132
+ self.encoder = AutoModel.from_pretrained(RAD_DINO_ID)
133
+ # img_size_hint=448 will be honoured if user set it; otherwise
134
+ # default to 518 (RAD-DINO's native training resolution).
135
+ self.img_size = img_size_hint or self._DEFAULT_SIZE["rad_dino"]
136
+ return True, None
137
+
138
+ elif backend == "biovilt":
139
+ if not BIOVIL_AVAILABLE:
140
+ return False, "hi-ml-multimodal not installed"
141
+ self.encoder = get_biovil_t_image_encoder()
142
+ self.img_size = img_size_hint or self._DEFAULT_SIZE["biovilt"]
143
+ return True, None
144
+
145
+ elif backend == "vit":
146
+ if not TIMM_AVAILABLE:
147
+ return False, "timm not installed"
148
+ self.encoder = timm.create_model(
149
+ "vit_base_patch16_224",
150
+ pretrained = True,
151
+ num_classes = 0, # drop classifier head
152
+ global_pool = "", # keep patch tokens
153
+ )
154
+ # ViT-B/16 is locked to 224 by its position embeddings
155
+ self.img_size = self._DEFAULT_SIZE["vit"]
156
+ return True, None
157
+
158
+ else:
159
+ return False, f"unknown backend name: {backend!r}"
160
+
161
+ except Exception as e:
162
+ # AutoModel.from_pretrained may fail on network / auth / disk.
163
+ # Treat it as "backend unavailable" so auto-fallback can proceed.
164
+ return False, f"{type(e).__name__}: {e}"
165
+
166
  def _freeze(self):
167
  for p in self.encoder.parameters():
168
  p.requires_grad = False
169
  self.encoder.eval()
170
 
171
+ # ────────────────────────────────────────────────────────────────────
172
+ # Forward
173
+ # ────────────────────────────────────────────────────────────────────
174
+
175
  def forward(self, images: torch.Tensor) -> torch.Tensor:
176
  """
177
  Args:
 
181
  """
182
  ctx = torch.no_grad() if self.frozen else torch.enable_grad()
183
  with ctx:
184
+ if self.backend == "rad_dino":
185
+ # HF AutoModel returns BaseModelOutput; last_hidden_state has
186
+ # shape (B, N+1, 768) where token 0 is the CLS — drop it.
187
+ out = self.encoder(pixel_values=images)
188
+ feats = out.last_hidden_state
189
+ if feats.ndim == 3 and feats.shape[1] > 1:
190
+ feats = feats[:, 1:, :]
191
+
192
+ elif self.backend == "biovilt":
193
  out = self.encoder(images)
194
  feats = out.patch_embedding # (B, 768, H', W')
195
  B, C, H, W = feats.shape
196
  feats = feats.flatten(2).transpose(1, 2) # (B, H'*W', 768)
197
+
198
  else: # vit
199
  # timm ViT with num_classes=0, global_pool="" returns (B, N+1, 768)
200
  # where token 0 is [CLS]. Drop it.
201
+ feats = self.encoder.forward_features(images)
202
  if feats.ndim == 3 and feats.shape[1] > 1:
203
+ feats = feats[:, 1:, :]
204
+
205
  return feats
206
 
207
+ # ────────────────────────────────────────────────────────────────────
208
+ # Image transform (preprocessing)
209
+ # ────────────────────────────────────────────────────────────────────
210
+
211
  @staticmethod
212
+ def get_transform(split: str = "train", backend: str = "auto"):
213
  """
214
+ Return an image transform that matches the chosen backend's expected
215
+ normalization and input size.
216
+
217
+ Priority is the same as backend selection (rad_dino → biovilt → vit).
218
+ `backend="auto"` picks whichever transform we can construct; pass an
219
+ explicit backend name to force one.
220
+
221
+ The returned object is callable: `transform(pil_image) -> tensor`.
222
  """
223
+ candidates = (
224
+ ("rad_dino", "biovilt", "vit") if backend == "auto" else (backend,)
225
+ )
226
+
227
+ for cand in candidates:
228
+ try:
229
+ if cand == "rad_dino" and HF_TRANSFORMERS_AVAILABLE:
230
+ # RAD-DINO ships its own preprocessor (correct chest-X-ray
231
+ # specific mean/std, native 518×518 resize, RGB channels).
232
+ proc = AutoImageProcessor.from_pretrained(RAD_DINO_ID)
233
+
234
+ def _rad_dino_transform(pil_img):
235
+ return proc(images=pil_img, return_tensors="pt")["pixel_values"][0]
236
+
237
+ return _rad_dino_transform
238
+
239
+ if cand == "biovilt" and BIOVIL_AVAILABLE:
240
+ return create_chest_xray_transform_for_inference(
241
+ width = BioViLTEncoder._DEFAULT_SIZE["biovilt"],
242
+ height = BioViLTEncoder._DEFAULT_SIZE["biovilt"],
243
+ )
244
+
245
+ if cand == "vit":
246
+ from torchvision import transforms
247
+ size = BioViLTEncoder._DEFAULT_SIZE["vit"]
248
+ return transforms.Compose([
249
+ transforms.Resize((size, size)),
250
+ transforms.ToTensor(),
251
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
252
+ std=[0.229, 0.224, 0.225]),
253
+ ])
254
+
255
+ except Exception as e:
256
+ print(f"[BioViLTEncoder.get_transform] '{cand}' transform "
257
+ f"unavailable: {type(e).__name__}: {e}")
258
+ continue
259
+
260
+ raise RuntimeError(
261
+ "No image transform could be constructed (rad_dino/biovilt/vit all failed)"
262
+ )
263
 
264
  @property
265
  def output_dim(self) -> int: