convitom commited on
Commit ·
78b85ff
1
Parent(s): 9dadb47
chore: ignore .claude worktrees
Browse files- .claude/worktrees/strange-agnesi-73641a +1 -0
- .gitignore +0 -0
- configs/model_config.yaml +10 -3
- data/eda_full.ipynb +874 -0
- data/eda_p18.ipynb +797 -0
- data/eda_reports.ipynb +741 -0
- model/cxr_vlm.py +5 -1
- model/image_encoder.py +187 -57
.claude/worktrees/strange-agnesi-73641a
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 9dadb472ab6ab5dee7a656bf525b249a605a68ff
|
.gitignore
CHANGED
|
Binary files a/.gitignore and b/.gitignore differ
|
|
|
configs/model_config.yaml
CHANGED
|
@@ -3,11 +3,18 @@
|
|
| 3 |
# ─────────────────────────────────────────────
|
| 4 |
|
| 5 |
# ── Vision Encoder ──────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
image_encoder:
|
| 7 |
-
name: "microsoft/
|
|
|
|
| 8 |
frozen: true # freeze encoder during training
|
| 9 |
-
img_size: 448 # input image resolution
|
| 10 |
-
output_dim: 768 # patch feature dimension
|
| 11 |
|
| 12 |
# ── MLP Projection (Alignment Layer) ────────
|
| 13 |
projection:
|
|
|
|
| 3 |
# ─────────────────────────────────────────────
|
| 4 |
|
| 5 |
# ── Vision Encoder ──────────────────────────
|
| 6 |
+
# `backend` chooses the underlying model. "auto" tries rad_dino → biovilt → vit
|
| 7 |
+
# in priority order and uses the first one that loads.
|
| 8 |
+
# - rad_dino : microsoft/rad-dino, chest-X-ray DINOv2 (HF transformers).
|
| 9 |
+
# Works on Python 3.12, recommended for CXR.
|
| 10 |
+
# - biovilt : Microsoft BioViL-T (needs hi-ml-multimodal, Python <3.11).
|
| 11 |
+
# - vit : timm ViT-B/16 ImageNet — generic fallback if above fail.
|
| 12 |
image_encoder:
|
| 13 |
+
name: "microsoft/rad-dino" # informational; backend below drives loading
|
| 14 |
+
backend: "auto" # "auto" | "rad_dino" | "biovilt" | "vit"
|
| 15 |
frozen: true # freeze encoder during training
|
| 16 |
+
img_size: 448 # input image resolution (RAD-DINO native is 518)
|
| 17 |
+
output_dim: 768 # patch feature dimension (768 for all backends)
|
| 18 |
|
| 19 |
# ── MLP Projection (Alignment Layer) ────────
|
| 20 |
projection:
|
data/eda_full.ipynb
ADDED
|
@@ -0,0 +1,874 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# EDA — MIMIC-CXR Full Dataset\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"**Datasets used:**\n",
|
| 10 |
+
"- `MIMIC-CXR-JPG` (v2.1.0) — ảnh JPG + CSV metadata\n",
|
| 11 |
+
"- `MIMIC-CXR` (v2.1.0) — report `.txt` (Findings / Impression)\n",
|
| 12 |
+
"- `MIMIC-Ext-MIMIC-CXR-VQA` (v1.0.0) — câu hỏi/đáp VQA\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"**Scope:** toàn bộ dataset (tất cả subset p10–p19).\n",
|
| 15 |
+
"\n",
|
| 16 |
+
"> ℹ️ **Không cần tải ảnh JPG** để chạy notebook này — toàn bộ EDA dựa trên CSV, .txt reports và .json VQA."
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "markdown",
|
| 21 |
+
"metadata": {},
|
| 22 |
+
"source": [
|
| 23 |
+
"## 0. Cấu hình đường dẫn"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"execution_count": null,
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [],
|
| 31 |
+
"source": "from pathlib import Path\n\nDATA_DIR = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\")\nCXR_ROOT = DATA_DIR / \"mimic-cxr-reports\" # files/p10…p19/pXXXXXX/sYYYYYY.txt — toàn bộ dataset\n\nSPLIT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-split.csv\"\nMETA_CSV = DATA_DIR / \"mimic-cxr-2.0.0-metadata.csv\"\nCHEXPERT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-chexpert.csv\"\n\n_VQA_DIR = (DATA_DIR\n / \"mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\"\n / \"MIMIC-Ext-MIMIC-CXR-VQA\"\n / \"dataset\")\nVQA_TRAIN = _VQA_DIR / \"train.json\"\nVQA_VALID = _VQA_DIR / \"valid.json\"\nVQA_TEST = _VQA_DIR / \"test.json\"\n\n# None = parse hết toàn bộ (~227k studies, mất 10-20 phút)\n# Số nguyên = sample ngẫu nhiên để chạy nhanh\nREPORT_SAMPLE_SIZE = 10000\n\n# Kiểm tra nhanh\nfor name, p in [(\"SPLIT_CSV\", SPLIT_CSV),\n (\"META_CSV\", META_CSV),\n (\"CHEXPERT_CSV\", CHEXPERT_CSV),\n (\"CXR_ROOT\", CXR_ROOT),\n (\"VQA_TRAIN\", VQA_TRAIN)]:\n status = \"✓\" if p.exists() else \"✗ NOT FOUND\"\n print(f\" {status} {name}: {p}\")\n\nprint(\"\\nPaths configured.\")"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": null,
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"source": [
|
| 39 |
+
"import pandas as pd\n",
|
| 40 |
+
"import numpy as np\n",
|
| 41 |
+
"import json\n",
|
| 42 |
+
"import re\n",
|
| 43 |
+
"import matplotlib.pyplot as plt\n",
|
| 44 |
+
"import matplotlib.ticker as mticker\n",
|
| 45 |
+
"import seaborn as sns\n",
|
| 46 |
+
"from collections import Counter\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
|
| 49 |
+
"plt.rcParams[\"figure.dpi\"] = 120\n",
|
| 50 |
+
"plt.rcParams[\"figure.figsize\"] = (11, 4)\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"CHEXPERT_LABELS = [\n",
|
| 53 |
+
" \"Atelectasis\", \"Cardiomegaly\", \"Consolidation\", \"Edema\",\n",
|
| 54 |
+
" \"Enlarged Cardiomediastinum\", \"Fracture\", \"Lung Lesion\",\n",
|
| 55 |
+
" \"Lung Opacity\", \"No Finding\", \"Pleural Effusion\",\n",
|
| 56 |
+
" \"Pleural Other\", \"Pneumonia\", \"Pneumothorax\", \"Support Devices\"\n",
|
| 57 |
+
"]\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"# Subset folders p10–p19\n",
|
| 60 |
+
"ALL_SUBSETS = [f\"p{i}\" for i in range(10, 20)]\n",
|
| 61 |
+
"\n",
|
| 62 |
+
"print(\"Libraries imported.\")"
|
| 63 |
+
]
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"cell_type": "markdown",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"source": [
|
| 69 |
+
"## 1. Load CSV files"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"cell_type": "code",
|
| 74 |
+
"execution_count": null,
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"outputs": [],
|
| 77 |
+
"source": [
|
| 78 |
+
"split_df = pd.read_csv(SPLIT_CSV)\n",
|
| 79 |
+
"meta_df = pd.read_csv(META_CSV)\n",
|
| 80 |
+
"chexpert_df = pd.read_csv(CHEXPERT_CSV)\n",
|
| 81 |
+
"\n",
|
| 82 |
+
"# Tạo cột subset folder (p10, p11, ..., p19)\n",
|
| 83 |
+
"def get_subset(subject_id):\n",
|
| 84 |
+
" return \"p\" + str(subject_id)[:2]\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"for df_ in [split_df, meta_df, chexpert_df]:\n",
|
| 87 |
+
" df_[\"subset\"] = df_[\"subject_id\"].astype(str).str[:2].apply(lambda x: f\"p{x}\")\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"print(f\"split.csv — total images : {len(split_df):,}\")\n",
|
| 90 |
+
"print(f\"metadata — total images : {len(meta_df):,}\")\n",
|
| 91 |
+
"print(f\"chexpert — total studies : {len(chexpert_df):,}\")\n",
|
| 92 |
+
"print(f\"\\nSubsets found in split.csv:\")\n",
|
| 93 |
+
"print(split_df[\"subset\"].value_counts().sort_index().to_string())"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"cell_type": "code",
|
| 98 |
+
"execution_count": null,
|
| 99 |
+
"metadata": {},
|
| 100 |
+
"outputs": [],
|
| 101 |
+
"source": [
|
| 102 |
+
"# Merge split + metadata\n",
|
| 103 |
+
"df = split_df.merge(\n",
|
| 104 |
+
" meta_df[[\"dicom_id\", \"ViewPosition\", \"Rows\", \"Columns\"]],\n",
|
| 105 |
+
" on=\"dicom_id\", how=\"left\"\n",
|
| 106 |
+
")\n",
|
| 107 |
+
"# Giữ lại cột subset từ split_df\n",
|
| 108 |
+
"if \"subset_y\" in df.columns:\n",
|
| 109 |
+
" df = df.drop(columns=[\"subset_y\"]).rename(columns={\"subset_x\": \"subset\"})\n",
|
| 110 |
+
"\n",
|
| 111 |
+
"print(f\"Merged shape: {df.shape}\")\n",
|
| 112 |
+
"df.head(3)"
|
| 113 |
+
]
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"cell_type": "markdown",
|
| 117 |
+
"metadata": {},
|
| 118 |
+
"source": [
|
| 119 |
+
"## 2. Tổng quan: số ảnh & report theo split + subset folder"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"cell_type": "code",
|
| 124 |
+
"execution_count": null,
|
| 125 |
+
"metadata": {},
|
| 126 |
+
"outputs": [],
|
| 127 |
+
"source": [
|
| 128 |
+
"# Tổng theo split\n",
|
| 129 |
+
"img_per_split = df[\"split\"].value_counts().reindex([\"train\",\"validate\",\"test\"])\n",
|
| 130 |
+
"study_per_split = (\n",
|
| 131 |
+
" df.drop_duplicates(\"study_id\")[\"split\"]\n",
|
| 132 |
+
" .value_counts().reindex([\"train\",\"validate\",\"test\"])\n",
|
| 133 |
+
")\n",
|
| 134 |
+
"\n",
|
| 135 |
+
"summary_total = pd.DataFrame({\n",
|
| 136 |
+
" \"Images\": img_per_split,\n",
|
| 137 |
+
" \"Studies/Reports\": study_per_split\n",
|
| 138 |
+
"})\n",
|
| 139 |
+
"summary_total.loc[\"TOTAL\"] = summary_total.sum()\n",
|
| 140 |
+
"print(\"=== Overall split summary ===\")\n",
|
| 141 |
+
"print(summary_total.to_string())"
|
| 142 |
+
]
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"cell_type": "code",
|
| 146 |
+
"execution_count": null,
|
| 147 |
+
"metadata": {},
|
| 148 |
+
"outputs": [],
|
| 149 |
+
"source": [
|
| 150 |
+
"# ── Breakdown theo từng subset folder ────────────────────────────────────────\n",
|
| 151 |
+
"img_subset_split = (\n",
|
| 152 |
+
" df.groupby([\"subset\", \"split\"])[\"dicom_id\"]\n",
|
| 153 |
+
" .count()\n",
|
| 154 |
+
" .unstack(fill_value=0)\n",
|
| 155 |
+
" .reindex(columns=[\"train\",\"validate\",\"test\"], fill_value=0)\n",
|
| 156 |
+
" .reindex(ALL_SUBSETS, fill_value=0)\n",
|
| 157 |
+
")\n",
|
| 158 |
+
"img_subset_split[\"TOTAL\"] = img_subset_split.sum(axis=1)\n",
|
| 159 |
+
"\n",
|
| 160 |
+
"study_subset_split = (\n",
|
| 161 |
+
" df.drop_duplicates(\"study_id\")\n",
|
| 162 |
+
" .groupby([\"subset\", \"split\"])[\"study_id\"]\n",
|
| 163 |
+
" .count()\n",
|
| 164 |
+
" .unstack(fill_value=0)\n",
|
| 165 |
+
" .reindex(columns=[\"train\",\"validate\",\"test\"], fill_value=0)\n",
|
| 166 |
+
" .reindex(ALL_SUBSETS, fill_value=0)\n",
|
| 167 |
+
")\n",
|
| 168 |
+
"study_subset_split[\"TOTAL\"] = study_subset_split.sum(axis=1)\n",
|
| 169 |
+
"\n",
|
| 170 |
+
"print(\"=== Images per subset × split ===\")\n",
|
| 171 |
+
"print(img_subset_split.to_string())\n",
|
| 172 |
+
"print(\"\\n=== Studies/Reports per subset × split ===\")\n",
|
| 173 |
+
"print(study_subset_split.to_string())"
|
| 174 |
+
]
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"cell_type": "code",
|
| 178 |
+
"execution_count": null,
|
| 179 |
+
"metadata": {},
|
| 180 |
+
"outputs": [],
|
| 181 |
+
"source": [
|
| 182 |
+
"fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
|
| 183 |
+
"palette = {\"train\": \"#4C72B0\", \"validate\": \"#DD8452\", \"test\": \"#55A868\"}\n",
|
| 184 |
+
"\n",
|
| 185 |
+
"for ax, data, title in zip(\n",
|
| 186 |
+
" axes,\n",
|
| 187 |
+
" [img_subset_split[[\"train\",\"validate\",\"test\"]], study_subset_split[[\"train\",\"validate\",\"test\"]]],\n",
|
| 188 |
+
" [\"Số ảnh theo subset × split\", \"Số study/report theo subset × split\"]\n",
|
| 189 |
+
"):\n",
|
| 190 |
+
" data.plot(kind=\"bar\", ax=ax, color=[palette[c] for c in data.columns], width=0.75)\n",
|
| 191 |
+
" ax.set_title(title, fontsize=12)\n",
|
| 192 |
+
" ax.set_xlabel(\"Subset folder\")\n",
|
| 193 |
+
" ax.set_ylabel(\"Count\")\n",
|
| 194 |
+
" ax.tick_params(axis=\"x\", rotation=0)\n",
|
| 195 |
+
" ax.legend(title=\"Split\")\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"plt.suptitle(\"MIMIC-CXR Full Dataset — Split × Subset\", fontsize=14, y=1.02)\n",
|
| 198 |
+
"plt.tight_layout()\n",
|
| 199 |
+
"plt.show()"
|
| 200 |
+
]
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"cell_type": "code",
|
| 204 |
+
"execution_count": null,
|
| 205 |
+
"metadata": {},
|
| 206 |
+
"outputs": [],
|
| 207 |
+
"source": [
|
| 208 |
+
"# Heatmap: tỉ lệ % train/val/test trong mỗi subset\n",
|
| 209 |
+
"img_pct = img_subset_split[[\"train\",\"validate\",\"test\"]].div(\n",
|
| 210 |
+
" img_subset_split[\"TOTAL\"], axis=0\n",
|
| 211 |
+
") * 100\n",
|
| 212 |
+
"\n",
|
| 213 |
+
"fig, ax = plt.subplots(figsize=(8, 5))\n",
|
| 214 |
+
"sns.heatmap(\n",
|
| 215 |
+
" img_pct.round(1), annot=True, fmt=\".1f\", cmap=\"YlGnBu\",\n",
|
| 216 |
+
" linewidths=0.5, ax=ax, cbar_kws={\"label\": \"%\"}\n",
|
| 217 |
+
")\n",
|
| 218 |
+
"ax.set_title(\"Tỉ lệ (%) train/val/test trong mỗi subset folder\")\n",
|
| 219 |
+
"ax.set_xlabel(\"Split\")\n",
|
| 220 |
+
"ax.set_ylabel(\"Subset\")\n",
|
| 221 |
+
"plt.tight_layout()\n",
|
| 222 |
+
"plt.show()"
|
| 223 |
+
]
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"cell_type": "markdown",
|
| 227 |
+
"metadata": {},
|
| 228 |
+
"source": [
|
| 229 |
+
"## 3. Số ảnh mỗi study"
|
| 230 |
+
]
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"cell_type": "code",
|
| 234 |
+
"execution_count": null,
|
| 235 |
+
"metadata": {},
|
| 236 |
+
"outputs": [],
|
| 237 |
+
"source": [
|
| 238 |
+
"imgs_per_study = df.groupby(\"study_id\")[\"dicom_id\"].count()\n",
|
| 239 |
+
"count_dist = imgs_per_study.value_counts().sort_index()\n",
|
| 240 |
+
"\n",
|
| 241 |
+
"print(\"Images per study distribution:\")\n",
|
| 242 |
+
"print(count_dist.to_string())\n",
|
| 243 |
+
"print(f\"\\nMax : {imgs_per_study.max()}\")\n",
|
| 244 |
+
"print(f\"Mean: {imgs_per_study.mean():.2f}\")"
|
| 245 |
+
]
|
| 246 |
+
},
|
| 247 |
+
{
|
| 248 |
+
"cell_type": "code",
|
| 249 |
+
"execution_count": null,
|
| 250 |
+
"metadata": {},
|
| 251 |
+
"outputs": [],
|
| 252 |
+
"source": [
|
| 253 |
+
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
| 254 |
+
"\n",
|
| 255 |
+
"# Full distribution\n",
|
| 256 |
+
"axes[0].bar(count_dist.index.astype(str), count_dist.values,\n",
|
| 257 |
+
" color=sns.color_palette(\"Blues_d\", len(count_dist)))\n",
|
| 258 |
+
"axes[0].set_title(\"Số ảnh mỗi study (toàn bộ)\")\n",
|
| 259 |
+
"axes[0].set_xlabel(\"Số ảnh trong study\")\n",
|
| 260 |
+
"axes[0].set_ylabel(\"Số study\")\n",
|
| 261 |
+
"for x, v in zip(count_dist.index, count_dist.values):\n",
|
| 262 |
+
" axes[0].text(str(x), v * 1.01, f\"{v:,}\", ha=\"center\", va=\"bottom\", fontsize=8)\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"# Per-subset: mean images per study\n",
|
| 265 |
+
"mean_imgs = df.groupby(\"subset\").apply(\n",
|
| 266 |
+
" lambda g: g.groupby(\"study_id\")[\"dicom_id\"].count().mean()\n",
|
| 267 |
+
").reindex(ALL_SUBSETS)\n",
|
| 268 |
+
"axes[1].bar(mean_imgs.index, mean_imgs.values, color=\"steelblue\")\n",
|
| 269 |
+
"axes[1].set_title(\"Trung bình số ảnh/study theo subset\")\n",
|
| 270 |
+
"axes[1].set_xlabel(\"Subset\")\n",
|
| 271 |
+
"axes[1].set_ylabel(\"Mean images/study\")\n",
|
| 272 |
+
"axes[1].set_ylim(0, mean_imgs.max() * 1.2)\n",
|
| 273 |
+
"for x, v in zip(mean_imgs.index, mean_imgs.values):\n",
|
| 274 |
+
" axes[1].text(x, v * 1.01, f\"{v:.2f}\", ha=\"center\", va=\"bottom\", fontsize=9)\n",
|
| 275 |
+
"\n",
|
| 276 |
+
"plt.suptitle(\"Images per Study Distribution\", fontsize=13)\n",
|
| 277 |
+
"plt.tight_layout()\n",
|
| 278 |
+
"plt.show()"
|
| 279 |
+
]
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"cell_type": "markdown",
|
| 283 |
+
"metadata": {},
|
| 284 |
+
"source": [
|
| 285 |
+
"## 4. View Position"
|
| 286 |
+
]
|
| 287 |
+
},
|
| 288 |
+
{
|
| 289 |
+
"cell_type": "code",
|
| 290 |
+
"execution_count": null,
|
| 291 |
+
"metadata": {},
|
| 292 |
+
"outputs": [],
|
| 293 |
+
"source": [
|
| 294 |
+
"view_counts = df[\"ViewPosition\"].fillna(\"Unknown\").value_counts()\n",
|
| 295 |
+
"print(\"View position counts (total):\")\n",
|
| 296 |
+
"print(view_counts.to_string())"
|
| 297 |
+
]
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"cell_type": "code",
|
| 301 |
+
"execution_count": null,
|
| 302 |
+
"metadata": {},
|
| 303 |
+
"outputs": [],
|
| 304 |
+
"source": [
|
| 305 |
+
"fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
|
| 306 |
+
"\n",
|
| 307 |
+
"bars = axes[0].bar(view_counts.index, view_counts.values,\n",
|
| 308 |
+
" color=sns.color_palette(\"Set2\", len(view_counts)))\n",
|
| 309 |
+
"axes[0].bar_label(bars, fmt=\"%d\")\n",
|
| 310 |
+
"axes[0].set_title(\"Số ảnh theo View Position\")\n",
|
| 311 |
+
"axes[0].set_ylabel(\"Count\")\n",
|
| 312 |
+
"\n",
|
| 313 |
+
"axes[1].pie(view_counts.values, labels=view_counts.index, autopct=\"%1.1f%%\",\n",
|
| 314 |
+
" colors=sns.color_palette(\"Set2\", len(view_counts)))\n",
|
| 315 |
+
"axes[1].set_title(\"Tỉ lệ View Position\")\n",
|
| 316 |
+
"\n",
|
| 317 |
+
"plt.suptitle(\"View Position Distribution — Full Dataset\", fontsize=13)\n",
|
| 318 |
+
"plt.tight_layout()\n",
|
| 319 |
+
"plt.show()"
|
| 320 |
+
]
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"cell_type": "code",
|
| 324 |
+
"execution_count": null,
|
| 325 |
+
"metadata": {},
|
| 326 |
+
"outputs": [],
|
| 327 |
+
"source": [
|
| 328 |
+
"# View per subset\n",
|
| 329 |
+
"view_subset = (\n",
|
| 330 |
+
" df.fillna({\"ViewPosition\": \"Unknown\"})\n",
|
| 331 |
+
" .groupby([\"subset\", \"ViewPosition\"])[\"dicom_id\"]\n",
|
| 332 |
+
" .count()\n",
|
| 333 |
+
" .unstack(fill_value=0)\n",
|
| 334 |
+
" .reindex(ALL_SUBSETS, fill_value=0)\n",
|
| 335 |
+
")\n",
|
| 336 |
+
"\n",
|
| 337 |
+
"view_subset.plot(kind=\"bar\", figsize=(14, 4),\n",
|
| 338 |
+
" color=sns.color_palette(\"Set2\", view_subset.shape[1]),\n",
|
| 339 |
+
" width=0.8)\n",
|
| 340 |
+
"plt.title(\"View Position theo subset folder\")\n",
|
| 341 |
+
"plt.xlabel(\"Subset\")\n",
|
| 342 |
+
"plt.ylabel(\"Count\")\n",
|
| 343 |
+
"plt.xticks(rotation=0)\n",
|
| 344 |
+
"plt.legend(title=\"ViewPosition\", bbox_to_anchor=(1.01, 1), loc=\"upper left\")\n",
|
| 345 |
+
"plt.tight_layout()\n",
|
| 346 |
+
"plt.show()"
|
| 347 |
+
]
|
| 348 |
+
},
|
| 349 |
+
{
|
| 350 |
+
"cell_type": "code",
|
| 351 |
+
"execution_count": null,
|
| 352 |
+
"metadata": {},
|
| 353 |
+
"outputs": [],
|
| 354 |
+
"source": [
|
| 355 |
+
"# View split breakdown\n",
|
| 356 |
+
"view_split = df.groupby([\"split\", \"ViewPosition\"]).size().unstack(fill_value=0)\n",
|
| 357 |
+
"view_split = view_split.reindex([\"train\",\"validate\",\"test\"])\n",
|
| 358 |
+
"view_split.plot(kind=\"bar\", figsize=(10, 4),\n",
|
| 359 |
+
" color=sns.color_palette(\"Set2\", view_split.shape[1]))\n",
|
| 360 |
+
"plt.title(\"View Position theo split\")\n",
|
| 361 |
+
"plt.xlabel(\"Split\")\n",
|
| 362 |
+
"plt.xticks(rotation=0)\n",
|
| 363 |
+
"plt.legend(title=\"ViewPosition\")\n",
|
| 364 |
+
"plt.tight_layout()\n",
|
| 365 |
+
"plt.show()"
|
| 366 |
+
]
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"cell_type": "markdown",
|
| 370 |
+
"id": "8845a29f",
|
| 371 |
+
"source": "## 4b. Frontal-Only Sampling Strategy (AP > PA)\n\nChiến lược train: **1 report + 1 ảnh frontal** mỗi study.\n- Chỉ giữ AP hoặc PA; nếu study có cả hai thì **ưu tiên AP**.\n- Study không có ảnh frontal nào → loại khỏi tập train.",
|
| 372 |
+
"metadata": {}
|
| 373 |
+
},
|
| 374 |
+
{
|
| 375 |
+
"cell_type": "code",
|
| 376 |
+
"id": "22a327eb",
|
| 377 |
+
"source": "frontal = df[df[\"ViewPosition\"].isin([\"AP\", \"PA\"])].copy()\n\ndef pick_frontal_view(group):\n ap = group[group[\"ViewPosition\"] == \"AP\"]\n if len(ap) > 0:\n return ap.iloc[[0]]\n return group[group[\"ViewPosition\"] == \"PA\"].iloc[[0]]\n\nfrontal_1img = (\n frontal.groupby(\"study_id\", group_keys=False)\n .apply(pick_frontal_view)\n .reset_index(drop=True)\n)\n\nn_study_total = df[\"study_id\"].nunique()\nn_study_frontal = frontal_1img[\"study_id\"].nunique()\nn_study_no_front = n_study_total - n_study_frontal\n\nprint(\"=== Frontal-Only Sampling (Full Dataset) ===\")\nprint(f\"Tổng số study : {n_study_total:,}\")\nprint(f\"Study có ảnh frontal (AP/PA) : {n_study_frontal:,} ({n_study_frontal/n_study_total*100:.1f}%)\")\nprint(f\"Study bị loại (không có frontal): {n_study_no_front:,} ({n_study_no_front/n_study_total*100:.1f}%)\")\nprint()\nprint(\"Ảnh được chọn theo view:\")\nprint(frontal_1img[\"ViewPosition\"].value_counts().to_string())\nprint()\n\nsplit_frontal = frontal_1img[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\nsplit_all = df.drop_duplicates(\"study_id\")[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\ncompare = pd.DataFrame({\n \"All studies\": split_all,\n \"Frontal-only\": split_frontal,\n \"Giảm (%)\": ((split_all - split_frontal) / split_all * 100).round(1)\n})\nprint(\"=== Mẫu train sau khi filter (split) ===\")\nprint(compare.to_string())",
|
| 378 |
+
"metadata": {},
|
| 379 |
+
"execution_count": null,
|
| 380 |
+
"outputs": []
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"cell_type": "code",
|
| 384 |
+
"id": "712ff838",
|
| 385 |
+
"source": "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. All vs Frontal-only\nbars = axes[0].bar([\"All studies\", \"Frontal-only\"],\n [n_study_total, n_study_frontal],\n color=[\"#4C72B0\", \"#55A868\"], width=0.5)\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Study count: All vs Frontal-only\")\naxes[0].set_ylabel(\"Số study\")\n\n# 2. Pie: view được chọn\nvc = frontal_1img[\"ViewPosition\"].value_counts()\naxes[1].pie(vc.values, labels=vc.index, autopct=\"%1.1f%%\",\n colors=[\"#4C72B0\", \"#DD8452\"])\naxes[1].set_title(\"View được chọn (AP ưu tiên)\")\n\n# 3. Per-split comparison\nx = np.arange(3)\nw = 0.35\naxes[2].bar(x - w/2, split_all.values, w, label=\"All\", color=\"#4C72B0\", alpha=0.85)\naxes[2].bar(x + w/2, split_frontal.values, w, label=\"Frontal-only\", color=\"#55A868\", alpha=0.85)\naxes[2].set_xticks(x)\naxes[2].set_xticklabels([\"train\", \"validate\", \"test\"])\naxes[2].set_title(\"Frontal-only vs All (per split)\")\naxes[2].set_ylabel(\"Số study\")\naxes[2].legend()\n\nplt.suptitle(\"Frontal-Only Sampling Strategy — Full Dataset\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# 4. Frontal-only per subset\nfrontal_subset = (\n frontal_1img.groupby([\"subset\", \"split\"]).size()\n .unstack(fill_value=0)\n .reindex(columns=[\"train\", \"validate\", \"test\"], fill_value=0)\n .reindex(ALL_SUBSETS, fill_value=0)\n)\nfrontal_subset[\"TOTAL\"] = frontal_subset.sum(axis=1)\nprint(\"Frontal-only samples per subset:\")\nprint(frontal_subset.to_string())\n\nfrontal_subset[[\"train\",\"validate\",\"test\"]].plot(\n kind=\"bar\", figsize=(13, 4),\n color=[\"#4C72B0\", \"#DD8452\", \"#55A868\"], width=0.75\n)\nplt.title(\"Frontal-Only samples theo subset × split\")\nplt.xlabel(\"Subset\")\nplt.ylabel(\"Số study\")\nplt.xticks(rotation=0)\nplt.legend(title=\"Split\")\nplt.tight_layout()\nplt.show()",
|
| 386 |
+
"metadata": {},
|
| 387 |
+
"execution_count": null,
|
| 388 |
+
"outputs": []
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"cell_type": "markdown",
|
| 392 |
+
"metadata": {},
|
| 393 |
+
"source": [
|
| 394 |
+
"## 5. CheXpert Labels — 14 nhãn bệnh lý"
|
| 395 |
+
]
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"cell_type": "code",
|
| 399 |
+
"execution_count": null,
|
| 400 |
+
"metadata": {},
|
| 401 |
+
"outputs": [],
|
| 402 |
+
"source": [
|
| 403 |
+
"label_cols = [c for c in chexpert_df.columns if c in CHEXPERT_LABELS]\n",
|
| 404 |
+
"\n",
|
| 405 |
+
"positive_counts = (chexpert_df[label_cols] == 1).sum().sort_values(ascending=False)\n",
|
| 406 |
+
"uncertain_counts = (chexpert_df[label_cols] == -1).sum()\n",
|
| 407 |
+
"negative_counts = (chexpert_df[label_cols] == 0).sum()\n",
|
| 408 |
+
"\n",
|
| 409 |
+
"label_summary = pd.DataFrame({\n",
|
| 410 |
+
" \"Positive\": positive_counts,\n",
|
| 411 |
+
" \"Uncertain\": uncertain_counts,\n",
|
| 412 |
+
" \"Negative\": negative_counts,\n",
|
| 413 |
+
" \"Not Mentioned\": chexpert_df[label_cols].isna().sum()\n",
|
| 414 |
+
"})\n",
|
| 415 |
+
"label_summary[\"Positive %\"] = (label_summary[\"Positive\"] / len(chexpert_df) * 100).round(1)\n",
|
| 416 |
+
"print(label_summary.sort_values(\"Positive\", ascending=False).to_string())"
|
| 417 |
+
]
|
| 418 |
+
},
|
| 419 |
+
{
|
| 420 |
+
"cell_type": "code",
|
| 421 |
+
"execution_count": null,
|
| 422 |
+
"metadata": {},
|
| 423 |
+
"outputs": [],
|
| 424 |
+
"source": [
|
| 425 |
+
"ordered_labels = label_summary.sort_values(\"Positive\", ascending=False).index.tolist()\n",
|
| 426 |
+
"x = np.arange(len(ordered_labels))\n",
|
| 427 |
+
"w = 0.25\n",
|
| 428 |
+
"\n",
|
| 429 |
+
"fig, ax = plt.subplots(figsize=(14, 5))\n",
|
| 430 |
+
"ax.bar(x - w, label_summary.loc[ordered_labels, \"Positive\"], w, label=\"Positive\", color=\"#e74c3c\")\n",
|
| 431 |
+
"ax.bar(x, label_summary.loc[ordered_labels, \"Uncertain\"], w, label=\"Uncertain\", color=\"#f39c12\")\n",
|
| 432 |
+
"ax.bar(x + w, label_summary.loc[ordered_labels, \"Negative\"], w, label=\"Negative\", color=\"#2ecc71\")\n",
|
| 433 |
+
"ax.set_xticks(x)\n",
|
| 434 |
+
"ax.set_xticklabels(ordered_labels, rotation=40, ha=\"right\", fontsize=9)\n",
|
| 435 |
+
"ax.set_ylabel(\"Số study\")\n",
|
| 436 |
+
"ax.set_title(\"CheXpert Labels — Positive / Uncertain / Negative (Full Dataset)\")\n",
|
| 437 |
+
"ax.legend()\n",
|
| 438 |
+
"plt.tight_layout()\n",
|
| 439 |
+
"plt.show()"
|
| 440 |
+
]
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"cell_type": "code",
|
| 444 |
+
"execution_count": null,
|
| 445 |
+
"metadata": {},
|
| 446 |
+
"outputs": [],
|
| 447 |
+
"source": "ADMIN_HEADERS = {\n 'EXAMINATION', 'INDICATION', 'CLINICAL INDICATION', 'TECHNIQUE',\n 'COMPARISON', 'HISTORY', 'REASON', 'REASON FOR EXAM',\n 'REASON FOR EXAMINATION', 'PROCEDURE', 'FINAL REPORT',\n 'NOTIFICATION', 'RECOMMENDATION', 'ADDENDUM'\n}\n\nSECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\-]{1,70}?):\\s*', re.MULTILINE)\n\ndef parse_report(txt_path: Path) -> dict:\n \"\"\"\n Quy luật detect section: mọi header đều VIẾT HOA TOÀN BỘ và kết thúc bằng ':'.\n Fallback: nếu không có FINDINGS tường minh, lấy section descriptive đầu tiên.\n \"\"\"\n try:\n text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n except FileNotFoundError:\n return {\"findings\": None, \"impression\": None}\n\n matches = list(SECTION_RE.finditer(text))\n if not matches:\n return {\"findings\": None, \"impression\": None}\n\n sections = []\n for i, m in enumerate(matches):\n header = m.group(1).strip()\n start = m.end()\n end = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n content = text[start:end].strip()\n sections.append((header, content))\n\n findings = impression = None\n for header, content in sections:\n h = header.upper()\n if \"FINDING\" in h and findings is None:\n findings = content or None\n elif \"IMPRESSION\" in h and impression is None:\n impression = content or None\n\n if findings is None:\n for header, content in sections:\n h = header.upper()\n if h not in ADMIN_HEADERS and \"IMPRESSION\" not in h and content:\n findings = content\n break\n\n return {\"findings\": findings, \"impression\": impression}\n\n\nall_studies = (\n df[[\"subject_id\", \"study_id\", \"subset\"]]\n .drop_duplicates(\"study_id\")\n .reset_index(drop=True)\n)\n\nif REPORT_SAMPLE_SIZE is not None:\n parse_studies = all_studies.sample(\n n=min(REPORT_SAMPLE_SIZE, len(all_studies)), random_state=42\n ).reset_index(drop=True)\n print(f\"Sample {len(parse_studies):,} / {len(all_studies):,} studies\")\nelse:\n parse_studies = all_studies\n print(f\"Parsing ALL {len(parse_studies):,} studies...\")\n\nrecords = []\nfor _, row in parse_studies.iterrows():\n sid = str(row[\"subject_id\"])\n stid = str(row[\"study_id\"])\n sub = row[\"subset\"]\n txt_path = CXR_ROOT / \"files\" / sub / f\"p{sid}\" / f\"s{stid}.txt\"\n records.append({\"study_id\": stid, \"subset\": sub, **parse_report(txt_path)})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"] = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\ntotal = len(report_df)\nprint(f\"\\nFindings found : {report_df['findings'].notna().sum():,} / {total:,} ({report_df['findings'].notna().mean()*100:.1f}%)\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {total:,} ({report_df['impression'].notna().mean()*100:.1f}%)\")\nboth = (report_df['findings'].notna() & report_df['impression'].notna()).sum()\nneither = (report_df['findings'].isna() & report_df['impression'].isna()).sum()\nprint(f\"Cả hai : {both:,} / {total:,} ({both/total*100:.1f}%)\")\nprint(f\"Không có cả hai : {neither:,} / {total:,} ({neither/total*100:.1f}%)\")"
|
| 448 |
+
},
|
| 449 |
+
{
|
| 450 |
+
"cell_type": "code",
|
| 451 |
+
"execution_count": null,
|
| 452 |
+
"metadata": {},
|
| 453 |
+
"outputs": [],
|
| 454 |
+
"source": [
|
| 455 |
+
"# Số nhãn positive mỗi study\n",
|
| 456 |
+
"labels_per_study = (chexpert_df[label_cols] == 1).sum(axis=1)\n",
|
| 457 |
+
"lps_counts = labels_per_study.value_counts().sort_index()\n",
|
| 458 |
+
"\n",
|
| 459 |
+
"fig, ax = plt.subplots(figsize=(9, 4))\n",
|
| 460 |
+
"ax.bar(lps_counts.index.astype(str), lps_counts.values,\n",
|
| 461 |
+
" color=sns.color_palette(\"Blues_d\", len(lps_counts)))\n",
|
| 462 |
+
"ax.set_xlabel(\"Số nhãn positive\")\n",
|
| 463 |
+
"ax.set_ylabel(\"Số study\")\n",
|
| 464 |
+
"ax.set_title(\"Phân bố số nhãn positive mỗi study (Full Dataset)\")\n",
|
| 465 |
+
"for x_, v in zip(lps_counts.index, lps_counts.values):\n",
|
| 466 |
+
" ax.text(str(x_), v * 1.01, f\"{v:,}\", ha=\"center\", va=\"bottom\", fontsize=8)\n",
|
| 467 |
+
"plt.tight_layout()\n",
|
| 468 |
+
"plt.show()"
|
| 469 |
+
]
|
| 470 |
+
},
|
| 471 |
+
{
|
| 472 |
+
"cell_type": "markdown",
|
| 473 |
+
"metadata": {},
|
| 474 |
+
"source": "## 6. Phân tích Report — Findings & Impression\n\n> ℹ️ Report parsing chỉ hoạt động với subset **đã tải về**. Các subset chưa có sẽ tự động bị bỏ qua."
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"cell_type": "code",
|
| 478 |
+
"execution_count": null,
|
| 479 |
+
"metadata": {},
|
| 480 |
+
"outputs": [],
|
| 481 |
+
"source": "def parse_report(txt_path: Path) -> dict:\n try:\n text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n except FileNotFoundError:\n return {\"findings\": None, \"impression\": None}\n\n text = re.sub(r\"[\\r\\n]+\", \" \", text)\n\n def extract_section(pattern, text):\n m = re.search(pattern, text, re.IGNORECASE)\n if not m:\n return None\n start = m.end()\n nxt = re.search(\n r\"(IMPRESSION|FINDINGS|CONCLUSION|RECOMMENDATION|NOTIFICATION)\",\n text[start:], re.IGNORECASE\n )\n end = start + nxt.start() if nxt else len(text)\n return text[start:end].strip()\n\n return {\n \"findings\": extract_section(r\"FINDINGS\\s*:\", text),\n \"impression\": extract_section(r\"IMPRESSION\\s*:\", text)\n }\n\n\nall_studies = (\n df[[\"subject_id\", \"study_id\", \"subset\"]]\n .drop_duplicates(\"study_id\")\n .reset_index(drop=True)\n)\n\nif REPORT_SAMPLE_SIZE is not None:\n parse_studies = all_studies.sample(\n n=min(REPORT_SAMPLE_SIZE, len(all_studies)), random_state=42\n ).reset_index(drop=True)\n print(f\"Sample {len(parse_studies):,} / {len(all_studies):,} studies\")\nelse:\n parse_studies = all_studies\n print(f\"Parsing ALL {len(parse_studies):,} studies... (có thể mất 10-20 phút)\")\n\nrecords = []\nfor _, row in parse_studies.iterrows():\n sid = str(row[\"subject_id\"])\n stid = str(row[\"study_id\"])\n sub = row[\"subset\"]\n txt_path = CXR_ROOT / \"files\" / sub / f\"p{sid}\" / f\"s{stid}.txt\"\n records.append({\"study_id\": stid, \"subset\": sub, **parse_report(txt_path)})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"] = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\nprint(f\"Findings found : {report_df['findings'].notna().sum():,} / {len(report_df):,}\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {len(report_df):,}\")"
|
| 482 |
+
},
|
| 483 |
+
{
|
| 484 |
+
"cell_type": "code",
|
| 485 |
+
"execution_count": null,
|
| 486 |
+
"metadata": {},
|
| 487 |
+
"outputs": [],
|
| 488 |
+
"source": [
|
| 489 |
+
"print(\"=== Findings word count ===\")\n",
|
| 490 |
+
"print(report_df[\"findings_len\"].describe().round(1).to_string())\n",
|
| 491 |
+
"print(\"\\n=== Impression word count ===\")\n",
|
| 492 |
+
"print(report_df[\"impression_len\"].describe().round(1).to_string())"
|
| 493 |
+
]
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"cell_type": "code",
|
| 497 |
+
"execution_count": null,
|
| 498 |
+
"metadata": {},
|
| 499 |
+
"outputs": [],
|
| 500 |
+
"source": [
|
| 501 |
+
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
| 502 |
+
"\n",
|
| 503 |
+
"for ax, col, title, color in zip(\n",
|
| 504 |
+
" axes,\n",
|
| 505 |
+
" [\"findings_len\", \"impression_len\"],\n",
|
| 506 |
+
" [\"Findings — độ dài (số từ)\", \"Impression — độ dài (số từ)\"],\n",
|
| 507 |
+
" [\"steelblue\", \"tomato\"]\n",
|
| 508 |
+
"):\n",
|
| 509 |
+
" data = report_df[col].dropna()\n",
|
| 510 |
+
" p99 = data.quantile(0.99)\n",
|
| 511 |
+
" ax.hist(data[data <= p99], bins=50, color=color, edgecolor=\"white\", alpha=0.85)\n",
|
| 512 |
+
" ax.axvline(data.median(), color=\"black\", ls=\"--\", lw=1.3, label=f\"Median={data.median():.0f}\")\n",
|
| 513 |
+
" ax.axvline(data.mean(), color=\"gray\", ls=\":\", lw=1.3, label=f\"Mean={data.mean():.0f}\")\n",
|
| 514 |
+
" ax.set_title(title)\n",
|
| 515 |
+
" ax.set_xlabel(\"Số từ\")\n",
|
| 516 |
+
" ax.set_ylabel(\"Số report\")\n",
|
| 517 |
+
" ax.legend(fontsize=9)\n",
|
| 518 |
+
" ax.text(0.97, 0.95, f\"n={len(data):,}\\n(≤p99={p99:.0f}w)\",\n",
|
| 519 |
+
" transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
|
| 520 |
+
"\n",
|
| 521 |
+
"plt.suptitle(\"Phân bố độ dài report — Full Dataset\", fontsize=13)\n",
|
| 522 |
+
"plt.tight_layout()\n",
|
| 523 |
+
"plt.show()"
|
| 524 |
+
]
|
| 525 |
+
},
|
| 526 |
+
{
|
| 527 |
+
"cell_type": "code",
|
| 528 |
+
"execution_count": null,
|
| 529 |
+
"metadata": {},
|
| 530 |
+
"outputs": [],
|
| 531 |
+
"source": [
|
| 532 |
+
"# Box plot Findings vs Impression\n",
|
| 533 |
+
"combined = pd.DataFrame({\n",
|
| 534 |
+
" \"word_count\": pd.concat([report_df[\"findings_len\"], report_df[\"impression_len\"]], ignore_index=True),\n",
|
| 535 |
+
" \"section\": [\"Findings\"] * len(report_df) + [\"Impression\"] * len(report_df)\n",
|
| 536 |
+
"}).dropna()\n",
|
| 537 |
+
"\n",
|
| 538 |
+
"fig, ax = plt.subplots(figsize=(7, 4))\n",
|
| 539 |
+
"sns.boxplot(data=combined, x=\"section\", y=\"word_count\",\n",
|
| 540 |
+
" palette=[\"steelblue\", \"tomato\"], showfliers=False, ax=ax)\n",
|
| 541 |
+
"ax.set_title(\"Findings vs Impression — độ dài (no outliers)\")\n",
|
| 542 |
+
"ax.set_ylabel(\"Số từ\")\n",
|
| 543 |
+
"plt.tight_layout()\n",
|
| 544 |
+
"plt.show()"
|
| 545 |
+
]
|
| 546 |
+
},
|
| 547 |
+
{
|
| 548 |
+
"cell_type": "code",
|
| 549 |
+
"execution_count": null,
|
| 550 |
+
"metadata": {},
|
| 551 |
+
"outputs": [],
|
| 552 |
+
"source": [
|
| 553 |
+
"# Median report length theo subset\n",
|
| 554 |
+
"rep_by_subset = report_df.groupby(\"subset\")[[\"findings_len\",\"impression_len\"]].median().reindex(ALL_SUBSETS)\n",
|
| 555 |
+
"\n",
|
| 556 |
+
"rep_by_subset.plot(kind=\"bar\", figsize=(12, 4),\n",
|
| 557 |
+
" color=[\"steelblue\", \"tomato\"], width=0.7)\n",
|
| 558 |
+
"plt.title(\"Median độ dài Findings & Impression theo subset\")\n",
|
| 559 |
+
"plt.xlabel(\"Subset\")\n",
|
| 560 |
+
"plt.ylabel(\"Median số từ\")\n",
|
| 561 |
+
"plt.xticks(rotation=0)\n",
|
| 562 |
+
"plt.legend([\"Findings\", \"Impression\"])\n",
|
| 563 |
+
"plt.tight_layout()\n",
|
| 564 |
+
"plt.show()"
|
| 565 |
+
]
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"cell_type": "markdown",
|
| 569 |
+
"metadata": {},
|
| 570 |
+
"source": [
|
| 571 |
+
"## 7. VQA — phân tích câu hỏi & đáp"
|
| 572 |
+
]
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"cell_type": "code",
|
| 576 |
+
"execution_count": null,
|
| 577 |
+
"metadata": {},
|
| 578 |
+
"outputs": [],
|
| 579 |
+
"source": [
|
| 580 |
+
"vqa_dfs = []\n",
|
| 581 |
+
"for fpath, sname in [(VQA_TRAIN, \"train\"), (VQA_VALID, \"valid\"), (VQA_TEST, \"test\")]:\n",
|
| 582 |
+
" if fpath.exists():\n",
|
| 583 |
+
" with open(fpath, encoding=\"utf-8\") as f:\n",
|
| 584 |
+
" data = json.load(f)\n",
|
| 585 |
+
" tmp = pd.DataFrame(data)\n",
|
| 586 |
+
" tmp[\"split\"] = sname\n",
|
| 587 |
+
" vqa_dfs.append(tmp)\n",
|
| 588 |
+
" else:\n",
|
| 589 |
+
" print(f\"[WARNING] Not found: {fpath}\")\n",
|
| 590 |
+
"\n",
|
| 591 |
+
"vqa_all = pd.concat(vqa_dfs, ignore_index=True)\n",
|
| 592 |
+
"vqa_all[\"subset\"] = \"p\" + vqa_all[\"subject_id\"].astype(str).str[:2]\n",
|
| 593 |
+
"\n",
|
| 594 |
+
"print(f\"VQA total: {len(vqa_all):,}\")\n",
|
| 595 |
+
"print(f\"\\nPer split:\")\n",
|
| 596 |
+
"print(vqa_all[\"split\"].value_counts().to_string())"
|
| 597 |
+
]
|
| 598 |
+
},
|
| 599 |
+
{
|
| 600 |
+
"cell_type": "code",
|
| 601 |
+
"execution_count": null,
|
| 602 |
+
"metadata": {},
|
| 603 |
+
"outputs": [],
|
| 604 |
+
"source": [
|
| 605 |
+
"# VQA per subset × split\n",
|
| 606 |
+
"vqa_subset_split = (\n",
|
| 607 |
+
" vqa_all.groupby([\"subset\", \"split\"]).size()\n",
|
| 608 |
+
" .unstack(fill_value=0)\n",
|
| 609 |
+
" .reindex(columns=[\"train\",\"valid\",\"test\"], fill_value=0)\n",
|
| 610 |
+
" .reindex(ALL_SUBSETS, fill_value=0)\n",
|
| 611 |
+
")\n",
|
| 612 |
+
"vqa_subset_split[\"TOTAL\"] = vqa_subset_split.sum(axis=1)\n",
|
| 613 |
+
"print(\"VQA samples per subset × split:\")\n",
|
| 614 |
+
"print(vqa_subset_split.to_string())"
|
| 615 |
+
]
|
| 616 |
+
},
|
| 617 |
+
{
|
| 618 |
+
"cell_type": "code",
|
| 619 |
+
"execution_count": null,
|
| 620 |
+
"metadata": {},
|
| 621 |
+
"outputs": [],
|
| 622 |
+
"source": [
|
| 623 |
+
"vqa_subset_split[[\"train\",\"valid\",\"test\"]].plot(\n",
|
| 624 |
+
" kind=\"bar\", figsize=(13, 4),\n",
|
| 625 |
+
" color=[palette[\"train\"], palette[\"validate\"], palette[\"test\"]],\n",
|
| 626 |
+
" width=0.75\n",
|
| 627 |
+
")\n",
|
| 628 |
+
"plt.title(\"VQA samples theo subset × split\")\n",
|
| 629 |
+
"plt.xlabel(\"Subset\")\n",
|
| 630 |
+
"plt.ylabel(\"Count\")\n",
|
| 631 |
+
"plt.xticks(rotation=0)\n",
|
| 632 |
+
"plt.legend(title=\"Split\")\n",
|
| 633 |
+
"plt.tight_layout()\n",
|
| 634 |
+
"plt.show()"
|
| 635 |
+
]
|
| 636 |
+
},
|
| 637 |
+
{
|
| 638 |
+
"cell_type": "markdown",
|
| 639 |
+
"id": "63f3247e",
|
| 640 |
+
"source": "### VQA × View Position — mẫu hỏi đáp thuộc ảnh view nào",
|
| 641 |
+
"metadata": {}
|
| 642 |
+
},
|
| 643 |
+
{
|
| 644 |
+
"cell_type": "code",
|
| 645 |
+
"id": "d5e6a532",
|
| 646 |
+
"source": "# image_id trong VQA = dicom_id trong metadata\nvqa_view = vqa_all.merge(\n meta_df[[\"dicom_id\", \"ViewPosition\"]],\n left_on=\"image_id\", right_on=\"dicom_id\",\n how=\"left\"\n)\n\nmissing_view_vqa = vqa_view[\"ViewPosition\"].isna().sum()\nvqa_view[\"ViewPosition\"] = vqa_view[\"ViewPosition\"].fillna(\"Unknown\")\n\nview_vqa_counts = vqa_view[\"ViewPosition\"].value_counts()\nprint(\"=== VQA samples theo View Position (Full Dataset) ===\")\nprint(view_vqa_counts.to_string())\nprint(f\"\\nKhông map được ViewPosition: {missing_view_vqa:,} ({missing_view_vqa/len(vqa_view)*100:.1f}%)\")\n\nfig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. Bar\nbars = axes[0].bar(view_vqa_counts.index, view_vqa_counts.values,\n color=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Số mẫu VQA theo View Position\")\naxes[0].set_ylabel(\"Số mẫu\")\n\n# 2. Pie\naxes[1].pie(view_vqa_counts.values, labels=view_vqa_counts.index,\n autopct=\"%1.1f%%\", colors=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[1].set_title(\"Tỉ lệ VQA theo View Position\")\n\n# 3. Semantic type × View (stacked bar)\nsem_view = vqa_view.groupby([\"ViewPosition\", \"semantic_type\"]).size().unstack(fill_value=0)\nsem_view.plot(kind=\"bar\", ax=axes[2], color=sns.color_palette(\"Set1\", sem_view.shape[1]),\n width=0.7, stacked=True)\naxes[2].set_title(\"Semantic Type × View Position\")\naxes[2].set_xlabel(\"View Position\")\naxes[2].set_ylabel(\"Số mẫu\")\naxes[2].tick_params(axis=\"x\", rotation=30)\naxes[2].legend(title=\"Semantic Type\", fontsize=8)\n\nplt.suptitle(\"VQA × View Position — Full Dataset\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# Content type × View (heatmap)\ncontent_view = (vqa_view.groupby([\"ViewPosition\", \"content_type\"]).size()\n .unstack(fill_value=0))\nprint(\"\\nContent type theo View Position:\")\nprint(content_view.to_string())\n\nfig, ax = plt.subplots(figsize=(11, 4))\nsns.heatmap(content_view, annot=True, fmt=\"d\", cmap=\"YlGnBu\",\n linewidths=0.4, ax=ax)\nax.set_title(\"VQA — Content Type × View Position\")\nplt.tight_layout()\nplt.show()",
|
| 647 |
+
"metadata": {},
|
| 648 |
+
"execution_count": null,
|
| 649 |
+
"outputs": []
|
| 650 |
+
},
|
| 651 |
+
{
|
| 652 |
+
"cell_type": "code",
|
| 653 |
+
"execution_count": null,
|
| 654 |
+
"metadata": {},
|
| 655 |
+
"outputs": [],
|
| 656 |
+
"source": [
|
| 657 |
+
"# Semantic type & Content type\n",
|
| 658 |
+
"sem_counts = vqa_all[\"semantic_type\"].value_counts()\n",
|
| 659 |
+
"con_counts = vqa_all[\"content_type\"].value_counts()\n",
|
| 660 |
+
"\n",
|
| 661 |
+
"fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
|
| 662 |
+
"for ax, counts, title in zip(\n",
|
| 663 |
+
" axes,\n",
|
| 664 |
+
" [sem_counts, con_counts],\n",
|
| 665 |
+
" [\"VQA — Semantic Type\", \"VQA — Content Type\"]\n",
|
| 666 |
+
"):\n",
|
| 667 |
+
" bars = ax.bar(counts.index, counts.values,\n",
|
| 668 |
+
" color=sns.color_palette(\"Set2\", len(counts)))\n",
|
| 669 |
+
" ax.bar_label(bars, fmt=\"%d\")\n",
|
| 670 |
+
" ax.set_title(title)\n",
|
| 671 |
+
" ax.set_ylabel(\"Count\")\n",
|
| 672 |
+
" ax.tick_params(axis=\"x\", rotation=30)\n",
|
| 673 |
+
"\n",
|
| 674 |
+
"plt.suptitle(\"VQA Question Analysis — Full Dataset\", fontsize=13)\n",
|
| 675 |
+
"plt.tight_layout()\n",
|
| 676 |
+
"plt.show()"
|
| 677 |
+
]
|
| 678 |
+
},
|
| 679 |
+
{
|
| 680 |
+
"cell_type": "code",
|
| 681 |
+
"execution_count": null,
|
| 682 |
+
"metadata": {},
|
| 683 |
+
"outputs": [],
|
| 684 |
+
"source": [
|
| 685 |
+
"# Cross-tab semantic × content\n",
|
| 686 |
+
"cross = pd.crosstab(vqa_all[\"semantic_type\"], vqa_all[\"content_type\"])\n",
|
| 687 |
+
"fig, ax = plt.subplots(figsize=(10, 3))\n",
|
| 688 |
+
"sns.heatmap(cross, annot=True, fmt=\"d\", cmap=\"YlOrRd\", ax=ax)\n",
|
| 689 |
+
"ax.set_title(\"VQA — Semantic Type × Content Type\")\n",
|
| 690 |
+
"plt.tight_layout()\n",
|
| 691 |
+
"plt.show()"
|
| 692 |
+
]
|
| 693 |
+
},
|
| 694 |
+
{
|
| 695 |
+
"cell_type": "code",
|
| 696 |
+
"execution_count": null,
|
| 697 |
+
"metadata": {},
|
| 698 |
+
"outputs": [],
|
| 699 |
+
"source": [
|
| 700 |
+
"# Answer type\n",
|
| 701 |
+
"def classify_answer(ans_list):\n",
|
| 702 |
+
" if not isinstance(ans_list, list) or len(ans_list) == 0:\n",
|
| 703 |
+
" return \"no answer\"\n",
|
| 704 |
+
" a = ans_list[0].strip().lower()\n",
|
| 705 |
+
" return a if a in [\"yes\", \"no\"] else \"open\"\n",
|
| 706 |
+
"\n",
|
| 707 |
+
"vqa_all[\"ans_type\"] = vqa_all[\"answer\"].apply(classify_answer)\n",
|
| 708 |
+
"ans_counts = vqa_all[\"ans_type\"].value_counts()\n",
|
| 709 |
+
"\n",
|
| 710 |
+
"fig, ax = plt.subplots(figsize=(6, 3))\n",
|
| 711 |
+
"bars = ax.bar(ans_counts.index, ans_counts.values,\n",
|
| 712 |
+
" color=sns.color_palette(\"Pastel1\", len(ans_counts)))\n",
|
| 713 |
+
"ax.bar_label(bars, fmt=\"%d\")\n",
|
| 714 |
+
"ax.set_title(\"VQA — Answer Type (Full Dataset)\")\n",
|
| 715 |
+
"ax.set_ylabel(\"Count\")\n",
|
| 716 |
+
"plt.tight_layout()\n",
|
| 717 |
+
"plt.show()"
|
| 718 |
+
]
|
| 719 |
+
},
|
| 720 |
+
{
|
| 721 |
+
"cell_type": "markdown",
|
| 722 |
+
"metadata": {},
|
| 723 |
+
"source": [
|
| 724 |
+
"## 8. Data Quality & Missing Data"
|
| 725 |
+
]
|
| 726 |
+
},
|
| 727 |
+
{
|
| 728 |
+
"cell_type": "code",
|
| 729 |
+
"execution_count": null,
|
| 730 |
+
"metadata": {},
|
| 731 |
+
"outputs": [],
|
| 732 |
+
"source": [
|
| 733 |
+
"# Missing ViewPosition\n",
|
| 734 |
+
"missing_view = df[\"ViewPosition\"].isna().sum()\n",
|
| 735 |
+
"print(f\"Ảnh thiếu ViewPosition: {missing_view:,} / {len(df):,} ({missing_view/len(df)*100:.2f}%)\")\n",
|
| 736 |
+
"\n",
|
| 737 |
+
"# Missing view per subset\n",
|
| 738 |
+
"mv_subset = df[df[\"ViewPosition\"].isna()].groupby(\"subset\").size().reindex(ALL_SUBSETS, fill_value=0)\n",
|
| 739 |
+
"print(\"\\nMissing ViewPosition per subset:\")\n",
|
| 740 |
+
"print(mv_subset.to_string())"
|
| 741 |
+
]
|
| 742 |
+
},
|
| 743 |
+
{
|
| 744 |
+
"cell_type": "code",
|
| 745 |
+
"execution_count": null,
|
| 746 |
+
"metadata": {},
|
| 747 |
+
"outputs": [],
|
| 748 |
+
"source": [
|
| 749 |
+
"# Missing findings/impression (từ sample)\n",
|
| 750 |
+
"no_findings = report_df[\"findings\"].isna().sum()\n",
|
| 751 |
+
"no_impression = report_df[\"impression\"].isna().sum()\n",
|
| 752 |
+
"n = len(report_df)\n",
|
| 753 |
+
"print(f\"Reports thiếu Findings : {no_findings:,}/{n:,} ({no_findings/n*100:.1f}%)\")\n",
|
| 754 |
+
"print(f\"Reports thiếu Impression : {no_impression:,}/{n:,} ({no_impression/n*100:.1f}%)\")\n",
|
| 755 |
+
"print(f\"Reports thiếu CẢ HAI : {(report_df['findings'].isna() & report_df['impression'].isna()).sum():,}/{n:,}\")"
|
| 756 |
+
]
|
| 757 |
+
},
|
| 758 |
+
{
|
| 759 |
+
"cell_type": "code",
|
| 760 |
+
"execution_count": null,
|
| 761 |
+
"metadata": {},
|
| 762 |
+
"outputs": [],
|
| 763 |
+
"source": [
|
| 764 |
+
"# Bệnh nhân / study / ảnh tổng quan\n",
|
| 765 |
+
"n_subjects = df[\"subject_id\"].nunique()\n",
|
| 766 |
+
"n_studies = df[\"study_id\"].nunique()\n",
|
| 767 |
+
"n_images = df[\"dicom_id\"].nunique()\n",
|
| 768 |
+
"\n",
|
| 769 |
+
"print(f\"Bệnh nhân : {n_subjects:,}\")\n",
|
| 770 |
+
"print(f\"Studies : {n_studies:,}\")\n",
|
| 771 |
+
"print(f\"Ảnh : {n_images:,}\")\n",
|
| 772 |
+
"print(f\"Trung bình study/patient : {n_studies/n_subjects:.2f}\")\n",
|
| 773 |
+
"print(f\"Trung bình ảnh/patient : {n_images/n_subjects:.2f}\")"
|
| 774 |
+
]
|
| 775 |
+
},
|
| 776 |
+
{
|
| 777 |
+
"cell_type": "code",
|
| 778 |
+
"execution_count": null,
|
| 779 |
+
"metadata": {},
|
| 780 |
+
"outputs": [],
|
| 781 |
+
"source": [
|
| 782 |
+
"# Study per patient distribution\n",
|
| 783 |
+
"spp = df.groupby(\"subject_id\")[\"study_id\"].nunique()\n",
|
| 784 |
+
"print(\"Studies per patient:\")\n",
|
| 785 |
+
"print(spp.describe().round(1).to_string())\n",
|
| 786 |
+
"\n",
|
| 787 |
+
"fig, ax = plt.subplots(figsize=(10, 4))\n",
|
| 788 |
+
"spp_vc = spp.value_counts().sort_index()\n",
|
| 789 |
+
"# clip tails\n",
|
| 790 |
+
"spp_vc_clip = spp_vc[spp_vc.index <= spp.quantile(0.99)]\n",
|
| 791 |
+
"ax.bar(spp_vc_clip.index.astype(str), spp_vc_clip.values, color=\"mediumpurple\")\n",
|
| 792 |
+
"ax.set_xlabel(\"Số study mỗi bệnh nhân\")\n",
|
| 793 |
+
"ax.set_ylabel(\"Số bệnh nhân\")\n",
|
| 794 |
+
"ax.set_title(\"Phân bố số lần khám mỗi bệnh nhân (≤p99)\")\n",
|
| 795 |
+
"ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True, nbins=20))\n",
|
| 796 |
+
"plt.tight_layout()\n",
|
| 797 |
+
"plt.show()"
|
| 798 |
+
]
|
| 799 |
+
},
|
| 800 |
+
{
|
| 801 |
+
"cell_type": "code",
|
| 802 |
+
"execution_count": null,
|
| 803 |
+
"metadata": {},
|
| 804 |
+
"outputs": [],
|
| 805 |
+
"source": [
|
| 806 |
+
"# Số bệnh nhân và study per subset\n",
|
| 807 |
+
"patient_subset = df.groupby(\"subset\")[\"subject_id\"].nunique().reindex(ALL_SUBSETS)\n",
|
| 808 |
+
"study_subset = df.groupby(\"subset\")[\"study_id\"].nunique().reindex(ALL_SUBSETS)\n",
|
| 809 |
+
"image_subset = df.groupby(\"subset\")[\"dicom_id\"].nunique().reindex(ALL_SUBSETS)\n",
|
| 810 |
+
"\n",
|
| 811 |
+
"subset_overview = pd.DataFrame({\n",
|
| 812 |
+
" \"Patients\": patient_subset,\n",
|
| 813 |
+
" \"Studies\": study_subset,\n",
|
| 814 |
+
" \"Images\": image_subset\n",
|
| 815 |
+
"})\n",
|
| 816 |
+
"print(subset_overview.to_string())\n",
|
| 817 |
+
"\n",
|
| 818 |
+
"subset_overview.plot(kind=\"bar\", figsize=(13, 4),\n",
|
| 819 |
+
" color=[\"#5e81ac\", \"#88c0d0\", \"#a3be8c\"], width=0.75)\n",
|
| 820 |
+
"plt.title(\"Patients / Studies / Images theo subset\")\n",
|
| 821 |
+
"plt.xlabel(\"Subset\")\n",
|
| 822 |
+
"plt.ylabel(\"Count\")\n",
|
| 823 |
+
"plt.xticks(rotation=0)\n",
|
| 824 |
+
"plt.legend()\n",
|
| 825 |
+
"plt.tight_layout()\n",
|
| 826 |
+
"plt.show()"
|
| 827 |
+
]
|
| 828 |
+
},
|
| 829 |
+
{
|
| 830 |
+
"cell_type": "markdown",
|
| 831 |
+
"metadata": {},
|
| 832 |
+
"source": [
|
| 833 |
+
"## 9. Summary"
|
| 834 |
+
]
|
| 835 |
+
},
|
| 836 |
+
{
|
| 837 |
+
"cell_type": "code",
|
| 838 |
+
"execution_count": null,
|
| 839 |
+
"metadata": {},
|
| 840 |
+
"outputs": [],
|
| 841 |
+
"source": [
|
| 842 |
+
"print(\"=\"*60)\n",
|
| 843 |
+
"print(\" SUMMARY — MIMIC-CXR Full Dataset\")\n",
|
| 844 |
+
"print(\"=\"*60)\n",
|
| 845 |
+
"print(f\" Bệnh nhân : {n_subjects:,}\")\n",
|
| 846 |
+
"print(f\" Studies (reports) : {n_studies:,}\")\n",
|
| 847 |
+
"print(f\" Ảnh (dicom/jpg) : {n_images:,}\")\n",
|
| 848 |
+
"print()\n",
|
| 849 |
+
"for sp in [\"train\", \"validate\", \"test\"]:\n",
|
| 850 |
+
" ni = img_per_split.get(sp, 0)\n",
|
| 851 |
+
" ns = study_per_split.get(sp, 0)\n",
|
| 852 |
+
" print(f\" [{sp:>8}] ảnh={ni:>6,} studies={ns:>6,}\")\n",
|
| 853 |
+
"print()\n",
|
| 854 |
+
"print(f\" Frontal (PA+AP) : {view_counts.get('PA',0)+view_counts.get('AP',0):,} ảnh\")\n",
|
| 855 |
+
"print(f\" Lateral (LL) : {view_counts.get('LL',view_counts.get('LATERAL',0)):,} ảnh\")\n",
|
| 856 |
+
"print(f\" VQA total samples : {len(vqa_all):,}\")\n",
|
| 857 |
+
"print(\"=\"*60)"
|
| 858 |
+
]
|
| 859 |
+
}
|
| 860 |
+
],
|
| 861 |
+
"metadata": {
|
| 862 |
+
"kernelspec": {
|
| 863 |
+
"display_name": "Python 3",
|
| 864 |
+
"language": "python",
|
| 865 |
+
"name": "python3"
|
| 866 |
+
},
|
| 867 |
+
"language_info": {
|
| 868 |
+
"name": "python",
|
| 869 |
+
"version": "3.10.0"
|
| 870 |
+
}
|
| 871 |
+
},
|
| 872 |
+
"nbformat": 4,
|
| 873 |
+
"nbformat_minor": 5
|
| 874 |
+
}
|
data/eda_p18.ipynb
ADDED
|
@@ -0,0 +1,797 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# EDA — MIMIC-CXR Subset p18\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"**Datasets used:**\n",
|
| 10 |
+
"- `MIMIC-CXR-JPG` (v2.1.0) — ảnh JPG + CSV metadata\n",
|
| 11 |
+
"- `MIMIC-CXR` (v2.1.0) — report `.txt` (Findings / Impression)\n",
|
| 12 |
+
"- `MIMIC-Ext-MIMIC-CXR-VQA` (v1.0.0) — câu hỏi/đáp VQA\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"**Scope:** chỉ phân tích bệnh nhân có `subject_id` bắt đầu bằng `18` (folder `p18`).\n",
|
| 15 |
+
"\n",
|
| 16 |
+
"> ℹ️ **Không cần tải ảnh JPG** để chạy notebook này — toàn bộ EDA dựa trên CSV, .txt reports và .json VQA."
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "markdown",
|
| 21 |
+
"metadata": {},
|
| 22 |
+
"source": [
|
| 23 |
+
"## 0. Cấu hình đường dẫn"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"execution_count": null,
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [],
|
| 31 |
+
"source": "from pathlib import Path\n\nDATA_DIR = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\")\nCXR_ROOT = DATA_DIR / \"mimic-cxr-reports\" # files/p10…p19/pXXXXXX/sYYYYYY.txt\n\nSPLIT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-split.csv\"\nMETA_CSV = DATA_DIR / \"mimic-cxr-2.0.0-metadata.csv\"\nCHEXPERT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-chexpert.csv\"\n\n_VQA_DIR = (DATA_DIR\n / \"mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\"\n / \"MIMIC-Ext-MIMIC-CXR-VQA\"\n / \"dataset\")\nVQA_TRAIN = _VQA_DIR / \"train.json\"\nVQA_VALID = _VQA_DIR / \"valid.json\"\nVQA_TEST = _VQA_DIR / \"test.json\"\n\n# Kiểm tra nhanh\nfor name, p in [(\"SPLIT_CSV\", SPLIT_CSV),\n (\"META_CSV\", META_CSV),\n (\"CHEXPERT_CSV\", CHEXPERT_CSV),\n (\"CXR_ROOT\", CXR_ROOT),\n (\"VQA_TRAIN\", VQA_TRAIN)]:\n status = \"✓\" if p.exists() else \"✗ NOT FOUND\"\n print(f\" {status} {name}: {p}\")\n\nprint(\"\\nPaths configured.\")"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": null,
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"source": [
|
| 39 |
+
"import pandas as pd\n",
|
| 40 |
+
"import numpy as np\n",
|
| 41 |
+
"import json\n",
|
| 42 |
+
"import re\n",
|
| 43 |
+
"import matplotlib.pyplot as plt\n",
|
| 44 |
+
"import matplotlib.ticker as mticker\n",
|
| 45 |
+
"import seaborn as sns\n",
|
| 46 |
+
"from collections import Counter\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
|
| 49 |
+
"plt.rcParams[\"figure.dpi\"] = 120\n",
|
| 50 |
+
"plt.rcParams[\"figure.figsize\"] = (10, 4)\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"CHEXPERT_LABELS = [\n",
|
| 53 |
+
" \"Atelectasis\", \"Cardiomegaly\", \"Consolidation\", \"Edema\",\n",
|
| 54 |
+
" \"Enlarged Cardiomediastinum\", \"Fracture\", \"Lung Lesion\",\n",
|
| 55 |
+
" \"Lung Opacity\", \"No Finding\", \"Pleural Effusion\",\n",
|
| 56 |
+
" \"Pleural Other\", \"Pneumonia\", \"Pneumothorax\", \"Support Devices\"\n",
|
| 57 |
+
"]\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"print(\"Libraries imported.\")"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "markdown",
|
| 64 |
+
"metadata": {},
|
| 65 |
+
"source": [
|
| 66 |
+
"## 1. Load & lọc subset p18"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "code",
|
| 71 |
+
"execution_count": null,
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"outputs": [],
|
| 74 |
+
"source": [
|
| 75 |
+
"split_df = pd.read_csv(SPLIT_CSV)\n",
|
| 76 |
+
"meta_df = pd.read_csv(META_CSV)\n",
|
| 77 |
+
"chexpert_df = pd.read_csv(CHEXPERT_CSV)\n",
|
| 78 |
+
"\n",
|
| 79 |
+
"# Lọc p18\n",
|
| 80 |
+
"p18_split = split_df[split_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
|
| 81 |
+
"p18_meta = meta_df[meta_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
|
| 82 |
+
"p18_chex = chexpert_df[chexpert_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
|
| 83 |
+
"\n",
|
| 84 |
+
"print(f\"split.csv — p18 images : {len(p18_split):,}\")\n",
|
| 85 |
+
"print(f\"metadata — p18 images : {len(p18_meta):,}\")\n",
|
| 86 |
+
"print(f\"chexpert — p18 studies : {len(p18_chex):,}\")"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"cell_type": "code",
|
| 91 |
+
"execution_count": null,
|
| 92 |
+
"metadata": {},
|
| 93 |
+
"outputs": [],
|
| 94 |
+
"source": [
|
| 95 |
+
"# Merge split + metadata (by dicom_id)\n",
|
| 96 |
+
"df = p18_split.merge(\n",
|
| 97 |
+
" p18_meta[[\"dicom_id\", \"ViewPosition\", \"Rows\", \"Columns\"]],\n",
|
| 98 |
+
" on=\"dicom_id\", how=\"left\"\n",
|
| 99 |
+
")\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"print(f\"Merged shape: {df.shape}\")\n",
|
| 102 |
+
"df.head(3)"
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"cell_type": "markdown",
|
| 107 |
+
"metadata": {},
|
| 108 |
+
"source": [
|
| 109 |
+
"## 2. Tổng quan số lượng ảnh & report theo split"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"execution_count": null,
|
| 115 |
+
"metadata": {},
|
| 116 |
+
"outputs": [],
|
| 117 |
+
"source": [
|
| 118 |
+
"# Số ảnh theo split\n",
|
| 119 |
+
"img_per_split = df[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"# Số study (≈ report) theo split (mỗi study_id = 1 report)\n",
|
| 122 |
+
"study_per_split = (\n",
|
| 123 |
+
" df.drop_duplicates(\"study_id\")[\"split\"]\n",
|
| 124 |
+
" .value_counts()\n",
|
| 125 |
+
" .reindex([\"train\", \"validate\", \"test\"])\n",
|
| 126 |
+
")\n",
|
| 127 |
+
"\n",
|
| 128 |
+
"summary = pd.DataFrame({\n",
|
| 129 |
+
" \"Images (dicom_id)\": img_per_split,\n",
|
| 130 |
+
" \"Studies / Reports\": study_per_split\n",
|
| 131 |
+
"})\n",
|
| 132 |
+
"summary.loc[\"TOTAL\"] = summary.sum()\n",
|
| 133 |
+
"print(summary.to_string())"
|
| 134 |
+
]
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"cell_type": "code",
|
| 138 |
+
"execution_count": null,
|
| 139 |
+
"metadata": {},
|
| 140 |
+
"outputs": [],
|
| 141 |
+
"source": [
|
| 142 |
+
"fig, axes = plt.subplots(1, 2, figsize=(11, 4))\n",
|
| 143 |
+
"for ax, col, title in zip(axes, summary.columns, [\"Số ảnh theo split\", \"Số study/report theo split\"]):\n",
|
| 144 |
+
" vals = summary.loc[[\"train\",\"validate\",\"test\"], col]\n",
|
| 145 |
+
" bars = ax.bar(vals.index, vals.values, color=sns.color_palette(\"muted\", 3))\n",
|
| 146 |
+
" ax.bar_label(bars, fmt=\"%d\")\n",
|
| 147 |
+
" ax.set_title(title)\n",
|
| 148 |
+
" ax.set_ylabel(\"Count\")\n",
|
| 149 |
+
"plt.suptitle(\"p18 subset — images vs reports per split\", fontsize=13, y=1.02)\n",
|
| 150 |
+
"plt.tight_layout()\n",
|
| 151 |
+
"plt.show()"
|
| 152 |
+
]
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"cell_type": "markdown",
|
| 156 |
+
"metadata": {},
|
| 157 |
+
"source": [
|
| 158 |
+
"## 3. Số ảnh mỗi study (1 study → bao nhiêu ảnh?)"
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"cell_type": "code",
|
| 163 |
+
"execution_count": null,
|
| 164 |
+
"metadata": {},
|
| 165 |
+
"outputs": [],
|
| 166 |
+
"source": [
|
| 167 |
+
"imgs_per_study = df.groupby(\"study_id\")[\"dicom_id\"].count()\n",
|
| 168 |
+
"count_dist = imgs_per_study.value_counts().sort_index()\n",
|
| 169 |
+
"\n",
|
| 170 |
+
"print(\"Images per study distribution:\")\n",
|
| 171 |
+
"print(count_dist.to_string())\n",
|
| 172 |
+
"print(f\"\\nMax images in a single study: {imgs_per_study.max()}\")\n",
|
| 173 |
+
"print(f\"Mean images per study : {imgs_per_study.mean():.2f}\")"
|
| 174 |
+
]
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"cell_type": "code",
|
| 178 |
+
"execution_count": null,
|
| 179 |
+
"metadata": {},
|
| 180 |
+
"outputs": [],
|
| 181 |
+
"source": [
|
| 182 |
+
"fig, ax = plt.subplots(figsize=(8, 4))\n",
|
| 183 |
+
"ax.bar(count_dist.index.astype(str), count_dist.values, color=sns.color_palette(\"Blues_d\", len(count_dist)))\n",
|
| 184 |
+
"ax.set_xlabel(\"Số ảnh trong study\")\n",
|
| 185 |
+
"ax.set_ylabel(\"Số study\")\n",
|
| 186 |
+
"ax.set_title(\"Distribution: số ảnh mỗi study (p18)\")\n",
|
| 187 |
+
"for i, v in zip(count_dist.index, count_dist.values):\n",
|
| 188 |
+
" ax.text(str(i), v + max(count_dist)*0.01, str(v), ha=\"center\", va=\"bottom\", fontsize=9)\n",
|
| 189 |
+
"plt.tight_layout()\n",
|
| 190 |
+
"plt.show()"
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"cell_type": "markdown",
|
| 195 |
+
"metadata": {},
|
| 196 |
+
"source": [
|
| 197 |
+
"## 4. Phân bố View Position (AP, PA, Lateral, ...)"
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"cell_type": "code",
|
| 202 |
+
"execution_count": null,
|
| 203 |
+
"metadata": {},
|
| 204 |
+
"outputs": [],
|
| 205 |
+
"source": [
|
| 206 |
+
"view_counts = df[\"ViewPosition\"].fillna(\"Unknown\").value_counts()\n",
|
| 207 |
+
"print(\"View position counts:\")\n",
|
| 208 |
+
"print(view_counts.to_string())\n",
|
| 209 |
+
"print(f\"\\nTotal images: {len(df):,}\")"
|
| 210 |
+
]
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"cell_type": "code",
|
| 214 |
+
"execution_count": null,
|
| 215 |
+
"metadata": {},
|
| 216 |
+
"outputs": [],
|
| 217 |
+
"source": [
|
| 218 |
+
"fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
|
| 219 |
+
"\n",
|
| 220 |
+
"# Bar chart\n",
|
| 221 |
+
"bars = axes[0].bar(view_counts.index, view_counts.values,\n",
|
| 222 |
+
" color=sns.color_palette(\"Set2\", len(view_counts)))\n",
|
| 223 |
+
"axes[0].bar_label(bars, fmt=\"%d\")\n",
|
| 224 |
+
"axes[0].set_title(\"Số ảnh theo View Position\")\n",
|
| 225 |
+
"axes[0].set_ylabel(\"Count\")\n",
|
| 226 |
+
"\n",
|
| 227 |
+
"# Pie chart\n",
|
| 228 |
+
"axes[1].pie(view_counts.values, labels=view_counts.index, autopct=\"%1.1f%%\",\n",
|
| 229 |
+
" colors=sns.color_palette(\"Set2\", len(view_counts)))\n",
|
| 230 |
+
"axes[1].set_title(\"Tỉ lệ View Position\")\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"plt.suptitle(\"View Position Distribution — p18\", fontsize=13)\n",
|
| 233 |
+
"plt.tight_layout()\n",
|
| 234 |
+
"plt.show()"
|
| 235 |
+
]
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"cell_type": "code",
|
| 239 |
+
"execution_count": null,
|
| 240 |
+
"metadata": {},
|
| 241 |
+
"outputs": [],
|
| 242 |
+
"source": [
|
| 243 |
+
"# View distribution theo split\n",
|
| 244 |
+
"view_split = df.groupby([\"split\", \"ViewPosition\"]).size().unstack(fill_value=0)\n",
|
| 245 |
+
"view_split = view_split.reindex([\"train\", \"validate\", \"test\"])\n",
|
| 246 |
+
"view_split.plot(kind=\"bar\", figsize=(10, 4), color=sns.color_palette(\"Set2\", view_split.shape[1]))\n",
|
| 247 |
+
"plt.title(\"View Position theo split — p18\")\n",
|
| 248 |
+
"plt.xlabel(\"Split\")\n",
|
| 249 |
+
"plt.ylabel(\"Count\")\n",
|
| 250 |
+
"plt.xticks(rotation=0)\n",
|
| 251 |
+
"plt.legend(title=\"ViewPosition\")\n",
|
| 252 |
+
"plt.tight_layout()\n",
|
| 253 |
+
"plt.show()"
|
| 254 |
+
]
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"cell_type": "markdown",
|
| 258 |
+
"id": "ae9f3d3c",
|
| 259 |
+
"source": "## 4b. Frontal-Only Sampling Strategy (AP > PA)\n\nChiến lược train: **1 report + 1 ảnh frontal** mỗi study.\n- Chỉ giữ AP hoặc PA; nếu study có cả hai thì **ưu tiên AP**.\n- Study không có ảnh frontal nào → loại khỏi tập train.",
|
| 260 |
+
"metadata": {}
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"cell_type": "code",
|
| 264 |
+
"id": "d2ce6beb",
|
| 265 |
+
"source": "frontal = df[df[\"ViewPosition\"].isin([\"AP\", \"PA\"])].copy()\n\n# Với mỗi study: chọn AP trước, nếu không có thì chọn PA (lấy 1 ảnh duy nhất)\ndef pick_frontal_view(group):\n ap = group[group[\"ViewPosition\"] == \"AP\"]\n if len(ap) > 0:\n return ap.iloc[[0]]\n return group[group[\"ViewPosition\"] == \"PA\"].iloc[[0]]\n\nfrontal_1img = (\n frontal.groupby(\"study_id\", group_keys=False)\n .apply(pick_frontal_view)\n .reset_index(drop=True)\n)\n\n# Thống kê tổng quan\nn_study_total = df[\"study_id\"].nunique()\nn_study_frontal = frontal_1img[\"study_id\"].nunique()\nn_study_no_front = n_study_total - n_study_frontal\n\nprint(\"=== Frontal-Only Sampling (p18) ===\")\nprint(f\"Tổng số study : {n_study_total:,}\")\nprint(f\"Study có ảnh frontal (AP/PA) : {n_study_frontal:,} ({n_study_frontal/n_study_total*100:.1f}%)\")\nprint(f\"Study bị loại (không có frontal): {n_study_no_front:,} ({n_study_no_front/n_study_total*100:.1f}%)\")\nprint()\nprint(f\"Ảnh được chọn theo view:\")\nprint(frontal_1img[\"ViewPosition\"].value_counts().to_string())\nprint()\nprint(\"=== Mẫu train sau khi filter (split) ===\")\nsplit_frontal = frontal_1img[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\nsplit_all = df.drop_duplicates(\"study_id\")[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\ncompare = pd.DataFrame({\n \"All studies\": split_all,\n \"Frontal-only\": split_frontal,\n \"Giảm (%)\": ((split_all - split_frontal) / split_all * 100).round(1)\n})\nprint(compare.to_string())",
|
| 266 |
+
"metadata": {},
|
| 267 |
+
"execution_count": null,
|
| 268 |
+
"outputs": []
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"cell_type": "code",
|
| 272 |
+
"id": "9d4aaf5c",
|
| 273 |
+
"source": "fig, axes = plt.subplots(1, 3, figsize=(14, 4))\n\n# 1. All vs Frontal-only (study count)\ncats = [\"All studies\", \"Frontal-only\"]\nvals = [n_study_total, n_study_frontal]\nbars = axes[0].bar(cats, vals, color=[\"#4C72B0\", \"#55A868\"], width=0.5)\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Study count: All vs Frontal-only\")\naxes[0].set_ylabel(\"Số study\")\n\n# 2. View breakdown của ảnh được chọn\nvc = frontal_1img[\"ViewPosition\"].value_counts()\naxes[1].pie(vc.values, labels=vc.index, autopct=\"%1.1f%%\",\n colors=[\"#4C72B0\", \"#DD8452\"])\naxes[1].set_title(\"View được chọn (AP ưu tiên)\")\n\n# 3. So sánh train/val/test\nx = np.arange(3)\nw = 0.35\nsplits = [\"train\", \"validate\", \"test\"]\naxes[2].bar(x - w/2, split_all.values, w, label=\"All\", color=\"#4C72B0\", alpha=0.85)\naxes[2].bar(x + w/2, split_frontal.values, w, label=\"Frontal-only\", color=\"#55A868\", alpha=0.85)\naxes[2].set_xticks(x)\naxes[2].set_xticklabels(splits)\naxes[2].set_title(\"Frontal-only vs All (per split)\")\naxes[2].set_ylabel(\"Số study\")\naxes[2].legend()\n\nplt.suptitle(\"Frontal-Only Sampling Strategy — p18\", fontsize=13)\nplt.tight_layout()\nplt.show()",
|
| 274 |
+
"metadata": {},
|
| 275 |
+
"execution_count": null,
|
| 276 |
+
"outputs": []
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"cell_type": "markdown",
|
| 280 |
+
"metadata": {},
|
| 281 |
+
"source": [
|
| 282 |
+
"## 5. CheXpert Labels — 14 nhãn bệnh lý"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"cell_type": "code",
|
| 287 |
+
"execution_count": null,
|
| 288 |
+
"metadata": {},
|
| 289 |
+
"outputs": [],
|
| 290 |
+
"source": [
|
| 291 |
+
"# Chỉ lấy cột labels (1 = positive, 0 = negative, -1 = uncertain, NaN = not mentioned)\n",
|
| 292 |
+
"label_cols = [c for c in p18_chex.columns if c in CHEXPERT_LABELS]\n",
|
| 293 |
+
"\n",
|
| 294 |
+
"# Số study có nhãn Positive (=1) mỗi bệnh\n",
|
| 295 |
+
"positive_counts = (p18_chex[label_cols] == 1).sum().sort_values(ascending=False)\n",
|
| 296 |
+
"uncertain_counts = (p18_chex[label_cols] == -1).sum().sort_values(ascending=False)\n",
|
| 297 |
+
"negative_counts = (p18_chex[label_cols] == 0).sum().sort_values(ascending=False)\n",
|
| 298 |
+
"\n",
|
| 299 |
+
"label_summary = pd.DataFrame({\n",
|
| 300 |
+
" \"Positive\": positive_counts,\n",
|
| 301 |
+
" \"Uncertain\": uncertain_counts,\n",
|
| 302 |
+
" \"Negative\": negative_counts,\n",
|
| 303 |
+
" \"Not Mentioned\": p18_chex[label_cols].isna().sum()\n",
|
| 304 |
+
"})\n",
|
| 305 |
+
"label_summary[\"Total Studies\"] = len(p18_chex)\n",
|
| 306 |
+
"label_summary[\"Positive %\"] = (label_summary[\"Positive\"] / len(p18_chex) * 100).round(1)\n",
|
| 307 |
+
"print(label_summary[[\"Positive\",\"Uncertain\",\"Negative\",\"Not Mentioned\",\"Positive %\"]]\n",
|
| 308 |
+
" .sort_values(\"Positive\", ascending=False).to_string())"
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"cell_type": "code",
|
| 313 |
+
"execution_count": null,
|
| 314 |
+
"metadata": {},
|
| 315 |
+
"outputs": [],
|
| 316 |
+
"source": "# Headers hành chính — không phải findings\nADMIN_HEADERS = {\n 'EXAMINATION', 'INDICATION', 'CLINICAL INDICATION', 'TECHNIQUE',\n 'COMPARISON', 'HISTORY', 'REASON', 'REASON FOR EXAM',\n 'REASON FOR EXAMINATION', 'PROCEDURE', 'FINAL REPORT',\n 'NOTIFICATION', 'RECOMMENDATION', 'ADDENDUM'\n}\n\n# Detect section header: dòng bắt đầu bằng ALL-CAPS (có thể có space/dấu câu) rồi đến \":\"\nSECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\-]{1,70}?):\\s*', re.MULTILINE)\n\ndef parse_report(txt_path: Path) -> dict:\n \"\"\"\n Parse report .txt thành dict {'findings': str|None, 'impression': str|None}.\n\n Quy luật detect section: mọi header đều VIẾT HOA TOÀN BỘ và kết thúc bằng ':',\n ví dụ: FINDINGS:, IMPRESSION:, FRONTAL AND LATERAL VIEWS OF THE CHEST:\n → dùng regex bắt pattern đó, không hardcode t��ng keyword.\n\n Nếu không có section FINDINGS tường minh, fallback lấy section\n descriptive đầu tiên (không phải admin header).\n \"\"\"\n try:\n text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n except FileNotFoundError:\n return {\"findings\": None, \"impression\": None}\n\n matches = list(SECTION_RE.finditer(text))\n if not matches:\n return {\"findings\": None, \"impression\": None}\n\n # Tách từng section thành (header, content)\n sections = []\n for i, m in enumerate(matches):\n header = m.group(1).strip()\n start = m.end()\n end = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n content = text[start:end].strip()\n sections.append((header, content))\n\n findings = impression = None\n for header, content in sections:\n h = header.upper()\n if \"FINDING\" in h and findings is None:\n findings = content or None\n elif \"IMPRESSION\" in h and impression is None:\n impression = content or None\n\n # Fallback: không có FINDINGS tường minh → lấy section descriptive đầu tiên\n if findings is None:\n for header, content in sections:\n h = header.upper()\n if h not in ADMIN_HEADERS and \"IMPRESSION\" not in h and content:\n findings = content\n break\n\n return {\"findings\": findings, \"impression\": impression}\n\n\n# Lấy danh sách unique studies trong p18\np18_studies = (\n df[[\"subject_id\", \"study_id\"]]\n .drop_duplicates(\"study_id\")\n .reset_index(drop=True)\n)\n\nprint(f\"Số study cần parse: {len(p18_studies):,}\")\nprint(\"Parsing reports...\")\n\nrecords = []\nfor _, row in p18_studies.iterrows():\n sid = str(row[\"subject_id\"])\n stid = str(row[\"study_id\"])\n txt_path = CXR_ROOT / \"files\" / \"p18\" / f\"p{sid}\" / f\"s{stid}.txt\"\n parsed = parse_report(txt_path)\n records.append({\"study_id\": stid, **parsed})\n\nreport_df = pd.DataFrame(records)\nreport_df[\"findings_len\"] = report_df[\"findings\"].str.split().str.len()\nreport_df[\"impression_len\"] = report_df[\"impression\"].str.split().str.len()\n\ntotal = len(report_df)\nprint(f\"\\nFindings found : {report_df['findings'].notna().sum():,} / {total:,} ({report_df['findings'].notna().mean()*100:.1f}%)\")\nprint(f\"Impression found : {report_df['impression'].notna().sum():,} / {total:,} ({report_df['impression'].notna().mean()*100:.1f}%)\")\nboth = (report_df['findings'].notna() & report_df['impression'].notna()).sum()\nneither = (report_df['findings'].isna() & report_df['impression'].isna()).sum()\nprint(f\"Cả hai : {both:,} / {total:,} ({both/total*100:.1f}%)\")\nprint(f\"Không có cả hai : {neither:,} / {total:,} ({neither/total*100:.1f}%)\")"
|
| 317 |
+
},
|
| 318 |
+
{
|
| 319 |
+
"cell_type": "code",
|
| 320 |
+
"execution_count": null,
|
| 321 |
+
"metadata": {},
|
| 322 |
+
"outputs": [],
|
| 323 |
+
"source": [
|
| 324 |
+
"# Số nhãn positive mỗi study (label co-occurrence)\n",
|
| 325 |
+
"labels_per_study = (p18_chex[label_cols] == 1).sum(axis=1)\n",
|
| 326 |
+
"print(\"Số nhãn positive mỗi study:\")\n",
|
| 327 |
+
"print(labels_per_study.value_counts().sort_index().to_string())\n",
|
| 328 |
+
"\n",
|
| 329 |
+
"fig, ax = plt.subplots(figsize=(9, 4))\n",
|
| 330 |
+
"lps_counts = labels_per_study.value_counts().sort_index()\n",
|
| 331 |
+
"ax.bar(lps_counts.index.astype(str), lps_counts.values, color=sns.color_palette(\"Blues_d\", len(lps_counts)))\n",
|
| 332 |
+
"ax.set_xlabel(\"Số nhãn positive\")\n",
|
| 333 |
+
"ax.set_ylabel(\"Số study\")\n",
|
| 334 |
+
"ax.set_title(\"Phân bố số nhãn positive mỗi study (p18)\")\n",
|
| 335 |
+
"plt.tight_layout()\n",
|
| 336 |
+
"plt.show()"
|
| 337 |
+
]
|
| 338 |
+
},
|
| 339 |
+
{
|
| 340 |
+
"cell_type": "markdown",
|
| 341 |
+
"metadata": {},
|
| 342 |
+
"source": [
|
| 343 |
+
"## 6. Phân tích Report — Findings & Impression"
|
| 344 |
+
]
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
"cell_type": "code",
|
| 348 |
+
"execution_count": null,
|
| 349 |
+
"metadata": {},
|
| 350 |
+
"outputs": [],
|
| 351 |
+
"source": [
|
| 352 |
+
"def parse_report(txt_path: Path) -> dict:\n",
|
| 353 |
+
" \"\"\"Trả về dict với 'findings' và 'impression' (str hoặc None).\"\"\"\n",
|
| 354 |
+
" try:\n",
|
| 355 |
+
" text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n",
|
| 356 |
+
" except FileNotFoundError:\n",
|
| 357 |
+
" return {\"findings\": None, \"impression\": None}\n",
|
| 358 |
+
"\n",
|
| 359 |
+
" text = re.sub(r\"[\\r\\n]+\", \" \", text) # flatten newlines\n",
|
| 360 |
+
"\n",
|
| 361 |
+
" def extract_section(pattern, text):\n",
|
| 362 |
+
" m = re.search(pattern, text, re.IGNORECASE)\n",
|
| 363 |
+
" if not m:\n",
|
| 364 |
+
" return None\n",
|
| 365 |
+
" start = m.end()\n",
|
| 366 |
+
" # cắt đến section tiếp theo hoặc hết string\n",
|
| 367 |
+
" next_sec = re.search(\n",
|
| 368 |
+
" r\"(IMPRESSION|FINDINGS|CONCLUSION|RECOMMENDATION|NOTIFICATION)\",\n",
|
| 369 |
+
" text[start:], re.IGNORECASE\n",
|
| 370 |
+
" )\n",
|
| 371 |
+
" end = start + next_sec.start() if next_sec else len(text)\n",
|
| 372 |
+
" return text[start:end].strip()\n",
|
| 373 |
+
"\n",
|
| 374 |
+
" findings = extract_section(r\"FINDINGS\\s*:\", text)\n",
|
| 375 |
+
" impression = extract_section(r\"IMPRESSION\\s*:\", text)\n",
|
| 376 |
+
" return {\"findings\": findings, \"impression\": impression}\n",
|
| 377 |
+
"\n",
|
| 378 |
+
"\n",
|
| 379 |
+
"# Lấy danh sách unique studies trong p18\n",
|
| 380 |
+
"p18_studies = (\n",
|
| 381 |
+
" df[[\"subject_id\", \"study_id\"]]\n",
|
| 382 |
+
" .drop_duplicates(\"study_id\")\n",
|
| 383 |
+
" .reset_index(drop=True)\n",
|
| 384 |
+
")\n",
|
| 385 |
+
"\n",
|
| 386 |
+
"print(f\"Số study cần parse: {len(p18_studies):,}\")\n",
|
| 387 |
+
"print(\"Parsing reports... (có thể mất vài giây)\")\n",
|
| 388 |
+
"\n",
|
| 389 |
+
"records = []\n",
|
| 390 |
+
"for _, row in p18_studies.iterrows():\n",
|
| 391 |
+
" sid = str(row[\"subject_id\"])\n",
|
| 392 |
+
" stid = str(row[\"study_id\"])\n",
|
| 393 |
+
" txt_path = CXR_ROOT / \"files\" / \"p18\" / f\"p{sid}\" / f\"s{stid}.txt\"\n",
|
| 394 |
+
" parsed = parse_report(txt_path)\n",
|
| 395 |
+
" records.append({\"study_id\": stid, **parsed})\n",
|
| 396 |
+
"\n",
|
| 397 |
+
"report_df = pd.DataFrame(records)\n",
|
| 398 |
+
"report_df[\"findings_len\"] = report_df[\"findings\"].dropna().str.split().str.len()\n",
|
| 399 |
+
"report_df[\"impression_len\"] = report_df[\"impression\"].dropna().str.split().str.len()\n",
|
| 400 |
+
"\n",
|
| 401 |
+
"print(f\"\\nFindings found : {report_df['findings'].notna().sum():,} / {len(report_df):,}\")\n",
|
| 402 |
+
"print(f\"Impression found : {report_df['impression'].notna().sum():,} / {len(report_df):,}\")"
|
| 403 |
+
]
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"cell_type": "code",
|
| 407 |
+
"execution_count": null,
|
| 408 |
+
"metadata": {},
|
| 409 |
+
"outputs": [],
|
| 410 |
+
"source": [
|
| 411 |
+
"# Descriptive stats\n",
|
| 412 |
+
"print(\"=== Findings word count ===\")\n",
|
| 413 |
+
"print(report_df[\"findings_len\"].describe().round(1).to_string())\n",
|
| 414 |
+
"print(\"\\n=== Impression word count ===\")\n",
|
| 415 |
+
"print(report_df[\"impression_len\"].describe().round(1).to_string())"
|
| 416 |
+
]
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"cell_type": "code",
|
| 420 |
+
"execution_count": null,
|
| 421 |
+
"metadata": {},
|
| 422 |
+
"outputs": [],
|
| 423 |
+
"source": [
|
| 424 |
+
"fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
|
| 425 |
+
"\n",
|
| 426 |
+
"for ax, col, title, color in zip(\n",
|
| 427 |
+
" axes,\n",
|
| 428 |
+
" [\"findings_len\", \"impression_len\"],\n",
|
| 429 |
+
" [\"Findings — phân bố độ dài (số từ)\", \"Impression — phân bố độ dài (số từ)\"],\n",
|
| 430 |
+
" [\"steelblue\", \"tomato\"]\n",
|
| 431 |
+
"):\n",
|
| 432 |
+
" data = report_df[col].dropna()\n",
|
| 433 |
+
" # clip outliers để biểu đồ dễ nhìn\n",
|
| 434 |
+
" p99 = data.quantile(0.99)\n",
|
| 435 |
+
" data_clipped = data[data <= p99]\n",
|
| 436 |
+
" ax.hist(data_clipped, bins=40, color=color, edgecolor=\"white\", alpha=0.85)\n",
|
| 437 |
+
" ax.axvline(data.median(), color=\"black\", linestyle=\"--\", linewidth=1.2, label=f\"Median={data.median():.0f}\")\n",
|
| 438 |
+
" ax.axvline(data.mean(), color=\"gray\", linestyle=\":\", linewidth=1.2, label=f\"Mean={data.mean():.0f}\")\n",
|
| 439 |
+
" ax.set_title(title)\n",
|
| 440 |
+
" ax.set_xlabel(\"Số từ\")\n",
|
| 441 |
+
" ax.set_ylabel(\"Số report\")\n",
|
| 442 |
+
" ax.legend(fontsize=9)\n",
|
| 443 |
+
" ax.text(0.97, 0.95, f\"n={len(data):,}\\n(hiển thị ≤p99={p99:.0f}w)\",\n",
|
| 444 |
+
" transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
|
| 445 |
+
"\n",
|
| 446 |
+
"plt.suptitle(\"Phân bố độ dài report — p18\", fontsize=13)\n",
|
| 447 |
+
"plt.tight_layout()\n",
|
| 448 |
+
"plt.show()"
|
| 449 |
+
]
|
| 450 |
+
},
|
| 451 |
+
{
|
| 452 |
+
"cell_type": "code",
|
| 453 |
+
"execution_count": null,
|
| 454 |
+
"metadata": {},
|
| 455 |
+
"outputs": [],
|
| 456 |
+
"source": [
|
| 457 |
+
"# Box plot so sánh Findings vs Impression\n",
|
| 458 |
+
"combined = pd.DataFrame({\n",
|
| 459 |
+
" \"word_count\": pd.concat([report_df[\"findings_len\"], report_df[\"impression_len\"]], ignore_index=True),\n",
|
| 460 |
+
" \"section\": [\"Findings\"] * len(report_df) + [\"Impression\"] * len(report_df)\n",
|
| 461 |
+
"}).dropna()\n",
|
| 462 |
+
"\n",
|
| 463 |
+
"fig, ax = plt.subplots(figsize=(7, 4))\n",
|
| 464 |
+
"sns.boxplot(data=combined, x=\"section\", y=\"word_count\",\n",
|
| 465 |
+
" palette=[\"steelblue\", \"tomato\"], showfliers=False, ax=ax)\n",
|
| 466 |
+
"ax.set_title(\"Findings vs Impression — độ dài (box plot, no outliers)\")\n",
|
| 467 |
+
"ax.set_ylabel(\"Số từ\")\n",
|
| 468 |
+
"plt.tight_layout()\n",
|
| 469 |
+
"plt.show()"
|
| 470 |
+
]
|
| 471 |
+
},
|
| 472 |
+
{
|
| 473 |
+
"cell_type": "markdown",
|
| 474 |
+
"metadata": {},
|
| 475 |
+
"source": [
|
| 476 |
+
"## 7. VQA — phân tích câu hỏi & đáp"
|
| 477 |
+
]
|
| 478 |
+
},
|
| 479 |
+
{
|
| 480 |
+
"cell_type": "code",
|
| 481 |
+
"execution_count": null,
|
| 482 |
+
"metadata": {},
|
| 483 |
+
"outputs": [],
|
| 484 |
+
"source": [
|
| 485 |
+
"vqa_dfs = []\n",
|
| 486 |
+
"for fpath, split_name in [(VQA_TRAIN, \"train\"), (VQA_VALID, \"valid\"), (VQA_TEST, \"test\")]:\n",
|
| 487 |
+
" if fpath.exists():\n",
|
| 488 |
+
" with open(fpath, encoding=\"utf-8\") as f:\n",
|
| 489 |
+
" data = json.load(f)\n",
|
| 490 |
+
" tmp = pd.DataFrame(data)\n",
|
| 491 |
+
" tmp[\"split\"] = split_name\n",
|
| 492 |
+
" vqa_dfs.append(tmp)\n",
|
| 493 |
+
" else:\n",
|
| 494 |
+
" print(f\"[WARNING] File not found: {fpath}\")\n",
|
| 495 |
+
"\n",
|
| 496 |
+
"vqa_all = pd.concat(vqa_dfs, ignore_index=True)\n",
|
| 497 |
+
"\n",
|
| 498 |
+
"# Lọc p18\n",
|
| 499 |
+
"vqa_p18 = vqa_all[vqa_all[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n",
|
| 500 |
+
"\n",
|
| 501 |
+
"print(f\"VQA total records : {len(vqa_all):,}\")\n",
|
| 502 |
+
"print(f\"VQA p18 records : {len(vqa_p18):,}\")\n",
|
| 503 |
+
"print(f\"\\nColumns: {list(vqa_p18.columns)}\")"
|
| 504 |
+
]
|
| 505 |
+
},
|
| 506 |
+
{
|
| 507 |
+
"cell_type": "code",
|
| 508 |
+
"execution_count": null,
|
| 509 |
+
"metadata": {},
|
| 510 |
+
"outputs": [],
|
| 511 |
+
"source": [
|
| 512 |
+
"# Số VQA mẫu theo split\n",
|
| 513 |
+
"print(\"VQA p18 per split:\")\n",
|
| 514 |
+
"print(vqa_p18[\"split\"].value_counts().to_string())"
|
| 515 |
+
]
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"cell_type": "code",
|
| 519 |
+
"execution_count": null,
|
| 520 |
+
"metadata": {},
|
| 521 |
+
"outputs": [],
|
| 522 |
+
"source": [
|
| 523 |
+
"# Semantic type: verify / choose / query\n",
|
| 524 |
+
"sem_counts = vqa_p18[\"semantic_type\"].value_counts()\n",
|
| 525 |
+
"print(\"Semantic type (verify/choose/query):\")\n",
|
| 526 |
+
"print(sem_counts.to_string())\n",
|
| 527 |
+
"\n",
|
| 528 |
+
"# Content type: presence / anatomy / attribute / abnormality / size / plane / gender\n",
|
| 529 |
+
"con_counts = vqa_p18[\"content_type\"].value_counts()\n",
|
| 530 |
+
"print(\"\\nContent type:\")\n",
|
| 531 |
+
"print(con_counts.to_string())"
|
| 532 |
+
]
|
| 533 |
+
},
|
| 534 |
+
{
|
| 535 |
+
"cell_type": "code",
|
| 536 |
+
"execution_count": null,
|
| 537 |
+
"metadata": {},
|
| 538 |
+
"outputs": [],
|
| 539 |
+
"source": [
|
| 540 |
+
"fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
|
| 541 |
+
"\n",
|
| 542 |
+
"# Semantic type\n",
|
| 543 |
+
"bars = axes[0].bar(sem_counts.index, sem_counts.values,\n",
|
| 544 |
+
" color=sns.color_palette(\"Set1\", len(sem_counts)))\n",
|
| 545 |
+
"axes[0].bar_label(bars, fmt=\"%d\")\n",
|
| 546 |
+
"axes[0].set_title(\"VQA — Semantic Type (p18)\")\n",
|
| 547 |
+
"axes[0].set_ylabel(\"Count\")\n",
|
| 548 |
+
"\n",
|
| 549 |
+
"# Content type\n",
|
| 550 |
+
"bars2 = axes[1].bar(con_counts.index, con_counts.values,\n",
|
| 551 |
+
" color=sns.color_palette(\"Set2\", len(con_counts)))\n",
|
| 552 |
+
"axes[1].bar_label(bars2, fmt=\"%d\")\n",
|
| 553 |
+
"axes[1].set_title(\"VQA — Content Type (p18)\")\n",
|
| 554 |
+
"axes[1].set_ylabel(\"Count\")\n",
|
| 555 |
+
"axes[1].tick_params(axis=\"x\", rotation=30)\n",
|
| 556 |
+
"\n",
|
| 557 |
+
"plt.suptitle(\"VQA Question Analysis — p18\", fontsize=13)\n",
|
| 558 |
+
"plt.tight_layout()\n",
|
| 559 |
+
"plt.show()"
|
| 560 |
+
]
|
| 561 |
+
},
|
| 562 |
+
{
|
| 563 |
+
"cell_type": "markdown",
|
| 564 |
+
"id": "c313b9c3",
|
| 565 |
+
"source": "### VQA × View Position — mẫu hỏi đáp thuộc ảnh view nào",
|
| 566 |
+
"metadata": {}
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"cell_type": "code",
|
| 570 |
+
"id": "0791482f",
|
| 571 |
+
"source": "# image_id trong VQA = dicom_id trong metadata\nvqa_view = vqa_p18.merge(\n p18_meta[[\"dicom_id\", \"ViewPosition\"]],\n left_on=\"image_id\", right_on=\"dicom_id\",\n how=\"left\"\n)\n\nmissing_view_vqa = vqa_view[\"ViewPosition\"].isna().sum()\nvqa_view[\"ViewPosition\"] = vqa_view[\"ViewPosition\"].fillna(\"Unknown\")\n\nview_vqa_counts = vqa_view[\"ViewPosition\"].value_counts()\nprint(\"=== VQA samples theo View Position (p18) ===\")\nprint(view_vqa_counts.to_string())\nprint(f\"\\nKhông map được ViewPosition: {missing_view_vqa:,} ({missing_view_vqa/len(vqa_view)*100:.1f}%)\")",
|
| 572 |
+
"metadata": {},
|
| 573 |
+
"execution_count": null,
|
| 574 |
+
"outputs": []
|
| 575 |
+
},
|
| 576 |
+
{
|
| 577 |
+
"cell_type": "code",
|
| 578 |
+
"id": "049baaef",
|
| 579 |
+
"source": "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n\n# 1. Bar: số mẫu VQA theo view\nbars = axes[0].bar(view_vqa_counts.index, view_vqa_counts.values,\n color=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[0].bar_label(bars, fmt=\"%d\")\naxes[0].set_title(\"Số mẫu VQA theo View Position\")\naxes[0].set_ylabel(\"Số mẫu\")\n\n# 2. Pie\naxes[1].pie(view_vqa_counts.values, labels=view_vqa_counts.index,\n autopct=\"%1.1f%%\", colors=sns.color_palette(\"Set2\", len(view_vqa_counts)))\naxes[1].set_title(\"Tỉ lệ VQA theo View Position\")\n\n# 3. Semantic type × View (stacked bar)\nsem_view = vqa_view.groupby([\"ViewPosition\", \"semantic_type\"]).size().unstack(fill_value=0)\nsem_view.plot(kind=\"bar\", ax=axes[2], color=sns.color_palette(\"Set1\", sem_view.shape[1]),\n width=0.7, stacked=True)\naxes[2].set_title(\"Semantic Type × View Position\")\naxes[2].set_xlabel(\"View Position\")\naxes[2].set_ylabel(\"Số mẫu\")\naxes[2].tick_params(axis=\"x\", rotation=30)\naxes[2].legend(title=\"Semantic Type\", fontsize=8)\n\nplt.suptitle(\"VQA × View Position — p18\", fontsize=13)\nplt.tight_layout()\nplt.show()\n\n# Content type × View\nprint(\"\\nContent type theo View Position:\")\nprint(vqa_view.groupby([\"ViewPosition\", \"content_type\"]).size()\n .unstack(fill_value=0).to_string())",
|
| 580 |
+
"metadata": {},
|
| 581 |
+
"execution_count": null,
|
| 582 |
+
"outputs": []
|
| 583 |
+
},
|
| 584 |
+
{
|
| 585 |
+
"cell_type": "code",
|
| 586 |
+
"execution_count": null,
|
| 587 |
+
"metadata": {},
|
| 588 |
+
"outputs": [],
|
| 589 |
+
"source": [
|
| 590 |
+
"# Cross-tab: semantic_type × content_type\n",
|
| 591 |
+
"cross = pd.crosstab(vqa_p18[\"semantic_type\"], vqa_p18[\"content_type\"])\n",
|
| 592 |
+
"print(\"Cross-tab semantic × content:\")\n",
|
| 593 |
+
"print(cross.to_string())\n",
|
| 594 |
+
"\n",
|
| 595 |
+
"fig, ax = plt.subplots(figsize=(10, 3))\n",
|
| 596 |
+
"sns.heatmap(cross, annot=True, fmt=\"d\", cmap=\"YlOrRd\", ax=ax)\n",
|
| 597 |
+
"ax.set_title(\"VQA — Semantic Type × Content Type (p18)\")\n",
|
| 598 |
+
"plt.tight_layout()\n",
|
| 599 |
+
"plt.show()"
|
| 600 |
+
]
|
| 601 |
+
},
|
| 602 |
+
{
|
| 603 |
+
"cell_type": "code",
|
| 604 |
+
"execution_count": null,
|
| 605 |
+
"metadata": {},
|
| 606 |
+
"outputs": [],
|
| 607 |
+
"source": [
|
| 608 |
+
"# Phân bố độ dài câu hỏi (số từ)\n",
|
| 609 |
+
"vqa_p18[\"q_len\"] = vqa_p18[\"question\"].str.split().str.len()\n",
|
| 610 |
+
"\n",
|
| 611 |
+
"print(\"Question length stats:\")\n",
|
| 612 |
+
"print(vqa_p18[\"q_len\"].describe().round(1).to_string())\n",
|
| 613 |
+
"\n",
|
| 614 |
+
"fig, ax = plt.subplots(figsize=(9, 3))\n",
|
| 615 |
+
"ax.hist(vqa_p18[\"q_len\"].clip(upper=vqa_p18[\"q_len\"].quantile(0.99)),\n",
|
| 616 |
+
" bins=30, color=\"slateblue\", edgecolor=\"white\")\n",
|
| 617 |
+
"ax.axvline(vqa_p18[\"q_len\"].median(), color=\"black\", linestyle=\"--\",\n",
|
| 618 |
+
" label=f\"Median={vqa_p18['q_len'].median():.0f}\")\n",
|
| 619 |
+
"ax.set_title(\"Phân bố độ dài câu hỏi VQA (p18)\")\n",
|
| 620 |
+
"ax.set_xlabel(\"Số từ\")\n",
|
| 621 |
+
"ax.set_ylabel(\"Count\")\n",
|
| 622 |
+
"ax.legend()\n",
|
| 623 |
+
"plt.tight_layout()\n",
|
| 624 |
+
"plt.show()"
|
| 625 |
+
]
|
| 626 |
+
},
|
| 627 |
+
{
|
| 628 |
+
"cell_type": "code",
|
| 629 |
+
"execution_count": null,
|
| 630 |
+
"metadata": {},
|
| 631 |
+
"outputs": [],
|
| 632 |
+
"source": [
|
| 633 |
+
"# Phân bố dạng câu trả lời: yes/no vs. khác\n",
|
| 634 |
+
"def classify_answer(ans_list):\n",
|
| 635 |
+
" if not isinstance(ans_list, list) or len(ans_list) == 0:\n",
|
| 636 |
+
" return \"no answer\"\n",
|
| 637 |
+
" a = ans_list[0].strip().lower()\n",
|
| 638 |
+
" if a in [\"yes\", \"no\"]:\n",
|
| 639 |
+
" return a\n",
|
| 640 |
+
" return \"open\"\n",
|
| 641 |
+
"\n",
|
| 642 |
+
"vqa_p18[\"ans_type\"] = vqa_p18[\"answer\"].apply(classify_answer)\n",
|
| 643 |
+
"\n",
|
| 644 |
+
"ans_counts = vqa_p18[\"ans_type\"].value_counts()\n",
|
| 645 |
+
"print(\"Answer type distribution:\")\n",
|
| 646 |
+
"print(ans_counts.to_string())\n",
|
| 647 |
+
"\n",
|
| 648 |
+
"fig, ax = plt.subplots(figsize=(6, 3))\n",
|
| 649 |
+
"bars = ax.bar(ans_counts.index, ans_counts.values,\n",
|
| 650 |
+
" color=sns.color_palette(\"Pastel1\", len(ans_counts)))\n",
|
| 651 |
+
"ax.bar_label(bars, fmt=\"%d\")\n",
|
| 652 |
+
"ax.set_title(\"VQA — Answer Type Distribution (p18)\")\n",
|
| 653 |
+
"ax.set_ylabel(\"Count\")\n",
|
| 654 |
+
"plt.tight_layout()\n",
|
| 655 |
+
"plt.show()"
|
| 656 |
+
]
|
| 657 |
+
},
|
| 658 |
+
{
|
| 659 |
+
"cell_type": "markdown",
|
| 660 |
+
"metadata": {},
|
| 661 |
+
"source": [
|
| 662 |
+
"## 8. Gợi ý thêm — Missing data & Data Quality"
|
| 663 |
+
]
|
| 664 |
+
},
|
| 665 |
+
{
|
| 666 |
+
"cell_type": "code",
|
| 667 |
+
"execution_count": null,
|
| 668 |
+
"metadata": {},
|
| 669 |
+
"outputs": [],
|
| 670 |
+
"source": [
|
| 671 |
+
"# 8.1 Tỉ lệ study không có findings / không có impression\n",
|
| 672 |
+
"no_findings = report_df[\"findings\"].isna().sum()\n",
|
| 673 |
+
"no_impression = report_df[\"impression\"].isna().sum()\n",
|
| 674 |
+
"total_studies = len(report_df)\n",
|
| 675 |
+
"\n",
|
| 676 |
+
"print(f\"Studies thiếu Findings : {no_findings:,} / {total_studies:,} ({no_findings/total_studies*100:.1f}%)\")\n",
|
| 677 |
+
"print(f\"Studies thiếu Impression : {no_impression:,} / {total_studies:,} ({no_impression/total_studies*100:.1f}%)\")\n",
|
| 678 |
+
"both_missing = (report_df[\"findings\"].isna() & report_df[\"impression\"].isna()).sum()\n",
|
| 679 |
+
"print(f\"Studies thiếu CẢ HAI : {both_missing:,} / {total_studies:,} ({both_missing/total_studies*100:.1f}%)\")"
|
| 680 |
+
]
|
| 681 |
+
},
|
| 682 |
+
{
|
| 683 |
+
"cell_type": "code",
|
| 684 |
+
"execution_count": null,
|
| 685 |
+
"metadata": {},
|
| 686 |
+
"outputs": [],
|
| 687 |
+
"source": [
|
| 688 |
+
"# 8.2 Tỉ lệ ảnh thiếu ViewPosition\n",
|
| 689 |
+
"missing_view = df[\"ViewPosition\"].isna().sum()\n",
|
| 690 |
+
"print(f\"Ảnh thiếu ViewPosition: {missing_view:,} / {len(df):,} ({missing_view/len(df)*100:.1f}%)\")"
|
| 691 |
+
]
|
| 692 |
+
},
|
| 693 |
+
{
|
| 694 |
+
"cell_type": "code",
|
| 695 |
+
"execution_count": null,
|
| 696 |
+
"metadata": {},
|
| 697 |
+
"outputs": [],
|
| 698 |
+
"source": [
|
| 699 |
+
"# 8.3 Số bệnh nhân (subject_id) trong p18\n",
|
| 700 |
+
"n_subjects = df[\"subject_id\"].nunique()\n",
|
| 701 |
+
"n_studies = df[\"study_id\"].nunique()\n",
|
| 702 |
+
"n_images = df[\"dicom_id\"].nunique()\n",
|
| 703 |
+
"\n",
|
| 704 |
+
"print(f\"Bệnh nhân (subject_id) : {n_subjects:,}\")\n",
|
| 705 |
+
"print(f\"Lần khám (study_id) : {n_studies:,}\")\n",
|
| 706 |
+
"print(f\"Ảnh (dicom_id) : {n_images:,}\")\n",
|
| 707 |
+
"print(f\"\\nTrung bình study/bệnh nhân : {n_studies/n_subjects:.2f}\")\n",
|
| 708 |
+
"print(f\"Trung bình ảnh/bệnh nhân : {n_images/n_subjects:.2f}\")"
|
| 709 |
+
]
|
| 710 |
+
},
|
| 711 |
+
{
|
| 712 |
+
"cell_type": "code",
|
| 713 |
+
"execution_count": null,
|
| 714 |
+
"metadata": {},
|
| 715 |
+
"outputs": [],
|
| 716 |
+
"source": [
|
| 717 |
+
"# 8.4 Study distribution per patient\n",
|
| 718 |
+
"studies_per_patient = df.groupby(\"subject_id\")[\"study_id\"].nunique()\n",
|
| 719 |
+
"print(\"Studies per patient stats:\")\n",
|
| 720 |
+
"print(studies_per_patient.describe().round(1).to_string())\n",
|
| 721 |
+
"\n",
|
| 722 |
+
"fig, ax = plt.subplots(figsize=(9, 3))\n",
|
| 723 |
+
"spp = studies_per_patient.value_counts().sort_index()\n",
|
| 724 |
+
"ax.bar(spp.index.astype(str), spp.values, color=\"mediumpurple\")\n",
|
| 725 |
+
"ax.set_xlabel(\"Số study mỗi bệnh nhân\")\n",
|
| 726 |
+
"ax.set_ylabel(\"Số bệnh nhân\")\n",
|
| 727 |
+
"ax.set_title(\"Phân bố số lần khám mỗi bệnh nhân — p18\")\n",
|
| 728 |
+
"ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True, nbins=20))\n",
|
| 729 |
+
"plt.tight_layout()\n",
|
| 730 |
+
"plt.show()"
|
| 731 |
+
]
|
| 732 |
+
},
|
| 733 |
+
{
|
| 734 |
+
"cell_type": "code",
|
| 735 |
+
"execution_count": null,
|
| 736 |
+
"metadata": {},
|
| 737 |
+
"outputs": [],
|
| 738 |
+
"source": [
|
| 739 |
+
"# 8.5 Image resolution distribution (nếu có cột Rows/Columns trong metadata)\n",
|
| 740 |
+
"if \"Rows\" in df.columns and \"Columns\" in df.columns:\n",
|
| 741 |
+
" print(\"Image resolution stats:\")\n",
|
| 742 |
+
" print(df[[\"Rows\", \"Columns\"]].describe().round(0).to_string())\n",
|
| 743 |
+
"\n",
|
| 744 |
+
" res_counts = df.groupby([\"Rows\", \"Columns\"]).size().sort_values(ascending=False).head(15)\n",
|
| 745 |
+
" print(\"\\nTop-15 resolutions:\")\n",
|
| 746 |
+
" print(res_counts.to_string())\nelse:\n",
|
| 747 |
+
" print(\"Cột Rows/Columns không có trong metadata.\")"
|
| 748 |
+
]
|
| 749 |
+
},
|
| 750 |
+
{
|
| 751 |
+
"cell_type": "markdown",
|
| 752 |
+
"metadata": {},
|
| 753 |
+
"source": [
|
| 754 |
+
"## 9. Tóm tắt (Summary)"
|
| 755 |
+
]
|
| 756 |
+
},
|
| 757 |
+
{
|
| 758 |
+
"cell_type": "code",
|
| 759 |
+
"execution_count": null,
|
| 760 |
+
"metadata": {},
|
| 761 |
+
"outputs": [],
|
| 762 |
+
"source": [
|
| 763 |
+
"print(\"=\"*55)\n",
|
| 764 |
+
"print(\" SUMMARY — MIMIC-CXR Subset p18\")\n",
|
| 765 |
+
"print(\"=\"*55)\n",
|
| 766 |
+
"print(f\" Bệnh nhân : {n_subjects:,}\")\n",
|
| 767 |
+
"print(f\" Studies (reports) : {n_studies:,}\")\n",
|
| 768 |
+
"print(f\" Ảnh (dicom/jpg) : {n_images:,}\")\n",
|
| 769 |
+
"print()\n",
|
| 770 |
+
"for sp in [\"train\", \"validate\", \"test\"]:\n",
|
| 771 |
+
" ni = img_per_split.get(sp, 0)\n",
|
| 772 |
+
" ns = study_per_split.get(sp, 0)\n",
|
| 773 |
+
" print(f\" [{sp:>8}] ảnh={ni:>5,} studies={ns:>5,}\")\n",
|
| 774 |
+
"print()\n",
|
| 775 |
+
"print(f\" Frontal (PA+AP) : {view_counts.get('PA',0)+view_counts.get('AP',0):,} ảnh\")\n",
|
| 776 |
+
"print(f\" Lateral : {view_counts.get('LL',0)+view_counts.get('LATERAL',0):,} ảnh\")\n",
|
| 777 |
+
"print(f\" Findings available : {report_df['findings'].notna().sum():,}/{total_studies:,}\")\n",
|
| 778 |
+
"print(f\" Impression available : {report_df['impression'].notna().sum():,}/{total_studies:,}\")\n",
|
| 779 |
+
"print(f\" VQA samples (p18) : {len(vqa_p18):,}\")\n",
|
| 780 |
+
"print(\"=\"*55)"
|
| 781 |
+
]
|
| 782 |
+
}
|
| 783 |
+
],
|
| 784 |
+
"metadata": {
|
| 785 |
+
"kernelspec": {
|
| 786 |
+
"display_name": "Python 3",
|
| 787 |
+
"language": "python",
|
| 788 |
+
"name": "python3"
|
| 789 |
+
},
|
| 790 |
+
"language_info": {
|
| 791 |
+
"name": "python",
|
| 792 |
+
"version": "3.10.0"
|
| 793 |
+
}
|
| 794 |
+
},
|
| 795 |
+
"nbformat": 4,
|
| 796 |
+
"nbformat_minor": 5
|
| 797 |
+
}
|
data/eda_reports.ipynb
ADDED
|
@@ -0,0 +1,741 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# EDA — MIMIC-CXR Reports\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"Phân tích chuyên sâu toàn bộ report `.txt` trong MIMIC-CXR:\n",
|
| 10 |
+
"- Thống kê tất cả loại section header thực tế\n",
|
| 11 |
+
"- Tỉ lệ report có/thiếu findings, impression theo subset\n",
|
| 12 |
+
"- Phân phối độ dài findings & impression\n",
|
| 13 |
+
"- Parser cập nhật xử lý đầy đủ alias (CONCLUSION, FINDINGS AND IMPRESSION, v.v.)"
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"cell_type": "markdown",
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"source": [
|
| 20 |
+
"## 0. Config"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "code",
|
| 25 |
+
"execution_count": null,
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"outputs": [],
|
| 28 |
+
"source": [
|
| 29 |
+
"from pathlib import Path\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"CXR_ROOT = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\\mimic-cxr-reports\") # files/p10…p19/\n",
|
| 32 |
+
"SPLIT_CSV = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\\mimic-cxr-2.0.0-split.csv\")\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"# None = parse hết ~227k, số nguyên = sample nhanh\n",
|
| 35 |
+
"SAMPLE_SIZE = None\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"for name, p in [(\"CXR_ROOT\", CXR_ROOT), (\"SPLIT_CSV\", SPLIT_CSV)]:\n",
|
| 38 |
+
" print(f\" {'✓' if p.exists() else '✗ NOT FOUND'} {name}: {p}\")"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": null,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"import re\n",
|
| 48 |
+
"import pandas as pd\n",
|
| 49 |
+
"import numpy as np\n",
|
| 50 |
+
"import matplotlib.pyplot as plt\n",
|
| 51 |
+
"import matplotlib.ticker as mticker\n",
|
| 52 |
+
"import seaborn as sns\n",
|
| 53 |
+
"from collections import Counter\n",
|
| 54 |
+
"\n",
|
| 55 |
+
"sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
|
| 56 |
+
"plt.rcParams[\"figure.dpi\"] = 120\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"ALL_SUBSETS = [f\"p{i}\" for i in range(10, 20)]\n",
|
| 59 |
+
"print(\"Ready.\")"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "markdown",
|
| 64 |
+
"metadata": {},
|
| 65 |
+
"source": [
|
| 66 |
+
"## 1. Parser — ALL-CAPS header detection\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"Quy luật: mọi section header trong MIMIC-CXR đều **VIẾT HOA TOÀN BỘ** và kết thúc bằng `:` \n",
|
| 69 |
+
"→ dùng regex detect tất cả, sau đó phân loại theo nhóm."
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"cell_type": "code",
|
| 74 |
+
"execution_count": null,
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"outputs": [],
|
| 77 |
+
"source": [
|
| 78 |
+
"# Regex: dòng bắt đầu bằng chuỗi ALL-CAPS rồi đến \":\"\n",
|
| 79 |
+
"SECTION_RE = re.compile(r'^[ \\t]*([A-Z][A-Z ,/()\\.\\-]{1,70}?):\\s*', re.MULTILINE)\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"# ── Nhóm IMPRESSION (nội dung kết luận) ─────────────────────────────────────\n",
|
| 82 |
+
"IMPRESSION_KEYWORDS = {\n",
|
| 83 |
+
" \"IMPRESSION\",\n",
|
| 84 |
+
" \"CONCLUSION\",\n",
|
| 85 |
+
" \"CONCLUSIONS\",\n",
|
| 86 |
+
" \"FINDINGS AND IMPRESSION\",\n",
|
| 87 |
+
" \"FINDINGS/IMPRESSION\",\n",
|
| 88 |
+
" \"PROVISIONAL FINDINGS IMPRESSION (PFI)\",\n",
|
| 89 |
+
" \"PFI\",\n",
|
| 90 |
+
" \"WET READ\", # quick impression trước khi có final report\n",
|
| 91 |
+
" \"RECOMMENDATION\",\n",
|
| 92 |
+
" \"RECOMMENDATION(S)\",\n",
|
| 93 |
+
" \"RECOMMENDATIONS\",\n",
|
| 94 |
+
"}\n",
|
| 95 |
+
"\n",
|
| 96 |
+
"# ── Nhóm FINDINGS (mô tả hình ảnh) ──────────────────────────────────────────\n",
|
| 97 |
+
"FINDINGS_KEYWORDS = {\n",
|
| 98 |
+
" \"FINDINGS\",\n",
|
| 99 |
+
" \"REPORT\",\n",
|
| 100 |
+
"}\n",
|
| 101 |
+
"\n",
|
| 102 |
+
"# Patterns dạng view description (findings không tường minh)\n",
|
| 103 |
+
"FINDINGS_VIEW_RE = re.compile(\n",
|
| 104 |
+
" r'(VIEW|VIEWS|RADIOGRAPH|RADIOGRAPHS|CHEST|PORTABLE|FRONTAL|LATERAL|PA AND|AP AND|UPRIGHT|SUPINE|SEMI)',\n",
|
| 105 |
+
" re.IGNORECASE\n",
|
| 106 |
+
")\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"# ── Admin headers (bỏ qua khi fallback) ─────────────────────────────────────\n",
|
| 109 |
+
"ADMIN_KEYWORDS = {\n",
|
| 110 |
+
" \"EXAMINATION\", \"EXAM\", \"INDICATION\", \"INDICATIONS\",\n",
|
| 111 |
+
" \"CLINICAL INDICATION\", \"CLINICAL HISTORY\", \"CLINICAL INFORMATION\",\n",
|
| 112 |
+
" \"TECHNIQUE\", \"COMPARISON\", \"COMPARISONS\", \"COMPARISON EXAM\",\n",
|
| 113 |
+
" \"COMPARISON FILM\", \"COMPARISON STUDY\", \"REFERENCE EXAM\",\n",
|
| 114 |
+
" \"HISTORY\", \"PATIENT HISTORY\",\n",
|
| 115 |
+
" \"REASON\", \"REASON FOR EXAM\", \"REASON FOR EXAMINATION\",\n",
|
| 116 |
+
" \"TYPE OF EXAMINATION\", \"PROCEDURE\",\n",
|
| 117 |
+
" \"NOTIFICATION\", \"NOTIFICATIONS\", \"ADDENDUM\",\n",
|
| 118 |
+
" \"STUDY\", \"DATE\", \"CC\", \"NOTE\", \"COMMENT\", \"COMMENTS\",\n",
|
| 119 |
+
" \"FINAL REPORT\",\n",
|
| 120 |
+
"}\n",
|
| 121 |
+
"\n",
|
| 122 |
+
"\n",
|
| 123 |
+
"def classify_header(h: str) -> str:\n",
|
| 124 |
+
" \"\"\"Phân loại header vào: findings / impression / admin / view_desc / other.\"\"\"\n",
|
| 125 |
+
" h = h.upper().strip()\n",
|
| 126 |
+
" if h in FINDINGS_KEYWORDS or \"FINDING\" in h:\n",
|
| 127 |
+
" return \"findings\"\n",
|
| 128 |
+
" if h in IMPRESSION_KEYWORDS or \"IMPRESSION\" in h or \"CONCLUSION\" in h:\n",
|
| 129 |
+
" return \"impression\"\n",
|
| 130 |
+
" if h in ADMIN_KEYWORDS:\n",
|
| 131 |
+
" return \"admin\"\n",
|
| 132 |
+
" if FINDINGS_VIEW_RE.search(h):\n",
|
| 133 |
+
" return \"view_desc\" # potential findings\n",
|
| 134 |
+
" return \"other\"\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"\n",
|
| 137 |
+
"def parse_report(txt_path: Path) -> dict:\n",
|
| 138 |
+
" \"\"\"\n",
|
| 139 |
+
" Trả về dict:\n",
|
| 140 |
+
" findings : str | None\n",
|
| 141 |
+
" impression : str | None\n",
|
| 142 |
+
" sections : list of (header, category, content)\n",
|
| 143 |
+
" \"\"\"\n",
|
| 144 |
+
" try:\n",
|
| 145 |
+
" text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n",
|
| 146 |
+
" except FileNotFoundError:\n",
|
| 147 |
+
" return {\"findings\": None, \"impression\": None, \"sections\": []}\n",
|
| 148 |
+
"\n",
|
| 149 |
+
" matches = list(SECTION_RE.finditer(text))\n",
|
| 150 |
+
" if not matches:\n",
|
| 151 |
+
" return {\"findings\": None, \"impression\": None, \"sections\": []}\n",
|
| 152 |
+
"\n",
|
| 153 |
+
" sections = []\n",
|
| 154 |
+
" for i, m in enumerate(matches):\n",
|
| 155 |
+
" header = m.group(1).strip()\n",
|
| 156 |
+
" start = m.end()\n",
|
| 157 |
+
" end = matches[i + 1].start() if i + 1 < len(matches) else len(text)\n",
|
| 158 |
+
" content = text[start:end].strip()\n",
|
| 159 |
+
" cat = classify_header(header)\n",
|
| 160 |
+
" sections.append((header, cat, content))\n",
|
| 161 |
+
"\n",
|
| 162 |
+
" findings = impression = None\n",
|
| 163 |
+
"\n",
|
| 164 |
+
" # Pass 1: tìm tường minh\n",
|
| 165 |
+
" for header, cat, content in sections:\n",
|
| 166 |
+
" if cat == \"findings\" and findings is None:\n",
|
| 167 |
+
" findings = content or None\n",
|
| 168 |
+
" elif cat == \"impression\" and impression is None:\n",
|
| 169 |
+
" impression = content or None\n",
|
| 170 |
+
"\n",
|
| 171 |
+
" # Pass 2: fallback findings từ view_desc\n",
|
| 172 |
+
" if findings is None:\n",
|
| 173 |
+
" for header, cat, content in sections:\n",
|
| 174 |
+
" if cat == \"view_desc\" and content:\n",
|
| 175 |
+
" findings = content\n",
|
| 176 |
+
" break\n",
|
| 177 |
+
"\n",
|
| 178 |
+
" return {\"findings\": findings, \"impression\": impression, \"sections\": sections}\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"\n",
|
| 181 |
+
"print(\"Parser defined.\")"
|
| 182 |
+
]
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"cell_type": "markdown",
|
| 186 |
+
"metadata": {},
|
| 187 |
+
"source": [
|
| 188 |
+
"## 2. Load & parse tất cả reports"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"cell_type": "code",
|
| 193 |
+
"execution_count": null,
|
| 194 |
+
"metadata": {},
|
| 195 |
+
"outputs": [],
|
| 196 |
+
"source": [
|
| 197 |
+
"# Lấy danh sách study từ split.csv để biết subset của từng study\n",
|
| 198 |
+
"split_df = pd.read_csv(SPLIT_CSV)\n",
|
| 199 |
+
"split_df[\"subset\"] = \"p\" + split_df[\"subject_id\"].astype(str).str[:2]\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"studies = (\n",
|
| 202 |
+
" split_df[[\"subject_id\", \"study_id\", \"subset\", \"split\"]]\n",
|
| 203 |
+
" .drop_duplicates(\"study_id\")\n",
|
| 204 |
+
" .reset_index(drop=True)\n",
|
| 205 |
+
")\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"if SAMPLE_SIZE:\n",
|
| 208 |
+
" studies = studies.sample(n=min(SAMPLE_SIZE, len(studies)), random_state=42).reset_index(drop=True)\n",
|
| 209 |
+
" print(f\"Sample: {len(studies):,} studies\")\n",
|
| 210 |
+
"else:\n",
|
| 211 |
+
" print(f\"Total studies to parse: {len(studies):,}\")"
|
| 212 |
+
]
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"cell_type": "code",
|
| 216 |
+
"execution_count": null,
|
| 217 |
+
"metadata": {},
|
| 218 |
+
"outputs": [],
|
| 219 |
+
"source": [
|
| 220 |
+
"records = []\n",
|
| 221 |
+
"header_counter = Counter()\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"for _, row in studies.iterrows():\n",
|
| 224 |
+
" sid = str(row[\"subject_id\"])\n",
|
| 225 |
+
" stid = str(row[\"study_id\"])\n",
|
| 226 |
+
" subset = row[\"subset\"]\n",
|
| 227 |
+
" split = row[\"split\"]\n",
|
| 228 |
+
" path = CXR_ROOT / \"files\" / subset / f\"p{sid}\" / f\"s{stid}.txt\"\n",
|
| 229 |
+
"\n",
|
| 230 |
+
" result = parse_report(path)\n",
|
| 231 |
+
"\n",
|
| 232 |
+
" for header, cat, _ in result[\"sections\"]:\n",
|
| 233 |
+
" header_counter[header.upper()] += 1\n",
|
| 234 |
+
"\n",
|
| 235 |
+
" records.append({\n",
|
| 236 |
+
" \"study_id\": stid,\n",
|
| 237 |
+
" \"subject_id\": sid,\n",
|
| 238 |
+
" \"subset\": subset,\n",
|
| 239 |
+
" \"split\": split,\n",
|
| 240 |
+
" \"findings\": result[\"findings\"],\n",
|
| 241 |
+
" \"impression\": result[\"impression\"],\n",
|
| 242 |
+
" \"n_sections\": len(result[\"sections\"]),\n",
|
| 243 |
+
" \"section_headers\": \"|\".join(h for h, _, _ in result[\"sections\"]),\n",
|
| 244 |
+
" })\n",
|
| 245 |
+
"\n",
|
| 246 |
+
"df = pd.DataFrame(records)\n",
|
| 247 |
+
"df[\"findings_len\"] = df[\"findings\"].str.split().str.len()\n",
|
| 248 |
+
"df[\"impression_len\"] = df[\"impression\"].str.split().str.len()\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"total = len(df)\n",
|
| 251 |
+
"has_f = df[\"findings\"].notna().sum()\n",
|
| 252 |
+
"has_i = df[\"impression\"].notna().sum()\n",
|
| 253 |
+
"has_both = (df[\"findings\"].notna() & df[\"impression\"].notna()).sum()\n",
|
| 254 |
+
"has_neither = (df[\"findings\"].isna() & df[\"impression\"].isna()).sum()\n",
|
| 255 |
+
"\n",
|
| 256 |
+
"print(f\"Total studies parsed : {total:,}\")\n",
|
| 257 |
+
"print(f\"Has findings : {has_f:,} ({has_f/total*100:.1f}%)\")\n",
|
| 258 |
+
"print(f\"Has impression : {has_i:,} ({has_i/total*100:.1f}%)\")\n",
|
| 259 |
+
"print(f\"Has both : {has_both:,} ({has_both/total*100:.1f}%)\")\n",
|
| 260 |
+
"print(f\"Has neither : {has_neither:,} ({has_neither/total*100:.1f}%)\")"
|
| 261 |
+
]
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"cell_type": "markdown",
|
| 265 |
+
"metadata": {},
|
| 266 |
+
"source": [
|
| 267 |
+
"## 3. Thống kê tất cả section headers"
|
| 268 |
+
]
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"cell_type": "code",
|
| 272 |
+
"execution_count": null,
|
| 273 |
+
"metadata": {},
|
| 274 |
+
"outputs": [],
|
| 275 |
+
"source": [
|
| 276 |
+
"# Bảng đầy đủ tất cả headers + category\n",
|
| 277 |
+
"header_rows = []\n",
|
| 278 |
+
"for h, cnt in header_counter.most_common():\n",
|
| 279 |
+
" header_rows.append({\n",
|
| 280 |
+
" \"header\": h,\n",
|
| 281 |
+
" \"count\": cnt,\n",
|
| 282 |
+
" \"category\": classify_header(h),\n",
|
| 283 |
+
" \"pct\": cnt / total * 100\n",
|
| 284 |
+
" })\n",
|
| 285 |
+
"\n",
|
| 286 |
+
"header_df = pd.DataFrame(header_rows)\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"print(f\"Distinct section headers: {len(header_df)}\")\n",
|
| 289 |
+
"print(\"\\n=== Top 50 headers ===\")\n",
|
| 290 |
+
"print(header_df.head(50).to_string(index=False))"
|
| 291 |
+
]
|
| 292 |
+
},
|
| 293 |
+
{
|
| 294 |
+
"cell_type": "code",
|
| 295 |
+
"execution_count": null,
|
| 296 |
+
"metadata": {},
|
| 297 |
+
"outputs": [],
|
| 298 |
+
"source": [
|
| 299 |
+
"# Phân bố theo category\n",
|
| 300 |
+
"cat_summary = header_df.groupby(\"category\")[\"count\"].sum().sort_values(ascending=False)\n",
|
| 301 |
+
"print(\"=== Tổng count theo category ===\")\n",
|
| 302 |
+
"print(cat_summary.to_string())\n",
|
| 303 |
+
"\n",
|
| 304 |
+
"fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n",
|
| 305 |
+
"\n",
|
| 306 |
+
"# Bar: category totals\n",
|
| 307 |
+
"colors = {\"findings\": \"#4C72B0\", \"impression\": \"#DD8452\",\n",
|
| 308 |
+
" \"admin\": \"#8c8c8c\", \"view_desc\": \"#55A868\", \"other\": \"#C44E52\"}\n",
|
| 309 |
+
"cat_colors = [colors.get(c, \"gray\") for c in cat_summary.index]\n",
|
| 310 |
+
"bars = axes[0].bar(cat_summary.index, cat_summary.values, color=cat_colors)\n",
|
| 311 |
+
"axes[0].bar_label(bars, fmt=\"%d\")\n",
|
| 312 |
+
"axes[0].set_title(\"Tổng số lần xuất hiện theo category\")\n",
|
| 313 |
+
"axes[0].set_ylabel(\"Count\")\n",
|
| 314 |
+
"axes[0].tick_params(axis=\"x\", rotation=20)\n",
|
| 315 |
+
"\n",
|
| 316 |
+
"# Bar: số header distinct mỗi category\n",
|
| 317 |
+
"cat_distinct = header_df.groupby(\"category\").size().sort_values(ascending=False)\n",
|
| 318 |
+
"bars2 = axes[1].bar(cat_distinct.index, cat_distinct.values,\n",
|
| 319 |
+
" color=[colors.get(c, \"gray\") for c in cat_distinct.index])\n",
|
| 320 |
+
"axes[1].bar_label(bars2, fmt=\"%d\")\n",
|
| 321 |
+
"axes[1].set_title(\"Số header phân biệt mỗi category\")\n",
|
| 322 |
+
"axes[1].set_ylabel(\"Distinct headers\")\n",
|
| 323 |
+
"axes[1].tick_params(axis=\"x\", rotation=20)\n",
|
| 324 |
+
"\n",
|
| 325 |
+
"plt.suptitle(\"Section Header Categories\", fontsize=13)\n",
|
| 326 |
+
"plt.tight_layout()\n",
|
| 327 |
+
"plt.show()"
|
| 328 |
+
]
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"cell_type": "code",
|
| 332 |
+
"execution_count": null,
|
| 333 |
+
"metadata": {},
|
| 334 |
+
"outputs": [],
|
| 335 |
+
"source": [
|
| 336 |
+
"# Top headers mỗi category\n",
|
| 337 |
+
"for cat in [\"findings\", \"impression\", \"view_desc\", \"other\"]:\n",
|
| 338 |
+
" sub = header_df[header_df[\"category\"] == cat].head(15)\n",
|
| 339 |
+
" print(f\"\\n=== [{cat}] Top headers ===\")\n",
|
| 340 |
+
" print(sub[[\"header\", \"count\", \"pct\"]].to_string(index=False))"
|
| 341 |
+
]
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"cell_type": "code",
|
| 345 |
+
"execution_count": null,
|
| 346 |
+
"metadata": {},
|
| 347 |
+
"outputs": [],
|
| 348 |
+
"source": [
|
| 349 |
+
"# Top 20 headers — horizontal bar\n",
|
| 350 |
+
"top20 = header_df.head(20).copy()\n",
|
| 351 |
+
"top20_colors = [colors.get(c, \"gray\") for c in top20[\"category\"]]\n",
|
| 352 |
+
"\n",
|
| 353 |
+
"fig, ax = plt.subplots(figsize=(10, 7))\n",
|
| 354 |
+
"bars = ax.barh(top20[\"header\"][::-1], top20[\"count\"][::-1], color=top20_colors[::-1])\n",
|
| 355 |
+
"ax.bar_label(bars, fmt=\"%d\", padding=3, fontsize=8)\n",
|
| 356 |
+
"ax.set_xlabel(\"Count\")\n",
|
| 357 |
+
"ax.set_title(\"Top 20 Section Headers (tô màu theo category)\")\n",
|
| 358 |
+
"\n",
|
| 359 |
+
"from matplotlib.patches import Patch\n",
|
| 360 |
+
"legend_elements = [Patch(facecolor=v, label=k) for k, v in colors.items()]\n",
|
| 361 |
+
"ax.legend(handles=legend_elements, loc=\"lower right\", fontsize=9)\n",
|
| 362 |
+
"plt.tight_layout()\n",
|
| 363 |
+
"plt.show()"
|
| 364 |
+
]
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"cell_type": "markdown",
|
| 368 |
+
"metadata": {},
|
| 369 |
+
"source": [
|
| 370 |
+
"## 4. Tỉ lệ có/thiếu Findings & Impression"
|
| 371 |
+
]
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"cell_type": "code",
|
| 375 |
+
"execution_count": null,
|
| 376 |
+
"metadata": {},
|
| 377 |
+
"outputs": [],
|
| 378 |
+
"source": [
|
| 379 |
+
"# Tạo cột status\n",
|
| 380 |
+
"def report_status(row):\n",
|
| 381 |
+
" f = row[\"findings\"] is not None\n",
|
| 382 |
+
" i = row[\"impression\"] is not None\n",
|
| 383 |
+
" if f and i: return \"both\"\n",
|
| 384 |
+
" if f: return \"findings only\"\n",
|
| 385 |
+
" if i: return \"impression only\"\n",
|
| 386 |
+
" return \"neither\"\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"df[\"status\"] = df.apply(report_status, axis=1)\n",
|
| 389 |
+
"\n",
|
| 390 |
+
"status_counts = df[\"status\"].value_counts()\n",
|
| 391 |
+
"print(\"=== Report completeness (full dataset) ===\")\n",
|
| 392 |
+
"for s, c in status_counts.items():\n",
|
| 393 |
+
" print(f\" {s:<20}: {c:>7,} ({c/total*100:.1f}%)\")"
|
| 394 |
+
]
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"cell_type": "code",
|
| 398 |
+
"execution_count": null,
|
| 399 |
+
"metadata": {},
|
| 400 |
+
"outputs": [],
|
| 401 |
+
"source": [
|
| 402 |
+
"fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
|
| 403 |
+
"status_order = [\"both\", \"impression only\", \"findings only\", \"neither\"]\n",
|
| 404 |
+
"status_colors = [\"#55A868\", \"#DD8452\", \"#4C72B0\", \"#C44E52\"]\n",
|
| 405 |
+
"vals = [status_counts.get(s, 0) for s in status_order]\n",
|
| 406 |
+
"\n",
|
| 407 |
+
"bars = axes[0].bar(status_order, vals, color=status_colors)\n",
|
| 408 |
+
"axes[0].bar_label(bars, fmt=\"%d\")\n",
|
| 409 |
+
"axes[0].set_title(\"Report completeness (count)\")\n",
|
| 410 |
+
"axes[0].set_ylabel(\"Số report\")\n",
|
| 411 |
+
"axes[0].tick_params(axis=\"x\", rotation=15)\n",
|
| 412 |
+
"\n",
|
| 413 |
+
"axes[1].pie(vals, labels=status_order, autopct=\"%1.1f%%\", colors=status_colors,\n",
|
| 414 |
+
" startangle=140)\n",
|
| 415 |
+
"axes[1].set_title(\"Report completeness (%)\")\n",
|
| 416 |
+
"\n",
|
| 417 |
+
"plt.suptitle(\"Findings & Impression Availability\", fontsize=13)\n",
|
| 418 |
+
"plt.tight_layout()\n",
|
| 419 |
+
"plt.show()"
|
| 420 |
+
]
|
| 421 |
+
},
|
| 422 |
+
{
|
| 423 |
+
"cell_type": "markdown",
|
| 424 |
+
"metadata": {},
|
| 425 |
+
"source": [
|
| 426 |
+
"## 5. Breakdown theo Subset"
|
| 427 |
+
]
|
| 428 |
+
},
|
| 429 |
+
{
|
| 430 |
+
"cell_type": "code",
|
| 431 |
+
"execution_count": null,
|
| 432 |
+
"metadata": {},
|
| 433 |
+
"outputs": [],
|
| 434 |
+
"source": [
|
| 435 |
+
"subset_stats = df.groupby(\"subset\").apply(lambda g: pd.Series({\n",
|
| 436 |
+
" \"total\": len(g),\n",
|
| 437 |
+
" \"has_findings\": g[\"findings\"].notna().sum(),\n",
|
| 438 |
+
" \"has_impression\": g[\"impression\"].notna().sum(),\n",
|
| 439 |
+
" \"has_both\": (g[\"findings\"].notna() & g[\"impression\"].notna()).sum(),\n",
|
| 440 |
+
" \"has_neither\": (g[\"findings\"].isna() & g[\"impression\"].isna()).sum(),\n",
|
| 441 |
+
"})).reindex(ALL_SUBSETS)\n",
|
| 442 |
+
"\n",
|
| 443 |
+
"subset_pct = subset_stats.div(subset_stats[\"total\"], axis=0).mul(100).round(1)\n",
|
| 444 |
+
"\n",
|
| 445 |
+
"print(\"=== Count per subset ===\")\n",
|
| 446 |
+
"print(subset_stats.to_string())\n",
|
| 447 |
+
"print(\"\\n=== % per subset ===\")\n",
|
| 448 |
+
"print(subset_pct[[\"has_findings\",\"has_impression\",\"has_both\",\"has_neither\"]].to_string())"
|
| 449 |
+
]
|
| 450 |
+
},
|
| 451 |
+
{
|
| 452 |
+
"cell_type": "code",
|
| 453 |
+
"execution_count": null,
|
| 454 |
+
"metadata": {},
|
| 455 |
+
"outputs": [],
|
| 456 |
+
"source": [
|
| 457 |
+
"fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
|
| 458 |
+
"\n",
|
| 459 |
+
"# Stacked % bar\n",
|
| 460 |
+
"status_per_subset = (\n",
|
| 461 |
+
" df.groupby([\"subset\", \"status\"]).size()\n",
|
| 462 |
+
" .unstack(fill_value=0)\n",
|
| 463 |
+
" .reindex(ALL_SUBSETS, fill_value=0)\n",
|
| 464 |
+
")\n",
|
| 465 |
+
"# Tỉ lệ %\n",
|
| 466 |
+
"status_pct_subset = status_per_subset.div(status_per_subset.sum(axis=1), axis=0) * 100\n",
|
| 467 |
+
"status_pct_subset = status_pct_subset.reindex(\n",
|
| 468 |
+
" columns=[c for c in status_order if c in status_pct_subset.columns]\n",
|
| 469 |
+
")\n",
|
| 470 |
+
"status_pct_subset.plot(\n",
|
| 471 |
+
" kind=\"bar\", stacked=True, ax=axes[0],\n",
|
| 472 |
+
" color=[status_colors[status_order.index(c)] for c in status_pct_subset.columns],\n",
|
| 473 |
+
" width=0.75\n",
|
| 474 |
+
")\n",
|
| 475 |
+
"axes[0].set_title(\"Report completeness (%) theo subset\")\n",
|
| 476 |
+
"axes[0].set_ylabel(\"%\")\n",
|
| 477 |
+
"axes[0].set_ylim(0, 105)\n",
|
| 478 |
+
"axes[0].tick_params(axis=\"x\", rotation=0)\n",
|
| 479 |
+
"axes[0].legend(loc=\"lower right\", fontsize=8)\n",
|
| 480 |
+
"\n",
|
| 481 |
+
"# Heatmap tỉ lệ % has_both / has_neither\n",
|
| 482 |
+
"heatmap_data = subset_pct[[\"has_findings\",\"has_impression\",\"has_both\",\"has_neither\"]]\n",
|
| 483 |
+
"sns.heatmap(heatmap_data, annot=True, fmt=\".1f\", cmap=\"RdYlGn\",\n",
|
| 484 |
+
" linewidths=0.5, ax=axes[1], vmin=0, vmax=100,\n",
|
| 485 |
+
" cbar_kws={\"label\": \"%\"})\n",
|
| 486 |
+
"axes[1].set_title(\"Tỉ lệ (%) completeness mỗi subset\")\n",
|
| 487 |
+
"\n",
|
| 488 |
+
"plt.suptitle(\"Report Completeness per Subset\", fontsize=13)\n",
|
| 489 |
+
"plt.tight_layout()\n",
|
| 490 |
+
"plt.show()"
|
| 491 |
+
]
|
| 492 |
+
},
|
| 493 |
+
{
|
| 494 |
+
"cell_type": "code",
|
| 495 |
+
"execution_count": null,
|
| 496 |
+
"metadata": {},
|
| 497 |
+
"outputs": [],
|
| 498 |
+
"source": [
|
| 499 |
+
"# Breakdown theo split (train/validate/test)\n",
|
| 500 |
+
"split_stats = df.groupby(\"split\").apply(lambda g: pd.Series({\n",
|
| 501 |
+
" \"total\": len(g),\n",
|
| 502 |
+
" \"has_findings %\": g[\"findings\"].notna().mean() * 100,\n",
|
| 503 |
+
" \"has_impression %\": g[\"impression\"].notna().mean() * 100,\n",
|
| 504 |
+
" \"has_both %\": (g[\"findings\"].notna() & g[\"impression\"].notna()).mean() * 100,\n",
|
| 505 |
+
" \"has_neither %\": (g[\"findings\"].isna() & g[\"impression\"].isna()).mean() * 100,\n",
|
| 506 |
+
"})).reindex([\"train\", \"validate\", \"test\"])\n",
|
| 507 |
+
"\n",
|
| 508 |
+
"print(\"=== Completeness by split ===\")\n",
|
| 509 |
+
"print(split_stats.round(1).to_string())"
|
| 510 |
+
]
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"cell_type": "markdown",
|
| 514 |
+
"metadata": {},
|
| 515 |
+
"source": [
|
| 516 |
+
"## 6. Phân phối độ dài Findings & Impression"
|
| 517 |
+
]
|
| 518 |
+
},
|
| 519 |
+
{
|
| 520 |
+
"cell_type": "code",
|
| 521 |
+
"execution_count": null,
|
| 522 |
+
"metadata": {},
|
| 523 |
+
"outputs": [],
|
| 524 |
+
"source": [
|
| 525 |
+
"print(\"=== Findings word count ===\")\n",
|
| 526 |
+
"print(df[\"findings_len\"].describe().round(1).to_string())\n",
|
| 527 |
+
"print(\"\\n=== Impression word count ===\")\n",
|
| 528 |
+
"print(df[\"impression_len\"].describe().round(1).to_string())"
|
| 529 |
+
]
|
| 530 |
+
},
|
| 531 |
+
{
|
| 532 |
+
"cell_type": "code",
|
| 533 |
+
"execution_count": null,
|
| 534 |
+
"metadata": {},
|
| 535 |
+
"outputs": [],
|
| 536 |
+
"source": [
|
| 537 |
+
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
| 538 |
+
"for ax, col, title, color in zip(\n",
|
| 539 |
+
" axes,\n",
|
| 540 |
+
" [\"findings_len\", \"impression_len\"],\n",
|
| 541 |
+
" [\"Findings — độ dài (số từ)\", \"Impression — độ dài (số từ)\"],\n",
|
| 542 |
+
" [\"#4C72B0\", \"#DD8452\"]\n",
|
| 543 |
+
"):\n",
|
| 544 |
+
" data = df[col].dropna()\n",
|
| 545 |
+
" p99 = data.quantile(0.99)\n",
|
| 546 |
+
" ax.hist(data[data <= p99], bins=60, color=color, edgecolor=\"white\", alpha=0.85)\n",
|
| 547 |
+
" ax.axvline(data.median(), color=\"black\", ls=\"--\", lw=1.3, label=f\"Median={data.median():.0f}\")\n",
|
| 548 |
+
" ax.axvline(data.mean(), color=\"gray\", ls=\":\", lw=1.3, label=f\"Mean={data.mean():.0f}\")\n",
|
| 549 |
+
" ax.set_title(title)\n",
|
| 550 |
+
" ax.set_xlabel(\"Số từ\")\n",
|
| 551 |
+
" ax.set_ylabel(\"Số report\")\n",
|
| 552 |
+
" ax.legend(fontsize=9)\n",
|
| 553 |
+
" ax.text(0.97, 0.95, f\"n={len(data):,}\\n(≤p99={p99:.0f}w)\",\n",
|
| 554 |
+
" transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n",
|
| 555 |
+
"\n",
|
| 556 |
+
"plt.suptitle(\"Phân phối độ dài Findings & Impression\", fontsize=13)\n",
|
| 557 |
+
"plt.tight_layout()\n",
|
| 558 |
+
"plt.show()"
|
| 559 |
+
]
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"cell_type": "code",
|
| 563 |
+
"execution_count": null,
|
| 564 |
+
"metadata": {},
|
| 565 |
+
"outputs": [],
|
| 566 |
+
"source": [
|
| 567 |
+
"# Boxplot: Findings vs Impression\n",
|
| 568 |
+
"combined = pd.concat([\n",
|
| 569 |
+
" df[[\"findings_len\"]].rename(columns={\"findings_len\": \"words\"}).assign(section=\"Findings\"),\n",
|
| 570 |
+
" df[[\"impression_len\"]].rename(columns={\"impression_len\": \"words\"}).assign(section=\"Impression\"),\n",
|
| 571 |
+
"]).dropna()\n",
|
| 572 |
+
"\n",
|
| 573 |
+
"fig, ax = plt.subplots(figsize=(7, 4))\n",
|
| 574 |
+
"sns.boxplot(data=combined, x=\"section\", y=\"words\",\n",
|
| 575 |
+
" palette=[\"#4C72B0\", \"#DD8452\"], showfliers=False, ax=ax)\n",
|
| 576 |
+
"ax.set_title(\"Findings vs Impression — độ dài (no outliers)\")\n",
|
| 577 |
+
"ax.set_ylabel(\"Số từ\")\n",
|
| 578 |
+
"plt.tight_layout()\n",
|
| 579 |
+
"plt.show()"
|
| 580 |
+
]
|
| 581 |
+
},
|
| 582 |
+
{
|
| 583 |
+
"cell_type": "code",
|
| 584 |
+
"execution_count": null,
|
| 585 |
+
"metadata": {},
|
| 586 |
+
"outputs": [],
|
| 587 |
+
"source": [
|
| 588 |
+
"# Median length per subset\n",
|
| 589 |
+
"med_subset = df.groupby(\"subset\")[[\"findings_len\",\"impression_len\"]].median().reindex(ALL_SUBSETS)\n",
|
| 590 |
+
"\n",
|
| 591 |
+
"med_subset.plot(kind=\"bar\", figsize=(12, 4),\n",
|
| 592 |
+
" color=[\"#4C72B0\", \"#DD8452\"], width=0.7)\n",
|
| 593 |
+
"plt.title(\"Median độ dài Findings & Impression theo subset\")\n",
|
| 594 |
+
"plt.xlabel(\"Subset\")\n",
|
| 595 |
+
"plt.ylabel(\"Median (số từ)\")\n",
|
| 596 |
+
"plt.xticks(rotation=0)\n",
|
| 597 |
+
"plt.legend([\"Findings\", \"Impression\"])\n",
|
| 598 |
+
"plt.tight_layout()\n",
|
| 599 |
+
"plt.show()"
|
| 600 |
+
]
|
| 601 |
+
},
|
| 602 |
+
{
|
| 603 |
+
"cell_type": "code",
|
| 604 |
+
"execution_count": null,
|
| 605 |
+
"metadata": {},
|
| 606 |
+
"outputs": [],
|
| 607 |
+
"source": [
|
| 608 |
+
"# Heatmap percentile độ dài theo subset\n",
|
| 609 |
+
"for col, label in [(\"findings_len\", \"Findings\"), (\"impression_len\", \"Impression\")]:\n",
|
| 610 |
+
" pct_data = df.groupby(\"subset\")[col].describe(\n",
|
| 611 |
+
" percentiles=[.25, .5, .75, .95]\n",
|
| 612 |
+
" )[[\"count\", \"mean\", \"25%\", \"50%\", \"75%\", \"95%\", \"max\"]].reindex(ALL_SUBSETS).round(0)\n",
|
| 613 |
+
" print(f\"\\n=== {label} length per subset ===\")\n",
|
| 614 |
+
" print(pct_data.to_string())"
|
| 615 |
+
]
|
| 616 |
+
},
|
| 617 |
+
{
|
| 618 |
+
"cell_type": "markdown",
|
| 619 |
+
"metadata": {},
|
| 620 |
+
"source": [
|
| 621 |
+
"## 7. Reports \"has neither\" — phân tích thêm"
|
| 622 |
+
]
|
| 623 |
+
},
|
| 624 |
+
{
|
| 625 |
+
"cell_type": "code",
|
| 626 |
+
"execution_count": null,
|
| 627 |
+
"metadata": {},
|
| 628 |
+
"outputs": [],
|
| 629 |
+
"source": [
|
| 630 |
+
"neither_df = df[df[\"status\"] == \"neither\"].copy()\n",
|
| 631 |
+
"print(f\"Reports không có cả findings lẫn impression: {len(neither_df):,}\")\n",
|
| 632 |
+
"print(f\"\\nPhân bố n_sections của những report này:\")\n",
|
| 633 |
+
"print(neither_df[\"n_sections\"].value_counts().sort_index().head(10).to_string())\n",
|
| 634 |
+
"\n",
|
| 635 |
+
"# Xem top headers trong những report này\n",
|
| 636 |
+
"neither_headers = Counter()\n",
|
| 637 |
+
"for row in neither_df[\"section_headers\"]:\n",
|
| 638 |
+
" if isinstance(row, str):\n",
|
| 639 |
+
" for h in row.split(\"|\"):\n",
|
| 640 |
+
" if h:\n",
|
| 641 |
+
" neither_headers[h] += 1\n",
|
| 642 |
+
"\n",
|
| 643 |
+
"print(\"\\nTop section headers trong reports 'neither':\")\n",
|
| 644 |
+
"for h, c in neither_headers.most_common(15):\n",
|
| 645 |
+
" print(f\" {c:>6,} {h}\")"
|
| 646 |
+
]
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"cell_type": "code",
|
| 650 |
+
"execution_count": null,
|
| 651 |
+
"metadata": {},
|
| 652 |
+
"outputs": [],
|
| 653 |
+
"source": [
|
| 654 |
+
"# Vài ví dụ report có neither\n",
|
| 655 |
+
"import random\n",
|
| 656 |
+
"random.seed(0)\n",
|
| 657 |
+
"sample_neither = neither_df.sample(min(3, len(neither_df)), random_state=0)\n",
|
| 658 |
+
"for _, row in sample_neither.iterrows():\n",
|
| 659 |
+
" path = CXR_ROOT / \"files\" / row[\"subset\"] / f\"p{row['subject_id']}\" / f\"s{row['study_id']}.txt\"\n",
|
| 660 |
+
" print(f\"\\n{'='*60}\")\n",
|
| 661 |
+
" print(f\"s{row['study_id']}.txt (sections: {row['section_headers']})\")\n",
|
| 662 |
+
" try:\n",
|
| 663 |
+
" print(path.read_text(encoding=\"utf-8\", errors=\"ignore\")[:600])\n",
|
| 664 |
+
" except:\n",
|
| 665 |
+
" print(\"[file not found]\")"
|
| 666 |
+
]
|
| 667 |
+
},
|
| 668 |
+
{
|
| 669 |
+
"cell_type": "markdown",
|
| 670 |
+
"metadata": {},
|
| 671 |
+
"source": [
|
| 672 |
+
"## 8. Số section mỗi report"
|
| 673 |
+
]
|
| 674 |
+
},
|
| 675 |
+
{
|
| 676 |
+
"cell_type": "code",
|
| 677 |
+
"execution_count": null,
|
| 678 |
+
"metadata": {},
|
| 679 |
+
"outputs": [],
|
| 680 |
+
"source": [
|
| 681 |
+
"sec_dist = df[\"n_sections\"].value_counts().sort_index()\n",
|
| 682 |
+
"print(\"Phân bố số sections mỗi report:\")\n",
|
| 683 |
+
"print(sec_dist.head(20).to_string())\n",
|
| 684 |
+
"\n",
|
| 685 |
+
"fig, ax = plt.subplots(figsize=(11, 4))\n",
|
| 686 |
+
"sec_clip = sec_dist[sec_dist.index <= df[\"n_sections\"].quantile(0.99)]\n",
|
| 687 |
+
"ax.bar(sec_clip.index.astype(str), sec_clip.values, color=\"steelblue\")\n",
|
| 688 |
+
"ax.set_xlabel(\"Số sections\")\n",
|
| 689 |
+
"ax.set_ylabel(\"Số report\")\n",
|
| 690 |
+
"ax.set_title(\"Phân bố số sections mỗi report (≤p99)\")\n",
|
| 691 |
+
"ax.axvline(str(int(df[\"n_sections\"].median())), color=\"black\", ls=\"--\",\n",
|
| 692 |
+
" label=f\"Median={df['n_sections'].median():.0f}\")\n",
|
| 693 |
+
"ax.legend()\n",
|
| 694 |
+
"plt.tight_layout()\n",
|
| 695 |
+
"plt.show()"
|
| 696 |
+
]
|
| 697 |
+
},
|
| 698 |
+
{
|
| 699 |
+
"cell_type": "markdown",
|
| 700 |
+
"metadata": {},
|
| 701 |
+
"source": [
|
| 702 |
+
"## 9. Summary"
|
| 703 |
+
]
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"cell_type": "code",
|
| 707 |
+
"execution_count": null,
|
| 708 |
+
"metadata": {},
|
| 709 |
+
"outputs": [],
|
| 710 |
+
"source": [
|
| 711 |
+
"print(\"=\"*60)\n",
|
| 712 |
+
"print(\" MIMIC-CXR Report EDA Summary\")\n",
|
| 713 |
+
"print(\"=\"*60)\n",
|
| 714 |
+
"print(f\" Total reports parsed : {total:,}\")\n",
|
| 715 |
+
"print(f\" Distinct section headers : {len(header_df)}\")\n",
|
| 716 |
+
"print()\n",
|
| 717 |
+
"print(f\" Has findings : {has_f:,} ({has_f/total*100:.1f}%)\")\n",
|
| 718 |
+
"print(f\" Has impression : {has_i:,} ({has_i/total*100:.1f}%)\")\n",
|
| 719 |
+
"print(f\" Has BOTH (usable) : {has_both:,} ({has_both/total*100:.1f}%)\")\n",
|
| 720 |
+
"print(f\" Has neither : {has_neither:,} ({has_neither/total*100:.1f}%)\")\n",
|
| 721 |
+
"print()\n",
|
| 722 |
+
"print(f\" Findings median length : {df['findings_len'].median():.0f} words\")\n",
|
| 723 |
+
"print(f\" Impression median length : {df['impression_len'].median():.0f} words\")\n",
|
| 724 |
+
"print(\"=\"*60)"
|
| 725 |
+
]
|
| 726 |
+
}
|
| 727 |
+
],
|
| 728 |
+
"metadata": {
|
| 729 |
+
"kernelspec": {
|
| 730 |
+
"display_name": "Python 3",
|
| 731 |
+
"language": "python",
|
| 732 |
+
"name": "python3"
|
| 733 |
+
},
|
| 734 |
+
"language_info": {
|
| 735 |
+
"name": "python",
|
| 736 |
+
"version": "3.10.0"
|
| 737 |
+
}
|
| 738 |
+
},
|
| 739 |
+
"nbformat": 4,
|
| 740 |
+
"nbformat_minor": 5
|
| 741 |
+
}
|
model/cxr_vlm.py
CHANGED
|
@@ -53,10 +53,14 @@ class CXRVisionLanguageModel(nn.Module):
|
|
| 53 |
super().__init__()
|
| 54 |
self.cfg = model_cfg
|
| 55 |
|
| 56 |
-
# ── 1. Image Encoder (
|
|
|
|
|
|
|
|
|
|
| 57 |
self.image_encoder = BioViLTEncoder(
|
| 58 |
frozen = model_cfg.image_encoder.frozen,
|
| 59 |
img_size = model_cfg.image_encoder.img_size,
|
|
|
|
| 60 |
)
|
| 61 |
|
| 62 |
# ── 2. MLP Projection (trained) ──────────────────────────────────────
|
|
|
|
| 53 |
super().__init__()
|
| 54 |
self.cfg = model_cfg
|
| 55 |
|
| 56 |
+
# ── 1. Image Encoder (rad_dino / biovilt / vit, frozen) ─────────────
|
| 57 |
+
# `backend` defaults to "auto" → tries rad_dino → biovilt → vit and
|
| 58 |
+
# uses the first that loads (see model/image_encoder.py docstring).
|
| 59 |
+
_enc_backend = getattr(model_cfg.image_encoder, "backend", "auto")
|
| 60 |
self.image_encoder = BioViLTEncoder(
|
| 61 |
frozen = model_cfg.image_encoder.frozen,
|
| 62 |
img_size = model_cfg.image_encoder.img_size,
|
| 63 |
+
backend = _enc_backend,
|
| 64 |
)
|
| 65 |
|
| 66 |
# ── 2. MLP Projection (trained) ──────────────────────────────────────
|
model/image_encoder.py
CHANGED
|
@@ -1,16 +1,24 @@
|
|
| 1 |
"""
|
| 2 |
image_encoder.py
|
| 3 |
----------------
|
| 4 |
-
Vision encoder wrapper.
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
import torch
|
|
@@ -23,6 +31,12 @@ try:
|
|
| 23 |
except ImportError:
|
| 24 |
TIMM_AVAILABLE = False
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
try:
|
| 27 |
from health_multimodal.image import get_biovil_t_image_encoder
|
| 28 |
from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference
|
|
@@ -31,12 +45,17 @@ except ImportError:
|
|
| 31 |
BIOVIL_AVAILABLE = False
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
| 34 |
class BioViLTEncoder(nn.Module):
|
| 35 |
"""
|
| 36 |
-
Image encoder. Name kept for backward compatibility
|
| 37 |
-
depends on `backend`:
|
| 38 |
-
- "
|
| 39 |
-
- "
|
|
|
|
|
|
|
| 40 |
|
| 41 |
Output contract: (B, num_patches, 768)
|
| 42 |
"""
|
|
@@ -44,51 +63,115 @@ class BioViLTEncoder(nn.Module):
|
|
| 44 |
PATCH_DIM = 768
|
| 45 |
IMG_SIZE = 448
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def __init__(
|
| 48 |
self,
|
| 49 |
frozen: bool = True,
|
| 50 |
img_size: int = 448,
|
| 51 |
-
backend: str = "auto", # "auto" | "biovilt" | "vit"
|
| 52 |
device: Optional[str] = None,
|
| 53 |
):
|
| 54 |
super().__init__()
|
| 55 |
-
self.
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
)
|
| 77 |
-
|
| 78 |
-
self.img_size = 224
|
| 79 |
-
else:
|
| 80 |
-
raise ValueError(f"Unknown backend: {backend}")
|
| 81 |
|
| 82 |
if frozen:
|
| 83 |
self._freeze()
|
| 84 |
|
| 85 |
print(f"[BioViLTEncoder] backend={self.backend} frozen={frozen} img_size={self.img_size}")
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
def _freeze(self):
|
| 88 |
for p in self.encoder.parameters():
|
| 89 |
p.requires_grad = False
|
| 90 |
self.encoder.eval()
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
def forward(self, images: torch.Tensor) -> torch.Tensor:
|
| 93 |
"""
|
| 94 |
Args:
|
|
@@ -98,38 +181,85 @@ class BioViLTEncoder(nn.Module):
|
|
| 98 |
"""
|
| 99 |
ctx = torch.no_grad() if self.frozen else torch.enable_grad()
|
| 100 |
with ctx:
|
| 101 |
-
if self.backend == "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
out = self.encoder(images)
|
| 103 |
feats = out.patch_embedding # (B, 768, H', W')
|
| 104 |
B, C, H, W = feats.shape
|
| 105 |
feats = feats.flatten(2).transpose(1, 2) # (B, H'*W', 768)
|
|
|
|
| 106 |
else: # vit
|
| 107 |
# timm ViT with num_classes=0, global_pool="" returns (B, N+1, 768)
|
| 108 |
# where token 0 is [CLS]. Drop it.
|
| 109 |
-
feats = self.encoder.forward_features(images)
|
| 110 |
if feats.ndim == 3 and feats.shape[1] > 1:
|
| 111 |
-
feats = feats[:, 1:, :]
|
|
|
|
| 112 |
return feats
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
@staticmethod
|
| 115 |
-
def get_transform(split: str = "train"):
|
| 116 |
"""
|
| 117 |
-
Return an image transform
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
"""
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
@property
|
| 135 |
def output_dim(self) -> int:
|
|
|
|
| 1 |
"""
|
| 2 |
image_encoder.py
|
| 3 |
----------------
|
| 4 |
+
Vision encoder wrapper. Tries multiple backends in this priority order:
|
| 5 |
|
| 6 |
+
1. "rad_dino" — Microsoft RAD-DINO (microsoft/rad-dino, HF Hub).
|
| 7 |
+
Chest-X-ray self-supervised DINOv2. Loaded via the
|
| 8 |
+
`transformers` library, so works on Python 3.12.
|
| 9 |
+
⭐ recommended for CXR.
|
| 10 |
+
2. "biovilt" — Microsoft BioViL-T via `hi-ml-multimodal`.
|
| 11 |
+
Original choice; package requires Python <3.11 so
|
| 12 |
+
it doesn't install on recent Colab/Kaggle images.
|
| 13 |
+
3. "vit" — timm ViT-B/16 ImageNet-pretrained.
|
| 14 |
+
Generic fallback (not domain-pretrained).
|
| 15 |
|
| 16 |
+
All backends output (B, num_patches, 768) — the MLP Projection layer
|
| 17 |
+
downstream cross-attention pools that to 32 visual tokens regardless of
|
| 18 |
+
num_patches, so swapping backends does not break anything else.
|
| 19 |
+
|
| 20 |
+
To force a specific backend, set `image_encoder.backend` in
|
| 21 |
+
`configs/model_config.yaml` to one of the names above.
|
| 22 |
"""
|
| 23 |
|
| 24 |
import torch
|
|
|
|
| 31 |
except ImportError:
|
| 32 |
TIMM_AVAILABLE = False
|
| 33 |
|
| 34 |
+
try:
|
| 35 |
+
from transformers import AutoModel, AutoImageProcessor
|
| 36 |
+
HF_TRANSFORMERS_AVAILABLE = True
|
| 37 |
+
except ImportError:
|
| 38 |
+
HF_TRANSFORMERS_AVAILABLE = False
|
| 39 |
+
|
| 40 |
try:
|
| 41 |
from health_multimodal.image import get_biovil_t_image_encoder
|
| 42 |
from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference
|
|
|
|
| 45 |
BIOVIL_AVAILABLE = False
|
| 46 |
|
| 47 |
|
| 48 |
+
RAD_DINO_ID = "microsoft/rad-dino"
|
| 49 |
+
|
| 50 |
+
|
| 51 |
class BioViLTEncoder(nn.Module):
|
| 52 |
"""
|
| 53 |
+
Image encoder. Name kept for backward compatibility with existing
|
| 54 |
+
checkpoints; actual backbone depends on `backend`:
|
| 55 |
+
- "auto": try rad_dino → biovilt → vit, first that loads wins
|
| 56 |
+
- "rad_dino": Microsoft RAD-DINO (HF Hub) ⭐ recommended for CXR
|
| 57 |
+
- "biovilt": Microsoft BioViL-T (hi-ml-multimodal)
|
| 58 |
+
- "vit": timm ViT-B/16 ImageNet pretrained
|
| 59 |
|
| 60 |
Output contract: (B, num_patches, 768)
|
| 61 |
"""
|
|
|
|
| 63 |
PATCH_DIM = 768
|
| 64 |
IMG_SIZE = 448
|
| 65 |
|
| 66 |
+
# Native input size per backend (used when caller passes img_size=None)
|
| 67 |
+
_DEFAULT_SIZE = {
|
| 68 |
+
"rad_dino": 518, # RAD-DINO trained at 518×518 (patch 14)
|
| 69 |
+
"biovilt": 448, # BioViL-T trained at 448×448
|
| 70 |
+
"vit": 224, # ViT-B/16 native 224×224 (patch 16)
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
def __init__(
|
| 74 |
self,
|
| 75 |
frozen: bool = True,
|
| 76 |
img_size: int = 448,
|
| 77 |
+
backend: str = "auto", # "auto" | "rad_dino" | "biovilt" | "vit"
|
| 78 |
device: Optional[str] = None,
|
| 79 |
):
|
| 80 |
super().__init__()
|
| 81 |
+
self.frozen = frozen
|
| 82 |
+
# `img_size` may be overridden by the chosen backend's native size if
|
| 83 |
+
# the caller didn't pass anything specific.
|
| 84 |
+
self._requested_img_size = img_size
|
| 85 |
+
|
| 86 |
+
# ── Resolve backend ──────────────────────────────────────────────
|
| 87 |
+
# "auto" tries each candidate in priority order and uses the first
|
| 88 |
+
# one that successfully loads. Per-backend load failures are caught
|
| 89 |
+
# and logged so a missing dependency on one path doesn't kill the
|
| 90 |
+
# run; only if EVERY backend fails do we raise.
|
| 91 |
+
candidates = (
|
| 92 |
+
("rad_dino", "biovilt", "vit") if backend == "auto" else (backend,)
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
last_error = None
|
| 96 |
+
chosen = None
|
| 97 |
+
for cand in candidates:
|
| 98 |
+
ok, err = self._try_load_backend(cand, img_size)
|
| 99 |
+
if ok:
|
| 100 |
+
chosen = cand
|
| 101 |
+
break
|
| 102 |
+
last_error = err
|
| 103 |
+
print(f"[BioViLTEncoder] backend '{cand}' unavailable: {err}")
|
| 104 |
+
|
| 105 |
+
if chosen is None:
|
| 106 |
+
raise RuntimeError(
|
| 107 |
+
f"No image encoder backend could be loaded. Last error: {last_error}"
|
| 108 |
)
|
| 109 |
+
self.backend = chosen
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
if frozen:
|
| 112 |
self._freeze()
|
| 113 |
|
| 114 |
print(f"[BioViLTEncoder] backend={self.backend} frozen={frozen} img_size={self.img_size}")
|
| 115 |
|
| 116 |
+
# ────────────────────────────────────────────────────────────────────
|
| 117 |
+
# Backend loading
|
| 118 |
+
# ────────────────────────────────────────────────────────────────────
|
| 119 |
+
|
| 120 |
+
def _try_load_backend(self, backend: str, img_size_hint: Optional[int]):
|
| 121 |
+
"""
|
| 122 |
+
Try to load `backend`. Returns (success: bool, error_or_None).
|
| 123 |
+
On success, sets self.encoder and self.img_size.
|
| 124 |
+
"""
|
| 125 |
+
try:
|
| 126 |
+
if backend == "rad_dino":
|
| 127 |
+
if not HF_TRANSFORMERS_AVAILABLE:
|
| 128 |
+
return False, "transformers not installed"
|
| 129 |
+
# Load via HF transformers — works on any Python version
|
| 130 |
+
# that runs `transformers`. Weights download from HF Hub on
|
| 131 |
+
# first use (~340MB, cached afterwards).
|
| 132 |
+
self.encoder = AutoModel.from_pretrained(RAD_DINO_ID)
|
| 133 |
+
# img_size_hint=448 will be honoured if user set it; otherwise
|
| 134 |
+
# default to 518 (RAD-DINO's native training resolution).
|
| 135 |
+
self.img_size = img_size_hint or self._DEFAULT_SIZE["rad_dino"]
|
| 136 |
+
return True, None
|
| 137 |
+
|
| 138 |
+
elif backend == "biovilt":
|
| 139 |
+
if not BIOVIL_AVAILABLE:
|
| 140 |
+
return False, "hi-ml-multimodal not installed"
|
| 141 |
+
self.encoder = get_biovil_t_image_encoder()
|
| 142 |
+
self.img_size = img_size_hint or self._DEFAULT_SIZE["biovilt"]
|
| 143 |
+
return True, None
|
| 144 |
+
|
| 145 |
+
elif backend == "vit":
|
| 146 |
+
if not TIMM_AVAILABLE:
|
| 147 |
+
return False, "timm not installed"
|
| 148 |
+
self.encoder = timm.create_model(
|
| 149 |
+
"vit_base_patch16_224",
|
| 150 |
+
pretrained = True,
|
| 151 |
+
num_classes = 0, # drop classifier head
|
| 152 |
+
global_pool = "", # keep patch tokens
|
| 153 |
+
)
|
| 154 |
+
# ViT-B/16 is locked to 224 by its position embeddings
|
| 155 |
+
self.img_size = self._DEFAULT_SIZE["vit"]
|
| 156 |
+
return True, None
|
| 157 |
+
|
| 158 |
+
else:
|
| 159 |
+
return False, f"unknown backend name: {backend!r}"
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
# AutoModel.from_pretrained may fail on network / auth / disk.
|
| 163 |
+
# Treat it as "backend unavailable" so auto-fallback can proceed.
|
| 164 |
+
return False, f"{type(e).__name__}: {e}"
|
| 165 |
+
|
| 166 |
def _freeze(self):
|
| 167 |
for p in self.encoder.parameters():
|
| 168 |
p.requires_grad = False
|
| 169 |
self.encoder.eval()
|
| 170 |
|
| 171 |
+
# ────────────────────────────────────────────────────────────────────
|
| 172 |
+
# Forward
|
| 173 |
+
# ────────────────────────────────────────────────────────────────────
|
| 174 |
+
|
| 175 |
def forward(self, images: torch.Tensor) -> torch.Tensor:
|
| 176 |
"""
|
| 177 |
Args:
|
|
|
|
| 181 |
"""
|
| 182 |
ctx = torch.no_grad() if self.frozen else torch.enable_grad()
|
| 183 |
with ctx:
|
| 184 |
+
if self.backend == "rad_dino":
|
| 185 |
+
# HF AutoModel returns BaseModelOutput; last_hidden_state has
|
| 186 |
+
# shape (B, N+1, 768) where token 0 is the CLS — drop it.
|
| 187 |
+
out = self.encoder(pixel_values=images)
|
| 188 |
+
feats = out.last_hidden_state
|
| 189 |
+
if feats.ndim == 3 and feats.shape[1] > 1:
|
| 190 |
+
feats = feats[:, 1:, :]
|
| 191 |
+
|
| 192 |
+
elif self.backend == "biovilt":
|
| 193 |
out = self.encoder(images)
|
| 194 |
feats = out.patch_embedding # (B, 768, H', W')
|
| 195 |
B, C, H, W = feats.shape
|
| 196 |
feats = feats.flatten(2).transpose(1, 2) # (B, H'*W', 768)
|
| 197 |
+
|
| 198 |
else: # vit
|
| 199 |
# timm ViT with num_classes=0, global_pool="" returns (B, N+1, 768)
|
| 200 |
# where token 0 is [CLS]. Drop it.
|
| 201 |
+
feats = self.encoder.forward_features(images)
|
| 202 |
if feats.ndim == 3 and feats.shape[1] > 1:
|
| 203 |
+
feats = feats[:, 1:, :]
|
| 204 |
+
|
| 205 |
return feats
|
| 206 |
|
| 207 |
+
# ────────────────────────────────────────────────────────────────────
|
| 208 |
+
# Image transform (preprocessing)
|
| 209 |
+
# ────────────────────────────────────────────────────────────────────
|
| 210 |
+
|
| 211 |
@staticmethod
|
| 212 |
+
def get_transform(split: str = "train", backend: str = "auto"):
|
| 213 |
"""
|
| 214 |
+
Return an image transform that matches the chosen backend's expected
|
| 215 |
+
normalization and input size.
|
| 216 |
+
|
| 217 |
+
Priority is the same as backend selection (rad_dino → biovilt → vit).
|
| 218 |
+
`backend="auto"` picks whichever transform we can construct; pass an
|
| 219 |
+
explicit backend name to force one.
|
| 220 |
+
|
| 221 |
+
The returned object is callable: `transform(pil_image) -> tensor`.
|
| 222 |
"""
|
| 223 |
+
candidates = (
|
| 224 |
+
("rad_dino", "biovilt", "vit") if backend == "auto" else (backend,)
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
for cand in candidates:
|
| 228 |
+
try:
|
| 229 |
+
if cand == "rad_dino" and HF_TRANSFORMERS_AVAILABLE:
|
| 230 |
+
# RAD-DINO ships its own preprocessor (correct chest-X-ray
|
| 231 |
+
# specific mean/std, native 518×518 resize, RGB channels).
|
| 232 |
+
proc = AutoImageProcessor.from_pretrained(RAD_DINO_ID)
|
| 233 |
+
|
| 234 |
+
def _rad_dino_transform(pil_img):
|
| 235 |
+
return proc(images=pil_img, return_tensors="pt")["pixel_values"][0]
|
| 236 |
+
|
| 237 |
+
return _rad_dino_transform
|
| 238 |
+
|
| 239 |
+
if cand == "biovilt" and BIOVIL_AVAILABLE:
|
| 240 |
+
return create_chest_xray_transform_for_inference(
|
| 241 |
+
width = BioViLTEncoder._DEFAULT_SIZE["biovilt"],
|
| 242 |
+
height = BioViLTEncoder._DEFAULT_SIZE["biovilt"],
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
if cand == "vit":
|
| 246 |
+
from torchvision import transforms
|
| 247 |
+
size = BioViLTEncoder._DEFAULT_SIZE["vit"]
|
| 248 |
+
return transforms.Compose([
|
| 249 |
+
transforms.Resize((size, size)),
|
| 250 |
+
transforms.ToTensor(),
|
| 251 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
| 252 |
+
std=[0.229, 0.224, 0.225]),
|
| 253 |
+
])
|
| 254 |
+
|
| 255 |
+
except Exception as e:
|
| 256 |
+
print(f"[BioViLTEncoder.get_transform] '{cand}' transform "
|
| 257 |
+
f"unavailable: {type(e).__name__}: {e}")
|
| 258 |
+
continue
|
| 259 |
+
|
| 260 |
+
raise RuntimeError(
|
| 261 |
+
"No image transform could be constructed (rad_dino/biovilt/vit all failed)"
|
| 262 |
+
)
|
| 263 |
|
| 264 |
@property
|
| 265 |
def output_dim(self) -> int:
|