{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# EDA — MIMIC-CXR Subset p18\n", "\n", "**Datasets used:**\n", "- `MIMIC-CXR-JPG` (v2.1.0) — ảnh JPG + CSV metadata\n", "- `MIMIC-CXR` (v2.1.0) — report `.txt` (Findings / Impression)\n", "- `MIMIC-Ext-MIMIC-CXR-VQA` (v1.0.0) — câu hỏi/đáp VQA\n", "\n", "**Scope:** chỉ phân tích bệnh nhân có `subject_id` bắt đầu bằng `18` (folder `p18`).\n", "\n", "> ℹ️ **Không cần tải ảnh JPG** để chạy notebook này — toàn bộ EDA dựa trên CSV, .txt reports và .json VQA." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. Cấu hình đường dẫn" ] }, { "cell_type": "code", "execution_count": null, "id": "a4238924", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "DATA_DIR = Path(r\"D:\\USTH\\KLTN\\cxr-vlm-data\")\n", "CXR_ROOT = DATA_DIR / \"mimic-cxr-reports\" # files/p10…p19/pXXXXXX/sYYYYYY.txt\n", "\n", "SPLIT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-split.csv\"\n", "META_CSV = DATA_DIR / \"mimic-cxr-2.0.0-metadata.csv\"\n", "CHEXPERT_CSV = DATA_DIR / \"mimic-cxr-2.0.0-chexpert.csv\"\n", "\n", "_VQA_DIR = (DATA_DIR\n", " / \"mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\"\n", " / \"MIMIC-Ext-MIMIC-CXR-VQA\"\n", " / \"dataset\")\n", "VQA_TRAIN = _VQA_DIR / \"train.json\"\n", "VQA_VALID = _VQA_DIR / \"valid.json\"\n", "VQA_TEST = _VQA_DIR / \"test.json\"\n", "\n", "# Kiểm tra nhanh\n", "for name, p in [(\"SPLIT_CSV\", SPLIT_CSV),\n", " (\"META_CSV\", META_CSV),\n", " (\"CHEXPERT_CSV\", CHEXPERT_CSV),\n", " (\"CXR_ROOT\", CXR_ROOT),\n", " (\"VQA_TRAIN\", VQA_TRAIN)]:\n", " status = \"✓\" if p.exists() else \"✗ NOT FOUND\"\n", " print(f\" {status} {name}: {p}\")\n", "\n", "print(\"\\nPaths configured.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "99828a70", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import json\n", "import re\n", "import matplotlib.pyplot as plt\n", "import matplotlib.ticker as mticker\n", "import seaborn as sns\n", "from collections import Counter\n", "\n", "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n", "plt.rcParams[\"figure.dpi\"] = 120\n", "plt.rcParams[\"figure.figsize\"] = (10, 4)\n", "\n", "CHEXPERT_LABELS = [\n", " \"Atelectasis\", \"Cardiomegaly\", \"Consolidation\", \"Edema\",\n", " \"Enlarged Cardiomediastinum\", \"Fracture\", \"Lung Lesion\",\n", " \"Lung Opacity\", \"No Finding\", \"Pleural Effusion\",\n", " \"Pleural Other\", \"Pneumonia\", \"Pneumothorax\", \"Support Devices\"\n", "]\n", "\n", "print(\"Libraries imported.\")" ] }, { "cell_type": "markdown", "id": "4674dd4f", "metadata": {}, "source": [ "## 1. Load & lọc subset p18" ] }, { "cell_type": "code", "execution_count": null, "id": "9f1d59fe", "metadata": {}, "outputs": [], "source": [ "split_df = pd.read_csv(SPLIT_CSV)\n", "meta_df = pd.read_csv(META_CSV)\n", "chexpert_df = pd.read_csv(CHEXPERT_CSV)\n", "\n", "# Lọc p18\n", "p18_split = split_df[split_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n", "p18_meta = meta_df[meta_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n", "p18_chex = chexpert_df[chexpert_df[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n", "\n", "print(f\"split.csv — p18 images : {len(p18_split):,}\")\n", "print(f\"metadata — p18 images : {len(p18_meta):,}\")\n", "print(f\"chexpert — p18 studies : {len(p18_chex):,}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6657d6ec", "metadata": {}, "outputs": [], "source": [ "# Merge split + metadata (by dicom_id)\n", "df = p18_split.merge(\n", " p18_meta[[\"dicom_id\", \"ViewPosition\", \"Rows\", \"Columns\"]],\n", " on=\"dicom_id\", how=\"left\"\n", ")\n", "\n", "print(f\"Merged shape: {df.shape}\")\n", "df.head(3)" ] }, { "cell_type": "markdown", "id": "5a7bff47", "metadata": {}, "source": [ "## 2. Tổng quan số lượng ảnh & report theo split" ] }, { "cell_type": "code", "execution_count": null, "id": "81be327d", "metadata": {}, "outputs": [], "source": [ "# Số ảnh theo split\n", "img_per_split = df[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\n", "\n", "# Số study (≈ report) theo split (mỗi study_id = 1 report)\n", "study_per_split = (\n", " df.drop_duplicates(\"study_id\")[\"split\"]\n", " .value_counts()\n", " .reindex([\"train\", \"validate\", \"test\"])\n", ")\n", "\n", "summary = pd.DataFrame({\n", " \"Images (dicom_id)\": img_per_split,\n", " \"Studies / Reports\": study_per_split\n", "})\n", "summary.loc[\"TOTAL\"] = summary.sum()\n", "print(summary.to_string())" ] }, { "cell_type": "code", "execution_count": null, "id": "80fa39e7", "metadata": {}, "outputs": [], "source": [ "fig, axes = plt.subplots(1, 2, figsize=(11, 4))\n", "for ax, col, title in zip(axes, summary.columns, [\"Số ảnh theo split\", \"Số study/report theo split\"]):\n", " vals = summary.loc[[\"train\",\"validate\",\"test\"], col]\n", " bars = ax.bar(vals.index, vals.values, color=sns.color_palette(\"muted\", 3))\n", " ax.bar_label(bars, fmt=\"%d\")\n", " ax.set_title(title)\n", " ax.set_ylabel(\"Count\")\n", "plt.suptitle(\"p18 subset — images vs reports per split\", fontsize=13, y=1.02)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "4fed2aa0", "metadata": {}, "source": [ "## 3. Số ảnh mỗi study (1 study → bao nhiêu ảnh?)" ] }, { "cell_type": "code", "execution_count": null, "id": "39b23ccb", "metadata": {}, "outputs": [], "source": [ "imgs_per_study = df.groupby(\"study_id\")[\"dicom_id\"].count()\n", "count_dist = imgs_per_study.value_counts().sort_index()\n", "\n", "print(\"Images per study distribution:\")\n", "print(count_dist.to_string())\n", "print(f\"\\nMax images in a single study: {imgs_per_study.max()}\")\n", "print(f\"Mean images per study : {imgs_per_study.mean():.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b8c6560b", "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(8, 4))\n", "ax.bar(count_dist.index.astype(str), count_dist.values, color=sns.color_palette(\"Blues_d\", len(count_dist)))\n", "ax.set_xlabel(\"Số ảnh trong study\")\n", "ax.set_ylabel(\"Số study\")\n", "ax.set_title(\"Distribution: số ảnh mỗi study (p18)\")\n", "for i, v in zip(count_dist.index, count_dist.values):\n", " ax.text(str(i), v + max(count_dist)*0.01, str(v), ha=\"center\", va=\"bottom\", fontsize=9)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "0262e14a", "metadata": {}, "source": [ "## 4. Phân bố View Position (AP, PA, Lateral, ...)" ] }, { "cell_type": "code", "execution_count": null, "id": "cad06cc2", "metadata": {}, "outputs": [], "source": [ "view_counts = df[\"ViewPosition\"].fillna(\"Unknown\").value_counts()\n", "print(\"View position counts:\")\n", "print(view_counts.to_string())\n", "print(f\"\\nTotal images: {len(df):,}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d86b2102", "metadata": {}, "outputs": [], "source": [ "fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n", "\n", "# Bar chart\n", "bars = axes[0].bar(view_counts.index, view_counts.values,\n", " color=sns.color_palette(\"Set2\", len(view_counts)))\n", "axes[0].bar_label(bars, fmt=\"%d\")\n", "axes[0].set_title(\"Số ảnh theo View Position\")\n", "axes[0].set_ylabel(\"Count\")\n", "\n", "# Pie chart\n", "axes[1].pie(view_counts.values, labels=view_counts.index, autopct=\"%1.1f%%\",\n", " colors=sns.color_palette(\"Set2\", len(view_counts)))\n", "axes[1].set_title(\"Tỉ lệ View Position\")\n", "\n", "plt.suptitle(\"View Position Distribution — p18\", fontsize=13)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "d8f24892", "metadata": {}, "outputs": [], "source": [ "# View distribution theo split\n", "view_split = df.groupby([\"split\", \"ViewPosition\"]).size().unstack(fill_value=0)\n", "view_split = view_split.reindex([\"train\", \"validate\", \"test\"])\n", "view_split.plot(kind=\"bar\", figsize=(10, 4), color=sns.color_palette(\"Set2\", view_split.shape[1]))\n", "plt.title(\"View Position theo split — p18\")\n", "plt.xlabel(\"Split\")\n", "plt.ylabel(\"Count\")\n", "plt.xticks(rotation=0)\n", "plt.legend(title=\"ViewPosition\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "ae9f3d3c", "metadata": {}, "source": [ "## 4b. Frontal-Only Sampling Strategy (AP > PA)\n", "\n", "Chiến lược train: **1 report + 1 ảnh frontal** mỗi study.\n", "- Chỉ giữ AP hoặc PA; nếu study có cả hai thì **ưu tiên AP**.\n", "- Study không có ảnh frontal nào → loại khỏi tập train." ] }, { "cell_type": "code", "execution_count": null, "id": "d2ce6beb", "metadata": {}, "outputs": [], "source": [ "frontal = df[df[\"ViewPosition\"].isin([\"AP\", \"PA\"])].copy()\n", "\n", "# Với mỗi study: chọn AP trước, nếu không có thì chọn PA (lấy 1 ảnh duy nhất)\n", "def pick_frontal_view(group):\n", " ap = group[group[\"ViewPosition\"] == \"AP\"]\n", " if len(ap) > 0:\n", " return ap.iloc[[0]]\n", " return group[group[\"ViewPosition\"] == \"PA\"].iloc[[0]]\n", "\n", "frontal_1img = (\n", " frontal.groupby(\"study_id\", group_keys=False)\n", " .apply(pick_frontal_view)\n", " .reset_index(drop=True)\n", ")\n", "\n", "# Thống kê tổng quan\n", "n_study_total = df[\"study_id\"].nunique()\n", "n_study_frontal = frontal_1img[\"study_id\"].nunique()\n", "n_study_no_front = n_study_total - n_study_frontal\n", "\n", "print(\"=== Frontal-Only Sampling (p18) ===\")\n", "print(f\"Tổng số study : {n_study_total:,}\")\n", "print(f\"Study có ảnh frontal (AP/PA) : {n_study_frontal:,} ({n_study_frontal/n_study_total*100:.1f}%)\")\n", "print(f\"Study bị loại (không có frontal): {n_study_no_front:,} ({n_study_no_front/n_study_total*100:.1f}%)\")\n", "print()\n", "print(f\"Ảnh được chọn theo view:\")\n", "print(frontal_1img[\"ViewPosition\"].value_counts().to_string())\n", "print()\n", "print(\"=== Mẫu train sau khi filter (split) ===\")\n", "split_frontal = frontal_1img[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\n", "split_all = df.drop_duplicates(\"study_id\")[\"split\"].value_counts().reindex([\"train\", \"validate\", \"test\"])\n", "compare = pd.DataFrame({\n", " \"All studies\": split_all,\n", " \"Frontal-only\": split_frontal,\n", " \"Giảm (%)\": ((split_all - split_frontal) / split_all * 100).round(1)\n", "})\n", "print(compare.to_string())" ] }, { "cell_type": "code", "execution_count": null, "id": "9d4aaf5c", "metadata": {}, "outputs": [], "source": [ "fig, axes = plt.subplots(1, 3, figsize=(14, 4))\n", "\n", "# 1. All vs Frontal-only (study count)\n", "cats = [\"All studies\", \"Frontal-only\"]\n", "vals = [n_study_total, n_study_frontal]\n", "bars = axes[0].bar(cats, vals, color=[\"#4C72B0\", \"#55A868\"], width=0.5)\n", "axes[0].bar_label(bars, fmt=\"%d\")\n", "axes[0].set_title(\"Study count: All vs Frontal-only\")\n", "axes[0].set_ylabel(\"Số study\")\n", "\n", "# 2. View breakdown của ảnh được chọn\n", "vc = frontal_1img[\"ViewPosition\"].value_counts()\n", "axes[1].pie(vc.values, labels=vc.index, autopct=\"%1.1f%%\",\n", " colors=[\"#4C72B0\", \"#DD8452\"])\n", "axes[1].set_title(\"View được chọn (AP ưu tiên)\")\n", "\n", "# 3. So sánh train/val/test\n", "x = np.arange(3)\n", "w = 0.35\n", "splits = [\"train\", \"validate\", \"test\"]\n", "axes[2].bar(x - w/2, split_all.values, w, label=\"All\", color=\"#4C72B0\", alpha=0.85)\n", "axes[2].bar(x + w/2, split_frontal.values, w, label=\"Frontal-only\", color=\"#55A868\", alpha=0.85)\n", "axes[2].set_xticks(x)\n", "axes[2].set_xticklabels(splits)\n", "axes[2].set_title(\"Frontal-only vs All (per split)\")\n", "axes[2].set_ylabel(\"Số study\")\n", "axes[2].legend()\n", "\n", "plt.suptitle(\"Frontal-Only Sampling Strategy — p18\", fontsize=13)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "28847d0b", "metadata": {}, "source": [ "## 5. CheXpert Labels — 14 nhãn bệnh lý" ] }, { "cell_type": "code", "execution_count": null, "id": "410fbdbe", "metadata": {}, "outputs": [], "source": [ "# Chỉ lấy cột labels (1 = positive, 0 = negative, -1 = uncertain, NaN = not mentioned)\n", "label_cols = [c for c in p18_chex.columns if c in CHEXPERT_LABELS]\n", "\n", "# Số study có nhãn Positive (=1) mỗi bệnh\n", "positive_counts = (p18_chex[label_cols] == 1).sum().sort_values(ascending=False)\n", "uncertain_counts = (p18_chex[label_cols] == -1).sum().sort_values(ascending=False)\n", "negative_counts = (p18_chex[label_cols] == 0).sum().sort_values(ascending=False)\n", "\n", "label_summary = pd.DataFrame({\n", " \"Positive\": positive_counts,\n", " \"Uncertain\": uncertain_counts,\n", " \"Negative\": negative_counts,\n", " \"Not Mentioned\": p18_chex[label_cols].isna().sum()\n", "})\n", "label_summary[\"Total Studies\"] = len(p18_chex)\n", "label_summary[\"Positive %\"] = (label_summary[\"Positive\"] / len(p18_chex) * 100).round(1)\n", "print(label_summary[[\"Positive\",\"Uncertain\",\"Negative\",\"Not Mentioned\",\"Positive %\"]]\n", " .sort_values(\"Positive\", ascending=False).to_string())" ] }, { "cell_type": "code", "execution_count": null, "id": "50c9a91d", "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(12, 5))\n", "x = np.arange(len(label_cols))\n", "w = 0.25\n", "\n", "ordered_labels = label_summary.sort_values(\"Positive\", ascending=False).index.tolist()\n", "\n", "ax.bar(x - w, label_summary.loc[ordered_labels, \"Positive\"], width=w, label=\"Positive\", color=\"#e74c3c\")\n", "ax.bar(x, label_summary.loc[ordered_labels, \"Uncertain\"], width=w, label=\"Uncertain\", color=\"#f39c12\")\n", "ax.bar(x + w, label_summary.loc[ordered_labels, \"Negative\"], width=w, label=\"Negative\", color=\"#2ecc71\")\n", "\n", "ax.set_xticks(x)\n", "ax.set_xticklabels(ordered_labels, rotation=40, ha=\"right\", fontsize=9)\n", "ax.set_ylabel(\"Số study\")\n", "ax.set_title(\"CheXpert Labels — Positive / Uncertain / Negative (p18)\")\n", "ax.legend()\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "1e1209c5", "metadata": {}, "outputs": [], "source": [ "# Số nhãn positive mỗi study (label co-occurrence)\n", "labels_per_study = (p18_chex[label_cols] == 1).sum(axis=1)\n", "print(\"Số nhãn positive mỗi study:\")\n", "print(labels_per_study.value_counts().sort_index().to_string())\n", "\n", "fig, ax = plt.subplots(figsize=(9, 4))\n", "lps_counts = labels_per_study.value_counts().sort_index()\n", "ax.bar(lps_counts.index.astype(str), lps_counts.values, color=sns.color_palette(\"Blues_d\", len(lps_counts)))\n", "ax.set_xlabel(\"Số nhãn positive\")\n", "ax.set_ylabel(\"Số study\")\n", "ax.set_title(\"Phân bố số nhãn positive mỗi study (p18)\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "f0aa1ba8", "metadata": {}, "source": [ "## 6. Phân tích Report — Findings & Impression" ] }, { "cell_type": "code", "execution_count": null, "id": "8b1e562e", "metadata": {}, "outputs": [], "source": [ "def parse_report(txt_path: Path) -> dict:\n", " \"\"\"Trả về dict với 'findings' và 'impression' (str hoặc None).\"\"\"\n", " try:\n", " text = txt_path.read_text(encoding=\"utf-8\", errors=\"ignore\")\n", " except FileNotFoundError:\n", " return {\"findings\": None, \"impression\": None}\n", "\n", " text = re.sub(r\"[\\r\\n]+\", \" \", text) # flatten newlines\n", "\n", " def extract_section(pattern, text):\n", " m = re.search(pattern, text, re.IGNORECASE)\n", " if not m:\n", " return None\n", " start = m.end()\n", " # cắt đến section tiếp theo hoặc hết string\n", " next_sec = re.search(\n", " r\"(IMPRESSION|FINDINGS|CONCLUSION|RECOMMENDATION|NOTIFICATION)\",\n", " text[start:], re.IGNORECASE\n", " )\n", " end = start + next_sec.start() if next_sec else len(text)\n", " return text[start:end].strip()\n", "\n", " findings = extract_section(r\"FINDINGS\\s*:\", text)\n", " impression = extract_section(r\"IMPRESSION\\s*:\", text)\n", " return {\"findings\": findings, \"impression\": impression}\n", "\n", "\n", "# Lấy danh sách unique studies trong p18\n", "p18_studies = (\n", " df[[\"subject_id\", \"study_id\"]]\n", " .drop_duplicates(\"study_id\")\n", " .reset_index(drop=True)\n", ")\n", "\n", "print(f\"Số study cần parse: {len(p18_studies):,}\")\n", "print(\"Parsing reports... (có thể mất vài giây)\")\n", "\n", "records = []\n", "for _, row in p18_studies.iterrows():\n", " sid = str(row[\"subject_id\"])\n", " stid = str(row[\"study_id\"])\n", " txt_path = CXR_ROOT / \"files\" / \"p18\" / f\"p{sid}\" / f\"s{stid}.txt\"\n", " parsed = parse_report(txt_path)\n", " records.append({\"study_id\": stid, **parsed})\n", "\n", "report_df = pd.DataFrame(records)\n", "report_df[\"findings_len\"] = report_df[\"findings\"].dropna().str.split().str.len()\n", "report_df[\"impression_len\"] = report_df[\"impression\"].dropna().str.split().str.len()\n", "\n", "print(f\"\\nFindings found : {report_df['findings'].notna().sum():,} / {len(report_df):,}\")\n", "print(f\"Impression found : {report_df['impression'].notna().sum():,} / {len(report_df):,}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c49a401a", "metadata": {}, "outputs": [], "source": [ "# Descriptive stats\n", "print(\"=== Findings word count ===\")\n", "print(report_df[\"findings_len\"].describe().round(1).to_string())\n", "print(\"\\n=== Impression word count ===\")\n", "print(report_df[\"impression_len\"].describe().round(1).to_string())" ] }, { "cell_type": "code", "execution_count": null, "id": "942959d1", "metadata": {}, "outputs": [], "source": [ "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n", "\n", "for ax, col, title, color in zip(\n", " axes,\n", " [\"findings_len\", \"impression_len\"],\n", " [\"Findings — phân bố độ dài (số từ)\", \"Impression — phân bố độ dài (số từ)\"],\n", " [\"steelblue\", \"tomato\"]\n", "):\n", " data = report_df[col].dropna()\n", " # clip outliers để biểu đồ dễ nhìn\n", " p99 = data.quantile(0.99)\n", " data_clipped = data[data <= p99]\n", " ax.hist(data_clipped, bins=40, color=color, edgecolor=\"white\", alpha=0.85)\n", " ax.axvline(data.median(), color=\"black\", linestyle=\"--\", linewidth=1.2, label=f\"Median={data.median():.0f}\")\n", " ax.axvline(data.mean(), color=\"gray\", linestyle=\":\", linewidth=1.2, label=f\"Mean={data.mean():.0f}\")\n", " ax.set_title(title)\n", " ax.set_xlabel(\"Số từ\")\n", " ax.set_ylabel(\"Số report\")\n", " ax.legend(fontsize=9)\n", " ax.text(0.97, 0.95, f\"n={len(data):,}\\n(hiển thị ≤p99={p99:.0f}w)\",\n", " transform=ax.transAxes, ha=\"right\", va=\"top\", fontsize=8, color=\"gray\")\n", "\n", "plt.suptitle(\"Phân bố độ dài report — p18\", fontsize=13)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "170b0971", "metadata": {}, "outputs": [], "source": [ "# Box plot so sánh Findings vs Impression\n", "combined = pd.DataFrame({\n", " \"word_count\": pd.concat([report_df[\"findings_len\"], report_df[\"impression_len\"]], ignore_index=True),\n", " \"section\": [\"Findings\"] * len(report_df) + [\"Impression\"] * len(report_df)\n", "}).dropna()\n", "\n", "fig, ax = plt.subplots(figsize=(7, 4))\n", "sns.boxplot(data=combined, x=\"section\", y=\"word_count\",\n", " palette=[\"steelblue\", \"tomato\"], showfliers=False, ax=ax)\n", "ax.set_title(\"Findings vs Impression — độ dài (box plot, no outliers)\")\n", "ax.set_ylabel(\"Số từ\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "5512e3aa", "metadata": {}, "source": [ "## 7. VQA — phân tích câu hỏi & đáp" ] }, { "cell_type": "code", "execution_count": null, "id": "7caa394c", "metadata": {}, "outputs": [], "source": [ "vqa_dfs = []\n", "for fpath, split_name in [(VQA_TRAIN, \"train\"), (VQA_VALID, \"valid\"), (VQA_TEST, \"test\")]:\n", " if fpath.exists():\n", " with open(fpath, encoding=\"utf-8\") as f:\n", " data = json.load(f)\n", " tmp = pd.DataFrame(data)\n", " tmp[\"split\"] = split_name\n", " vqa_dfs.append(tmp)\n", " else:\n", " print(f\"[WARNING] File not found: {fpath}\")\n", "\n", "vqa_all = pd.concat(vqa_dfs, ignore_index=True)\n", "\n", "# Lọc p18\n", "vqa_p18 = vqa_all[vqa_all[\"subject_id\"].astype(str).str.startswith(\"18\")].copy()\n", "\n", "print(f\"VQA total records : {len(vqa_all):,}\")\n", "print(f\"VQA p18 records : {len(vqa_p18):,}\")\n", "print(f\"\\nColumns: {list(vqa_p18.columns)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ddb012a8", "metadata": {}, "outputs": [], "source": [ "# Số VQA mẫu theo split\n", "print(\"VQA p18 per split:\")\n", "print(vqa_p18[\"split\"].value_counts().to_string())" ] }, { "cell_type": "code", "execution_count": null, "id": "86eec60e", "metadata": {}, "outputs": [], "source": [ "# Semantic type: verify / choose / query\n", "sem_counts = vqa_p18[\"semantic_type\"].value_counts()\n", "print(\"Semantic type (verify/choose/query):\")\n", "print(sem_counts.to_string())\n", "\n", "# Content type: presence / anatomy / attribute / abnormality / size / plane / gender\n", "con_counts = vqa_p18[\"content_type\"].value_counts()\n", "print(\"\\nContent type:\")\n", "print(con_counts.to_string())" ] }, { "cell_type": "code", "execution_count": null, "id": "4567a110", "metadata": {}, "outputs": [], "source": [ "fig, axes = plt.subplots(1, 2, figsize=(13, 4))\n", "\n", "# Semantic type\n", "bars = axes[0].bar(sem_counts.index, sem_counts.values,\n", " color=sns.color_palette(\"Set1\", len(sem_counts)))\n", "axes[0].bar_label(bars, fmt=\"%d\")\n", "axes[0].set_title(\"VQA — Semantic Type (p18)\")\n", "axes[0].set_ylabel(\"Count\")\n", "\n", "# Content type\n", "bars2 = axes[1].bar(con_counts.index, con_counts.values,\n", " color=sns.color_palette(\"Set2\", len(con_counts)))\n", "axes[1].bar_label(bars2, fmt=\"%d\")\n", "axes[1].set_title(\"VQA — Content Type (p18)\")\n", "axes[1].set_ylabel(\"Count\")\n", "axes[1].tick_params(axis=\"x\", rotation=30)\n", "\n", "plt.suptitle(\"VQA Question Analysis — p18\", fontsize=13)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "f968f772", "metadata": {}, "outputs": [], "source": [ "# Cross-tab: semantic_type × content_type\n", "cross = pd.crosstab(vqa_p18[\"semantic_type\"], vqa_p18[\"content_type\"])\n", "print(\"Cross-tab semantic × content:\")\n", "print(cross.to_string())\n", "\n", "fig, ax = plt.subplots(figsize=(10, 3))\n", "sns.heatmap(cross, annot=True, fmt=\"d\", cmap=\"YlOrRd\", ax=ax)\n", "ax.set_title(\"VQA — Semantic Type × Content Type (p18)\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "97179573", "metadata": {}, "outputs": [], "source": [ "# Phân bố độ dài câu hỏi (số từ)\n", "vqa_p18[\"q_len\"] = vqa_p18[\"question\"].str.split().str.len()\n", "\n", "print(\"Question length stats:\")\n", "print(vqa_p18[\"q_len\"].describe().round(1).to_string())\n", "\n", "fig, ax = plt.subplots(figsize=(9, 3))\n", "ax.hist(vqa_p18[\"q_len\"].clip(upper=vqa_p18[\"q_len\"].quantile(0.99)),\n", " bins=30, color=\"slateblue\", edgecolor=\"white\")\n", "ax.axvline(vqa_p18[\"q_len\"].median(), color=\"black\", linestyle=\"--\",\n", " label=f\"Median={vqa_p18['q_len'].median():.0f}\")\n", "ax.set_title(\"Phân bố độ dài câu hỏi VQA (p18)\")\n", "ax.set_xlabel(\"Số từ\")\n", "ax.set_ylabel(\"Count\")\n", "ax.legend()\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "9ffe116e", "metadata": {}, "outputs": [], "source": [ "# Phân bố dạng câu trả lời: yes/no vs. khác\n", "def classify_answer(ans_list):\n", " if not isinstance(ans_list, list) or len(ans_list) == 0:\n", " return \"no answer\"\n", " a = ans_list[0].strip().lower()\n", " if a in [\"yes\", \"no\"]:\n", " return a\n", " return \"open\"\n", "\n", "vqa_p18[\"ans_type\"] = vqa_p18[\"answer\"].apply(classify_answer)\n", "\n", "ans_counts = vqa_p18[\"ans_type\"].value_counts()\n", "print(\"Answer type distribution:\")\n", "print(ans_counts.to_string())\n", "\n", "fig, ax = plt.subplots(figsize=(6, 3))\n", "bars = ax.bar(ans_counts.index, ans_counts.values,\n", " color=sns.color_palette(\"Pastel1\", len(ans_counts)))\n", "ax.bar_label(bars, fmt=\"%d\")\n", "ax.set_title(\"VQA — Answer Type Distribution (p18)\")\n", "ax.set_ylabel(\"Count\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "37f8ee29", "metadata": {}, "source": [ "## 8. Gợi ý thêm — Missing data & Data Quality" ] }, { "cell_type": "code", "execution_count": null, "id": "c0a10b57", "metadata": {}, "outputs": [], "source": [ "# 8.1 Tỉ lệ study không có findings / không có impression\n", "no_findings = report_df[\"findings\"].isna().sum()\n", "no_impression = report_df[\"impression\"].isna().sum()\n", "total_studies = len(report_df)\n", "\n", "print(f\"Studies thiếu Findings : {no_findings:,} / {total_studies:,} ({no_findings/total_studies*100:.1f}%)\")\n", "print(f\"Studies thiếu Impression : {no_impression:,} / {total_studies:,} ({no_impression/total_studies*100:.1f}%)\")\n", "both_missing = (report_df[\"findings\"].isna() & report_df[\"impression\"].isna()).sum()\n", "print(f\"Studies thiếu CẢ HAI : {both_missing:,} / {total_studies:,} ({both_missing/total_studies*100:.1f}%)\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f2fe0d2e", "metadata": {}, "outputs": [], "source": [ "# 8.2 Tỉ lệ ảnh thiếu ViewPosition\n", "missing_view = df[\"ViewPosition\"].isna().sum()\n", "print(f\"Ảnh thiếu ViewPosition: {missing_view:,} / {len(df):,} ({missing_view/len(df)*100:.1f}%)\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4b3a9176", "metadata": {}, "outputs": [], "source": [ "# 8.3 Số bệnh nhân (subject_id) trong p18\n", "n_subjects = df[\"subject_id\"].nunique()\n", "n_studies = df[\"study_id\"].nunique()\n", "n_images = df[\"dicom_id\"].nunique()\n", "\n", "print(f\"Bệnh nhân (subject_id) : {n_subjects:,}\")\n", "print(f\"Lần khám (study_id) : {n_studies:,}\")\n", "print(f\"Ảnh (dicom_id) : {n_images:,}\")\n", "print(f\"\\nTrung bình study/bệnh nhân : {n_studies/n_subjects:.2f}\")\n", "print(f\"Trung bình ảnh/bệnh nhân : {n_images/n_subjects:.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ea4da928", "metadata": {}, "outputs": [], "source": [ "# 8.4 Study distribution per patient\n", "studies_per_patient = df.groupby(\"subject_id\")[\"study_id\"].nunique()\n", "print(\"Studies per patient stats:\")\n", "print(studies_per_patient.describe().round(1).to_string())\n", "\n", "fig, ax = plt.subplots(figsize=(9, 3))\n", "spp = studies_per_patient.value_counts().sort_index()\n", "ax.bar(spp.index.astype(str), spp.values, color=\"mediumpurple\")\n", "ax.set_xlabel(\"Số study mỗi bệnh nhân\")\n", "ax.set_ylabel(\"Số bệnh nhân\")\n", "ax.set_title(\"Phân bố số lần khám mỗi bệnh nhân — p18\")\n", "ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True, nbins=20))\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "9b990ae5", "metadata": {}, "outputs": [], "source": [ "# 8.5 Image resolution distribution (nếu có cột Rows/Columns trong metadata)\n", "if \"Rows\" in df.columns and \"Columns\" in df.columns:\n", " print(\"Image resolution stats:\")\n", " print(df[[\"Rows\", \"Columns\"]].describe().round(0).to_string())\n", "\n", " res_counts = df.groupby([\"Rows\", \"Columns\"]).size().sort_values(ascending=False).head(15)\n", " print(\"\\nTop-15 resolutions:\")\n", " print(res_counts.to_string())\n", "else:\n", " print(\"Cột Rows/Columns không có trong metadata.\")" ] }, { "cell_type": "markdown", "id": "a03900eb", "metadata": {}, "source": [ "## 9. Tóm tắt (Summary)" ] }, { "cell_type": "code", "execution_count": null, "id": "f8cc6c50", "metadata": {}, "outputs": [], "source": [ "print(\"=\"*55)\n", "print(\" SUMMARY — MIMIC-CXR Subset p18\")\n", "print(\"=\"*55)\n", "print(f\" Bệnh nhân : {n_subjects:,}\")\n", "print(f\" Studies (reports) : {n_studies:,}\")\n", "print(f\" Ảnh (dicom/jpg) : {n_images:,}\")\n", "print()\n", "for sp in [\"train\", \"validate\", \"test\"]:\n", " ni = img_per_split.get(sp, 0)\n", " ns = study_per_split.get(sp, 0)\n", " print(f\" [{sp:>8}] ảnh={ni:>5,} studies={ns:>5,}\")\n", "print()\n", "print(f\" Frontal (PA+AP) : {view_counts.get('PA',0)+view_counts.get('AP',0):,} ảnh\")\n", "print(f\" Lateral : {view_counts.get('LL',0)+view_counts.get('LATERAL',0):,} ảnh\")\n", "print(f\" Findings available : {report_df['findings'].notna().sum():,}/{total_studies:,}\")\n", "print(f\" Impression available : {report_df['impression'].notna().sum():,}/{total_studies:,}\")\n", "print(f\" VQA samples (p18) : {len(vqa_p18):,}\")\n", "print(\"=\"*55)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }