convitom

initial commit

28b13fc 19 days ago

9.62 kB

	<!DOCTYPE html>
	<html lang="vi">
	<head>
	<meta charset="UTF-8" />
	<title>CXR-VLM Pipeline</title>
	<script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script>
	<style>
	body {
	font-family: 'Segoe UI', sans-serif;
	background: #0f1117;
	color: #e2e8f0;
	margin: 0;
	padding: 32px;
	}
	h1 {
	text-align: center;
	color: #7dd3fc;
	font-size: 1.6rem;
	margin-bottom: 8px;
	letter-spacing: 0.5px;
	}
	.subtitle {
	text-align: center;
	color: #94a3b8;
	font-size: 0.9rem;
	margin-bottom: 36px;
	}

	.section-title {
	color: #7dd3fc;
	font-size: 0.95rem;
	font-weight: 600;
	text-transform: uppercase;
	letter-spacing: 1px;
	margin: 40px 0 12px;
	padding-left: 4px;
	border-left: 3px solid #38bdf8;
	}

	.mermaid {
	background: #1e2433;
	border-radius: 12px;
	padding: 24px;
	overflow-x: auto;
	}

	.legend {
	display: flex;
	flex-wrap: wrap;
	gap: 16px;
	margin-top: 32px;
	justify-content: center;
	}
	.legend-item {
	display: flex;
	align-items: center;
	gap: 8px;
	font-size: 0.82rem;
	color: #cbd5e1;
	}
	.legend-color {
	width: 16px;
	height: 16px;
	border-radius: 4px;
	}
	</style>
	</head>
	<body>
	<h1>CXR-VLM — Pipeline Overview</h1>
	<p class="subtitle">Vision-Language Model for Chest X-ray Interpretation  \|  Based on RaDialog</p>

	<!-- ═══════════════════════════════════════════════════════════════════ -->
	<div class="section-title">1. Data Pipeline</div>
	<div class="mermaid">
	flowchart LR
	subgraph SRC["📦 Nguồn dữ liệu (PhysioNet)"]
	direction TB
	D1["MIMIC-CXR\n(reports/*.txt)"]
	D2["MIMIC-CXR-JPG\n(files/*/.jpg)"]
	D3["MIMIC-Ext-CXR-QBA\n(*.json)"]
	end

	subgraph BUILD["🔧 build_instruct_json()"]
	direction TB
	P1["Parse findings\ntừ report .txt"]
	P2["Parse impression\ntừ report .txt"]
	P3["Parse Q&A pairs\ntừ JSON"]
	end

	subgraph JSON["📄 Unified Instruction JSON"]
	direction TB
	J1["{ task: findings,\n image_path, target }"]
	J2["{ task: impression,\n image_path, target }"]
	J3["{ task: vqa,\n image_path, question, target }"]
	end

	D1 --> P1 & P2
	D3 --> P3
	D2 -.->\|"image_path ref"\| J1 & J2 & J3
	P1 --> J1
	P2 --> J2
	P3 --> J3

	style SRC fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0
	style BUILD fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0
	style JSON fill:#3b2a1e,stroke:#f97316,color:#e2e8f0
	</div>

	<!-- ═══════════════════════════════════════════════════════════════════ -->
	<div class="section-title">2. Dataset & Prompt Construction</div>
	<div class="mermaid">
	flowchart TD
	JSON2["📄 Instruction JSON\n(image_path, task, target, question?)"]

	subgraph DS["CXRInstructDataset.__getitem__()"]
	direction LR
	IMG["_load_image()\n→ tensor C×H×W"]
	TMPL["build_training_sample(task)\nprompt_templates.py"]
	TOK["_tokenize(prompt, target)\nlabels: -100 trên prompt tokens"]
	end

	subgraph PROMPTS["10 biến thể prompt mỗi task (random.choice)"]
	direction TB
	F["Task: findings\n→ random.choice(FINDINGS_PROMPTS)\ne.g. 'Generate the findings section...'"]
	I["Task: impression\n→ random.choice(IMPRESSION_PROMPTS)\ne.g. 'Provide a concise clinical impression...'"]
	V["Task: vqa\n→ câu hỏi trực tiếp từ dataset\ne.g. 'Is there pleural effusion?'"]
	end

	subgraph FMT["Vicuna v1.1 Format"]
	direction TB
	FMT1["SYSTEM: You are a radiologist...\nUSER: <image>\n[Predicted Findings: ...]\n{instruction}\nASSISTANT:"]
	end

	OUT["Batch output:\n{ image, input_ids, attention_mask, labels, task }"]

	JSON2 --> DS
	DS --> TMPL
	TMPL --> PROMPTS
	PROMPTS --> FMT
	FMT --> TOK
	IMG --> OUT
	TOK --> OUT

	style DS fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0
	style PROMPTS fill:#2a1e3b,stroke:#a78bfa,color:#e2e8f0
	style FMT fill:#3b2a1e,stroke:#f97316,color:#e2e8f0
	</div>

	<!-- ═══════════════════════════════════════════════════════════════════ -->
	<div class="section-title">3. Model Architecture & Forward Pass</div>
	<div class="mermaid">
	flowchart TD
	subgraph INPUT["Input"]
	IMG3["Ảnh X-quang\n(B, C, 448, 448)"]
	TEXT["Tokenized Prompt\n(B, seq_len)"]
	end

	subgraph ENC["🔵 BioViL-T Encoder — FROZEN\nimage_encoder.py"]
	E1["Pretrained trên CXR + radiology reports\n(Microsoft hi-ml-multimodal)"]
	E2["patch_features\n(B, num_patches, 768)"]
	E1 --> E2
	end

	subgraph PROJ["🟢 MLP Projection — TRAINED\nprojection.py"]
	direction TB
	PR1["Learnable query tokens\n(1, 32, 768)"]
	PR2["Cross-Attention\nquery ← patches"]
	PR3["MLP: 768 → 1024 → 4096"]
	PR4["image_tokens\n(B, 32, 4096)"]
	PR1 --> PR2 --> PR3 --> PR4
	end

	subgraph CHEX["🟡 CheXpert Classifier — FROZEN (optional)\nchexpert_classifier.py"]
	C1["Structured labels\ne.g. 'Pleural Effusion: Positive'"]
	end

	subgraph LLM["🔴 Vicuna-7B + LoRA — TRAINED (LoRA only)\ncxr_vlm.py"]
	direction TB
	L1["embed_tokens(input_ids)\n→ text_embeds (B, seq_len, 4096)"]
	L2["_inject_image_tokens()\nThay <image> token → 32 visual tokens"]
	L3["LlamaForCausalLM.forward()\ninputs_embeds + attention_mask + labels"]
	L4["Cross-Entropy Loss\n(chỉ tính trên target tokens, prompt = -100)"]
	L1 --> L2 --> L3 --> L4
	end

	IMG3 --> ENC --> PROJ
	TEXT --> LLM
	PROJ --> L2
	CHEX -.->\|prepend vào prompt text\| TEXT

	style ENC fill:#1e2e4a,stroke:#60a5fa,color:#e2e8f0
	style PROJ fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0
	style CHEX fill:#3b3a1e,stroke:#facc15,color:#e2e8f0
	style LLM fill:#3a1e1e,stroke:#f87171,color:#e2e8f0
	style INPUT fill:#1e2433,stroke:#64748b,color:#e2e8f0
	</div>

	<!-- ═══════════════════════════════════════════════════════════════════ -->
	<div class="section-title">4. Training Stages</div>
	<div class="mermaid">
	flowchart LR
	subgraph S1["Stage 1 — Alignment\nset_stage1_mode()"]
	direction TB
	S1A["🔵 BioViL-T → FROZEN"]
	S1B["🟢 MLP Projection → TRAIN ✓"]
	S1C["🔴 Vicuna-7B LoRA → FROZEN"]
	S1D["Mục tiêu: học căn chỉnh\nvisual ↔ text embedding space"]
	end

	subgraph S2["Stage 2 — Fine-tuning\nset_stage2_mode()"]
	direction TB
	S2A["🔵 BioViL-T → FROZEN"]
	S2B["🟢 MLP Projection → TRAIN ✓"]
	S2C["🔴 Vicuna-7B LoRA → TRAIN ✓\n(chỉ LoRA adapters, ~0.1% params)"]
	S2D["Mục tiêu: học 3 tasks\nfindings / impression / VQA"]
	end

	S1 -->\|"Projection converged"\| S2

	style S1 fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0
	style S2 fill:#3a1e2a,stroke:#f472b6,color:#e2e8f0
	</div>

	<!-- ═══════════════════════════════════════════════════════════════════ -->
	<div class="section-title">5. Inference & Evaluation</div>
	<div class="mermaid">
	flowchart LR
	subgraph INF["generate() — cxr_vlm.py"]
	direction TB
	I1["Encode image → 32 visual tokens"]
	I2["Build prompt theo task"]
	I3["llm.generate()\ngreedy / beam search"]
	I4["Split tại 'ASSISTANT:'\n→ response text"]
	I1 --> I2 --> I3 --> I4
	end

	subgraph EVAL["Evaluation — evaluation/"]
	direction TB
	EV1["Findings / Impression:\nBLEU-4, ROUGE-L, BERTScore, ClinicalF1"]
	EV2["VQA:\nAccuracy, F1"]
	end

	subgraph OUT2["Output theo task"]
	direction TB
	O1["findings: 'The lungs are clear...'"]
	O2["impression: 'No acute process.'"]
	O3["vqa: 'Yes, mild pleural effusion.'"]
	end

	INF --> OUT2
	OUT2 --> EVAL

	style INF fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0
	style EVAL fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0
	style OUT2 fill:#3b2a1e,stroke:#f97316,color:#e2e8f0
	</div>

	<div class="legend">
	<div class="legend-item"><div class="legend-color" style="background:#60a5fa"></div>BioViL-T (frozen)</div>
	<div class="legend-item"><div class="legend-color" style="background:#4ade80"></div>MLP Projection (trained)</div>
	<div class="legend-item"><div class="legend-color" style="background:#facc15"></div>CheXpert Classifier (frozen)</div>
	<div class="legend-item"><div class="legend-color" style="background:#f87171"></div>Vicuna-7B + LoRA (LoRA trained)</div>
	<div class="legend-item"><div class="legend-color" style="background:#f97316"></div>Data / Output</div>
	</div>

	<script>
	mermaid.initialize({ startOnLoad: true, theme: 'dark', themeVariables: { fontSize: '14px' } });
	</script>
	</body>
	</html>