cxr-vlm-code / pipeline.html
convitom
initial commit
28b13fc
<!DOCTYPE html>
<html lang="vi">
<head>
<meta charset="UTF-8" />
<title>CXR-VLM Pipeline</title>
<script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script>
<style>
body {
font-family: 'Segoe UI', sans-serif;
background: #0f1117;
color: #e2e8f0;
margin: 0;
padding: 32px;
}
h1 {
text-align: center;
color: #7dd3fc;
font-size: 1.6rem;
margin-bottom: 8px;
letter-spacing: 0.5px;
}
.subtitle {
text-align: center;
color: #94a3b8;
font-size: 0.9rem;
margin-bottom: 36px;
}
.section-title {
color: #7dd3fc;
font-size: 0.95rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 1px;
margin: 40px 0 12px;
padding-left: 4px;
border-left: 3px solid #38bdf8;
}
.mermaid {
background: #1e2433;
border-radius: 12px;
padding: 24px;
overflow-x: auto;
}
.legend {
display: flex;
flex-wrap: wrap;
gap: 16px;
margin-top: 32px;
justify-content: center;
}
.legend-item {
display: flex;
align-items: center;
gap: 8px;
font-size: 0.82rem;
color: #cbd5e1;
}
.legend-color {
width: 16px;
height: 16px;
border-radius: 4px;
}
</style>
</head>
<body>
<h1>CXR-VLM — Pipeline Overview</h1>
<p class="subtitle">Vision-Language Model for Chest X-ray Interpretation &nbsp;|&nbsp; Based on RaDialog</p>
<!-- ═══════════════════════════════════════════════════════════════════ -->
<div class="section-title">1. Data Pipeline</div>
<div class="mermaid">
flowchart LR
subgraph SRC["📦 Nguồn dữ liệu (PhysioNet)"]
direction TB
D1["MIMIC-CXR\n(reports/*.txt)"]
D2["MIMIC-CXR-JPG\n(files/**/*.jpg)"]
D3["MIMIC-Ext-CXR-QBA\n(*.json)"]
end
subgraph BUILD["🔧 build_instruct_json()"]
direction TB
P1["Parse findings\ntừ report .txt"]
P2["Parse impression\ntừ report .txt"]
P3["Parse Q&A pairs\ntừ JSON"]
end
subgraph JSON["📄 Unified Instruction JSON"]
direction TB
J1["{ task: findings,\n image_path, target }"]
J2["{ task: impression,\n image_path, target }"]
J3["{ task: vqa,\n image_path, question, target }"]
end
D1 --> P1 & P2
D3 --> P3
D2 -.->|"image_path ref"| J1 & J2 & J3
P1 --> J1
P2 --> J2
P3 --> J3
style SRC fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0
style BUILD fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0
style JSON fill:#3b2a1e,stroke:#f97316,color:#e2e8f0
</div>
<!-- ═══════════════════════════════════════════════════════════════════ -->
<div class="section-title">2. Dataset & Prompt Construction</div>
<div class="mermaid">
flowchart TD
JSON2["📄 Instruction JSON\n(image_path, task, target, question?)"]
subgraph DS["CXRInstructDataset.__getitem__()"]
direction LR
IMG["_load_image()\n→ tensor C×H×W"]
TMPL["build_training_sample(task)\nprompt_templates.py"]
TOK["_tokenize(prompt, target)\nlabels: -100 trên prompt tokens"]
end
subgraph PROMPTS["10 biến thể prompt mỗi task (random.choice)"]
direction TB
F["Task: findings\n→ random.choice(FINDINGS_PROMPTS)\ne.g. 'Generate the findings section...'"]
I["Task: impression\n→ random.choice(IMPRESSION_PROMPTS)\ne.g. 'Provide a concise clinical impression...'"]
V["Task: vqa\n→ câu hỏi trực tiếp từ dataset\ne.g. 'Is there pleural effusion?'"]
end
subgraph FMT["Vicuna v1.1 Format"]
direction TB
FMT1["SYSTEM: You are a radiologist...\nUSER: &lt;image&gt;\n[Predicted Findings: ...]\n{instruction}\nASSISTANT:"]
end
OUT["Batch output:\n{ image, input_ids, attention_mask, labels, task }"]
JSON2 --> DS
DS --> TMPL
TMPL --> PROMPTS
PROMPTS --> FMT
FMT --> TOK
IMG --> OUT
TOK --> OUT
style DS fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0
style PROMPTS fill:#2a1e3b,stroke:#a78bfa,color:#e2e8f0
style FMT fill:#3b2a1e,stroke:#f97316,color:#e2e8f0
</div>
<!-- ═══════════════════════════════════════════════════════════════════ -->
<div class="section-title">3. Model Architecture & Forward Pass</div>
<div class="mermaid">
flowchart TD
subgraph INPUT["Input"]
IMG3["Ảnh X-quang\n(B, C, 448, 448)"]
TEXT["Tokenized Prompt\n(B, seq_len)"]
end
subgraph ENC["🔵 BioViL-T Encoder — FROZEN\nimage_encoder.py"]
E1["Pretrained trên CXR + radiology reports\n(Microsoft hi-ml-multimodal)"]
E2["patch_features\n(B, num_patches, 768)"]
E1 --> E2
end
subgraph PROJ["🟢 MLP Projection — TRAINED\nprojection.py"]
direction TB
PR1["Learnable query tokens\n(1, 32, 768)"]
PR2["Cross-Attention\nquery ← patches"]
PR3["MLP: 768 → 1024 → 4096"]
PR4["image_tokens\n(B, 32, 4096)"]
PR1 --> PR2 --> PR3 --> PR4
end
subgraph CHEX["🟡 CheXpert Classifier — FROZEN (optional)\nchexpert_classifier.py"]
C1["Structured labels\ne.g. 'Pleural Effusion: Positive'"]
end
subgraph LLM["🔴 Vicuna-7B + LoRA — TRAINED (LoRA only)\ncxr_vlm.py"]
direction TB
L1["embed_tokens(input_ids)\n→ text_embeds (B, seq_len, 4096)"]
L2["_inject_image_tokens()\nThay &lt;image&gt; token → 32 visual tokens"]
L3["LlamaForCausalLM.forward()\ninputs_embeds + attention_mask + labels"]
L4["Cross-Entropy Loss\n(chỉ tính trên target tokens, prompt = -100)"]
L1 --> L2 --> L3 --> L4
end
IMG3 --> ENC --> PROJ
TEXT --> LLM
PROJ --> L2
CHEX -.->|prepend vào prompt text| TEXT
style ENC fill:#1e2e4a,stroke:#60a5fa,color:#e2e8f0
style PROJ fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0
style CHEX fill:#3b3a1e,stroke:#facc15,color:#e2e8f0
style LLM fill:#3a1e1e,stroke:#f87171,color:#e2e8f0
style INPUT fill:#1e2433,stroke:#64748b,color:#e2e8f0
</div>
<!-- ═══════════════════════════════════════════════════════════════════ -->
<div class="section-title">4. Training Stages</div>
<div class="mermaid">
flowchart LR
subgraph S1["Stage 1 — Alignment\nset_stage1_mode()"]
direction TB
S1A["🔵 BioViL-T → FROZEN"]
S1B["🟢 MLP Projection → TRAIN ✓"]
S1C["🔴 Vicuna-7B LoRA → FROZEN"]
S1D["Mục tiêu: học căn chỉnh\nvisual ↔ text embedding space"]
end
subgraph S2["Stage 2 — Fine-tuning\nset_stage2_mode()"]
direction TB
S2A["🔵 BioViL-T → FROZEN"]
S2B["🟢 MLP Projection → TRAIN ✓"]
S2C["🔴 Vicuna-7B LoRA → TRAIN ✓\n(chỉ LoRA adapters, ~0.1% params)"]
S2D["Mục tiêu: học 3 tasks\nfindings / impression / VQA"]
end
S1 -->|"Projection converged"| S2
style S1 fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0
style S2 fill:#3a1e2a,stroke:#f472b6,color:#e2e8f0
</div>
<!-- ═══════════════════════════════════════════════════════════════════ -->
<div class="section-title">5. Inference & Evaluation</div>
<div class="mermaid">
flowchart LR
subgraph INF["generate() — cxr_vlm.py"]
direction TB
I1["Encode image → 32 visual tokens"]
I2["Build prompt theo task"]
I3["llm.generate()\ngreedy / beam search"]
I4["Split tại 'ASSISTANT:'\n→ response text"]
I1 --> I2 --> I3 --> I4
end
subgraph EVAL["Evaluation — evaluation/"]
direction TB
EV1["Findings / Impression:\nBLEU-4, ROUGE-L, BERTScore, ClinicalF1"]
EV2["VQA:\nAccuracy, F1"]
end
subgraph OUT2["Output theo task"]
direction TB
O1["findings: 'The lungs are clear...'"]
O2["impression: 'No acute process.'"]
O3["vqa: 'Yes, mild pleural effusion.'"]
end
INF --> OUT2
OUT2 --> EVAL
style INF fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0
style EVAL fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0
style OUT2 fill:#3b2a1e,stroke:#f97316,color:#e2e8f0
</div>
<div class="legend">
<div class="legend-item"><div class="legend-color" style="background:#60a5fa"></div>BioViL-T (frozen)</div>
<div class="legend-item"><div class="legend-color" style="background:#4ade80"></div>MLP Projection (trained)</div>
<div class="legend-item"><div class="legend-color" style="background:#facc15"></div>CheXpert Classifier (frozen)</div>
<div class="legend-item"><div class="legend-color" style="background:#f87171"></div>Vicuna-7B + LoRA (LoRA trained)</div>
<div class="legend-item"><div class="legend-color" style="background:#f97316"></div>Data / Output</div>
</div>
<script>
mermaid.initialize({ startOnLoad: true, theme: 'dark', themeVariables: { fontSize: '14px' } });
</script>
</body>
</html>