| <!DOCTYPE html> |
| <html lang="vi"> |
| <head> |
| <meta charset="UTF-8" /> |
| <title>CXR-VLM Pipeline</title> |
| <script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script> |
| <style> |
| body { |
| font-family: 'Segoe UI', sans-serif; |
| background: #0f1117; |
| color: #e2e8f0; |
| margin: 0; |
| padding: 32px; |
| } |
| h1 { |
| text-align: center; |
| color: #7dd3fc; |
| font-size: 1.6rem; |
| margin-bottom: 8px; |
| letter-spacing: 0.5px; |
| } |
| .subtitle { |
| text-align: center; |
| color: #94a3b8; |
| font-size: 0.9rem; |
| margin-bottom: 36px; |
| } |
| |
| .section-title { |
| color: #7dd3fc; |
| font-size: 0.95rem; |
| font-weight: 600; |
| text-transform: uppercase; |
| letter-spacing: 1px; |
| margin: 40px 0 12px; |
| padding-left: 4px; |
| border-left: 3px solid #38bdf8; |
| } |
| |
| .mermaid { |
| background: #1e2433; |
| border-radius: 12px; |
| padding: 24px; |
| overflow-x: auto; |
| } |
| |
| .legend { |
| display: flex; |
| flex-wrap: wrap; |
| gap: 16px; |
| margin-top: 32px; |
| justify-content: center; |
| } |
| .legend-item { |
| display: flex; |
| align-items: center; |
| gap: 8px; |
| font-size: 0.82rem; |
| color: #cbd5e1; |
| } |
| .legend-color { |
| width: 16px; |
| height: 16px; |
| border-radius: 4px; |
| } |
| </style> |
| </head> |
| <body> |
| <h1>CXR-VLM — Pipeline Overview</h1> |
| <p class="subtitle">Vision-Language Model for Chest X-ray Interpretation | Based on RaDialog</p> |
|
|
| |
| <div class="section-title">1. Data Pipeline</div> |
| <div class="mermaid"> |
| flowchart LR |
| subgraph SRC["📦 Nguồn dữ liệu (PhysioNet)"] |
| direction TB |
| D1["MIMIC-CXR\n(reports/*.txt)"] |
| D2["MIMIC-CXR-JPG\n(files/**/*.jpg)"] |
| D3["MIMIC-Ext-CXR-QBA\n(*.json)"] |
| end |
|
|
| subgraph BUILD["🔧 build_instruct_json()"] |
| direction TB |
| P1["Parse findings\ntừ report .txt"] |
| P2["Parse impression\ntừ report .txt"] |
| P3["Parse Q&A pairs\ntừ JSON"] |
| end |
|
|
| subgraph JSON["📄 Unified Instruction JSON"] |
| direction TB |
| J1["{ task: findings,\n image_path, target }"] |
| J2["{ task: impression,\n image_path, target }"] |
| J3["{ task: vqa,\n image_path, question, target }"] |
| end |
|
|
| D1 --> P1 & P2 |
| D3 --> P3 |
| D2 -.->|"image_path ref"| J1 & J2 & J3 |
| P1 --> J1 |
| P2 --> J2 |
| P3 --> J3 |
|
|
| style SRC fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0 |
| style BUILD fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0 |
| style JSON fill:#3b2a1e,stroke:#f97316,color:#e2e8f0 |
| </div> |
|
|
| |
| <div class="section-title">2. Dataset & Prompt Construction</div> |
| <div class="mermaid"> |
| flowchart TD |
| JSON2["📄 Instruction JSON\n(image_path, task, target, question?)"] |
|
|
| subgraph DS["CXRInstructDataset.__getitem__()"] |
| direction LR |
| IMG["_load_image()\n→ tensor C×H×W"] |
| TMPL["build_training_sample(task)\nprompt_templates.py"] |
| TOK["_tokenize(prompt, target)\nlabels: -100 trên prompt tokens"] |
| end |
|
|
| subgraph PROMPTS["10 biến thể prompt mỗi task (random.choice)"] |
| direction TB |
| F["Task: findings\n→ random.choice(FINDINGS_PROMPTS)\ne.g. 'Generate the findings section...'"] |
| I["Task: impression\n→ random.choice(IMPRESSION_PROMPTS)\ne.g. 'Provide a concise clinical impression...'"] |
| V["Task: vqa\n→ câu hỏi trực tiếp từ dataset\ne.g. 'Is there pleural effusion?'"] |
| end |
|
|
| subgraph FMT["Vicuna v1.1 Format"] |
| direction TB |
| FMT1["SYSTEM: You are a radiologist...\nUSER: <image>\n[Predicted Findings: ...]\n{instruction}\nASSISTANT:"] |
| end |
|
|
| OUT["Batch output:\n{ image, input_ids, attention_mask, labels, task }"] |
|
|
| JSON2 --> DS |
| DS --> TMPL |
| TMPL --> PROMPTS |
| PROMPTS --> FMT |
| FMT --> TOK |
| IMG --> OUT |
| TOK --> OUT |
|
|
| style DS fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0 |
| style PROMPTS fill:#2a1e3b,stroke:#a78bfa,color:#e2e8f0 |
| style FMT fill:#3b2a1e,stroke:#f97316,color:#e2e8f0 |
| </div> |
|
|
| |
| <div class="section-title">3. Model Architecture & Forward Pass</div> |
| <div class="mermaid"> |
| flowchart TD |
| subgraph INPUT["Input"] |
| IMG3["Ảnh X-quang\n(B, C, 448, 448)"] |
| TEXT["Tokenized Prompt\n(B, seq_len)"] |
| end |
|
|
| subgraph ENC["🔵 BioViL-T Encoder — FROZEN\nimage_encoder.py"] |
| E1["Pretrained trên CXR + radiology reports\n(Microsoft hi-ml-multimodal)"] |
| E2["patch_features\n(B, num_patches, 768)"] |
| E1 --> E2 |
| end |
|
|
| subgraph PROJ["🟢 MLP Projection — TRAINED\nprojection.py"] |
| direction TB |
| PR1["Learnable query tokens\n(1, 32, 768)"] |
| PR2["Cross-Attention\nquery ← patches"] |
| PR3["MLP: 768 → 1024 → 4096"] |
| PR4["image_tokens\n(B, 32, 4096)"] |
| PR1 --> PR2 --> PR3 --> PR4 |
| end |
|
|
| subgraph CHEX["🟡 CheXpert Classifier — FROZEN (optional)\nchexpert_classifier.py"] |
| C1["Structured labels\ne.g. 'Pleural Effusion: Positive'"] |
| end |
|
|
| subgraph LLM["🔴 Vicuna-7B + LoRA — TRAINED (LoRA only)\ncxr_vlm.py"] |
| direction TB |
| L1["embed_tokens(input_ids)\n→ text_embeds (B, seq_len, 4096)"] |
| L2["_inject_image_tokens()\nThay <image> token → 32 visual tokens"] |
| L3["LlamaForCausalLM.forward()\ninputs_embeds + attention_mask + labels"] |
| L4["Cross-Entropy Loss\n(chỉ tính trên target tokens, prompt = -100)"] |
| L1 --> L2 --> L3 --> L4 |
| end |
|
|
| IMG3 --> ENC --> PROJ |
| TEXT --> LLM |
| PROJ --> L2 |
| CHEX -.->|prepend vào prompt text| TEXT |
|
|
| style ENC fill:#1e2e4a,stroke:#60a5fa,color:#e2e8f0 |
| style PROJ fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0 |
| style CHEX fill:#3b3a1e,stroke:#facc15,color:#e2e8f0 |
| style LLM fill:#3a1e1e,stroke:#f87171,color:#e2e8f0 |
| style INPUT fill:#1e2433,stroke:#64748b,color:#e2e8f0 |
| </div> |
|
|
| |
| <div class="section-title">4. Training Stages</div> |
| <div class="mermaid"> |
| flowchart LR |
| subgraph S1["Stage 1 — Alignment\nset_stage1_mode()"] |
| direction TB |
| S1A["🔵 BioViL-T → FROZEN"] |
| S1B["🟢 MLP Projection → TRAIN ✓"] |
| S1C["🔴 Vicuna-7B LoRA → FROZEN"] |
| S1D["Mục tiêu: học căn chỉnh\nvisual ↔ text embedding space"] |
| end |
|
|
| subgraph S2["Stage 2 — Fine-tuning\nset_stage2_mode()"] |
| direction TB |
| S2A["🔵 BioViL-T → FROZEN"] |
| S2B["🟢 MLP Projection → TRAIN ✓"] |
| S2C["🔴 Vicuna-7B LoRA → TRAIN ✓\n(chỉ LoRA adapters, ~0.1% params)"] |
| S2D["Mục tiêu: học 3 tasks\nfindings / impression / VQA"] |
| end |
|
|
| S1 -->|"Projection converged"| S2 |
|
|
| style S1 fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0 |
| style S2 fill:#3a1e2a,stroke:#f472b6,color:#e2e8f0 |
| </div> |
|
|
| |
| <div class="section-title">5. Inference & Evaluation</div> |
| <div class="mermaid"> |
| flowchart LR |
| subgraph INF["generate() — cxr_vlm.py"] |
| direction TB |
| I1["Encode image → 32 visual tokens"] |
| I2["Build prompt theo task"] |
| I3["llm.generate()\ngreedy / beam search"] |
| I4["Split tại 'ASSISTANT:'\n→ response text"] |
| I1 --> I2 --> I3 --> I4 |
| end |
|
|
| subgraph EVAL["Evaluation — evaluation/"] |
| direction TB |
| EV1["Findings / Impression:\nBLEU-4, ROUGE-L, BERTScore, ClinicalF1"] |
| EV2["VQA:\nAccuracy, F1"] |
| end |
|
|
| subgraph OUT2["Output theo task"] |
| direction TB |
| O1["findings: 'The lungs are clear...'"] |
| O2["impression: 'No acute process.'"] |
| O3["vqa: 'Yes, mild pleural effusion.'"] |
| end |
|
|
| INF --> OUT2 |
| OUT2 --> EVAL |
|
|
| style INF fill:#1e3a5f,stroke:#38bdf8,color:#e2e8f0 |
| style EVAL fill:#1e3a2a,stroke:#4ade80,color:#e2e8f0 |
| style OUT2 fill:#3b2a1e,stroke:#f97316,color:#e2e8f0 |
| </div> |
|
|
| <div class="legend"> |
| <div class="legend-item"><div class="legend-color" style="background:#60a5fa"></div>BioViL-T (frozen)</div> |
| <div class="legend-item"><div class="legend-color" style="background:#4ade80"></div>MLP Projection (trained)</div> |
| <div class="legend-item"><div class="legend-color" style="background:#facc15"></div>CheXpert Classifier (frozen)</div> |
| <div class="legend-item"><div class="legend-color" style="background:#f87171"></div>Vicuna-7B + LoRA (LoRA trained)</div> |
| <div class="legend-item"><div class="legend-color" style="background:#f97316"></div>Data / Output</div> |
| </div> |
|
|
| <script> |
| mermaid.initialize({ startOnLoad: true, theme: 'dark', themeVariables: { fontSize: '14px' } }); |
| </script> |
| </body> |
| </html> |
|
|