File size: 6,096 Bytes
0710b5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
pipeline.py
============
Master Orchestrator β€” Task 2, Iteration 3

This script chains all five steps together in the correct order,
printing a clear progress banner at each stage so you can see exactly
what is happening and inspect intermediate results.

Step-by-step flow
------------------
  STEP 1 β†’ Load BLIP model (with fine-tuned weights if available).
  STEP 2 β†’ Encode image through ViT  β†’  encoder_hidden_states.
  STEP 3 β†’ Greedy decode token-by-token with Attention Flow heatmaps
            (multi-layer GradCAM rollout, bicubic upscaling).
  STEP 4 β†’ Build 2Γ—5 overlay grid image  β†’  attention_grid_v3.png.
  STEP 5 β†’ Grade alignment with OWL-ViT + IoU  β†’  iou_chart_v3.png.

Designed to be deployment-friendly:
  β€’ Every step is a clean function import from its own module.
  β€’ Intermediate artefacts (heatmaps, tokens) can be inspected between steps.
  β€’ Outputs are saved to the same directory as this script.

Usage:
    export PYTHONPATH=.
    venv/bin/python task/task_02/pipeline.py
"""

import os
import sys
import requests
from PIL import Image

# ── path bootstrap ────────────────────────────────────────────────────────────
_THIS_DIR       = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT   = os.path.dirname(os.path.dirname(_THIS_DIR))
if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)

# ── step imports ─────────────────────────────────────────────────────────────
from task.task_02.step1_load_model  import load_model
from task.task_02.step2_encode_image import encode_image
from task.task_02.step3_gradcam_flow import generate_with_flow
from task.task_02.step4_visualize    import save_attention_grid
from task.task_02.step5_iou_grade    import load_detector, grade_alignment, plot_iou_chart


# ── Output paths ─────────────────────────────────────────────────────────────
OUT_GRID  = os.path.join(_THIS_DIR, "attention_grid_v3.png")
OUT_CHART = os.path.join(_THIS_DIR, "iou_chart_v3.png")

# ── Test images ───────────────────────────────────────────────────────────────
TEST_URLS = [
    "http://images.cocodataset.org/val2017/000000039769.jpg",  # cats on couch
    "http://images.cocodataset.org/val2017/000000000139.jpg",  # dining room
]


def _load_image(url: str) -> Image.Image:
    """Download an image from url, return PIL RGB image."""
    print(f"\nπŸ“₯ Downloading test image: {url}")
    return Image.open(requests.get(url, stream=True).raw).convert("RGB")


def _banner(step: int, title: str):
    print(f"\n{'='*60}")
    print(f"  STEP {step} β€” {title}")
    print(f"{'='*60}")


# ── Main pipeline ─────────────────────────────────────────────────────────────
def run_pipeline():
    # ── STEP 1: Load model ───────────────────────────────────────────────────
    _banner(1, "Load BLIP Model")
    model, processor, device = load_model(use_finetuned=True)

    # ── Load OWL-ViT grader (do once, reuse for all images) ─────────────────
    detector = load_detector(device)

    # Aggregate IoU results across images for the final chart
    all_iou_results = []

    for img_url in TEST_URLS:
        raw_image = _load_image(img_url)

        # ── STEP 2: Encode image ─────────────────────────────────────────────
        _banner(2, "Encode Image through ViT")
        image_224, enc_hidden, enc_mask = encode_image(model, processor, device, raw_image)

        # ── STEP 3: Generate caption + Attention Flow heatmaps ───────────────
        _banner(3, "Greedy Decode with Attention Flow")
        tokens, heatmaps = generate_with_flow(
            model, processor, device, enc_hidden, enc_mask
        )

        # ── INSPECT intermediate results ─────────────────────────────────────
        print(f"\n  πŸ“ Tokens   : {tokens}")
        print(f"  πŸ—Ί  Heatmaps : {len(heatmaps)} maps, each shape {heatmaps[0].shape if heatmaps else 'N/A'}")
        print(f"     Peak values: {[f'{h.max():.3f}' for h in heatmaps[:5]]} …")

        # ── STEP 4: Visualize (only for the first image to save space) ───────
        if img_url == TEST_URLS[0]:
            _banner(4, "Build Attention Grid Visualization")
            save_attention_grid(image_224, tokens, heatmaps, out_path=OUT_GRID)

        # ── STEP 5: Grade alignment ──────────────────────────────────────────
        _banner(5, "Grade Attention Alignment (IoU)")
        results = grade_alignment(raw_image, tokens, heatmaps, detector)
        all_iou_results.extend(results)

    # ── Save IoU chart (all images combined) ─────────────────────────────────
    if all_iou_results:
        print(f"\nπŸ“ˆ Saving IoU chart for {len(all_iou_results)} data points …")
        plot_iou_chart(all_iou_results, out_path=OUT_CHART)

    print("\n" + "="*60)
    print("  βœ…  PIPELINE COMPLETE")
    print(f"     Attention grid  β†’ {OUT_GRID}")
    print(f"     IoU chart       β†’ {OUT_CHART}")
    print("="*60 + "\n")


if __name__ == "__main__":
    run_pipeline()