Spaces:
Sleeping
Sleeping
File size: 6,096 Bytes
0710b5c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | """
pipeline.py
============
Master Orchestrator β Task 2, Iteration 3
This script chains all five steps together in the correct order,
printing a clear progress banner at each stage so you can see exactly
what is happening and inspect intermediate results.
Step-by-step flow
------------------
STEP 1 β Load BLIP model (with fine-tuned weights if available).
STEP 2 β Encode image through ViT β encoder_hidden_states.
STEP 3 β Greedy decode token-by-token with Attention Flow heatmaps
(multi-layer GradCAM rollout, bicubic upscaling).
STEP 4 β Build 2Γ5 overlay grid image β attention_grid_v3.png.
STEP 5 β Grade alignment with OWL-ViT + IoU β iou_chart_v3.png.
Designed to be deployment-friendly:
β’ Every step is a clean function import from its own module.
β’ Intermediate artefacts (heatmaps, tokens) can be inspected between steps.
β’ Outputs are saved to the same directory as this script.
Usage:
export PYTHONPATH=.
venv/bin/python task/task_02/pipeline.py
"""
import os
import sys
import requests
from PIL import Image
# ββ path bootstrap ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.dirname(os.path.dirname(_THIS_DIR))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
# ββ step imports βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
from task.task_02.step1_load_model import load_model
from task.task_02.step2_encode_image import encode_image
from task.task_02.step3_gradcam_flow import generate_with_flow
from task.task_02.step4_visualize import save_attention_grid
from task.task_02.step5_iou_grade import load_detector, grade_alignment, plot_iou_chart
# ββ Output paths βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
OUT_GRID = os.path.join(_THIS_DIR, "attention_grid_v3.png")
OUT_CHART = os.path.join(_THIS_DIR, "iou_chart_v3.png")
# ββ Test images βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
TEST_URLS = [
"http://images.cocodataset.org/val2017/000000039769.jpg", # cats on couch
"http://images.cocodataset.org/val2017/000000000139.jpg", # dining room
]
def _load_image(url: str) -> Image.Image:
"""Download an image from url, return PIL RGB image."""
print(f"\nπ₯ Downloading test image: {url}")
return Image.open(requests.get(url, stream=True).raw).convert("RGB")
def _banner(step: int, title: str):
print(f"\n{'='*60}")
print(f" STEP {step} β {title}")
print(f"{'='*60}")
# ββ Main pipeline βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def run_pipeline():
# ββ STEP 1: Load model βββββββββββββββββββββββββββββββββββββββββββββββββββ
_banner(1, "Load BLIP Model")
model, processor, device = load_model(use_finetuned=True)
# ββ Load OWL-ViT grader (do once, reuse for all images) βββββββββββββββββ
detector = load_detector(device)
# Aggregate IoU results across images for the final chart
all_iou_results = []
for img_url in TEST_URLS:
raw_image = _load_image(img_url)
# ββ STEP 2: Encode image βββββββββββββββββββββββββββββββββββββββββββββ
_banner(2, "Encode Image through ViT")
image_224, enc_hidden, enc_mask = encode_image(model, processor, device, raw_image)
# ββ STEP 3: Generate caption + Attention Flow heatmaps βββββββββββββββ
_banner(3, "Greedy Decode with Attention Flow")
tokens, heatmaps = generate_with_flow(
model, processor, device, enc_hidden, enc_mask
)
# ββ INSPECT intermediate results βββββββββββββββββββββββββββββββββββββ
print(f"\n π Tokens : {tokens}")
print(f" πΊ Heatmaps : {len(heatmaps)} maps, each shape {heatmaps[0].shape if heatmaps else 'N/A'}")
print(f" Peak values: {[f'{h.max():.3f}' for h in heatmaps[:5]]} β¦")
# ββ STEP 4: Visualize (only for the first image to save space) βββββββ
if img_url == TEST_URLS[0]:
_banner(4, "Build Attention Grid Visualization")
save_attention_grid(image_224, tokens, heatmaps, out_path=OUT_GRID)
# ββ STEP 5: Grade alignment ββββββββββββββββββββββββββββββββββββββββββ
_banner(5, "Grade Attention Alignment (IoU)")
results = grade_alignment(raw_image, tokens, heatmaps, detector)
all_iou_results.extend(results)
# ββ Save IoU chart (all images combined) βββββββββββββββββββββββββββββββββ
if all_iou_results:
print(f"\nπ Saving IoU chart for {len(all_iou_results)} data points β¦")
plot_iou_chart(all_iou_results, out_path=OUT_CHART)
print("\n" + "="*60)
print(" β
PIPELINE COMPLETE")
print(f" Attention grid β {OUT_GRID}")
print(f" IoU chart β {OUT_CHART}")
print("="*60 + "\n")
if __name__ == "__main__":
run_pipeline()
|