Spaces:

griddev
/

project_02_DS

Sleeping

File size: 6,096 Bytes

0710b5c

"""
pipeline.py
============
Master Orchestrator — Task 2, Iteration 3

This script chains all five steps together in the correct order,
printing a clear progress banner at each stage so you can see exactly
what is happening and inspect intermediate results.

Step-by-step flow
------------------
  STEP 1 → Load BLIP model (with fine-tuned weights if available).
  STEP 2 → Encode image through ViT  →  encoder_hidden_states.
  STEP 3 → Greedy decode token-by-token with Attention Flow heatmaps
            (multi-layer GradCAM rollout, bicubic upscaling).
  STEP 4 → Build 2×5 overlay grid image  →  attention_grid_v3.png.
  STEP 5 → Grade alignment with OWL-ViT + IoU  →  iou_chart_v3.png.

Designed to be deployment-friendly:
  • Every step is a clean function import from its own module.
  • Intermediate artefacts (heatmaps, tokens) can be inspected between steps.
  • Outputs are saved to the same directory as this script.

Usage:
    export PYTHONPATH=.
    venv/bin/python task/task_02/pipeline.py
"""

import os
import sys
import requests
from PIL import Image

# ── path bootstrap ────────────────────────────────────────────────────────────
_THIS_DIR       = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT   = os.path.dirname(os.path.dirname(_THIS_DIR))
if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)

# ── step imports ─────────────────────────────────────────────────────────────
from task.task_02.step1_load_model  import load_model
from task.task_02.step2_encode_image import encode_image
from task.task_02.step3_gradcam_flow import generate_with_flow
from task.task_02.step4_visualize    import save_attention_grid
from task.task_02.step5_iou_grade    import load_detector, grade_alignment, plot_iou_chart


# ── Output paths ─────────────────────────────────────────────────────────────
OUT_GRID  = os.path.join(_THIS_DIR, "attention_grid_v3.png")
OUT_CHART = os.path.join(_THIS_DIR, "iou_chart_v3.png")

# ── Test images ───────────────────────────────────────────────────────────────
TEST_URLS = [
    "http://images.cocodataset.org/val2017/000000039769.jpg",  # cats on couch
    "http://images.cocodataset.org/val2017/000000000139.jpg",  # dining room
]


def _load_image(url: str) -> Image.Image:
    """Download an image from url, return PIL RGB image."""
    print(f"\n📥 Downloading test image: {url}")
    return Image.open(requests.get(url, stream=True).raw).convert("RGB")


def _banner(step: int, title: str):
    print(f"\n{'='*60}")
    print(f"  STEP {step} — {title}")
    print(f"{'='*60}")


# ── Main pipeline ─────────────────────────────────────────────────────────────
def run_pipeline():
    # ── STEP 1: Load model ───────────────────────────────────────────────────
    _banner(1, "Load BLIP Model")
    model, processor, device = load_model(use_finetuned=True)

    # ── Load OWL-ViT grader (do once, reuse for all images) ─────────────────
    detector = load_detector(device)

    # Aggregate IoU results across images for the final chart
    all_iou_results = []

    for img_url in TEST_URLS:
        raw_image = _load_image(img_url)

        # ── STEP 2: Encode image ─────────────────────────────────────────────
        _banner(2, "Encode Image through ViT")
        image_224, enc_hidden, enc_mask = encode_image(model, processor, device, raw_image)

        # ── STEP 3: Generate caption + Attention Flow heatmaps ───────────────
        _banner(3, "Greedy Decode with Attention Flow")
        tokens, heatmaps = generate_with_flow(
            model, processor, device, enc_hidden, enc_mask
        )

        # ── INSPECT intermediate results ─────────────────────────────────────
        print(f"\n  📝 Tokens   : {tokens}")
        print(f"  🗺  Heatmaps : {len(heatmaps)} maps, each shape {heatmaps[0].shape if heatmaps else 'N/A'}")
        print(f"     Peak values: {[f'{h.max():.3f}' for h in heatmaps[:5]]} …")

        # ── STEP 4: Visualize (only for the first image to save space) ───────
        if img_url == TEST_URLS[0]:
            _banner(4, "Build Attention Grid Visualization")
            save_attention_grid(image_224, tokens, heatmaps, out_path=OUT_GRID)

        # ── STEP 5: Grade alignment ──────────────────────────────────────────
        _banner(5, "Grade Attention Alignment (IoU)")
        results = grade_alignment(raw_image, tokens, heatmaps, detector)
        all_iou_results.extend(results)

    # ── Save IoU chart (all images combined) ─────────────────────────────────
    if all_iou_results:
        print(f"\n📈 Saving IoU chart for {len(all_iou_results)} data points …")
        plot_iou_chart(all_iou_results, out_path=OUT_CHART)

    print("\n" + "="*60)
    print("  ✅  PIPELINE COMPLETE")
    print(f"     Attention grid  → {OUT_GRID}")
    print(f"     IoU chart       → {OUT_CHART}")
    print("="*60 + "\n")


if __name__ == "__main__":
    run_pipeline()