Spaces:

griddev
/

project_02_DS

Sleeping

App Files Files Community

griddev commited on 29 days ago

Commit

ce25d0a

verified ·

1 Parent(s): 7c69cda

Deploy Streamlit Space app

Browse files

Files changed (3) hide show

app.py +235 -3
models/attention_flow.py +328 -0
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ Features:
 import os
 import warnings
 import torch
 import streamlit as st
 from PIL import Image
 from models.blip_tuner import generate_with_mask
@@ -476,6 +477,42 @@ def load_toxicity_filter():
     return tok, mdl
 # ─────────────────────────────────────────────────────────────────────────────
 # Toxicity Check
 # ─────────────────────────────────────────────────────────────────────────────
@@ -744,8 +781,8 @@ def render_caption_card(model_name, caption, weight_src, num_beams, length_penal
 # Tabs
 # ─────────────────────────────────────────────────────────────────────────────
-tab_caption, tab_compare, tab_results = st.tabs([
-    "🖼️  Caption", "🔀  Compare All Models", "📊  Experiment Results"
 ])
@@ -961,7 +998,202 @@ with tab_compare:
 # ═══════════════════════════════════════════════════════════════════════════
-# Tab 3 — Experiment Results
 # ═══════════════════════════════════════════════════════════════════════════
 with tab_results:

 import os
 import warnings
 import torch
+import numpy as np
 import streamlit as st
 from PIL import Image
 from models.blip_tuner import generate_with_mask
     return tok, mdl
+@st.cache_resource(show_spinner=False)
+def load_blip_attention_model(weight_source="base"):
+    from transformers import BlipForConditionalGeneration, BlipProcessor
+    device = get_device()
+    processor = BlipProcessor.from_pretrained(
+        "Salesforce/blip-image-captioning-base", use_fast=True
+    )
+    model = BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    if weight_source != "base":
+        output_root, _, _ = _resolve_weight_paths(
+            need_outputs=True, need_shakespeare=False
+        )
+        ckpt = _ckpt_path(output_root, "blip", weight_source)
+        if os.path.isdir(ckpt) and os.listdir(ckpt):
+            loaded = BlipForConditionalGeneration.from_pretrained(ckpt)
+            model.load_state_dict(loaded.state_dict(), strict=False)
+            del loaded
+    try:
+        model.gradient_checkpointing_disable()
+    except Exception:
+        pass
+    model.config.use_cache = False
+    model.to(device).eval()
+    return processor, model, device
+@st.cache_resource(show_spinner=False)
+def load_alignment_detector():
+    from models.attention_flow import load_owlvit_detector
+    return load_owlvit_detector(get_device())
 # ─────────────────────────────────────────────────────────────────────────────
 # Toxicity Check
 # ─────────────────────────────────────────────────────────────────────────────
 # Tabs
 # ─────────────────────────────────────────────────────────────────────────────
+tab_caption, tab_compare, tab_attention, tab_results = st.tabs([
+    "🖼️  Caption", "🔀  Compare All Models", "🧠  Attention Explorer", "📊  Experiment Results"
 ])
 # ═══════════════════════════════════════════════════════════════════════════
+# Tab 3 — Attention Explorer (Task 2)
+# ═══════════════════════════════════════════════════════════════════════════
+with tab_attention:
+    st.markdown("### 🧠 BLIP Attention Explorer")
+    st.caption(
+        "Step-by-step cross-attention analysis with rollout across decoder layers, "
+        "2x5 heatmap grid, IoU grounding score, and caption-length summary."
+    )
+    attn_col_left, attn_col_right = st.columns([1, 1], gap="large")
+    with attn_col_left:
+        attn_file = st.file_uploader(
+            "Upload an image for attention analysis",
+            type=["jpg", "jpeg", "png", "webp"],
+            key="attention_uploader",
+        )
+        if attn_file:
+            attn_image = Image.open(attn_file).convert("RGB")
+            st.image(attn_image, caption="Attention Input Image", use_column_width=True)
+    with attn_col_right:
+        _ensure_model_outputs_available("blip")
+        attn_weight_options = {"Base (Pretrained)": "base"}
+        if _has_finetuned("blip", "best"):
+            attn_weight_options["Fine-tuned (Best)"] = "best"
+        if _has_finetuned("blip", "latest"):
+            attn_weight_options["Fine-tuned (Latest)"] = "latest"
+        attn_weight_choice = st.selectbox(
+            "BLIP Weight Source",
+            list(attn_weight_options.keys()),
+            index=0,
+            key="attn_weight_choice",
+        )
+        attn_weight_source = attn_weight_options[attn_weight_choice]
+        token_mode = st.radio(
+            "Token Source",
+            ["Generated Caption", "Custom Text Prompt"],
+            horizontal=True,
+            key="attn_token_mode",
+        )
+        custom_text = ""
+        if token_mode == "Custom Text Prompt":
+            custom_text = st.text_input(
+                "Enter custom text/words for heatmap tracing",
+                value="a dog playing with a ball",
+                key="attn_custom_text",
+            )
+        max_attn_steps = st.slider(
+            "Caption Steps to Analyze",
+            min_value=3,
+            max_value=12,
+            value=9,
+            key="attn_steps",
+        )
+        run_iou = st.toggle(
+            "Compute IoU Alignment with OWL-ViT (slower)",
+            value=True,
+            key="attn_iou_toggle",
+        )
+        run_attention_btn = st.button(
+            "Run Step-by-Step Attention Analysis",
+            disabled=(attn_file is None or (token_mode == "Custom Text Prompt" and not custom_text.strip())),
+            key="attn_run_btn",
+        )
+    if run_attention_btn and attn_file:
+        from models.attention_flow import (
+            build_attention_grid_figure,
+            decode_custom_text_with_flow,
+            decode_generated_caption_with_flow,
+            encode_image_for_flow,
+            grade_alignment_with_detector,
+            summarize_caption_alignment,
+        )
+        attn_image = Image.open(attn_file).convert("RGB")
+        iou_results = []
+        with st.status("Running attention pipeline...", expanded=True) as status:
+            st.write("Step 1/5: Loading BLIP model and selected weights")
+            attn_processor, attn_model, attn_device = load_blip_attention_model(attn_weight_source)
+            st.write("Step 2/5: Encoding image through ViT")
+            image_224, enc_hidden, enc_mask = encode_image_for_flow(
+                attn_model, attn_processor, attn_device, attn_image
+            )
+            st.write("Step 3/5: Extracting rollout heatmaps token-by-token")
+            if token_mode == "Custom Text Prompt":
+                tokens, heatmaps = decode_custom_text_with_flow(
+                    attn_model,
+                    attn_processor,
+                    attn_device,
+                    enc_hidden,
+                    enc_mask,
+                    custom_text,
+                    max_tokens=max_attn_steps,
+                )
+            else:
+                tokens, heatmaps = decode_generated_caption_with_flow(
+                    attn_model,
+                    attn_processor,
+                    attn_device,
+                    enc_hidden,
+                    enc_mask,
+                    max_tokens=max_attn_steps,
+                )
+            st.write("Step 4/5: Building 2x5 attention grid")
+            fig_grid = build_attention_grid_figure(image_224, tokens, heatmaps, n_rows=2, n_cols=5)
+            if run_iou:
+                st.write("Step 5/5: Computing IoU alignment using OWL-ViT detections")
+                detector = load_alignment_detector()
+                iou_results = grade_alignment_with_detector(attn_image, tokens, heatmaps, detector)
+            else:
+                st.write("Step 5/5: IoU grading skipped by user")
+            status.update(label="Attention pipeline complete", state="complete", expanded=False)
+        st.pyplot(fig_grid, use_container_width=True)
+        caption_tokens = " ".join(tokens) if tokens else "[No tokens generated]"
+        st.markdown(f"**Decoded tokens:** `{caption_tokens}`")
+        summary = summarize_caption_alignment(iou_results, len(tokens))
+        st.markdown(
+            f"**Caption length:** `{summary['caption_length']}` | "
+            f"**Mean alignment IoU:** `{summary['mean_alignment_iou']:.4f}`"
+        )
+        if run_iou:
+            st.markdown("#### Word-level Alignment (IoU)")
+            if iou_results:
+                table_rows = [
+                    {
+                        "word": item["word"],
+                        "position": item["position"],
+                        "iou": round(item["iou"], 4),
+                        "det_score": round(item["det_score"], 4),
+                        "box": [int(x) for x in item["box"]],
+                    }
+                    for item in iou_results
+                ]
+                st.dataframe(table_rows, use_container_width=True)
+                strong = [item["word"] for item in iou_results if item["iou"] >= 0.30]
+                weak = [item["word"] for item in iou_results if item["iou"] < 0.10]
+                if strong:
+                    st.success("Strongly grounded words: " + ", ".join(strong))
+                if weak:
+                    st.warning("Weakly grounded words: " + ", ".join(weak))
+            else:
+                st.info("No detectable object-word matches found for IoU grading on this run.")
+        if "alignment_history" not in st.session_state:
+            st.session_state["alignment_history"] = []
+        st.session_state["alignment_history"].append(
+            {
+                "caption_length": int(summary["caption_length"]),
+                "mean_alignment_iou": float(summary["mean_alignment_iou"]),
+                "mode": token_mode,
+                "weights": attn_weight_source,
+            }
+        )
+        st.markdown("#### Caption Length -> Mean Alignment IoU")
+        history = st.session_state["alignment_history"]
+        if history:
+            try:
+                import matplotlib.pyplot as plt
+                x_vals = [item["caption_length"] for item in history]
+                y_vals = [item["mean_alignment_iou"] for item in history]
+                fig_summary, ax_summary = plt.subplots(figsize=(6, 3.2))
+                ax_summary.scatter(x_vals, y_vals, color="#58a6ff", alpha=0.85)
+                if len(x_vals) > 1:
+                    z = np.polyfit(x_vals, y_vals, 1)
+                    trend = np.poly1d(z)
+                    xs = sorted(x_vals)
+                    ax_summary.plot(xs, [trend(v) for v in xs], linestyle="--", color="#ff7b72")
+                ax_summary.set_xlabel("Caption length")
+                ax_summary.set_ylabel("Mean IoU")
+                ax_summary.set_title("Alignment Trend")
+                ax_summary.grid(alpha=0.35, linestyle="--")
+                st.pyplot(fig_summary, use_container_width=True)
+            except Exception:
+                pass
+            st.dataframe(history[-20:], use_container_width=True)
+# ═══════════════════════════════════════════════════════════════════════════
+# Tab 4 — Experiment Results
 # ═══════════════════════════════════════════════════════════════════════════
 with tab_results:

models/attention_flow.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import math
+from typing import List, Tuple
+import cv2
+import matplotlib
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+STOP_WORDS = {
+    "a", "an", "the", "and", "or", "but", "is", "are", "was", "were",
+    "in", "on", "at", "to", "for", "with", "by", "it", "this", "that",
+    "there", "here", "of", "up", "out", ".", ",", "!", "##",
+}
+class FlowExtractor:
+    def __init__(self, model):
+        self.model = model
+        self._hooks = []
+        self.layers = []
+        for layer in model.text_decoder.bert.encoder.layer:
+            if hasattr(layer, "crossattention"):
+                holder = {"fwd": None, "grad": None}
+                self.layers.append(holder)
+                def _make_hook(h):
+                    def _fwd(module, inputs, outputs):
+                        if len(outputs) > 1 and outputs[1] is not None:
+                            h["fwd"] = outputs[1]
+                            if h["fwd"].requires_grad:
+                                h["fwd"].register_hook(
+                                    lambda g, _h=h: _h.update({"grad": g.detach()})
+                                )
+                    return _fwd
+                target = layer.crossattention.self
+                self._hooks.append(target.register_forward_hook(_make_hook(holder)))
+    def clear(self):
+        for holder in self.layers:
+            holder["fwd"] = None
+            holder["grad"] = None
+    def remove(self):
+        for hook in self._hooks:
+            hook.remove()
+        self._hooks = []
+def encode_image_for_flow(model, processor, device, image_pil: Image.Image):
+    image_224 = image_pil.resize((224, 224), Image.LANCZOS)
+    inputs = processor(images=image_224, return_tensors="pt").to(device)
+    with torch.no_grad():
+        vision_out = model.vision_model(pixel_values=inputs["pixel_values"])
+    encoder_hidden = vision_out[0].detach().requires_grad_(False)
+    encoder_mask = torch.ones(encoder_hidden.size()[:-1], dtype=torch.long, device=device)
+    return image_224, encoder_hidden, encoder_mask
+def _single_layer_gradcam(holder, token_idx: int = -1) -> torch.Tensor:
+    attn = holder["fwd"][:, :, token_idx, :]
+    grad = holder["grad"][:, :, token_idx, :]
+    cam = (attn * grad).mean(dim=1).squeeze()
+    return torch.clamp(cam, min=0.0)
+def _normalize1d(tensor: torch.Tensor) -> torch.Tensor:
+    denom = tensor.sum()
+    if denom > 0:
+        return tensor / denom
+    return tensor
+def compute_attention_flow(
+    extractor: FlowExtractor,
+    num_image_tokens: int = 197,
+    residual_weight: float = 0.05,
+    out_resolution: int = 224,
+) -> np.ndarray:
+    valid_cams = []
+    for holder in extractor.layers:
+        if holder["fwd"] is None or holder["grad"] is None:
+            continue
+        valid_cams.append(_single_layer_gradcam(holder).detach())
+    if not valid_cams:
+        return np.zeros((out_resolution, out_resolution), dtype=np.float32)
+    uniform = torch.ones(num_image_tokens, device=valid_cams[0].device) / num_image_tokens
+    rollout = _normalize1d(valid_cams[0])
+    for cam in valid_cams[1:]:
+        rollout = _normalize1d(rollout) * _normalize1d(cam) + residual_weight * uniform
+        rollout = torch.clamp(rollout, min=0.0)
+    spatial = rollout[1:]
+    grid_size = int(math.sqrt(spatial.numel()))
+    hm_tensor = spatial.detach().cpu().reshape(1, 1, grid_size, grid_size).float()
+    hm_up = F.interpolate(
+        hm_tensor,
+        size=(out_resolution, out_resolution),
+        mode="bicubic",
+        align_corners=False,
+    ).squeeze()
+    hm_np = hm_up.numpy()
+    lo, hi = hm_np.min(), hm_np.max()
+    if hi > lo:
+        hm_np = (hm_np - lo) / (hi - lo)
+    else:
+        hm_np = np.zeros_like(hm_np)
+    return hm_np.astype(np.float32)
+def decode_generated_caption_with_flow(
+    model,
+    processor,
+    device,
+    encoder_hidden,
+    encoder_mask,
+    max_tokens: int = 20,
+) -> Tuple[List[str], List[np.ndarray]]:
+    extractor = FlowExtractor(model)
+    input_ids = torch.LongTensor([[model.config.text_config.bos_token_id]]).to(device)
+    tokens, heatmaps = [], []
+    for _ in range(max_tokens):
+        model.zero_grad()
+        extractor.clear()
+        outputs = model.text_decoder(
+            input_ids=input_ids,
+            encoder_hidden_states=encoder_hidden,
+            encoder_attention_mask=encoder_mask,
+            output_attentions=True,
+            return_dict=True,
+        )
+        logits = outputs.logits[:, -1, :]
+        next_token = torch.argmax(logits, dim=-1)
+        if next_token.item() == model.config.text_config.sep_token_id:
+            break
+        logits[0, next_token.item()].backward(retain_graph=False)
+        heatmaps.append(compute_attention_flow(extractor))
+        tokens.append(processor.tokenizer.decode([next_token.item()]).strip())
+        input_ids = torch.cat([input_ids, next_token.reshape(1, 1)], dim=-1)
+    extractor.remove()
+    return tokens, heatmaps
+def decode_custom_text_with_flow(
+    model,
+    processor,
+    device,
+    encoder_hidden,
+    encoder_mask,
+    text: str,
+    max_tokens: int = 20,
+) -> Tuple[List[str], List[np.ndarray]]:
+    extractor = FlowExtractor(model)
+    token_ids = processor.tokenizer(
+        text,
+        add_special_tokens=False,
+        return_attention_mask=False,
+    )["input_ids"][:max_tokens]
+    input_ids = torch.LongTensor([[model.config.text_config.bos_token_id]]).to(device)
+    tokens, heatmaps = [], []
+    for target_token_id in token_ids:
+        model.zero_grad()
+        extractor.clear()
+        outputs = model.text_decoder(
+            input_ids=input_ids,
+            encoder_hidden_states=encoder_hidden,
+            encoder_attention_mask=encoder_mask,
+            output_attentions=True,
+            return_dict=True,
+        )
+        logits = outputs.logits[:, -1, :]
+        score = logits[0, target_token_id]
+        score.backward(retain_graph=False)
+        heatmaps.append(compute_attention_flow(extractor))
+        tokens.append(processor.tokenizer.decode([target_token_id]).strip())
+        next_tensor = torch.LongTensor([[target_token_id]]).to(device)
+        input_ids = torch.cat([input_ids, next_tensor], dim=-1)
+    extractor.remove()
+    return tokens, heatmaps
+def overlay_heatmap_on_image(
+    image_pil: Image.Image,
+    heatmap_np: np.ndarray,
+    alpha: float = 0.5,
+    hot_threshold: float = 0.1,
+) -> Image.Image:
+    h, w = heatmap_np.shape
+    image_np = np.array(image_pil.resize((w, h), Image.LANCZOS))
+    hm_u8 = np.uint8(255.0 * heatmap_np)
+    colored = cv2.applyColorMap(hm_u8, cv2.COLORMAP_INFERNO)
+    colored = cv2.cvtColor(colored, cv2.COLOR_BGR2RGB)
+    mask = (heatmap_np > hot_threshold).astype(np.float32)[..., None]
+    blended = image_np * (1 - mask * alpha) + colored * (mask * alpha)
+    return Image.fromarray(blended.astype(np.uint8))
+def build_attention_grid_figure(
+    image_pil: Image.Image,
+    tokens: List[str],
+    heatmaps: List[np.ndarray],
+    n_rows: int = 2,
+    n_cols: int = 5,
+):
+    n_panels = n_rows * n_cols
+    n_words = min(n_panels - 1, len(tokens))
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 3.2, n_rows * 3.2))
+    axes = axes.flatten()
+    axes[0].imshow(image_pil)
+    axes[0].set_title("Original", fontsize=11, fontweight="bold")
+    axes[0].axis("off")
+    for index in range(n_words):
+        overlay = overlay_heatmap_on_image(image_pil, heatmaps[index])
+        axes[index + 1].imshow(overlay)
+        axes[index + 1].set_title(f"'{tokens[index]}'", fontsize=10, fontweight="bold")
+        axes[index + 1].axis("off")
+    for index in range(n_words + 1, n_panels):
+        axes[index].axis("off")
+    caption_preview = " ".join(tokens[:12])
+    fig.suptitle(
+        f"Cross-Attention Flow (2x5)\nCaption Tokens: {caption_preview}",
+        fontsize=12,
+        fontweight="bold",
+        y=1.02,
+    )
+    plt.tight_layout()
+    return fig
+def load_owlvit_detector(device):
+    from transformers import pipeline
+    pipe_device = 0 if str(device).startswith("cuda") else -1
+    return pipeline(
+        task="zero-shot-object-detection",
+        model="google/owlvit-base-patch32",
+        device=pipe_device,
+    )
+def binarize_heatmap(heatmap_np: np.ndarray, target_hw: tuple) -> np.ndarray:
+    hm = cv2.resize(heatmap_np, (target_hw[1], target_hw[0]))
+    hm_u8 = np.uint8(255.0 * hm)
+    _, binary = cv2.threshold(hm_u8, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    return binary > 0
+def calculate_iou(mask: np.ndarray, box: list, img_shape: tuple) -> float:
+    box_mask = np.zeros(img_shape, dtype=bool)
+    xmin, ymin, xmax, ymax = map(int, box)
+    xmin = max(0, xmin)
+    ymin = max(0, ymin)
+    xmax = min(img_shape[1], xmax)
+    ymax = min(img_shape[0], ymax)
+    box_mask[ymin:ymax, xmin:xmax] = True
+    inter = np.logical_and(mask, box_mask).sum()
+    union = np.logical_or(mask, box_mask).sum()
+    return float(inter) / union if union > 0 else 0.0
+def grade_alignment_with_detector(
+    image_pil: Image.Image,
+    tokens: List[str],
+    heatmaps: List[np.ndarray],
+    detector,
+    min_detection_score: float = 0.05,
+) -> List[dict]:
+    results = []
+    img_shape = (image_pil.height, image_pil.width)
+    for idx, (word, hm) in enumerate(zip(tokens, heatmaps)):
+        clean_word = word.replace("##", "").lower()
+        if len(clean_word) < 3 or clean_word in STOP_WORDS or not clean_word.isalpha():
+            continue
+        detections = detector(image_pil, candidate_labels=[clean_word])
+        best_box, best_score = None, 0.0
+        for detection in detections:
+            if detection["score"] > best_score and detection["score"] >= min_detection_score:
+                best_score = detection["score"]
+                best_box = [
+                    detection["box"]["xmin"],
+                    detection["box"]["ymin"],
+                    detection["box"]["xmax"],
+                    detection["box"]["ymax"],
+                ]
+        if best_box is None:
+            continue
+        mask = binarize_heatmap(hm, img_shape)
+        iou = calculate_iou(mask, best_box, img_shape)
+        results.append(
+            {
+                "word": clean_word,
+                "position": idx + 1,
+                "iou": float(iou),
+                "det_score": float(best_score),
+                "box": best_box,
+            }
+        )
+    return results
+def summarize_caption_alignment(results: List[dict], caption_length: int) -> dict:
+    if not results:
+        return {"caption_length": caption_length, "mean_alignment_iou": 0.0}
+    mean_iou = float(np.mean([item["iou"] for item in results]))
+    return {"caption_length": caption_length, "mean_alignment_iou": mean_iou}

requirements.txt CHANGED Viewed

@@ -12,3 +12,5 @@ tqdm
 accelerate
 sentencepiece
 pycocoevalcap

 accelerate
 sentencepiece
 pycocoevalcap
+matplotlib
+opencv-python-headless