Spaces:

PraneshJs
/

InsideViT

Paused

App Files Files Community

PraneshJs commited on Dec 11, 2025

Commit

994e1b3

verified ·

1 Parent(s): 4d08ba2

Update app.py

Browse files

Files changed (1) hide show

app.py +333 -311

app.py CHANGED Viewed

@@ -1,407 +1,429 @@
-# ==========================================================
-#  Vision Transformer (ViT) Visualizer — HF Space, CPU, Gradio 5
-#  - Model: google/vit-base-patch16-224
-#  - Shows:
-#      * Original + patch grid (tokens)
-#      * Attention heatmap overlay (CLS -> patches)
-#      * PCA of patch embeddings
-#      * Top-5 predictions
-#      * Simple vs technical explanation
-#  - CPU friendly, uses only Gradio v5-safe features
 # ==========================================================
 import math
 import warnings
-from typing import Dict, Any, Optional, List, Tuple
 import gradio as gr
-import torch
 import numpy as np
-from PIL import Image, ImageDraw
-from transformers import AutoImageProcessor, ViTForImageClassification
 from sklearn.decomposition import PCA
-import matplotlib.pyplot as plt
 warnings.filterwarnings("ignore")
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODEL_NAME = "google/vit-base-patch16-224"
-VIT_MODEL = None
-VIT_PROCESSOR = None
-# ---------------------- MODEL LOADING ----------------------
-def load_vit():
-    """Load ViT + image processor once into global cache."""
-    global VIT_MODEL, VIT_PROCESSOR
-    if VIT_MODEL is not None and VIT_PROCESSOR is not None:
-        return VIT_MODEL, VIT_PROCESSOR
-    processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
-    model = ViTForImageClassification.from_pretrained(MODEL_NAME)
-    # ensure we get attentions + hidden states
-    model.config.output_attentions = True
-    model.config.output_hidden_states = True
-    model.to(DEVICE)
-    model.eval()
-    VIT_MODEL = model
-    VIT_PROCESSOR = processor
-    return model, processor
-# ---------------------- VISUAL HELPERS ----------------------
-def make_patch_grid_image(pil_img: Image.Image, patch_size: int = 16) -> Image.Image:
-    """
-    Resize to 224x224 and draw a patch grid (ViT splits into 16x16 patches).
-    """
-    img = pil_img.convert("RGB").resize((224, 224))
     draw = ImageDraw.Draw(img)
     w, h = img.size
     for x in range(0, w, patch_size):
-        draw.line((x, 0, x, h), fill=(0, 255, 0), width=1)
     for y in range(0, h, patch_size):
-        draw.line((0, y, w, y), fill=(0, 255, 0), width=1)
     return img
-def make_attention_overlay(
-    base_img: Image.Image, heatmap_grid: np.ndarray
-) -> Image.Image:
     """
-    Overlay a CLS->patch attention heatmap on top of the 224x224 image.
-    heatmap_grid: (G, G) attention values.
     """
-    base = base_img.convert("RGB").resize((224, 224))
-    g = heatmap_grid.astype(np.float32)
-    if not np.any(g):
-        g = np.zeros_like(g, dtype=np.float32)
     else:
-        g -= g.min()
-        maxv = g.max()
-        if maxv > 0:
-            g /= maxv
-    # upscale to image size
-    H, W = g.shape
-    heat_img = Image.fromarray((g * 255).astype("uint8"), mode="L")
-    heat_img = heat_img.resize((224, 224), Image.BILINEAR)
-    heat = np.array(heat_img).astype(np.float32) / 255.0  # 0..1
-    # simple blue->red colormap overlay
     r = heat
-    g_c = np.zeros_like(heat)
     b = 1.0 - heat
-    cam = np.stack([r, g_c, b], axis=-1)  # H,W,3
-    base_np = np.array(base).astype(np.float32) / 255.0
-    alpha = 0.45
-    blended = (1 - alpha) * base_np + alpha * cam
     blended = np.clip(blended * 255.0, 0, 255).astype("uint8")
     return Image.fromarray(blended)
-def make_pca_plot(patch_embeddings: np.ndarray):
     """
-    patch_embeddings: (N_patches, hidden_dim)
-    Returns a Matplotlib figure showing patches in 2D PCA space.
     """
-    if patch_embeddings.shape[0] < 2:
-        return None
-    pca = PCA(n_components=2)
-    comps = pca.fit_transform(patch_embeddings)  # (N,2)
-    fig, ax = plt.subplots(figsize=(4, 4))
-    ax.scatter(comps[:, 0], comps[:, 1], s=20, alpha=0.8)
-    ax.set_title("Patches in 2D (PCA of embeddings)")
-    ax.set_xlabel("PC1")
-    ax.set_ylabel("PC2")
-    ax.grid(True, alpha=0.3)
-    fig.tight_layout()
     return fig
-# ---------------------- CORE ANALYSIS ----------------------
-def analyze_vit(img: Optional[Image.Image], simple: bool):
-    """
-    Main function called by gradio button.
-    Returns:
-      - patch_grid_image
-      - attention_overlay (default: last layer, head 0)
-      - PCA figure
-      - predictions table
-      - explanation markdown
-      - state dict (for attention slider updates)
-    """
     if img is None:
         return (
-            None,
-            None,
-            None,
-            [],
-            "⬆️ Please upload an image (e.g., a dog, a car, a object).",
-            {},
         )
-    model, processor = load_vit()
-    # 1) Preprocess
     img_resized = img.convert("RGB").resize((224, 224))
-    patch_grid_img = make_patch_grid_image(img_resized)
-    inputs = processor(images=img_resized, return_tensors="pt")
-    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
     with torch.no_grad():
-        outputs = model(**inputs)
-    # 2) Predictions (top-5)
-    logits = outputs.logits[0].cpu().numpy()
-    probs = np.exp(logits - logits.max())
-    probs = probs / probs.sum()
-    topk_idx = probs.argsort()[-5:][::-1]
-    id2label = model.config.id2label
-    preds_table = [
-        [id2label[int(i)], float(probs[int(i)])] for i in topk_idx
-    ]
-    # 3) Patch embeddings from last hidden state
-    #    hidden_states[-1]: (batch, seq_len, hidden)
-    hidden_last = outputs.hidden_states[-1][0].cpu().numpy()  # (seq, hidden)
-    # seq layout: [CLS] + patches
-    patch_emb = hidden_last[1:, :]  # (N_patches, hidden)
-    pca_fig = make_pca_plot(patch_emb)
-    # 4) Attention -> CLS to patches grid per layer/head
-    attentions = outputs.attentions  # list of (batch, heads, seq, seq)
-    num_layers = len(attentions)
-    num_heads = attentions[0].shape[1] if num_layers > 0 else 0
-    # ViT-base: 14x14 = 196 patches
-    seq_len = attentions[0].shape[-1]  # 1 + N_patches
     n_patches = seq_len - 1
     grid_size = int(math.sqrt(n_patches))
     if grid_size * grid_size != n_patches:
-        # fallback: approximate
         grid_size = int(round(math.sqrt(n_patches)))
-    cls_to_patch = np.zeros(
-        (num_layers, num_heads, grid_size, grid_size), dtype=np.float32
-    )
-    for l, att in enumerate(attentions):
-        a = att[0].cpu().numpy()  # (heads, seq, seq)
-        # CLS token index = 0, patches = 1..N
-        cls_vec = a[:, 0, 1:]  # (heads, N_patches)
-        # if shapes mismatch, pad/truncate
-        if cls_vec.shape[1] != grid_size * grid_size:
-            tmp = np.zeros((num_heads, grid_size * grid_size), dtype=np.float32)
-            n_min = min(tmp.shape[1], cls_vec.shape[1])
-            tmp[:, :n_min] = cls_vec[:, :n_min]
-            cls_vec = tmp
-        cls_grid = cls_vec.reshape(num_heads, grid_size, grid_size)
-        cls_to_patch[l] = cls_grid
-    # default attention overlay: last layer, head 0
-    default_layer = num_layers - 1
     default_head = 0
-    att_grid_default = cls_to_patch[default_layer, default_head]
-    att_overlay = make_attention_overlay(img_resized, att_grid_default)
-    # 5) Explanation
-    explanation = build_explanation(simple, num_layers, num_heads, grid_size)
-    # 6) State for slider updates
-    state = {
-        "cls_to_patch": cls_to_patch,
-        "grid_size": grid_size,
-        "num_layers": num_layers,
-        "num_heads": num_heads,
-        # we also keep a copy of the 224x224 base image in memory
-        "base_image": img_resized,
-    }
-    return patch_grid_img, att_overlay, pca_fig, preds_table, explanation, state
-def build_explanation(
-    simple: bool, num_layers: int, num_heads: int, grid_size: int
-) -> str:
     if simple:
-        return f"""
-### 🧒 How a Vision Transformer (ViT) “sees” this image
-1. **Cut into patches** – The image is sliced into **{grid_size}×{grid_size} = {grid_size*grid_size}** small squares.
-2. **Turn patches into tokens** – Each patch becomes a little vector (like a word in a sentence).
-3. **Add position info** – The model remembers where each patch came from (top-left, bottom-right, etc.).
-4. **Look around with attention** – In each of the **{num_layers} layers**, the model lets every patch
-   look at other patches using **self-attention** (with {num_heads} attention heads).
-5. **Understand the whole image** – After many layers, ViT builds a global understanding of the scene
-   and predicts what’s in the picture (top-5 shown on the right).
-The heatmap shows **where the special [CLS] token is looking** in the last layer.
 """
     else:
-        return f"""
-### 🔬 Vision Transformer internals (technical view)
-- The image is resized to 224×224 and split into **{grid_size}×{grid_size} = {grid_size*grid_size}** patches.
-- Each patch is linearly projected into an embedding and combined with a positional embedding,
-  forming a sequence of tokens: `[CLS] + P₁ + P₂ + … + Pₙ`.
-- The ViT encoder has **{num_layers} transformer layers** with **{num_heads} attention heads** each.
-  In every layer, **self-attention** mixes information across all patches, enabling long-range dependencies
-  and global context.
-- The [CLS] token aggregates information across patches and is passed through a classification head to produce
-  logits over ImageNet-1k classes (we show the top-5).
-- The attention heatmap we display is:
-  - From **[CLS] → patch tokens**
-  - For a selected `(layer, head)`
-  - Reshaped into a `{grid_size}×{grid_size}` grid and upsampled to image resolution for overlay.
-- The PCA plot shows the **final-layer patch embeddings** projected to 2D, giving an intuition of how
-  ViT places patches in a semantic space.
-Use the sliders to explore different layers and heads and see how the attention focus changes.
 """
-# ---------------------- ATTENTION SLIDER UPDATE ----------------------
-def update_attention_view(
-    state: Dict[str, Any], layer_idx: int, head_idx: int
-):
     """
-    Called when user moves the layer/head sliders.
-    Returns a new attention overlay image.
     """
-    if not state or "cls_to_patch" not in state:
         return None
-    cls_to_patch = state["cls_to_patch"]
     base_img = state["base_image"]
-    num_layers = state["num_layers"]
-    num_heads = state["num_heads"]
-    # clamp indices safely
-    l = max(0, min(int(layer_idx), num_layers - 1))
-    h = max(0, min(int(head_idx), num_heads - 1))
-    grid = cls_to_patch[l, h]
-    overlay = make_attention_overlay(base_img, grid)
     return overlay
-# ---------------------- BUILD UI ----------------------
-with gr.Blocks(title="Vision Transformer (ViT) Visualizer") as demo:
-    gr.Markdown(
-        """
-# 🧠 Vision Transformer (ViT) — How It Sees the World
-Upload an image and explore how a Vision Transformer:
-- Cuts it into patches (tokens)
-- Attends to different regions via self-attention
-- Embeds patches into a high-dimensional space
-- Predicts what’s in the image
-Toggle **simple / technical** explanation and move the sliders to change
-which layer/head's attention you’re seeing.
-"""
-    )
     with gr.Row():
         with gr.Column(scale=1):
-            img_in = gr.Image(
-                label="Upload image",
-                type="pil",
-            )
-            simple_ck = gr.Checkbox(
-                label="Simple explanation (for everyone)",
-                value=True,
-            )
-            run_btn = gr.Button("Run ViT Analysis", variant="primary")
-            gr.Markdown(
-                "Try images like: animals, objects, scenes. This uses `google/vit-base-patch16-224` (ImageNet-1k)."
-            )
-        with gr.Column(scale=1):
-            preds_df = gr.Dataframe(
-                headers=["Label", "Probability"],
-                datatype=["str", "number"],
-                interactive=False,
-                label="Top-5 predictions",
-            )
-            explanation_md = gr.Markdown(label="Explanation")
-    gr.Markdown("## 🧩 Tokens & Attention")
-    with gr.Row():
-        patch_img = gr.Image(
-            label="Patches (16×16) — how ViT tokenizes the image",
-            interactive=False,
-        )
-        attn_img = gr.Image(
-            label="Attention heatmap (CLS → patches)",
-            interactive=False,
-        )
-    with gr.Row():
-        layer_slider = gr.Slider(
-            minimum=0,
-            maximum=11,  # ViT-base has 12 layers (0-11)
-            step=1,
-            value=11,
-            label="Layer (0 = shallow, 11 = deepest)",
-        )
-        head_slider = gr.Slider(
-            minimum=0,
-            maximum=11,  # 12 attention heads
-            step=1,
-            value=0,
-            label="Head index",
-        )
-    gr.Markdown("## 🌌 Patch embeddings in 2D (PCA)")
-    pca_plot = gr.Plot(label="Patches in embedding space (last layer)")
     state = gr.State()
-    # main button: run full analysis
     run_btn.click(
-        fn=analyze_vit,
-        inputs=[img_in, simple_ck],
-        outputs=[patch_img, attn_img, pca_plot, preds_df, explanation_md, state],
     )
-    # when sliders change: update attention overlay only
     layer_slider.change(
-        fn=update_attention_view,
-        inputs=[state, layer_slider, head_slider],
-        outputs=[attn_img],
     )
     head_slider.change(
-        fn=update_attention_view,
-        inputs=[state, layer_slider, head_slider],
-        outputs=[attn_img],
     )
 demo.launch()

+# ViT Visualizer — Full Interpretability Suite (A + B + C)
+# Model: google/vit-base-patch16-224
+# Gradio 5 compatible, CPU-friendly
+# Features:
+#  - Patch grid (16x16)
+#  - Patch attention (per layer / per head / query token)
+#  - Attention rollout (layer aggregated)
+#  - PCA of patch embeddings across selected layers
+#  - Top-5 predictions & simple/technical explanations
 # ==========================================================
 import math
 import warnings
+from typing import Any, Dict, List, Optional, Tuple
 import gradio as gr
 import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoImageProcessor, ViTModel, ViTForImageClassification
 from sklearn.decomposition import PCA
+import plotly.express as px
+import plotly.graph_objects as go
 warnings.filterwarnings("ignore")
 MODEL_NAME = "google/vit-base-patch16-224"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# global caches
+VIT_BASE = None        # ViTModel (encoder with hidden states & attentions)
+VIT_CLF = None         # ViTForImageClassification (classification head)
+PROCESSOR = None
+# ------------------ model loader with SDPA fix ------------------
+def load_models():
+    global VIT_BASE, VIT_CLF, PROCESSOR
+    if VIT_BASE is not None and VIT_CLF is not None and PROCESSOR is not None:
+        return VIT_BASE, VIT_CLF, PROCESSOR
+    PROCESSOR = AutoImageProcessor.from_pretrained(MODEL_NAME)
+    # base ViT (encoder) - we need hidden_states & attentions
+    base = ViTModel.from_pretrained(MODEL_NAME, output_hidden_states=True)
+    # fix attn backend so we can access attentions
+    base.config.attn_implementation = "eager"
+    base.config.output_attentions = True
+    base.config.output_hidden_states = True
+    base.to(DEVICE)
+    base.eval()
+    # classifier head for top-k labels
+    clf = ViTForImageClassification.from_pretrained(MODEL_NAME)
+    clf.to(DEVICE)
+    clf.eval()
+    VIT_BASE = base
+    VIT_CLF = clf
+    return base, clf, PROCESSOR
+# ------------------ helpers: patch grid & overlay ------------------
+def make_patch_grid_image(pil: Image.Image, patch_size: int = 16, target_size: int = 224) -> Image.Image:
+    img = pil.convert("RGB").resize((target_size, target_size))
     draw = ImageDraw.Draw(img)
     w, h = img.size
     for x in range(0, w, patch_size):
+        draw.line((x, 0, x, h), fill=(0, 200, 0), width=1)
     for y in range(0, h, patch_size):
+        draw.line((0, y, w, y), fill=(0, 200, 0), width=1)
     return img
+def make_attention_overlay(base_img: Image.Image, heat_grid: np.ndarray, cmap_alpha: float = 0.45) -> Image.Image:
     """
+    heat_grid: (G, G) values in any scale (we will normalize)
+    overlay on base_img (resized to 224x224)
     """
+    img = base_img.convert("RGB").resize((224, 224))
+    g = np.array(heat_grid, dtype=np.float32)
+    # normalize 0..1
+    if np.any(g):
+        g = g - g.min()
+        if g.max() > 0:
+            g = g / g.max()
     else:
+        g = np.zeros_like(g, dtype=np.float32)
+    # upsample
+    heat_img = Image.fromarray((g * 255).astype("uint8"), mode="L").resize((224, 224), Image.BILINEAR)
+    heat = np.array(heat_img).astype(np.float32) / 255.0
+    # simple colormap blue->red
     r = heat
+    gch = np.zeros_like(heat)
     b = 1.0 - heat
+    cam = np.stack([r, gch, b], axis=-1)
+    base_np = np.array(img).astype(np.float32) / 255.0
+    blended = (1 - cmap_alpha) * base_np + cmap_alpha * cam
     blended = np.clip(blended * 255.0, 0, 255).astype("uint8")
     return Image.fromarray(blended)
+# ------------------ attention rollout (Abnar & Zuidema) ------------------
+def compute_attention_rollout(all_attentions: List[torch.Tensor]) -> np.ndarray:
     """
+    all_attentions: list length L of tensors (batch, heads, seq, seq)
+    We'll average heads per layer -> (seq, seq) and compute rollout:
+      R = prod_l (A_l_hat) where A_l_hat = A_l + I; rows normalized
+    Returns rollout matrix (seq, seq)
     """
+    # convert to np arrays averaged over heads
+    avg_mats = []
+    for a in all_attentions:
+        # a: (batch=1, heads, seq, seq)
+        mat = a[0].mean(dim=0).detach().cpu().numpy()  # (seq, seq)
+        avg_mats.append(mat)
+    seq = avg_mats[0].shape[0]
+    # add identity & normalize rows
+    aug = []
+    for A in avg_mats:
+        A_hat = A + np.eye(seq)
+        A_hat = A_hat / A_hat.sum(axis=-1, keepdims=True)
+        aug.append(A_hat)
+    # multiply (matrix product) in order
+    R = aug[0]
+    for A in aug[1:]:
+        R = A @ R
+    return R  # (seq, seq)
+# ------------------ PCA projection for multiple layers ------------------
+def layers_pca_plot(hidden_states: List[torch.Tensor], layers: List[int]) -> Any:
+    """
+    hidden_states: list of tensors (batch, seq, hidden)
+    layers: list of indices within hidden_states to project
+    We'll remove CLS token and do PCA for each chosen layer;
+    plot patches from each layer with different colors on single plot.
+    """
+    pts_all = []
+    layer_labels = []
+    for li in layers:
+        hs = hidden_states[li][0].detach().cpu().numpy()  # (seq, hidden)
+        patches = hs[1:, :]  # remove CLS -> (N_patches, hidden)
+        # PCA to 2D
+        pca = PCA(n_components=2)
+        pts = pca.fit_transform(patches)
+        pts_all.append(pts)
+        layer_labels.append(np.array([li] * pts.shape[0]))
+    # combine
+    coords = np.vstack(pts_all)
+    labels = np.concatenate(layer_labels)
+    df = {"x": coords[:, 0], "y": coords[:, 1], "layer": labels.astype(str)}
+    fig = px.scatter(df, x="x", y="y", color="layer", title="Patch embeddings across layers (PCA)")
+    fig.update_traces(marker=dict(size=6))
+    fig.update_layout(height=480)
     return fig
+# ------------------ core analyzer ------------------
+def analyze_vit_full(img: Optional[Image.Image], simple: bool):
     if img is None:
         return (
+            None, None, None, None, None, "", {}, {}
         )
+    base, clf, processor = load_models()
+    # preprocess to device
     img_resized = img.convert("RGB").resize((224, 224))
+    inputs = processor(images=img_resized, return_tensors="pt").to(DEVICE)
+    # forward pass through base model
     with torch.no_grad():
+        outputs = base(**inputs)
+    # outputs.attentions: list L tensors (batch=1, heads, seq, seq)
+    attentions = outputs.attentions  # list length L
+    hidden_states = outputs.hidden_states  # list length L+1 (including embeddings) usually
+    L = len(attentions)
+    seq_len = attentions[0].shape[-1]
     n_patches = seq_len - 1
     grid_size = int(math.sqrt(n_patches))
     if grid_size * grid_size != n_patches:
+        # fallback: compute closest integer grid
         grid_size = int(round(math.sqrt(n_patches)))
+    # default selections
+    default_layer = L - 1
     default_head = 0
+    # default query token = 0 (CLS)
+    default_query = 0
+    # Build patch grid image
+    patch_grid = make_patch_grid_image(img.copy(), patch_size=16, target_size=224)
+    # Build per-layer per-head CLS->patch default overlay
+    # pick last layer, head 0, CLS query
+    att_np = attentions[default_layer][0].cpu().numpy()  # (heads, seq, seq)
+    cls_to_patches = att_np[default_head, 0, 1:]  # (n_patches,)
+    cls_grid = cls_to_patches.reshape(grid_size, grid_size)
+    attn_overlay = make_attention_overlay(img, cls_grid)
+    # Compute rollout
+    rollout_mat = compute_attention_rollout(attentions)  # (seq, seq)
+    rollout_cls = rollout_mat[0, 1:]
+    rollout_grid = rollout_cls.reshape(grid_size, grid_size)
+    rollout_overlay = make_attention_overlay(img, rollout_grid, cmap_alpha=0.5)
+    # PCA multi-layer: pick a few representative layers (start, quarter, half, three-quarters, last)
+    layers_to_show = sorted(
+        list({0, max(0, L // 4), max(0, L // 2), max(0, 3 * L // 4), L - 1})
+    )
+    pca_fig = layers_pca_plot(hidden_states, layers_to_show)
+    # Classification top-5
+    with torch.no_grad():
+        logits = clf(**inputs).logits[0].cpu().numpy()
+    probs = np.exp(logits - logits.max())
+    probs = probs / probs.sum()
+    top5 = probs.argsort()[-5:][::-1]
+    labels = clf.config.id2label
+    preds_text = "\n".join([f"{labels[i]} — {probs[i]*100:.2f}%" for i in top5])
+    # Explanation
     if simple:
+        explain_md = f"""
+### 🧒 How ViT Sees the Image (Simple)
+1. Image is cut into {grid_size}×{grid_size} = {grid_size*grid_size} patches (16×16).
+2. Each patch becomes a token. The model learns what each patch "means".
+3. Attention tells each token which other patches matter to it.
+4. Rollout aggregates attention across layers to show the final "focus".
+5. PCA shows how patch features evolve across layers (from raw to object-aware).
 """
     else:
+        explain_md = f"""
+### 🔬 Technical Explanation
+- Model: {MODEL_NAME}
+- Transformer layers: {L}, patch grid: {grid_size}×{grid_size}
+- We extract token attentions (heads) and hidden states for PCA.
+- Patch attention visualization maps token attention back to the image grid.
+- Attention rollout uses Abnar & Zuidema's method to accumulate attention paths across layers.
 """
+    # return many things + state necessary for interactive updates (layer/head/query)
+    state = {
+        "attentions": [a.cpu() for a in attentions],  # store on CPU to allow slider updates
+        "hidden_states": [h.cpu() for h in hidden_states],
+        "grid_size": grid_size,
+        "num_layers": L,
+        "num_heads": attentions[0].shape[1],
+        "base_image": img,  # original high-res image (we'll resize to 224 when overlaying)
+    }
+    return (
+        patch_grid,
+        attn_overlay,
+        rollout_overlay,
+        pca_fig,
+        preds_text,
+        explain_md,
+        state,
+    )
+# ------------------ update functions for sliders / choices ------------------
+def update_layer_head_query(state: Dict[str, Any], layer_idx: int, head_idx: int, query_token: int, mode: str):
     """
+    mode:
+      - "patch_attention": attention of query_token -> patches at (layer, head)
+      - "rollout": ignored (we will return rollout overlay)
     """
+    if not state:
         return None
     base_img = state["base_image"]
+    grid = state["grid_size"]
+    L = state["num_layers"]
+    H = state["num_heads"]
+    l = max(0, min(int(layer_idx), L - 1))
+    h = max(0, min(int(head_idx), H - 1))
+    q = max(0, min(int(query_token), grid * grid))  # q in 0..n_patches (0==CLS)
+    # load attention for layer l: it's a CPU tensor (heads, seq, seq) already stored as state
+    att_tensor = state["attentions"][l]  # shape (heads, seq, seq) because we saved a[0] earlier
+    # ensure shape (heads, seq, seq)
+    if att_tensor.ndim == 4:  # sometimes shape might be (1, heads, seq, seq)
+        att_tensor = att_tensor[0]
+    att_np = att_tensor.numpy()  # (heads, seq, seq)
+    # query q -> keys: if q == 0 it's CLS; keys positions 1..seq-1 are patches
+    seq = att_np.shape[-1]
+    n_patches = seq - 1
+    # column indices for keys: 1..seq-1 map to patches 0..n_patches-1
+    if q >= seq:
+        q = 0
+    # get attention vector for head h: att[h, q, 1:]
+    vec = att_np[h, q, 1:]
+    # if vec shorter/longer than grid^2, adjust
+    if vec.shape[0] != grid * grid:
+        # pad or trim
+        tmp = np.zeros(grid * grid, dtype=np.float32)
+        nmin = min(vec.shape[0], tmp.shape[0])
+        tmp[:nmin] = vec[:nmin]
+        vec = tmp
+    grid_map = vec.reshape(grid, grid)
+    overlay = make_attention_overlay(base_img, grid_map)
     return overlay
+def get_rollout_overlay(state: Dict[str, Any]):
+    if not state:
+        return None
+    attentions = state["attentions"]
+    # attentions list of tensors (heads, seq, seq)
+    # convert to list of (1, heads, seq, seq) for compute_attention_rollout
+    mats = [a.unsqueeze(0) if a.ndim == 3 else a for a in attentions]
+    R = compute_attention_rollout(mats)  # (seq, seq)
+    grid = state["grid_size"]
+    rollout_cls = R[0, 1:]
+    if rollout_cls.shape[0] != grid * grid:
+        tmp = np.zeros(grid * grid, dtype=np.float32)
+        nmin = min(rollout_cls.shape[0], tmp.shape[0])
+        tmp[:nmin] = rollout_cls[:nmin]
+        rollout_cls = tmp
+    rollout_grid = rollout_cls.reshape(grid, grid)
+    return make_attention_overlay(state["base_image"], rollout_grid, cmap_alpha=0.55)
+def update_pca_layers(state: Dict[str, Any], selected_layers: List[int]):
+    if not state:
+        return None
+    # hidden_states stored as list of CPU tensors (batch, seq, hidden)
+    hs = state["hidden_states"]
+    # ensure layers within range
+    layers = [max(0, min(int(l), len(hs) - 1)) for l in selected_layers]
+    fig = layers_pca_plot(hs, layers)
+    return fig
+# ------------------ GRADIO UI ------------------
+with gr.Blocks(title="ViT Full Interpretability (A+B+C)") as demo:
+    gr.Markdown("# 🔍 ViT Visualizer — Patch Attention, Rollout & Layer PCA\n"
+                "Model: **google/vit-base-patch16-224** — explore patches, heads, layers, rollout and feature evolution.")
     with gr.Row():
         with gr.Column(scale=1):
+            img_in = gr.Image(label="Upload image (object/scene)", type="pil")
+            simple = gr.Checkbox(label="Simple explanation (kid-friendly)", value=True)
+            run_btn = gr.Button("Analyze ViT (full)")
+            gr.Markdown("**Patch Attention Controls**\nSelect layer, head and query token (0 = CLS, 1.. = patches left→right top→bottom).")
+            layer_slider = gr.Slider(minimum=0, maximum=11, step=1, value=11, label="Layer")
+            head_slider = gr.Slider(minimum=0, maximum=11, step=1, value=0, label="Head")
+            query_slider = gr.Slider(minimum=0, maximum=196, step=1, value=0, label="Query token (0=CLS)")
+            gr.Markdown("**Attention Rollout & PCA**")
+            rollout_btn = gr.Button("Refresh Rollout Overlay")
+            # PCA layers selection: simple multi-select text entry allowed (comma separated)
+            pca_layers_txt = gr.Textbox(label="PCA layers (comma separated indices, e.g. 0,3,6,11)", value="0,3,6,11,11")
+        with gr.Column(scale=1):
+            gr.Markdown("### Outputs")
+            patch_grid_out = gr.Image(label="Patch grid (16×16)")
+            attn_overlay_out = gr.Image(label="Patch Attention Overlay (layer/head/query)")
+            rollout_overlay_out = gr.Image(label="Attention Rollout Overlay (aggregated)")
+            pca_out = gr.Plot(label="PCA: patch embeddings across selected layers")
+            preds_out = gr.Textbox(label="Top-5 predictions", lines=6)
+            explanation_out = gr.Markdown(label="Explanation")
     state = gr.State()
+    # main analysis
     run_btn.click(
+        fn=analyze_vit_full,
+        inputs=[img_in, simple],
+        outputs=[patch_grid_out, attn_overlay_out, rollout_overlay_out, pca_out, preds_out, explanation_out, state],
     )
+    # update attention overlay (layer/head/query)
     layer_slider.change(
+        fn=update_layer_head_query,
+        inputs=[state, layer_slider, head_slider, query_slider, gr.State("patch_attention")],
+        outputs=[attn_overlay_out],
     )
     head_slider.change(
+        fn=update_layer_head_query,
+        inputs=[state, layer_slider, head_slider, query_slider, gr.State("patch_attention")],
+        outputs=[attn_overlay_out],
+    )
+    query_slider.change(
+        fn=update_layer_head_query,
+        inputs=[state, layer_slider, head_slider, query_slider, gr.State("patch_attention")],
+        outputs=[attn_overlay_out],
+    )
+    # rollout refresh
+    rollout_btn.click(
+        fn=get_rollout_overlay,
+        inputs=[state],
+        outputs=[rollout_overlay_out],
+    )
+    # PCA layers (parse input text)
+    def parse_and_update_pca(state_obj, txt):
+        if not state_obj:
+            return None
+        try:
+            parts = [int(p.strip()) for p in txt.split(",") if p.strip() != ""]
+        except:
+            parts = [0, max(0, state_obj["num_layers"] - 1)]
+        return update_pca_layers(state_obj, parts)
+    pca_layers_txt.submit(
+        fn=parse_and_update_pca,
+        inputs=[state, pca_layers_txt],
+        outputs=[pca_out],
     )
 demo.launch()