Spaces:

PraneshJs
/

InsideViT

Paused

App Files Files Community

PraneshJs commited on Dec 11, 2025

Commit

e4963e3

verified ·

1 Parent(s): 50cbd61

Update app.py

Browse files

Files changed (1) hide show

app.py +348 -270

app.py CHANGED Viewed

@@ -1,12 +1,16 @@
-# ViT Visualizer — Full Interpretability Suite (A + B + C)
 # Model: google/vit-base-patch16-224
-# Gradio 5 compatible, CPU-friendly
 # Features:
-#  - Patch grid (16x16)
-#  - Patch attention (per layer / per head / query token)
-#  - Attention rollout (layer aggregated)
-#  - PCA of patch embeddings across layers
-#  - Top-5 predictions & simple/technical explanations
 # ==========================================================
 import math
@@ -16,14 +20,15 @@ from typing import Any, Dict, List, Optional, Tuple
 import gradio as gr
 import numpy as np
 import torch
-from PIL import Image, ImageDraw
 from transformers import (
     AutoImageProcessor,
     ViTModel,
     ViTForImageClassification,
     AutoConfig,
 )
-from sklearn.decomposition import PCA
 import plotly.express as px
 warnings.filterwarnings("ignore")
@@ -31,214 +36,261 @@ warnings.filterwarnings("ignore")
 MODEL_NAME = "google/vit-base-patch16-224"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# global caches
-VIT_BASE = None        # ViTModel (encoder with hidden states & attentions)
-VIT_CLF = None         # ViTForImageClassification (classification head)
 PROCESSOR = None
-# ------------------ model loader with SDPA -> eager fix ------------------
 def load_models():
-    """
-    Load processor + ViT base + classification head.
-    Important: create config first, set attn_implementation='eager'
-    before enabling output_attentions/output_hidden_states, then load models.
-    """
-    global VIT_BASE, VIT_CLF, PROCESSOR
-    if VIT_BASE is not None and VIT_CLF is not None and PROCESSOR is not None:
-        return VIT_BASE, VIT_CLF, PROCESSOR
     PROCESSOR = AutoImageProcessor.from_pretrained(MODEL_NAME)
-    # load config, modify before creating model
-    cfg = AutoConfig = None
-    try:
-        cfg = AutoConfig.from_pretrained(MODEL_NAME)
-    except Exception:
-        # fallback: load a default config and set minimal fields
-        from transformers import ViTConfig
-        cfg = ViTConfig.from_pretrained(MODEL_NAME)
-    # FORCE eager attention backend so we can extract attentions
-    # (must set attn_implementation before enabling output_attentions)
-    cfg.attn_implementation = "eager"
     cfg.output_attentions = True
     cfg.output_hidden_states = True
-    # now load the base encoder with the modified config
-    base = ViTModel.from_pretrained(MODEL_NAME, config=cfg)
-    base.to(DEVICE)
-    base.eval()
-    # load classifier separately (we can use default config for classifier)
-    clf = ViTForImageClassification.from_pretrained(MODEL_NAME)
-    clf.to(DEVICE)
-    clf.eval()
-    VIT_BASE = base
-    VIT_CLF = clf
-    return base, clf, PROCESSOR
-# ------------------ helpers: patch grid & overlay ------------------
-def make_patch_grid_image(pil: Image.Image, patch_size: int = 16, target_size: int = 224) -> Image.Image:
-    img = pil.convert("RGB").resize((target_size, target_size))
     draw = ImageDraw.Draw(img)
     w, h = img.size
     for x in range(0, w, patch_size):
-        draw.line((x, 0, x, h), fill=(0, 200, 0), width=1)
     for y in range(0, h, patch_size):
-        draw.line((0, y, w, y), fill=(0, 200, 0), width=1)
     return img
-def make_attention_overlay(base_img: Image.Image, heat_grid: np.ndarray, cmap_alpha: float = 0.45) -> Image.Image:
     """
-    heat_grid: (G, G) values (any scale) -> normalized then overlaid on base_img (resized to 224x224)
     """
-    img = base_img.convert("RGB").resize((224, 224))
     g = np.array(heat_grid, dtype=np.float32)
-    # normalize 0..1
     if np.any(g):
         g = g - g.min()
         if g.max() > 0:
             g = g / g.max()
     else:
-        g = np.zeros_like(g, dtype=np.float32)
-    # upsample to image resolution
     heat_img = Image.fromarray((g * 255).astype("uint8"), mode="L").resize((224, 224), Image.BILINEAR)
     heat = np.array(heat_img).astype(np.float32) / 255.0
-    # simple blue->red colormap
-    r = heat
-    gch = np.zeros_like(heat)
-    b = 1.0 - heat
-    cam = np.stack([r, gch, b], axis=-1)
-    base_np = np.array(img).astype(np.float32) / 255.0
-    blended = (1 - cmap_alpha) * base_np + cmap_alpha * cam
-    blended = np.clip(blended * 255.0, 0, 255).astype("uint8")
-    return Image.fromarray(blended)
-# ------------------ attention rollout (Abnar & Zuidema) ------------------
 def compute_attention_rollout(all_attentions: List[torch.Tensor]) -> np.ndarray:
-    """
-    all_attentions: list length L of tensors (batch, heads, seq, seq)
-    We'll average heads per layer -> (seq, seq) and compute rollout:
-      R = prod_l (A_l_hat) where A_l_hat = A_l + I; rows normalized
-    Returns rollout matrix (seq, seq)
-    """
     avg_mats = []
     for a in all_attentions:
-        # a: (batch=1, heads, seq, seq)
         mat = a[0].mean(dim=0).detach().cpu().numpy()  # (seq, seq)
         avg_mats.append(mat)
     seq = avg_mats[0].shape[0]
     aug = []
     for A in avg_mats:
         A_hat = A + np.eye(seq)
-        # normalize rows (sum over last dim)
         row_sums = A_hat.sum(axis=-1, keepdims=True)
-        # avoid division by zero
         row_sums[row_sums == 0] = 1.0
         A_hat = A_hat / row_sums
         aug.append(A_hat)
     R = aug[0]
     for A in aug[1:]:
         R = A @ R
     return R  # (seq, seq)
-# ------------------ PCA projection for multiple layers ------------------
-def layers_pca_plot(hidden_states: List[torch.Tensor], layers: List[int]) -> Any:
-    """
-    hidden_states: list of tensors (batch, seq, hidden)
-    layers: list of indices within hidden_states to project
-    We'll remove CLS token and do PCA for each chosen layer;
-    plot patches from each layer with different colors on single plot.
-    """
     pts_all = []
-    layer_labels = []
     for li in layers:
-        hs = hidden_states[li][0].detach().cpu().numpy()  # (seq, hidden)
-        patches = hs[1:, :]  # remove CLS -> (N_patches, hidden)
         pca = PCA(n_components=2)
         pts = pca.fit_transform(patches)
         pts_all.append(pts)
-        layer_labels.append(np.array([li] * pts.shape[0]))
     coords = np.vstack(pts_all)
-    labels = np.concatenate(layer_labels)
-    df = {"x": coords[:, 0], "y": coords[:, 1], "layer": labels.astype(str)}
     fig = px.scatter(df, x="x", y="y", color="layer", title="Patch embeddings across layers (PCA)")
     fig.update_traces(marker=dict(size=6))
     fig.update_layout(height=480)
     return fig
-# ------------------ core analyzer ------------------
-def analyze_vit_full(img: Optional[Image.Image], simple: bool):
     if img is None:
-        return (None, None, None, None, None, "", {})
     base, clf, processor = load_models()
-    # preprocess to device
-    img_resized = img.convert("RGB").resize((224, 224))
-    inputs = processor(images=img_resized, return_tensors="pt").to(DEVICE)
-    # forward pass through base model
     with torch.no_grad():
         outputs = base(**inputs)
-    # outputs.attentions: list L tensors (batch=1, heads, seq, seq)
-    attentions = outputs.attentions
     hidden_states = outputs.hidden_states
-    L = len(attentions)
     seq_len = attentions[0].shape[-1]
     n_patches = seq_len - 1
-    grid_size = int(math.sqrt(n_patches))
-    if grid_size * grid_size != n_patches:
-        grid_size = int(round(math.sqrt(n_patches)))
-    # Build patch grid image
-    patch_grid = make_patch_grid_image(img.copy(), patch_size=16, target_size=224)
-    # default overlay: last layer, head 0, CLS query
-    last_layer = L - 1
-    head0 = 0
-    # attentions[last_layer]: shape (batch=1, heads, seq, seq)
-    att_np = attentions[last_layer][0].cpu().numpy()  # (heads, seq, seq)
-    cls_to_patches = att_np[head0, 0, 1:]  # (n_patches,)
-    if cls_to_patches.shape[0] != grid_size * grid_size:
-        tmp = np.zeros(grid_size * grid_size, dtype=np.float32)
-        nmin = min(cls_to_patches.shape[0], tmp.shape[0])
-        tmp[:nmin] = cls_to_patches[:nmin]
-        cls_to_patches = tmp
-    cls_grid = cls_to_patches.reshape(grid_size, grid_size)
-    attn_overlay = make_attention_overlay(img, cls_grid)
-    # Compute rollout overlay (CLS)
-    rollout_mat = compute_attention_rollout(attentions)  # (seq, seq)
-    rollout_cls = rollout_mat[0, 1:]
     if rollout_cls.shape[0] != grid_size * grid_size:
-        tmp = np.zeros(grid_size * grid_size, dtype=np.float32)
-        nmin = min(rollout_cls.shape[0], tmp.shape[0])
         tmp[:nmin] = rollout_cls[:nmin]
         rollout_cls = tmp
     rollout_grid = rollout_cls.reshape(grid_size, grid_size)
-    rollout_overlay = make_attention_overlay(img, rollout_grid, cmap_alpha=0.55)
-    # PCA multi-layer: choose representative layers
-    layers_to_show = sorted(list({0, max(0, L // 4), max(0, L // 2), max(0, 3 * L // 4), L - 1}))
-    pca_fig = layers_pca_plot(hidden_states, layers_to_show)
-    # Classification top-5
     with torch.no_grad():
         logits = clf(**inputs).logits[0].cpu().numpy()
     probs = np.exp(logits - logits.max())
@@ -247,175 +299,201 @@ def analyze_vit_full(img: Optional[Image.Image], simple: bool):
     labels = clf.config.id2label
     preds_text = "\n".join([f"{labels[i]} — {probs[i]*100:.2f}%" for i in top5])
-    # Explanation
-    if simple:
-        explain_md = f"""
-### 🧒 How ViT Sees the Image (Simple)
-1. Image is cut into {grid_size}×{grid_size} = {grid_size*grid_size} patches (16×16).
-2. Each patch becomes a token. The model learns what each patch "means".
-3. Attention tells each token which other patches matter to it.
-4. Rollout aggregates attention across layers to show the final "focus".
-5. PCA shows how patch features evolve across layers (from raw to object-aware).
-"""
-    else:
-        explain_md = f"""
-### 🔬 Technical Explanation
-- Model: {MODEL_NAME}
-- Transformer layers: {L}, patch grid: {grid_size}×{grid_size}
-- We extract token attentions (heads) and hidden states for PCA.
-- Patch attention visualization maps token attention back to the image grid.
-- Attention rollout uses Abnar & Zuidema's method to accumulate attention paths across layers.
-"""
     state = {
-        "attentions": [a.cpu() for a in attentions],  # move to CPU for interactive updates
         "hidden_states": [h.cpu() for h in hidden_states],
         "grid_size": grid_size,
-        "num_layers": L,
         "num_heads": attentions[0].shape[1],
         "base_image": img,
     }
-    return patch_grid, attn_overlay, rollout_overlay, pca_fig, preds_text, explain_md, state
-# ------------------ update functions for sliders / choices ------------------
-def update_layer_head_query(state: Dict[str, Any], layer_idx: int, head_idx: int, query_token: int, mode: str):
-    if not state:
-        return None
-    base_img = state["base_image"]
-    grid = state["grid_size"]
-    L = state["num_layers"]
-    H = state["num_heads"]
-    l = max(0, min(int(layer_idx), L - 1))
-    h = max(0, min(int(head_idx), H - 1))
-    q = max(0, min(int(query_token), grid * grid))  # q in 0..n_patches (0==CLS)
-    att_tensor = state["attentions"][l]  # shape (heads, seq, seq) or (1,heads,seq,seq)
     if att_tensor.ndim == 4:
         att_tensor = att_tensor[0]
     att_np = att_tensor.numpy()  # (heads, seq, seq)
-    seq = att_np.shape[-1]
-    if q >= seq:
-        q = 0
-    vec = att_np[h, q, 1:]
     if vec.shape[0] != grid * grid:
-        tmp = np.zeros(grid * grid, dtype=np.float32)
         nmin = min(vec.shape[0], tmp.shape[0])
         tmp[:nmin] = vec[:nmin]
         vec = tmp
     grid_map = vec.reshape(grid, grid)
-    overlay = make_attention_overlay(base_img, grid_map)
-    return overlay
-def get_rollout_overlay(state: Dict[str, Any]):
     if not state:
         return None
-    attentions = state["attentions"]
-    mats = [a.unsqueeze(0) if a.ndim == 3 else a for a in attentions]
-    R = compute_attention_rollout(mats)  # (seq, seq)
     grid = state["grid_size"]
     rollout_cls = R[0, 1:]
     if rollout_cls.shape[0] != grid * grid:
-        tmp = np.zeros(grid * grid, dtype=np.float32)
-        nmin = min(rollout_cls.shape[0], tmp.shape[0])
         tmp[:nmin] = rollout_cls[:nmin]
         rollout_cls = tmp
     rollout_grid = rollout_cls.reshape(grid, grid)
-    return make_attention_overlay(state["base_image"], rollout_grid, cmap_alpha=0.55)
-def update_pca_layers(state: Dict[str, Any], selected_layers: List[int]):
     if not state:
         return None
-    hs = state["hidden_states"]
-    layers = [max(0, min(int(l), len(hs) - 1)) for l in selected_layers]
-    fig = layers_pca_plot(hs, layers)
-    return fig
-# ------------------ GRADIO UI ------------------
-with gr.Blocks(title="ViT Full Interpretability (A+B+C)") as demo:
-    gr.Markdown("# 🔍 ViT Visualizer — Patch Attention, Rollout & Layer PCA\n"
-                "Model: **google/vit-base-patch16-224** — explore patches, heads, layers, rollout and feature evolution.")
-    with gr.Row():
-        with gr.Column(scale=1):
-            img_in = gr.Image(label="Upload image (object/scene)", type="pil")
-            simple = gr.Checkbox(label="Simple explanation (kid-friendly)", value=True)
-            run_btn = gr.Button("Analyze ViT (full)")
-            gr.Markdown("**Patch Attention Controls**\nSelect layer, head and query token (0 = CLS, 1.. = patches left→right top→bottom).")
-            layer_slider = gr.Slider(minimum=0, maximum=11, step=1, value=11, label="Layer")
-            head_slider = gr.Slider(minimum=0, maximum=11, step=1, value=0, label="Head")
-            query_slider = gr.Slider(minimum=0, maximum=196, step=1, value=0, label="Query token (0=CLS)")
-            gr.Markdown("**Attention Rollout & PCA**")
-            rollout_btn = gr.Button("Refresh Rollout Overlay")
-            pca_layers_txt = gr.Textbox(label="PCA layers (comma separated indices, e.g. 0,3,6,11)", value="0,3,6,11")
-        with gr.Column(scale=1):
-            gr.Markdown("### Outputs")
-            patch_grid_out = gr.Image(label="Patch grid (16×16)")
-            attn_overlay_out = gr.Image(label="Patch Attention Overlay (layer/head/query)")
-            rollout_overlay_out = gr.Image(label="Attention Rollout Overlay (aggregated)")
-            pca_out = gr.Plot(label="PCA: patch embeddings across selected layers")
-            preds_out = gr.Textbox(label="Top-5 predictions", lines=6)
-            explanation_out = gr.Markdown(label="Explanation")
-    state = gr.State()
-    # main analysis
-    run_btn.click(
-        fn=analyze_vit_full,
-        inputs=[img_in, simple],
-        outputs=[patch_grid_out, attn_overlay_out, rollout_overlay_out, pca_out, preds_out, explanation_out, state],
-    )
-    # update attention overlay (layer/head/query)
-    layer_slider.change(
-        fn=update_layer_head_query,
-        inputs=[state, layer_slider, head_slider, query_slider, gr.State("patch_attention")],
-        outputs=[attn_overlay_out],
-    )
-    head_slider.change(
-        fn=update_layer_head_query,
-        inputs=[state, layer_slider, head_slider, query_slider, gr.State("patch_attention")],
-        outputs=[attn_overlay_out],
-    )
-    query_slider.change(
-        fn=update_layer_head_query,
-        inputs=[state, layer_slider, head_slider, query_slider, gr.State("patch_attention")],
-        outputs=[attn_overlay_out],
-    )
-    # rollout refresh
-    rollout_btn.click(
-        fn=get_rollout_overlay,
-        inputs=[state],
-        outputs=[rollout_overlay_out],
-    )
-    # PCA layers (parse input text)
-    def parse_and_update_pca(state_obj, txt):
-        if not state_obj:
-            return None
-        try:
-            parts = [int(p.strip()) for p in txt.split(",") if p.strip() != ""]
-        except:
-            parts = [0, max(0, state_obj["num_layers"] - 1)]
-        return update_pca_layers(state_obj, parts)
-    pca_layers_txt.submit(
-        fn=parse_and_update_pca,
-        inputs=[state, pca_layers_txt],
-        outputs=[pca_out],
-    )
 demo.launch()

+# ==========================================================
+# ViT Visualizer — Simple (comic-style) + Advanced Mode
 # Model: google/vit-base-patch16-224
+# Gradio 5 compatible; CPU-friendly
+#
 # Features:
+#  - Simple mode (4-step, non-technical, kid-friendly)
+#    Step1: Patch grid
+#    Step2: Patch clustering (colored blocks)
+#    Step3: Patch-to-patch arrows (simplified attention)
+#    Step4: Focus map (rollout) + Top-5 predictions
+#  - Advanced mode (attention maps per layer/head, rollout, PCA)
+#  - SDPA -> eager fix for attention extraction
 # ==========================================================
 import math
 import gradio as gr
 import numpy as np
 import torch
+from PIL import Image, ImageDraw, ImageFont
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
 from transformers import (
     AutoImageProcessor,
     ViTModel,
     ViTForImageClassification,
     AutoConfig,
 )
 import plotly.express as px
 warnings.filterwarnings("ignore")
 MODEL_NAME = "google/vit-base-patch16-224"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Globals
+BASE_MODEL = None
+CLF_MODEL = None
 PROCESSOR = None
+# ------------------- model loader with SDPA -> eager fix -------------------
 def load_models():
+    global BASE_MODEL, CLF_MODEL, PROCESSOR
+    if BASE_MODEL is not None and CLF_MODEL is not None and PROCESSOR is not None:
+        return BASE_MODEL, CLF_MODEL, PROCESSOR
     PROCESSOR = AutoImageProcessor.from_pretrained(MODEL_NAME)
+    # load config first, set attn_implementation BEFORE enabling attentions
+    cfg = AutoConfig.from_pretrained(MODEL_NAME)
+    cfg.attn_implementation = "eager"  # << must set this first
     cfg.output_attentions = True
     cfg.output_hidden_states = True
+    # load base encoder with modified config (we'll extract hidden states & attentions)
+    BASE_MODEL = ViTModel.from_pretrained(MODEL_NAME, config=cfg)
+    BASE_MODEL.to(DEVICE).eval()
+    # classifier head (for top-5 predictions)
+    CLF_MODEL = ViTForImageClassification.from_pretrained(MODEL_NAME)
+    CLF_MODEL.to(DEVICE).eval()
+    return BASE_MODEL, CLF_MODEL, PROCESSOR
+# ------------------- utility: patch grid positions -------------------
+def patch_grid_info(image_size: int = 224, patch_size: int = 16):
+    grid_size = image_size // patch_size
+    positions = []
+    for i in range(grid_size):
+        for j in range(grid_size):
+            # center coordinates of patch (x,y)
+            cx = int((j + 0.5) * patch_size)
+            cy = int((i + 0.5) * patch_size)
+            positions.append((cx, cy))
+    return grid_size, positions
+# ------------------- visual helpers -------------------
+def draw_patch_grid(img: Image.Image, patch_size: int = 16, outline=(0, 180, 0)) -> Image.Image:
+    img = img.convert("RGB").resize((224, 224))
     draw = ImageDraw.Draw(img)
     w, h = img.size
     for x in range(0, w, patch_size):
+        draw.line([(x, 0), (x, h)], fill=outline, width=1)
     for y in range(0, h, patch_size):
+        draw.line([(0, y), (w, y)], fill=outline, width=1)
     return img
+def draw_cluster_blocks(img: Image.Image, labels: np.ndarray, n_clusters: int = 4, patch_size: int = 16):
     """
+    labels: (n_patches,) cluster labels assigned to each patch index (left→right, top→bottom)
+    """
+    img = img.convert("RGB").resize((224, 224))
+    draw = ImageDraw.Draw(img, "RGBA")
+    grid_size, positions = patch_grid_info()
+    colors = [
+        (255, 99, 71, 140),
+        (60, 179, 113, 140),
+        (65, 105, 225, 140),
+        (255, 215, 0, 140),
+        (199, 21, 133, 140),
+        (0, 206, 209, 140),
+    ]
+    for idx, lab in enumerate(labels):
+        i = idx // grid_size
+        j = idx % grid_size
+        x0 = j * patch_size
+        y0 = i * patch_size
+        x1 = x0 + patch_size
+        y1 = y0 + patch_size
+        col = colors[int(lab) % len(colors)]
+        draw.rectangle([x0, y0, x1, y1], fill=col)
+    return img
+def draw_attention_arrows(img: Image.Image, att_matrix: np.ndarray, top_k: int = 3, query_idx: Optional[int] = None):
+    """
+    att_matrix: (n_patches, n_patches) attention from query->keys (already preprocessed)
+    If query_idx is None -> use CLS (not plotted as patch), else 0..n_patches-1
+    We'll draw arrows from query patch centers to top-k key patch centers.
+    """
+    img = img.convert("RGB").resize((224, 224))
+    draw = ImageDraw.Draw(img, "RGBA")
+    grid_size, positions = patch_grid_info()
+    # pick a query: if None, choose center patch
+    if query_idx is None:
+        query_idx = (grid_size * grid_size) // 2
+    qpos = positions[query_idx]
+    # find top_k keys attended by this query
+    vec = att_matrix[query_idx]  # length n_patches
+    top_idx = vec.argsort()[-top_k:][::-1]
+    for t in top_idx:
+        kpos = positions[t]
+        # draw line + arrowhead
+        draw.line([qpos, kpos], fill=(255, 0, 0, 200), width=3)
+        # arrowhead: small triangle
+        dx = kpos[0] - qpos[0]
+        dy = kpos[1] - qpos[1]
+        ang = math.atan2(dy, dx)
+        # size proportional
+        ah = 8
+        p1 = (kpos[0] - ah * math.cos(ang - 0.3), kpos[1] - ah * math.sin(ang - 0.3))
+        p2 = (kpos[0] - ah * math.cos(ang + 0.3), kpos[1] - ah * math.sin(ang + 0.3))
+        draw.polygon([kpos, p1, p2], fill=(255, 0, 0, 200))
+    # highlight query patch with blue circle
+    r = 10
+    draw.ellipse([qpos[0] - r, qpos[1] - r, qpos[0] + r, qpos[1] + r], outline=(0, 0, 255, 220), width=2)
+    return img
+def make_focus_overlay(img: Image.Image, heat_grid: np.ndarray, alpha: float = 0.6):
     """
+    heat_grid: (G,G) float map
+    overlay colored transparency on image where heat is high
+    """
+    img = img.convert("RGB").resize((224, 224))
     g = np.array(heat_grid, dtype=np.float32)
     if np.any(g):
         g = g - g.min()
         if g.max() > 0:
             g = g / g.max()
     else:
+        g = np.zeros_like(g)
     heat_img = Image.fromarray((g * 255).astype("uint8"), mode="L").resize((224, 224), Image.BILINEAR)
     heat = np.array(heat_img).astype(np.float32) / 255.0
+    draw = ImageDraw.Draw(img, "RGBA")
+    # color mapping simple: yellow -> red
+    H, W = heat.shape
+    for y in range(H):
+        for x in range(W):
+            v = heat[y, x]
+            if v > 0.05:
+                # map to color
+                r = int(255 * v)
+                gcol = int(200 * (1 - v))
+                draw.point((x, y), fill=(r, gcol, 40, int(255 * alpha * v)))
+    return img
+# ------------------- attention rollout (Abnar & Zuidema) -------------------
 def compute_attention_rollout(all_attentions: List[torch.Tensor]) -> np.ndarray:
     avg_mats = []
     for a in all_attentions:
         mat = a[0].mean(dim=0).detach().cpu().numpy()  # (seq, seq)
         avg_mats.append(mat)
     seq = avg_mats[0].shape[0]
     aug = []
     for A in avg_mats:
         A_hat = A + np.eye(seq)
         row_sums = A_hat.sum(axis=-1, keepdims=True)
         row_sums[row_sums == 0] = 1.0
         A_hat = A_hat / row_sums
         aug.append(A_hat)
     R = aug[0]
     for A in aug[1:]:
         R = A @ R
     return R  # (seq, seq)
+# ------------------- PCA helper for advanced -------------------
+def pca_plot_from_hidden(hidden_states: List[torch.Tensor], layers: List[int]):
     pts_all = []
+    labels = []
     for li in layers:
+        hs = hidden_states[li][0].detach().cpu().numpy()
+        patches = hs[1:, :]
         pca = PCA(n_components=2)
         pts = pca.fit_transform(patches)
         pts_all.append(pts)
+        labels.append(np.array([li] * pts.shape[0]))
     coords = np.vstack(pts_all)
+    layer_labels = np.concatenate(labels)
+    df = {"x": coords[:, 0], "y": coords[:, 1], "layer": layer_labels.astype(str)}
     fig = px.scatter(df, x="x", y="y", color="layer", title="Patch embeddings across layers (PCA)")
     fig.update_traces(marker=dict(size=6))
     fig.update_layout(height=480)
     return fig
+# ------------------- main analyzer (both modes) -------------------
+def analyze_all(img: Optional[Image.Image], mode_simple: bool):
     if img is None:
+        # return placeholders for all outputs
+        empty = None
+        return empty, empty, empty, empty, "", empty, empty, empty
     base, clf, processor = load_models()
+    # preprocess
+    img224 = img.convert("RGB").resize((224, 224))
+    inputs = processor(images=img224, return_tensors="pt").to(DEVICE)
+    # forward through base model to get attentions & hidden states
     with torch.no_grad():
         outputs = base(**inputs)
+    attentions = outputs.attentions  # list L of (1, heads, seq, seq)
     hidden_states = outputs.hidden_states
+    # build grid & info
+    grid_size, positions = patch_grid_info()
     seq_len = attentions[0].shape[-1]
     n_patches = seq_len - 1
+    # Step1: patch grid image
+    patch_grid_img = draw_patch_grid(img224.copy())
+    # Step2: cluster patches using last hidden layer embeddings
+    last_hidden = hidden_states[-1][0].detach().cpu().numpy()  # (seq, hidden)
+    patch_embeddings = last_hidden[1:, :]  # remove CLS
+    # KMeans small number clusters (4)
+    n_clusters = 4
+    try:
+        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(patch_embeddings)
+        cluster_labels = kmeans.labels_
+    except Exception:
+        # fallback uniform
+        cluster_labels = np.zeros(n_patches, dtype=int)
+    cluster_img = draw_cluster_blocks(img224.copy(), cluster_labels, n_clusters=n_clusters)
+    # Step3: simplified arrows using average last-layer attention across heads
+    last_att = attentions[-1][0].mean(dim=0).cpu().numpy()  # (seq, seq) averaged heads
+    # We want patch->patch attention (exclude CLS index in mapping)
+    # Map token indices 1.. to patch indices 0..
+    # Make an (n_patches, n_patches) matrix where row q corresponds to query patch q
+    if last_att.shape[0] >= n_patches + 1:
+        patch_to_patch = last_att[1:, 1:]  # (n_patches, n_patches)
+    else:
+        # fallback zeros
+        patch_to_patch = np.zeros((n_patches, n_patches))
+    # draw arrows for a central query
+    arrow_img = draw_attention_arrows(img224.copy(), patch_to_patch, top_k=4, query_idx=(n_patches // 2))
+    # Step4: rollout focus map (CLS rollout)
+    rollout = compute_attention_rollout(attentions)  # (seq, seq)
+    # take CLS row -> keys 1.. = patches
+    rollout_cls = rollout[0, 1:]
     if rollout_cls.shape[0] != grid_size * grid_size:
+        tmp = np.zeros(grid_size * grid_size, dtype=float)
+        nmin = min(len(rollout_cls), tmp.shape[0])
         tmp[:nmin] = rollout_cls[:nmin]
         rollout_cls = tmp
     rollout_grid = rollout_cls.reshape(grid_size, grid_size)
+    focus_img = make_focus_overlay(img224.copy(), rollout_grid, alpha=0.6)
+    # Top-5 predictions from classifier head
     with torch.no_grad():
         logits = clf(**inputs).logits[0].cpu().numpy()
     probs = np.exp(logits - logits.max())
     labels = clf.config.id2label
     preds_text = "\n".join([f"{labels[i]} — {probs[i]*100:.2f}%" for i in top5])
+    # Advanced outputs: PCA fig and default attention overlay (last layer head 0 CLS->patch)
+    pca_fig = pca_plot_from_hidden(hidden_states, [0, max(0, len(hidden_states) // 2), len(hidden_states) - 1])
+    # Attention overlay for advanced default (last layer head0 CLS->patch)
+    att_np = attentions[-1][0].cpu().numpy()  # (heads, seq, seq)
+    # average heads for simplicity
+    cls_to_patches = att_np.mean(axis=0)[0, 1:]
+    if cls_to_patches.shape[0] != grid_size * grid_size:
+        tmp = np.zeros(grid_size * grid_size, dtype=float)
+        nmin = min(len(cls_to_patches), tmp.shape[0])
+        tmp[:nmin] = cls_to_patches[:nmin]
+        cls_to_patches = tmp
+    cls_grid = cls_to_patches.reshape(grid_size, grid_size)
+    # create overlay
+    from PIL import Image  # ensure imported
+    focus_overlay_default = make_focus_overlay(img224.copy(), cls_grid, alpha=0.5)
+    # make state for interactive advanced controls (move to CPU to save GPU mem)
     state = {
+        "attentions": [a.cpu() for a in attentions],
         "hidden_states": [h.cpu() for h in hidden_states],
         "grid_size": grid_size,
+        "num_layers": len(attentions),
         "num_heads": attentions[0].shape[1],
         "base_image": img,
     }
+    # Return values:
+    # Simple view images: patch_grid_img, cluster_img, arrow_img, focus_img, preds_text
+    # Advanced outputs: focus_overlay_default, pca_fig, preds_text, explain_md, state
+    simple_explain = """
+**How ViT Sees — Simple Steps**
+1) **Chop** — The image is chopped into small square tiles (patches) like LEGO pieces.
+2) **Understand** — Each piece gets a code that describes colors/edges. Pieces that look similar are grouped.
+3) **Talk** — Pieces tell each other what they see (we draw arrows to show that).
+4) **Focus & Guess** — The model merges clues and focuses on important areas, then guesses what the image shows.
+"""
+    advanced_explain = """
+**Advanced View:** Explore attention per layer/head, the PCA of patch embeddings, and the model's internal focus.
+Use sliders to change layer/head and see how ViT's attention evolves.
+"""
+    return (
+        patch_grid_img,
+        cluster_img,
+        arrow_img,
+        focus_img,
+        preds_text,
+        simple_explain,
+        focus_overlay_default,
+        pca_fig,
+        preds_text,
+        advanced_explain,
+        state,
+    )
+# ------------------- interactive advanced helpers -------------------
+def advanced_update_attention(state: Dict[str, Any], layer_idx: int, head_idx: int):
+    if not state:
+        return None
+    l = max(0, min(int(layer_idx), state["num_layers"] - 1))
+    h = max(0, min(int(head_idx), state["num_heads"] - 1))
+    att_tensor = state["attentions"][l]  # (1, heads, seq, seq) or (heads, seq, seq)
     if att_tensor.ndim == 4:
         att_tensor = att_tensor[0]
     att_np = att_tensor.numpy()  # (heads, seq, seq)
+    # take CLS->patchs for selected head
+    vec = att_np[h, 0, 1:]
+    grid = state["grid_size"]
     if vec.shape[0] != grid * grid:
+        tmp = np.zeros(grid * grid, dtype=float)
         nmin = min(vec.shape[0], tmp.shape[0])
         tmp[:nmin] = vec[:nmin]
         vec = tmp
     grid_map = vec.reshape(grid, grid)
+    return make_focus_overlay(state["base_image"].convert("RGB"), grid_map, alpha=0.55)
+def advanced_update_rollout(state: Dict[str, Any]):
     if not state:
         return None
+    mats = [a.unsqueeze(0) if a.ndim == 3 else a for a in state["attentions"]]
+    R = compute_attention_rollout(mats)
     grid = state["grid_size"]
     rollout_cls = R[0, 1:]
     if rollout_cls.shape[0] != grid * grid:
+        tmp = np.zeros(grid * grid, dtype=float)
+        nmin = min(len(rollout_cls), tmp.shape[0])
         tmp[:nmin] = rollout_cls[:nmin]
         rollout_cls = tmp
     rollout_grid = rollout_cls.reshape(grid, grid)
+    return make_focus_overlay(state["base_image"].convert("RGB"), rollout_grid, alpha=0.6)
+def advanced_update_pca(state: Dict[str, Any], txt: str):
     if not state:
         return None
+    try:
+        layers = [int(x.strip()) for x in txt.split(",") if x.strip() != ""]
+    except Exception:
+        layers = [0, max(0, state["num_layers"] - 1)]
+    return pca_plot_from_hidden(state["hidden_states"], layers)
+# ------------------- GRADIO UI -------------------
+with gr.Blocks(title="ViT Visualizer — Simple + Advanced") as demo:
+    gr.Markdown("# 👀 How Vision Transformers (ViT) See Images\n"
+                "Simple mode (story-style) + Advanced mode (inspect internals). Model: **google/vit-base-patch16-224**")
+    with gr.Tabs():
+        with gr.TabItem("Simple (for everyone)"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    img_input = gr.Image(label="Upload an image (photo / object)", type="pil")
+                    run_btn = gr.Button("🔎 Explain simply")
+                    gr.Markdown("Tip: use clear images of objects, animals, scenes for best examples.")
+                with gr.Column(scale=1):
+                    pass
+            gr.Markdown("### Step 1 — Chopped into patches")
+            step1 = gr.Image(label="Patch Grid (ViT chops image into 16×16 patches)")
+            gr.Markdown("### Step 2 — The model groups similar patches")
+            step2 = gr.Image(label="Clustered patches (colored blocks)")
+            gr.Markdown("### Step 3 — Patches talk to each other (simplified)")
+            step3 = gr.Image(label="Patch-to-Patch arrows")
+            gr.Markdown("### Step 4 — Model focus map and guess")
+            with gr.Row():
+                step4 = gr.Image(label="Focus map (where model looked most)")
+                preds_simple = gr.Textbox(label="Model guesses (Top-5)", lines=4)
+            explanation_simple = gr.Markdown()
+            run_btn.click(
+                fn=analyze_all,
+                inputs=[img_input, gr.State(True)],
+                outputs=[step1, step2, step3, step4, preds_simple, explanation_simple,
+                         gr.State(), gr.Plot(), gr.Textbox(), gr.Markdown(), gr.State()],
+            )
+        with gr.TabItem("Advanced (inspect internals)"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    img_adv = gr.Image(label="Upload image for advanced view", type="pil")
+                    run_adv = gr.Button("Analyze (advanced)")
+                    gr.Markdown("Use the sliders to explore attention per layer and head.")
+                    layer_slider = gr.Slider(0, 11, value=11, step=1, label="Layer (0=shallow)")
+                    head_slider = gr.Slider(0, 11, value=0, step=1, label="Head index")
+                    rollout_btn = gr.Button("Refresh Rollout Overlay")
+                    pca_txt = gr.Textbox(label="PCA layers (comma separated)", value="0,6,11")
+                    pca_btn = gr.Button("Update PCA")
+                with gr.Column(scale=1):
+                    adv_attn = gr.Image(label="Attention overlay (layer/head CLS->patch)")
+                    adv_rollout = gr.Image(label="Attention rollout overlay (aggregated)")
+                    adv_pca = gr.Plot(label="PCA of patch embeddings")
+                    adv_preds = gr.Textbox(label="Top-5 predictions", lines=5)
+                    adv_explain = gr.Markdown()
+            state_box = gr.State()
+            # run advanced analysis
+            run_adv.click(
+                fn=analyze_all,
+                inputs=[img_adv, gr.State(False)],
+                outputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image(), adv_preds, gr.Markdown(),
+                         adv_attn, adv_pca, adv_preds, adv_explain, state_box],
+            )
+            # update attention overlay with sliders
+            layer_slider.change(
+                fn=advanced_update_attention,
+                inputs=[state_box, layer_slider, head_slider],
+                outputs=[adv_attn],
+            )
+            head_slider.change(
+                fn=advanced_update_attention,
+                inputs=[state_box, layer_slider, head_slider],
+                outputs=[adv_attn],
+            )
+            rollout_btn.click(
+                fn=advanced_update_rollout,
+                inputs=[state_box],
+                outputs=[adv_rollout],
+            )
+            pca_btn.click(
+                fn=advanced_update_pca,
+                inputs=[state_box, pca_txt],
+                outputs=[adv_pca],
+            )
 demo.launch()