Spaces:

PraneshJs
/

InsideViT

Paused

App Files Files Community

PraneshJs commited on Dec 11, 2025

Commit

4d08ba2

verified ·

1 Parent(s): 0deccad

Create app.py

Browse files

Files changed (1) hide show

app.py +407 -0

app.py ADDED Viewed

	@@ -0,0 +1,407 @@

+# ==========================================================
+#  Vision Transformer (ViT) Visualizer — HF Space, CPU, Gradio 5
+#  - Model: google/vit-base-patch16-224
+#  - Shows:
+#      * Original + patch grid (tokens)
+#      * Attention heatmap overlay (CLS -> patches)
+#      * PCA of patch embeddings
+#      * Top-5 predictions
+#      * Simple vs technical explanation
+#  - CPU friendly, uses only Gradio v5-safe features
+# ==========================================================
+import math
+import warnings
+from typing import Dict, Any, Optional, List, Tuple
+import gradio as gr
+import torch
+import numpy as np
+from PIL import Image, ImageDraw
+from transformers import AutoImageProcessor, ViTForImageClassification
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+warnings.filterwarnings("ignore")
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_NAME = "google/vit-base-patch16-224"
+VIT_MODEL = None
+VIT_PROCESSOR = None
+# ---------------------- MODEL LOADING ----------------------
+def load_vit():
+    """Load ViT + image processor once into global cache."""
+    global VIT_MODEL, VIT_PROCESSOR
+    if VIT_MODEL is not None and VIT_PROCESSOR is not None:
+        return VIT_MODEL, VIT_PROCESSOR
+    processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
+    model = ViTForImageClassification.from_pretrained(MODEL_NAME)
+    # ensure we get attentions + hidden states
+    model.config.output_attentions = True
+    model.config.output_hidden_states = True
+    model.to(DEVICE)
+    model.eval()
+    VIT_MODEL = model
+    VIT_PROCESSOR = processor
+    return model, processor
+# ---------------------- VISUAL HELPERS ----------------------
+def make_patch_grid_image(pil_img: Image.Image, patch_size: int = 16) -> Image.Image:
+    """
+    Resize to 224x224 and draw a patch grid (ViT splits into 16x16 patches).
+    """
+    img = pil_img.convert("RGB").resize((224, 224))
+    draw = ImageDraw.Draw(img)
+    w, h = img.size
+    for x in range(0, w, patch_size):
+        draw.line((x, 0, x, h), fill=(0, 255, 0), width=1)
+    for y in range(0, h, patch_size):
+        draw.line((0, y, w, y), fill=(0, 255, 0), width=1)
+    return img
+def make_attention_overlay(
+    base_img: Image.Image, heatmap_grid: np.ndarray
+) -> Image.Image:
+    """
+    Overlay a CLS->patch attention heatmap on top of the 224x224 image.
+    heatmap_grid: (G, G) attention values.
+    """
+    base = base_img.convert("RGB").resize((224, 224))
+    g = heatmap_grid.astype(np.float32)
+    if not np.any(g):
+        g = np.zeros_like(g, dtype=np.float32)
+    else:
+        g -= g.min()
+        maxv = g.max()
+        if maxv > 0:
+            g /= maxv
+    # upscale to image size
+    H, W = g.shape
+    heat_img = Image.fromarray((g * 255).astype("uint8"), mode="L")
+    heat_img = heat_img.resize((224, 224), Image.BILINEAR)
+    heat = np.array(heat_img).astype(np.float32) / 255.0  # 0..1
+    # simple blue->red colormap overlay
+    r = heat
+    g_c = np.zeros_like(heat)
+    b = 1.0 - heat
+    cam = np.stack([r, g_c, b], axis=-1)  # H,W,3
+    base_np = np.array(base).astype(np.float32) / 255.0
+    alpha = 0.45
+    blended = (1 - alpha) * base_np + alpha * cam
+    blended = np.clip(blended * 255.0, 0, 255).astype("uint8")
+    return Image.fromarray(blended)
+def make_pca_plot(patch_embeddings: np.ndarray):
+    """
+    patch_embeddings: (N_patches, hidden_dim)
+    Returns a Matplotlib figure showing patches in 2D PCA space.
+    """
+    if patch_embeddings.shape[0] < 2:
+        return None
+    pca = PCA(n_components=2)
+    comps = pca.fit_transform(patch_embeddings)  # (N,2)
+    fig, ax = plt.subplots(figsize=(4, 4))
+    ax.scatter(comps[:, 0], comps[:, 1], s=20, alpha=0.8)
+    ax.set_title("Patches in 2D (PCA of embeddings)")
+    ax.set_xlabel("PC1")
+    ax.set_ylabel("PC2")
+    ax.grid(True, alpha=0.3)
+    fig.tight_layout()
+    return fig
+# ---------------------- CORE ANALYSIS ----------------------
+def analyze_vit(img: Optional[Image.Image], simple: bool):
+    """
+    Main function called by gradio button.
+    Returns:
+      - patch_grid_image
+      - attention_overlay (default: last layer, head 0)
+      - PCA figure
+      - predictions table
+      - explanation markdown
+      - state dict (for attention slider updates)
+    """
+    if img is None:
+        return (
+            None,
+            None,
+            None,
+            [],
+            "⬆️ Please upload an image (e.g., a dog, a car, a object).",
+            {},
+        )
+    model, processor = load_vit()
+    # 1) Preprocess
+    img_resized = img.convert("RGB").resize((224, 224))
+    patch_grid_img = make_patch_grid_image(img_resized)
+    inputs = processor(images=img_resized, return_tensors="pt")
+    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # 2) Predictions (top-5)
+    logits = outputs.logits[0].cpu().numpy()
+    probs = np.exp(logits - logits.max())
+    probs = probs / probs.sum()
+    topk_idx = probs.argsort()[-5:][::-1]
+    id2label = model.config.id2label
+    preds_table = [
+        [id2label[int(i)], float(probs[int(i)])] for i in topk_idx
+    ]
+    # 3) Patch embeddings from last hidden state
+    #    hidden_states[-1]: (batch, seq_len, hidden)
+    hidden_last = outputs.hidden_states[-1][0].cpu().numpy()  # (seq, hidden)
+    # seq layout: [CLS] + patches
+    patch_emb = hidden_last[1:, :]  # (N_patches, hidden)
+    pca_fig = make_pca_plot(patch_emb)
+    # 4) Attention -> CLS to patches grid per layer/head
+    attentions = outputs.attentions  # list of (batch, heads, seq, seq)
+    num_layers = len(attentions)
+    num_heads = attentions[0].shape[1] if num_layers > 0 else 0
+    # ViT-base: 14x14 = 196 patches
+    seq_len = attentions[0].shape[-1]  # 1 + N_patches
+    n_patches = seq_len - 1
+    grid_size = int(math.sqrt(n_patches))
+    if grid_size * grid_size != n_patches:
+        # fallback: approximate
+        grid_size = int(round(math.sqrt(n_patches)))
+    cls_to_patch = np.zeros(
+        (num_layers, num_heads, grid_size, grid_size), dtype=np.float32
+    )
+    for l, att in enumerate(attentions):
+        a = att[0].cpu().numpy()  # (heads, seq, seq)
+        # CLS token index = 0, patches = 1..N
+        cls_vec = a[:, 0, 1:]  # (heads, N_patches)
+        # if shapes mismatch, pad/truncate
+        if cls_vec.shape[1] != grid_size * grid_size:
+            tmp = np.zeros((num_heads, grid_size * grid_size), dtype=np.float32)
+            n_min = min(tmp.shape[1], cls_vec.shape[1])
+            tmp[:, :n_min] = cls_vec[:, :n_min]
+            cls_vec = tmp
+        cls_grid = cls_vec.reshape(num_heads, grid_size, grid_size)
+        cls_to_patch[l] = cls_grid
+    # default attention overlay: last layer, head 0
+    default_layer = num_layers - 1
+    default_head = 0
+    att_grid_default = cls_to_patch[default_layer, default_head]
+    att_overlay = make_attention_overlay(img_resized, att_grid_default)
+    # 5) Explanation
+    explanation = build_explanation(simple, num_layers, num_heads, grid_size)
+    # 6) State for slider updates
+    state = {
+        "cls_to_patch": cls_to_patch,
+        "grid_size": grid_size,
+        "num_layers": num_layers,
+        "num_heads": num_heads,
+        # we also keep a copy of the 224x224 base image in memory
+        "base_image": img_resized,
+    }
+    return patch_grid_img, att_overlay, pca_fig, preds_table, explanation, state
+def build_explanation(
+    simple: bool, num_layers: int, num_heads: int, grid_size: int
+) -> str:
+    if simple:
+        return f"""
+### 🧒 How a Vision Transformer (ViT) “sees” this image
+1. **Cut into patches** – The image is sliced into **{grid_size}×{grid_size} = {grid_size*grid_size}** small squares.
+2. **Turn patches into tokens** – Each patch becomes a little vector (like a word in a sentence).
+3. **Add position info** – The model remembers where each patch came from (top-left, bottom-right, etc.).
+4. **Look around with attention** – In each of the **{num_layers} layers**, the model lets every patch
+   look at other patches using **self-attention** (with {num_heads} attention heads).
+5. **Understand the whole image** – After many layers, ViT builds a global understanding of the scene
+   and predicts what’s in the picture (top-5 shown on the right).
+The heatmap shows **where the special [CLS] token is looking** in the last layer.
+"""
+    else:
+        return f"""
+### 🔬 Vision Transformer internals (technical view)
+- The image is resized to 224×224 and split into **{grid_size}×{grid_size} = {grid_size*grid_size}** patches.
+- Each patch is linearly projected into an embedding and combined with a positional embedding,
+  forming a sequence of tokens: `[CLS] + P₁ + P₂ + … + Pₙ`.
+- The ViT encoder has **{num_layers} transformer layers** with **{num_heads} attention heads** each.
+  In every layer, **self-attention** mixes information across all patches, enabling long-range dependencies
+  and global context.
+- The [CLS] token aggregates information across patches and is passed through a classification head to produce
+  logits over ImageNet-1k classes (we show the top-5).
+- The attention heatmap we display is:
+  - From **[CLS] → patch tokens**
+  - For a selected `(layer, head)`
+  - Reshaped into a `{grid_size}×{grid_size}` grid and upsampled to image resolution for overlay.
+- The PCA plot shows the **final-layer patch embeddings** projected to 2D, giving an intuition of how
+  ViT places patches in a semantic space.
+Use the sliders to explore different layers and heads and see how the attention focus changes.
+"""
+# ---------------------- ATTENTION SLIDER UPDATE ----------------------
+def update_attention_view(
+    state: Dict[str, Any], layer_idx: int, head_idx: int
+):
+    """
+    Called when user moves the layer/head sliders.
+    Returns a new attention overlay image.
+    """
+    if not state or "cls_to_patch" not in state:
+        return None
+    cls_to_patch = state["cls_to_patch"]
+    base_img = state["base_image"]
+    num_layers = state["num_layers"]
+    num_heads = state["num_heads"]
+    # clamp indices safely
+    l = max(0, min(int(layer_idx), num_layers - 1))
+    h = max(0, min(int(head_idx), num_heads - 1))
+    grid = cls_to_patch[l, h]
+    overlay = make_attention_overlay(base_img, grid)
+    return overlay
+# ---------------------- BUILD UI ----------------------
+with gr.Blocks(title="Vision Transformer (ViT) Visualizer") as demo:
+    gr.Markdown(
+        """
+# 🧠 Vision Transformer (ViT) — How It Sees the World
+Upload an image and explore how a Vision Transformer:
+- Cuts it into patches (tokens)
+- Attends to different regions via self-attention
+- Embeds patches into a high-dimensional space
+- Predicts what’s in the image
+Toggle **simple / technical** explanation and move the sliders to change
+which layer/head's attention you’re seeing.
+"""
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            img_in = gr.Image(
+                label="Upload image",
+                type="pil",
+            )
+            simple_ck = gr.Checkbox(
+                label="Simple explanation (for everyone)",
+                value=True,
+            )
+            run_btn = gr.Button("Run ViT Analysis", variant="primary")
+            gr.Markdown(
+                "Try images like: animals, objects, scenes. This uses `google/vit-base-patch16-224` (ImageNet-1k)."
+            )
+        with gr.Column(scale=1):
+            preds_df = gr.Dataframe(
+                headers=["Label", "Probability"],
+                datatype=["str", "number"],
+                interactive=False,
+                label="Top-5 predictions",
+            )
+            explanation_md = gr.Markdown(label="Explanation")
+    gr.Markdown("## 🧩 Tokens & Attention")
+    with gr.Row():
+        patch_img = gr.Image(
+            label="Patches (16×16) — how ViT tokenizes the image",
+            interactive=False,
+        )
+        attn_img = gr.Image(
+            label="Attention heatmap (CLS → patches)",
+            interactive=False,
+        )
+    with gr.Row():
+        layer_slider = gr.Slider(
+            minimum=0,
+            maximum=11,  # ViT-base has 12 layers (0-11)
+            step=1,
+            value=11,
+            label="Layer (0 = shallow, 11 = deepest)",
+        )
+        head_slider = gr.Slider(
+            minimum=0,
+            maximum=11,  # 12 attention heads
+            step=1,
+            value=0,
+            label="Head index",
+        )
+    gr.Markdown("## 🌌 Patch embeddings in 2D (PCA)")
+    pca_plot = gr.Plot(label="Patches in embedding space (last layer)")
+    state = gr.State()
+    # main button: run full analysis
+    run_btn.click(
+        fn=analyze_vit,
+        inputs=[img_in, simple_ck],
+        outputs=[patch_img, attn_img, pca_plot, preds_df, explanation_md, state],
+    )
+    # when sliders change: update attention overlay only
+    layer_slider.change(
+        fn=update_attention_view,
+        inputs=[state, layer_slider, head_slider],
+        outputs=[attn_img],
+    )
+    head_slider.change(
+        fn=update_attention_view,
+        inputs=[state, layer_slider, head_slider],
+        outputs=[attn_img],
+    )
+demo.launch()