DINOv3-two.image.features

Sleeping

App Files Files Community

Rausda6 commited on Aug 29

Commit

bf826bc

verified ·

1 Parent(s): b90fe2f

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -57

app.py CHANGED Viewed

@@ -1,30 +1,34 @@
 import os
 import numpy as np
 from PIL import Image, ImageDraw
 import torch
 import torch.nn.functional as F
 import torchvision.transforms.functional as TF
-from transformers import AutoModel # trust_remote_code=True
 import gradio as gr
-# --- config
 DEFAULT_MODEL_ID = "facebook/dinov3-vits16plus-pretrain-lvd1689m"
 ALT_MODEL_ID = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
 AVAILABLE_MODELS = [DEFAULT_MODEL_ID, ALT_MODEL_ID]
 PATCH_SIZE = 16
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
 N_SPECIAL_TOKENS = 5
-# --- robust colormap import (matplotlib new/old)
 try:
     from matplotlib import colormaps as _mpl_colormaps
     def _get_cmap(name: str):
@@ -34,9 +38,9 @@ except Exception:
     def _get_cmap(name: str):
         return _cm.get_cmap(name)
-# ----------------------------
 # Model loading / cache
-# ----------------------------
 _model_cache = {}
 _current_model_id = None
 model = None
@@ -60,12 +64,12 @@ def get_model(model_id: str):
 model = get_model(DEFAULT_MODEL_ID)
 _current_model_id = DEFAULT_MODEL_ID
-# ----------------------------
 # Helpers
-# ----------------------------
 def resize_to_grid(img: Image.Image, long_side: int, patch: int = PATCH_SIZE) -> torch.Tensor:
-    """Resize so max(h,w)=long_side with aspect kept; then pad up to multiples of patch.
     Return CHW float tensor in [0,1]."""
     w, h = img.size
     scale = long_side / max(h, w)
@@ -89,7 +93,7 @@ def blend(base: Image.Image, heat: Image.Image, alpha: float = 0.55) -> Image.Im
     out = Image.alpha_composite(base, heat)
     return out.convert("RGB")
-def draw_crosshair(img: Image.Image, x: int, y: int, radius: int = None) -> Image.Image:
     r = radius if radius is not None else max(2, PATCH_SIZE // 2)
     out = img.copy()
     draw = ImageDraw.Draw(out)
@@ -97,12 +101,11 @@ def draw_crosshair(img: Image.Image, x: int, y: int, radius: int = None) -> Imag
     draw.line([(x, y - r), (x, y + r)], fill="red", width=3)
     return out
-# ----------------------------
 # Feature extraction
-# ----------------------------
 @torch.inference_mode()
 def extract_image_features(image_pil: Image.Image, target_long_side: int, mdl=None):
-    global model
     mdl = mdl or model
     t = resize_to_grid(image_pil, target_long_side, PATCH_SIZE)
     t_norm = TF.normalize(t, IMAGENET_MEAN, IMAGENET_STD).unsqueeze(0).to(DEVICE)
@@ -116,16 +119,9 @@ def extract_image_features(image_pil: Image.Image, target_long_side: int, mdl=No
     return {"X": X, "Hp": Hp, "Wp": Wp, "img": img_resized}
-# ----------------------------
 # Similarity utilities
-# ----------------------------
-def index_from_xy(x_pix: int, y_pix: int, Wp: int) -> int:
-    col = int(np.clip(x_pix // PATCH_SIZE, 0, Wp - 1))
-    row = int(np.clip(y_pix // PATCH_SIZE, 0, (x_pix*0 + y_pix) // PATCH_SIZE))  # placeholder row calc replaced below
-    return row * Wp + col
-# Corrected row/col computation helper
 def row_col_from_xy(x_pix: int, y_pix: int, Hp: int, Wp: int):
     col = int(np.clip(x_pix // PATCH_SIZE, 0, Wp - 1))
@@ -134,19 +130,9 @@ def row_col_from_xy(x_pix: int, y_pix: int, Hp: int, Wp: int):
 @torch.inference_mode()
 def similarity_map(X: torch.Tensor, Hp: int, Wp: int, q_vec: torch.Tensor,
-                   img_h: int, img_w: int, exclude_radius_patches: int = 1):
     sims = torch.matmul(X, q_vec)  # (Hp*Wp)
     sim_map = sims.view(Hp, Wp)
-    if exclude_radius_patches > 0:
-        rr, cc = torch.meshgrid(
-            torch.arange(Hp, device=sims.device),
-            torch.arange(Wp, device=sims.device),
-            indexing="ij",
-        )
-        # We'll mask later at the click location per-image if needed
-        mask_template = (rr * 0)  # kept for API parity
     sim_up = F.interpolate(
         sim_map.unsqueeze(0).unsqueeze(0),
         size=(img_h, img_w),
@@ -155,11 +141,11 @@ def similarity_map(X: torch.Tensor, Hp: int, Wp: int, q_vec: torch.Tensor,
     ).squeeze().detach().cpu().numpy()
     return sim_map, sim_up
-# ----------------------------
 # Core: click on image 1 → heatmaps on image 1 and image 2
-# ----------------------------
-def click_two_image_similarity(state1: dict, state2: dict, click_xy: tuple[int, int],
                                exclude_radius_patches: int, alpha: float, cmap_name: str):
     if not state1 or not state2:
         return (None,)*6
@@ -170,13 +156,13 @@ def click_two_image_similarity(state1: dict, state2: dict, click_xy: tuple[int,
     img1_w, img1_h = img1.size
     img2_w, img2_h = img2.size
-    # Build query vector from clicked patch on image 1
     col = int(np.clip(click_xy[0] // PATCH_SIZE, 0, Wp1 - 1))
     row = int(np.clip(click_xy[1] // PATCH_SIZE, 0, Hp1 - 1))
     idx = row * Wp1 + col
     q = X1[idx]  # (D,)
-    # Similarity on image 1
     sims1 = torch.matmul(X1, q)
     sim_map1 = sims1.view(Hp1, Wp1)
     if exclude_radius_patches > 0:
@@ -199,7 +185,7 @@ def click_two_image_similarity(state1: dict, state2: dict, click_xy: tuple[int,
     overlay1 = blend(img1, heat1, alpha)
     marked1 = draw_crosshair(img1, int(click_xy[0]), int(click_xy[1]), radius=PATCH_SIZE // 2)
-    # Similarity on image 2 (no exclusion mask, since click is on image 1)
     sims2 = torch.matmul(X2, q)
     sim_map2 = sims2.view(Hp2, Wp2)
     sim2_up = F.interpolate(
@@ -212,16 +198,14 @@ def click_two_image_similarity(state1: dict, state2: dict, click_xy: tuple[int,
     heat2 = colorize(sim2_up, cmap_name)
     overlay2 = blend(img2, heat2, alpha)
-    return marked1, heat1, overlay1, heat2, overlay2, sim2_up.max().item()
-# ----------------------------
 # Gradio UI
-# ----------------------------
 with gr.Blocks(theme=gr.themes.Soft(), title="DINOv3 Two‑Image Patch Similarity") as demo:
     gr.Markdown("# DINOv3 Two‑Image Patch Similarity")
-    gr.Markdown("Upload two images, process, then click on image 1 to see similarities on both.")
     state1 = gr.State()
     state2 = gr.State()
@@ -229,17 +213,52 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DINOv3 Two‑Image Patch Similarit
     with gr.Row():
         with gr.Column():
             model_choice = gr.Dropdown(choices=AVAILABLE_MODELS, value=DEFAULT_MODEL_ID, label="Backbone")
-            target_long_side = gr.Slider(224, 1024, value=768, step=16, label="Resolution")
             alpha = gr.Slider(0.0, 1.0, value=0.55, step=0.05, label="Overlay opacity")
             cmap = gr.Dropdown(["viridis", "magma", "plasma", "inferno", "turbo", "cividis"], value="viridis", label="Colormap")
-            exclude_r = gr.Slider(0, 10, value=0, step=1, label="Exclude radius")
             start_btn = gr.Button("▶️ Process both", variant="primary")
         with gr.Column():
-            img1 = gr.Image(label="Image 1 (clickable)", type="pil", value=None)
-            img2 = gr.Image(label="Image 2", type="pil", value=None)
-    # (rest of app: outputs, event wiring, functions, unchanged)
-if __name__ == "__main__":
-    demo.launch()

+# app.py — DINOv3 two‑image patch similarity (click on Image 1 → show similarities on both images)
+# Runs on CPU or CUDA. No external image URLs.
 import os
+from typing import Tuple
 import numpy as np
 from PIL import Image, ImageDraw
 import torch
 import torch.nn.functional as F
 import torchvision.transforms.functional as TF
+from transformers import AutoModel  # trust_remote_code=True
 import gradio as gr
+# ============================
+# Config
+# ============================
 DEFAULT_MODEL_ID = "facebook/dinov3-vits16plus-pretrain-lvd1689m"
 ALT_MODEL_ID = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
 AVAILABLE_MODELS = [DEFAULT_MODEL_ID, ALT_MODEL_ID]
 PATCH_SIZE = 16
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD  = (0.229, 0.224, 0.225)
+# Many DINOv3 HF ports expose 1 [CLS] + 4 registers at the front
 N_SPECIAL_TOKENS = 5
+# Robust colormap import (Matplotlib new/old)
 try:
     from matplotlib import colormaps as _mpl_colormaps
     def _get_cmap(name: str):
     def _get_cmap(name: str):
         return _cm.get_cmap(name)
+# ============================
 # Model loading / cache
+# ============================
 _model_cache = {}
 _current_model_id = None
 model = None
 model = get_model(DEFAULT_MODEL_ID)
 _current_model_id = DEFAULT_MODEL_ID
+# ============================
 # Helpers
+# ============================
 def resize_to_grid(img: Image.Image, long_side: int, patch: int = PATCH_SIZE) -> torch.Tensor:
+    """Resize so max(h,w)=long_side with aspect kept; then pad to multiples of patch.
     Return CHW float tensor in [0,1]."""
     w, h = img.size
     scale = long_side / max(h, w)
     out = Image.alpha_composite(base, heat)
     return out.convert("RGB")
+def draw_crosshair(img: Image.Image, x: int, y: int, radius: int | None = None) -> Image.Image:
     r = radius if radius is not None else max(2, PATCH_SIZE // 2)
     out = img.copy()
     draw = ImageDraw.Draw(out)
     draw.line([(x, y - r), (x, y + r)], fill="red", width=3)
     return out
+# ============================
 # Feature extraction
+# ============================
 @torch.inference_mode()
 def extract_image_features(image_pil: Image.Image, target_long_side: int, mdl=None):
     mdl = mdl or model
     t = resize_to_grid(image_pil, target_long_side, PATCH_SIZE)
     t_norm = TF.normalize(t, IMAGENET_MEAN, IMAGENET_STD).unsqueeze(0).to(DEVICE)
     return {"X": X, "Hp": Hp, "Wp": Wp, "img": img_resized}
+# ============================
 # Similarity utilities
+# ============================
 def row_col_from_xy(x_pix: int, y_pix: int, Hp: int, Wp: int):
     col = int(np.clip(x_pix // PATCH_SIZE, 0, Wp - 1))
 @torch.inference_mode()
 def similarity_map(X: torch.Tensor, Hp: int, Wp: int, q_vec: torch.Tensor,
+                   img_h: int, img_w: int):
     sims = torch.matmul(X, q_vec)  # (Hp*Wp)
     sim_map = sims.view(Hp, Wp)
     sim_up = F.interpolate(
         sim_map.unsqueeze(0).unsqueeze(0),
         size=(img_h, img_w),
     ).squeeze().detach().cpu().numpy()
     return sim_map, sim_up
+# ============================
 # Core: click on image 1 → heatmaps on image 1 and image 2
+# ============================
+def click_two_image_similarity(state1: dict, state2: dict, click_xy: Tuple[int, int],
                                exclude_radius_patches: int, alpha: float, cmap_name: str):
     if not state1 or not state2:
         return (None,)*6
     img1_w, img1_h = img1.size
     img2_w, img2_h = img2.size
+    # Query vector from clicked patch on image 1
     col = int(np.clip(click_xy[0] // PATCH_SIZE, 0, Wp1 - 1))
     row = int(np.clip(click_xy[1] // PATCH_SIZE, 0, Hp1 - 1))
     idx = row * Wp1 + col
     q = X1[idx]  # (D,)
+    # Similarity on image 1 (+ small exclusion mask around click if requested)
     sims1 = torch.matmul(X1, q)
     sim_map1 = sims1.view(Hp1, Wp1)
     if exclude_radius_patches > 0:
     overlay1 = blend(img1, heat1, alpha)
     marked1 = draw_crosshair(img1, int(click_xy[0]), int(click_xy[1]), radius=PATCH_SIZE // 2)
+    # Similarity on image 2
     sims2 = torch.matmul(X2, q)
     sim_map2 = sims2.view(Hp2, Wp2)
     sim2_up = F.interpolate(
     heat2 = colorize(sim2_up, cmap_name)
     overlay2 = blend(img2, heat2, alpha)
+    return marked1, heat1, overlay1, heat2, overlay2, float(sim2_up.max())
+# ============================
 # Gradio UI
+# ============================
 with gr.Blocks(theme=gr.themes.Soft(), title="DINOv3 Two‑Image Patch Similarity") as demo:
     gr.Markdown("# DINOv3 Two‑Image Patch Similarity")
+    gr.Markdown("Upload two images and press **Process both**. Then click on **Image 1** to see similar regions on **both** images.")
     state1 = gr.State()
     state2 = gr.State()
     with gr.Row():
         with gr.Column():
             model_choice = gr.Dropdown(choices=AVAILABLE_MODELS, value=DEFAULT_MODEL_ID, label="Backbone")
+            target_long_side = gr.Slider(224, 1024, value=768, step=16, label="Resolution (long side)")
             alpha = gr.Slider(0.0, 1.0, value=0.55, step=0.05, label="Overlay opacity")
             cmap = gr.Dropdown(["viridis", "magma", "plasma", "inferno", "turbo", "cividis"], value="viridis", label="Colormap")
+            exclude_r = gr.Slider(0, 10, value=0, step=1, label="Exclude radius (patches) for Image 1")
             start_btn = gr.Button("▶️ Process both", variant="primary")
         with gr.Column():
+            img1 = gr.Image(label="Image 1 (clickable)", type="pil", sources=["upload", "clipboard"], value=None)
+            img2 = gr.Image(label="Image 2", type="pil", sources=["upload", "clipboard"], value=None)
+    with gr.Row():
+        with gr.Column():
+            marked1 = gr.Image(label="Image 1 — click marker / preview", interactive=False)
+            heat1   = gr.Image(label="Image 1 — similarity heatmap", interactive=False)
+            overlay1= gr.Image(label="Image 1 — overlay", interactive=False)
+        with gr.Column():
+            heat2   = gr.Image(label="Image 2 — similarity heatmap", interactive=False)
+            overlay2= gr.Image(label="Image 2 — overlay", interactive=False)
+            score2  = gr.Number(label="Image 2 — max similarity score", precision=6)
+    # Utilities
+    def _ensure_model(model_id: str):
+        global model, _current_model_id
+        if model_id != _current_model_id:
+            model = get_model(model_id)
+            _current_model_id = model_id
+    # Process button → extract features for both images and store in state
+    def _run_both(im1: Image.Image, im2: Image.Image, long_side: int, model_id: str, progress=gr.Progress(track_tqdm=False)):
+        if im1 is None or im2 is None:
+            raise gr.Error("Please provide both images before processing.")
+        _ensure_model(model_id)
+        progress(0, desc="Extracting features for Image 1…")
+        st1 = extract_image_features(im1, int(long_side), mdl=model)
+        progress(0.5, desc="Extracting features for Image 2…")
+        st2 = extract_image_features(im2, int(long_side), mdl=model)
+        progress(1, desc="Done")
+        # Show quick previews to confirm processing
+        return st1["img"], st2["img"], st1, st2
+    start_btn.click(
+        _run_both,
+        inputs=[img1, img2, target_long_side, model_choice],
+        outputs=[marked1, overlay2, state1, state2],
+    )
+    # Clicking on Image 1 → compute similarities on both images
+    def _on_click(st1, st2, a: float, m: str, excl: int, evt: gr.SelectData):
+        if not st1 or not st2 or evt is