DINOv3-two.image.features

Sleeping

App Files Files Community

Rausda6 commited on Aug 29

Commit

b0a5be5

verified ·

1 Parent(s): 5356d23

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -13

app.py CHANGED Viewed

@@ -1,31 +1,216 @@
-# The error comes from trying to set a remote image URL (`value=...`) in `gr.Image`, which Gradio tries to download and cache.
-# In Spaces with restricted networking, this fails with 404. Fix: use `value=None` or a local placeholder.
-import os
-import numpy as np
-from PIL import Image, ImageDraw
-import torch
 import torch.nn.functional as F
 import torchvision.transforms.functional as TF
-from transformers import AutoModel  # trust_remote_code=True
 import gradio as gr
 # --- config
 DEFAULT_MODEL_ID = "facebook/dinov3-vits16plus-pretrain-lvd1689m"
 ALT_MODEL_ID = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
 AVAILABLE_MODELS = [DEFAULT_MODEL_ID, ALT_MODEL_ID]
 PATCH_SIZE = 16
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD  = (0.229, 0.224, 0.225)
 N_SPECIAL_TOKENS = 5
-# (rest of code identical to previous, omitted here for brevity)
-# ...
 with gr.Blocks(theme=gr.themes.Soft(), title="DINOv3 Two‑Image Patch Similarity") as demo:
     gr.Markdown("# DINOv3 Two‑Image Patch Similarity")
     gr.Markdown("Upload two images, process, then click on image 1 to see similarities on both.")
@@ -49,4 +234,4 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DINOv3 Two‑Image Patch Similarit
     # (rest of app: outputs, event wiring, functions, unchanged)
 if __name__ == "__main__":
-    demo.launch()

 import torch.nn.functional as F
 import torchvision.transforms.functional as TF
+from transformers import AutoModel # trust_remote_code=True
 import gradio as gr
 # --- config
 DEFAULT_MODEL_ID = "facebook/dinov3-vits16plus-pretrain-lvd1689m"
 ALT_MODEL_ID = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
 AVAILABLE_MODELS = [DEFAULT_MODEL_ID, ALT_MODEL_ID]
 PATCH_SIZE = 16
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
 N_SPECIAL_TOKENS = 5
+# --- robust colormap import (matplotlib new/old)
+try:
+    from matplotlib import colormaps as _mpl_colormaps
+    def _get_cmap(name: str):
+        return _mpl_colormaps[name]
+except Exception:
+    import matplotlib.cm as _cm
+    def _get_cmap(name: str):
+        return _cm.get_cmap(name)
+# ----------------------------
+# Model loading / cache
+# ----------------------------
+_model_cache = {}
+_current_model_id = None
+model = None
+def load_model_from_hub(model_id: str):
+    print(f"Loading model '{model_id}' from HF Hub…")
+    token = os.environ.get("HF_TOKEN")
+    mdl = AutoModel.from_pretrained(model_id, token=token, trust_remote_code=True)
+    mdl.to(DEVICE).eval()
+    print(f"✅ Loaded '{model_id}' on {DEVICE}")
+    return mdl
+def get_model(model_id: str):
+    if model_id in _model_cache:
+        return _model_cache[model_id]
+    mdl = load_model_from_hub(model_id)
+    _model_cache[model_id] = mdl
+    return mdl
+# Load default at startup
+model = get_model(DEFAULT_MODEL_ID)
+_current_model_id = DEFAULT_MODEL_ID
+# ----------------------------
+# Helpers
+# ----------------------------
+def resize_to_grid(img: Image.Image, long_side: int, patch: int = PATCH_SIZE) -> torch.Tensor:
+    """Resize so max(h,w)=long_side with aspect kept; then pad up to multiples of patch.
+    Return CHW float tensor in [0,1]."""
+    w, h = img.size
+    scale = long_side / max(h, w)
+    new_h = max(patch, int(round(h * scale)))
+    new_w = max(patch, int(round(w * scale)))
+    new_h = ((new_h + patch - 1) // patch) * patch
+    new_w = ((new_w + patch - 1) // patch) * patch
+    return TF.to_tensor(TF.resize(img.convert("RGB"), (new_h, new_w)))
+def colorize(sim_map_up: np.ndarray, cmap_name: str = "viridis") -> Image.Image:
+    x = sim_map_up.astype(np.float32)
+    x = (x - x.min()) / (x.max() - x.min() + 1e-6)
+    rgb = (_get_cmap(cmap_name)(x)[..., :3] * 255).astype(np.uint8)
+    return Image.fromarray(rgb)
+def blend(base: Image.Image, heat: Image.Image, alpha: float = 0.55) -> Image.Image:
+    base = base.convert("RGBA")
+    heat = heat.convert("RGBA")
+    a = Image.new("L", heat.size, int(255 * alpha))
+    heat.putalpha(a)
+    out = Image.alpha_composite(base, heat)
+    return out.convert("RGB")
+def draw_crosshair(img: Image.Image, x: int, y: int, radius: int = None) -> Image.Image:
+    r = radius if radius is not None else max(2, PATCH_SIZE // 2)
+    out = img.copy()
+    draw = ImageDraw.Draw(out)
+    draw.line([(x - r, y), (x + r, y)], fill="red", width=3)
+    draw.line([(x, y - r), (x, y + r)], fill="red", width=3)
+    return out
+# ----------------------------
+# Feature extraction
+# ----------------------------
+@torch.inference_mode()
+def extract_image_features(image_pil: Image.Image, target_long_side: int, mdl=None):
+    global model
+    mdl = mdl or model
+    t = resize_to_grid(image_pil, target_long_side, PATCH_SIZE)
+    t_norm = TF.normalize(t, IMAGENET_MEAN, IMAGENET_STD).unsqueeze(0).to(DEVICE)
+    _, _, H, W = t_norm.shape
+    Hp, Wp = H // PATCH_SIZE, W // PATCH_SIZE
+    outputs = mdl(t_norm)
+    patch_emb = outputs.last_hidden_state.squeeze(0)[N_SPECIAL_TOKENS:, :]  # skip special tokens
+    X = F.normalize(patch_emb, p=2, dim=-1)  # (Hp*Wp, D), L2 norm for cosine
+    img_resized = TF.to_pil_image(t)
+    return {"X": X, "Hp": Hp, "Wp": Wp, "img": img_resized}
+# ----------------------------
+# Similarity utilities
+# ----------------------------
+def index_from_xy(x_pix: int, y_pix: int, Wp: int) -> int:
+    col = int(np.clip(x_pix // PATCH_SIZE, 0, Wp - 1))
+    row = int(np.clip(y_pix // PATCH_SIZE, 0, (x_pix*0 + y_pix) // PATCH_SIZE))  # placeholder row calc replaced below
+    return row * Wp + col
+# Corrected row/col computation helper
+def row_col_from_xy(x_pix: int, y_pix: int, Hp: int, Wp: int):
+    col = int(np.clip(x_pix // PATCH_SIZE, 0, Wp - 1))
+    row = int(np.clip(y_pix // PATCH_SIZE, 0, Hp - 1))
+    return row, col
+@torch.inference_mode()
+def similarity_map(X: torch.Tensor, Hp: int, Wp: int, q_vec: torch.Tensor,
+                   img_h: int, img_w: int, exclude_radius_patches: int = 1):
+    sims = torch.matmul(X, q_vec)  # (Hp*Wp)
+    sim_map = sims.view(Hp, Wp)
+    if exclude_radius_patches > 0:
+        rr, cc = torch.meshgrid(
+            torch.arange(Hp, device=sims.device),
+            torch.arange(Wp, device=sims.device),
+            indexing="ij",
+        )
+        # We'll mask later at the click location per-image if needed
+        mask_template = (rr * 0)  # kept for API parity
+    sim_up = F.interpolate(
+        sim_map.unsqueeze(0).unsqueeze(0),
+        size=(img_h, img_w),
+        mode="bicubic",
+        align_corners=False,
+    ).squeeze().detach().cpu().numpy()
+    return sim_map, sim_up
+# ----------------------------
+# Core: click on image 1 → heatmaps on image 1 and image 2
+# ----------------------------
+def click_two_image_similarity(state1: dict, state2: dict, click_xy: tuple[int, int],
+                               exclude_radius_patches: int, alpha: float, cmap_name: str):
+    if not state1 or not state2:
+        return (None,)*6
+    X1, Hp1, Wp1, img1 = state1["X"], state1["Hp"], state1["Wp"], state1["img"]
+    X2, Hp2, Wp2, img2 = state2["X"], state2["Hp"], state2["Wp"], state2["img"]
+    img1_w, img1_h = img1.size
+    img2_w, img2_h = img2.size
+    # Build query vector from clicked patch on image 1
+    col = int(np.clip(click_xy[0] // PATCH_SIZE, 0, Wp1 - 1))
+    row = int(np.clip(click_xy[1] // PATCH_SIZE, 0, Hp1 - 1))
+    idx = row * Wp1 + col
+    q = X1[idx]  # (D,)
+    # Similarity on image 1
+    sims1 = torch.matmul(X1, q)
+    sim_map1 = sims1.view(Hp1, Wp1)
+    if exclude_radius_patches > 0:
+        rr, cc = torch.meshgrid(
+            torch.arange(Hp1, device=sims1.device),
+            torch.arange(Wp1, device=sims1.device),
+            indexing="ij",
+        )
+        mask1 = (torch.abs(rr - row) <= exclude_radius_patches) & (torch.abs(cc - col) <= exclude_radius_patches)
+        sim_map1 = sim_map1.masked_fill(mask1, float("-inf"))
+    sim1_up = F.interpolate(
+        sim_map1.unsqueeze(0).unsqueeze(0),
+        size=(img1_h, img1_w),
+        mode="bicubic",
+        align_corners=False,
+    ).squeeze().detach().cpu().numpy()
+    heat1 = colorize(sim1_up, cmap_name)
+    overlay1 = blend(img1, heat1, alpha)
+    marked1 = draw_crosshair(img1, int(click_xy[0]), int(click_xy[1]), radius=PATCH_SIZE // 2)
+    # Similarity on image 2 (no exclusion mask, since click is on image 1)
+    sims2 = torch.matmul(X2, q)
+    sim_map2 = sims2.view(Hp2, Wp2)
+    sim2_up = F.interpolate(
+        sim_map2.unsqueeze(0).unsqueeze(0),
+        size=(img2_h, img2_w),
+        mode="bicubic",
+        align_corners=False,
+    ).squeeze().detach().cpu().numpy()
+    heat2 = colorize(sim2_up, cmap_name)
+    overlay2 = blend(img2, heat2, alpha)
+    return marked1, heat1, overlay1, heat2, overlay2, sim2_up.max().item()
+# ----------------------------
+# Gradio UI
+# ----------------------------
 with gr.Blocks(theme=gr.themes.Soft(), title="DINOv3 Two‑Image Patch Similarity") as demo:
     gr.Markdown("# DINOv3 Two‑Image Patch Similarity")
     gr.Markdown("Upload two images, process, then click on image 1 to see similarities on both.")
     # (rest of app: outputs, event wiring, functions, unchanged)
 if __name__ == "__main__":
+    demo.launch()