Spaces:

jfang
/

embedding-helper

Running on CPU Upgrade

App Files Files Community

jichao Claude Opus 4.6 commited on Feb 12

Commit

48207c2

1 Parent(s): 7064790

add multi_fps_k32 output for late interaction re-ranking

Browse files

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

.gitattributes +1 -3
app.py +104 -0

.gitattributes CHANGED Viewed

@@ -32,6 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-.claude/

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import gradio as gr
 import torch
 import timm
 from torchvision import transforms
 from PIL import Image
 import numpy as np
 import os
 # --- Model Configuration ---
 DEFAULT_MODEL_NAME = "dino-vits-mae-100epoch-1217-1220-e50"
@@ -118,6 +120,92 @@ def get_preprocess(model_name: str):
     ])
     return transforms.Compose(transforms_list)
 # --- Embedding Function ---
 def get_embedding(image_pil: Image.Image, model_name: str, embedding_method: str = 'cls') -> dict:
     """Preprocesses an image, extracts embedding using the specified method for the
@@ -128,6 +216,7 @@ def get_embedding(image_pil: Image.Image, model_name: str, embedding_method: str
             "model_name": model_name,
             "embedding_method": embedding_method,
             "data": None,
             "message": "Error: Please upload an image."
         }
     if model_name not in MODEL_CONFIGS:
@@ -135,6 +224,7 @@ def get_embedding(image_pil: Image.Image, model_name: str, embedding_method: str
             "model_name": model_name,
             "embedding_method": embedding_method,
             "data": None,
             "message": f"Error: Unknown model name '{model_name}'."
         }
@@ -151,6 +241,7 @@ def get_embedding(image_pil: Image.Image, model_name: str, embedding_method: str
                 "model_name": model_name,
                 "embedding_method": embedding_method,
                 "data": None,
                 "message": error_msg
             }
@@ -208,12 +299,23 @@ def get_embedding(image_pil: Image.Image, model_name: str, embedding_method: str
                     "model_name": model_name,
                     "embedding_method": embedding_method,
                     "data": None,
                     "message": f"Error: Unexpected feature output shape from model '{model_name}'. Check logs."
                  }
             normalized_embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)
         embedding_list = normalized_embedding.squeeze().cpu().numpy().tolist()
         if not isinstance(embedding_list, list):
              embedding_list = [embedding_list] # Ensure it's always a list
@@ -222,6 +324,7 @@ def get_embedding(image_pil: Image.Image, model_name: str, embedding_method: str
             "model_name": model_name,
             "embedding_method": embedding_method,
             "data": embedding_list,
             "message": "Success"
         }
@@ -234,6 +337,7 @@ def get_embedding(image_pil: Image.Image, model_name: str, embedding_method: str
             "model_name": model_name,
             "embedding_method": embedding_method,
             "data": None,
             "message": error_msg
         }

 import gradio as gr
 import torch
+import torch.nn.functional as F
 import timm
 from torchvision import transforms
 from PIL import Image
 import numpy as np
 import os
+from typing import Tuple
 # --- Model Configuration ---
 DEFAULT_MODEL_NAME = "dino-vits-mae-100epoch-1217-1220-e50"
     ])
     return transforms.Compose(transforms_list)
+# --- Multi-token FPS Aggregation ---
+def select_seeds_fps(patch_tokens: torch.Tensor, k: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Farthest-point sampling in embedding space.
+    Greedily selects tokens that maximize minimum cosine distance to
+    already-selected tokens. Starts from the token with highest L2 norm.
+    """
+    N, num_patches, D = patch_tokens.shape
+    tokens_norm = F.normalize(patch_tokens, dim=-1)
+    cos_sim = torch.bmm(tokens_norm, tokens_norm.transpose(1, 2))  # (N, P, P)
+    dist = 1.0 - cos_sim
+    norms = patch_tokens.norm(dim=-1)  # (N, P)
+    selected = [norms.argmax(dim=-1)]  # [(N,)]
+    batch_range = torch.arange(N, device=device)
+    min_dist = dist[batch_range, selected[0]]  # (N, P)
+    for _ in range(1, k):
+        new_idx = min_dist.argmax(dim=-1)  # (N,)
+        selected.append(new_idx)
+        new_dists = dist[batch_range, new_idx]  # (N, P)
+        min_dist = torch.minimum(min_dist, new_dists)
+    seed_indices = torch.stack(selected, dim=1)  # (N, K)
+    batch_idx = torch.arange(N, device=device).unsqueeze(1).expand(-1, k)
+    seed_tokens = patch_tokens[batch_idx, seed_indices]  # (N, K, D)
+    return seed_indices, seed_tokens
+def assign_hard_top1(
+    patch_tokens: torch.Tensor,
+    seed_tokens: torch.Tensor,
+    seed_indices: torch.Tensor,
+    device: torch.device,
+) -> torch.Tensor:
+    """Each non-seed token -> nearest seed (binary weights)."""
+    N, num_patches, D = patch_tokens.shape
+    K = seed_tokens.shape[1]
+    p_norm = F.normalize(patch_tokens, dim=-1)
+    s_norm = F.normalize(seed_tokens, dim=-1)
+    cos_sim = torch.bmm(p_norm, s_norm.transpose(1, 2))  # (N, P, K)
+    nearest = cos_sim.argmax(dim=-1)  # (N, P)
+    W = torch.zeros(N, num_patches, K, device=device)
+    n_idx = torch.arange(N, device=device).unsqueeze(1).expand(-1, num_patches)
+    p_idx = torch.arange(num_patches, device=device).unsqueeze(0).expand(N, -1)
+    W[n_idx, p_idx, nearest] = 1.0
+    batch_arange = torch.arange(N, device=device)
+    for ki in range(K):
+        W[batch_arange, seed_indices[:, ki], :] = 0.0
+    return W
+def aggregate_tokens(
+    patch_tokens: torch.Tensor,
+    seed_tokens: torch.Tensor,
+    W: torch.Tensor,
+) -> torch.Tensor:
+    """Aggregate non-seed tokens into seed tokens via weighted mean, L2-normalized."""
+    weighted_sum = torch.einsum('nik,nid->nkd', W, patch_tokens)
+    w_sum = W.sum(dim=1, keepdim=True).transpose(1, 2).clamp(min=1e-8)  # (N, K, 1)
+    agg = seed_tokens + weighted_sum / w_sum
+    agg = F.normalize(agg, dim=-1)
+    return agg
+def compute_multi_fps(patch_tokens: torch.Tensor, k: int = 32) -> torch.Tensor:
+    """
+    Full FPS pipeline: select seeds, assign, aggregate.
+    Returns (N, K, D) L2-normalized aggregated tokens.
+    """
+    device = patch_tokens.device
+    seed_indices, seed_tokens = select_seeds_fps(patch_tokens, k, device)
+    W = assign_hard_top1(patch_tokens, seed_tokens, seed_indices, device)
+    return aggregate_tokens(patch_tokens, seed_tokens, W)
 # --- Embedding Function ---
 def get_embedding(image_pil: Image.Image, model_name: str, embedding_method: str = 'cls') -> dict:
     """Preprocesses an image, extracts embedding using the specified method for the
             "model_name": model_name,
             "embedding_method": embedding_method,
             "data": None,
+            "multi_fps_k32": None,
             "message": "Error: Please upload an image."
         }
     if model_name not in MODEL_CONFIGS:
             "model_name": model_name,
             "embedding_method": embedding_method,
             "data": None,
+            "multi_fps_k32": None,
             "message": f"Error: Unknown model name '{model_name}'."
         }
                 "model_name": model_name,
                 "embedding_method": embedding_method,
                 "data": None,
+                "multi_fps_k32": None,
                 "message": error_msg
             }
                     "model_name": model_name,
                     "embedding_method": embedding_method,
                     "data": None,
+                    "multi_fps_k32": None,
                     "message": f"Error: Unexpected feature output shape from model '{model_name}'. Check logs."
                  }
             normalized_embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)
+            # Compute multi-token FPS aggregation (32 tokens)
+            multi_fps_data = None
+            if len(features.shape) == 3 and features.shape[1] > 1:
+                patch_tokens = features[:, 1:]  # (B, num_patches, D)
+                num_patches = patch_tokens.shape[1]
+                k = min(32, num_patches)
+                if k > 0:
+                    agg_tokens = compute_multi_fps(patch_tokens, k=k)  # (B, K, D)
+                    multi_fps_data = agg_tokens.squeeze(0).cpu().numpy().tolist()
         embedding_list = normalized_embedding.squeeze().cpu().numpy().tolist()
         if not isinstance(embedding_list, list):
              embedding_list = [embedding_list] # Ensure it's always a list
             "model_name": model_name,
             "embedding_method": embedding_method,
             "data": embedding_list,
+            "multi_fps_k32": multi_fps_data,
             "message": "Success"
         }
             "model_name": model_name,
             "embedding_method": embedding_method,
             "data": None,
+            "multi_fps_k32": None,
             "message": error_msg
         }