Spaces:
Sleeping
Sleeping
Upload 11 files
Browse files- app.py +102 -0
- bovw-codebook.pkl +3 -0
- bovw_embedding.py +32 -0
- clip_embedding.py +16 -0
- dino_embedding.py +21 -0
- efficientnet_embedding.py +21 -0
- histogram_embedding.py +21 -0
- idf.npy +3 -0
- requirements.txt +0 -0
- resnet_embedding.py +23 -0
- vit_embedding.py +18 -0
app.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
+
|
| 7 |
+
from clip_embedding import Clip
|
| 8 |
+
from efficientnet_embedding import EfficientNet
|
| 9 |
+
from vit_embedding import Vit
|
| 10 |
+
from resnet_embedding import Resnet
|
| 11 |
+
from dino_embedding import Dino
|
| 12 |
+
from histogram_embedding import cosine, get_embedding
|
| 13 |
+
from bovw_embedding import Bovw
|
| 14 |
+
|
| 15 |
+
resnet = Resnet()
|
| 16 |
+
vit = Vit()
|
| 17 |
+
efficientnet = EfficientNet()
|
| 18 |
+
bovw = Bovw()
|
| 19 |
+
dino = Dino()
|
| 20 |
+
clip = Clip()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def get_image_embedding(image: Image.Image, name):
|
| 24 |
+
match name:
|
| 25 |
+
case "ResNet":
|
| 26 |
+
return resnet.get_embedding(image).cpu().numpy()
|
| 27 |
+
case "VIT":
|
| 28 |
+
return vit.get_embedding(image).cpu().numpy()
|
| 29 |
+
case "EfficientNet":
|
| 30 |
+
return efficientnet.get_embedding(image).cpu().numpy()
|
| 31 |
+
case "Histogram":
|
| 32 |
+
return get_embedding(image)
|
| 33 |
+
case "BOVW":
|
| 34 |
+
return bovw.get_embedding(image)
|
| 35 |
+
case "DINO":
|
| 36 |
+
return dino.get_embedding(image).cpu().numpy()
|
| 37 |
+
case _:
|
| 38 |
+
return clip.get_embedding(image).cpu().numpy()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def compare_images(main_img, compare_imgs, name):
|
| 43 |
+
results = []
|
| 44 |
+
if name in ("Histogram", "ResNet", "BOVW"):
|
| 45 |
+
main_emb = get_image_embedding(main_img, name)
|
| 46 |
+
for img in compare_imgs:
|
| 47 |
+
emb = get_image_embedding(img, name)
|
| 48 |
+
results.append((img, round(cosine(main_emb, emb) * 100, 2)))
|
| 49 |
+
else:
|
| 50 |
+
main_embedding = get_image_embedding(main_img, name)
|
| 51 |
+
for img in compare_imgs:
|
| 52 |
+
emb = get_image_embedding(img, name)
|
| 53 |
+
score = cosine_similarity(main_embedding, emb)[0][0]
|
| 54 |
+
percentage = round(score * 100, 2)
|
| 55 |
+
results.append((img, percentage))
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
results.sort(key=lambda x: x[1], reverse=True)
|
| 59 |
+
|
| 60 |
+
return results
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
model_list = ["CLIP", "VIT", "EfficientNet", "ResNet", "DINO", "Histogram", "BOVW"]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
with gr.Blocks() as demo:
|
| 67 |
+
gr.Tab("Image Embedding")
|
| 68 |
+
gr.Markdown("# Image Similarity Finder")
|
| 69 |
+
gr.Markdown(
|
| 70 |
+
"Upload a main image and compare it to others. Results show similarity percentages using embeddings.")
|
| 71 |
+
|
| 72 |
+
with gr.Row():
|
| 73 |
+
with gr.Column():
|
| 74 |
+
main_image = gr.Image(type="pil", label="Main Image")
|
| 75 |
+
compare_images_input = gr.File(file_count="multiple", file_types=["image"], label="Comparison Images")
|
| 76 |
+
modelName = gr.Dropdown(model_list, label="Model", value=model_list[0])
|
| 77 |
+
submit_btn = gr.Button("Compare")
|
| 78 |
+
|
| 79 |
+
with gr.Column():
|
| 80 |
+
gallery = gr.Gallery(label="Similarity Results")
|
| 81 |
+
similarity_text = gr.Textbox(label="Similarity Scores")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def process_comparison(main_img, compare_files, name):
|
| 85 |
+
compare_imgs = [Image.open(file.name) for file in compare_files]
|
| 86 |
+
results = compare_images(main_img, compare_imgs, name)
|
| 87 |
+
|
| 88 |
+
# Prepare outputs
|
| 89 |
+
images = [result[0] for result in results]
|
| 90 |
+
scores = [f"Image: {os.path.basename(result[0].filename)} -> Similarity: {result[1]:.2f}%" for result in
|
| 91 |
+
results]
|
| 92 |
+
|
| 93 |
+
return images, "\n".join(scores)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
submit_btn.click(
|
| 97 |
+
fn=process_comparison,
|
| 98 |
+
inputs=[main_image, compare_images_input, modelName],
|
| 99 |
+
outputs=[gallery, similarity_text]
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
demo.launch()
|
bovw-codebook.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a62404bfa83e913f1b009be97230b0dc3ae0e54f0ee1a4b06f1ae79a8e35672e
|
| 3 |
+
size 92383
|
bovw_embedding.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import cv2
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import joblib
|
| 5 |
+
from scipy.cluster.vq import vq
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Bovw:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.k, self.codebook = joblib.load("bovw-codebook.pkl")
|
| 11 |
+
self.idf = np.load("idf.npy")
|
| 12 |
+
self.sift = cv2.SIFT_create()
|
| 13 |
+
|
| 14 |
+
def get_embedding(self, pil_image: Image.Image) -> np.ndarray:
|
| 15 |
+
img_np = np.array(pil_image.convert("RGB"))
|
| 16 |
+
img_np = cv2.resize(img_np, (224, 224))
|
| 17 |
+
gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY).astype('uint8')
|
| 18 |
+
|
| 19 |
+
keypoints, descriptors = self.sift.detectAndCompute(gray, None)
|
| 20 |
+
|
| 21 |
+
if descriptors is None or len(descriptors) == 0:
|
| 22 |
+
return np.zeros(self.k) # return zero-vector if no features found
|
| 23 |
+
|
| 24 |
+
visual_words, _ = vq(descriptors, self.codebook)
|
| 25 |
+
|
| 26 |
+
freq_vector = np.zeros(self.k)
|
| 27 |
+
for word in visual_words:
|
| 28 |
+
freq_vector[word] += 1
|
| 29 |
+
|
| 30 |
+
tfidf_vector = freq_vector * self.idf
|
| 31 |
+
|
| 32 |
+
return tfidf_vector
|
clip_embedding.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import CLIPProcessor, CLIPModel
|
| 3 |
+
|
| 4 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Clip:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
|
| 10 |
+
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 11 |
+
|
| 12 |
+
def get_embedding(self, img):
|
| 13 |
+
inputs = self.processor(images=img, return_tensors="pt").to(device)
|
| 14 |
+
with torch.no_grad():
|
| 15 |
+
embeddings = self.model.get_image_features(**inputs)
|
| 16 |
+
return embeddings
|
dino_embedding.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torchvision import transforms
|
| 3 |
+
|
| 4 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Dino:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.model = torch.hub.load('facebookresearch/dino:main', 'dino_vitb16').to(device)
|
| 10 |
+
self.model.eval()
|
| 11 |
+
self.transform = transforms.Compose([
|
| 12 |
+
transforms.Resize((224, 224)),
|
| 13 |
+
transforms.ToTensor(),
|
| 14 |
+
transforms.Normalize(mean=[0.5] * 3, std=[0.5] * 3),
|
| 15 |
+
])
|
| 16 |
+
|
| 17 |
+
def get_embedding(self, image):
|
| 18 |
+
img_tensor = self.transform(image).unsqueeze(0)
|
| 19 |
+
with torch.no_grad():
|
| 20 |
+
embedding = self.model(img_tensor)
|
| 21 |
+
return embedding
|
efficientnet_embedding.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torchvision import models, transforms
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class EfficientNet:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT).to(device)
|
| 10 |
+
self.model.classifier = torch.nn.Identity()
|
| 11 |
+
self.model.eval()
|
| 12 |
+
self.transform = transforms.Compose([
|
| 13 |
+
transforms.Resize((224, 224)),
|
| 14 |
+
transforms.ToTensor(),
|
| 15 |
+
])
|
| 16 |
+
|
| 17 |
+
def get_embedding(self, image):
|
| 18 |
+
img_tensor = self.transform(image).unsqueeze(0)
|
| 19 |
+
with torch.no_grad():
|
| 20 |
+
embedding = self.model(img_tensor)
|
| 21 |
+
return embedding
|
histogram_embedding.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def cosine(a, b):
|
| 6 |
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_embedding(img, bins=32):
|
| 10 |
+
img = np.array(img)
|
| 11 |
+
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
| 12 |
+
|
| 13 |
+
img = cv2.resize(img, (224, 224))
|
| 14 |
+
|
| 15 |
+
blue = cv2.calcHist([img], [0], None, [bins], [0, 256])
|
| 16 |
+
green = cv2.calcHist([img], [1], None, [bins], [0, 256])
|
| 17 |
+
red = cv2.calcHist([img], [2], None, [bins], [0, 256])
|
| 18 |
+
vector = np.concatenate([blue, green, red], axis=0)
|
| 19 |
+
vector = vector.flatten() # Flatten to 1D array
|
| 20 |
+
|
| 21 |
+
return vector
|
idf.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5acb3dab7e07a75564a93afd5ccf808d88ef864518c8da5e4b13dd1798e7642a
|
| 3 |
+
size 1728
|
requirements.txt
ADDED
|
Binary file (2.54 kB). View file
|
|
|
resnet_embedding.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torchvision
|
| 3 |
+
import torchvision.models as models
|
| 4 |
+
import torchvision.transforms as transforms
|
| 5 |
+
|
| 6 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Resnet:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.model = models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT).to(device)
|
| 12 |
+
self.model = torch.nn.Sequential(*list(self.model.children())[:-1])
|
| 13 |
+
self.model.eval()
|
| 14 |
+
self.transform = transforms.Compose([
|
| 15 |
+
transforms.Resize((224, 224)),
|
| 16 |
+
transforms.ToTensor(),
|
| 17 |
+
])
|
| 18 |
+
|
| 19 |
+
def get_embedding(self, image):
|
| 20 |
+
img_tensor = self.transform(image).unsqueeze(0)
|
| 21 |
+
with torch.no_grad():
|
| 22 |
+
embedding = self.model(img_tensor).squeeze()
|
| 23 |
+
return embedding
|
vit_embedding.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import ViTImageProcessor, ViTModel
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Vit:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k").to(device)
|
| 10 |
+
self.processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
|
| 11 |
+
self.model.eval()
|
| 12 |
+
|
| 13 |
+
def get_embedding(self, image):
|
| 14 |
+
inputs = self.processor(images=image, return_tensors="pt").to(device)
|
| 15 |
+
with torch.no_grad():
|
| 16 |
+
outputs = self.model(**inputs)
|
| 17 |
+
embedding = outputs.last_hidden_state[:, 0, :]
|
| 18 |
+
return embedding
|