File size: 5,273 Bytes
9517f59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
from transformers import CLIPModel, CLIPProcessor
from PIL import Image
import torch
import numpy as np
import matplotlib.pyplot as plt
import cv2
TF_ENABLE_ONEDNN_OPTS=0
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)
# This function extracts patches from an image and returns them along with their coordinates.
def image_patch(img, patch_size =(100, 100), stride = 2):
img_w, img_h = img.size
print(f"Image dimensions: width={img_w}, height={img_h}")
patches = []
for i in range(0, img_h - patch_size[1] + 1, stride):
for j in range(0, img_w - patch_size[0] + 1, stride):
patch = img.crop((j, i, j + patch_size[0], i + patch_size[1]))
patches.append((patch, (j, i)))
return patches
def bounding_box(img, heatmap):
img_copy = np.array(img).copy()
found = False
normalized = cv2.normalize(heatmap, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
_, binary = cv2.threshold(normalized, 200, 255, cv2.THRESH_BINARY)
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if contours:
largest = max(contours, key=cv2.contourArea)
x, y, w, h = cv2.boundingRect(largest)
cv2.rectangle(img_copy, (x, y), (x + w, y + h), (255, 0, 0), 2)
found = True
return img_copy, found
# def main():
# print("Starting the object detection process...")
# img_path = r"C:\Users\sahas\OneDrive\Desktop\GenMatch\Photo of a dog.jpg"
# score_patches = []
# prompt = ["a photo of a human", "a close up of a dog's face"]
# try:
# # Open the image
# img = Image.open(img_path)
# print(f"Image opened successfully: {img_path}")
# # Extract patches from the image
# patches = image_patch(img)
# print(f"Extracted {len(patches)} patches from the image.")
# # Process all patches with the CLIP model to get the probabilities
# patch_batch = [p for p, (x, y) in patches]
# input = processor(text=prompt, images=patch_batch, return_tensors="pt", padding=True)
# input = {k: v.to(device) for k, v in input.items()}
# with torch.no_grad():
# output = model(**input)
# logits = output.logits_per_image
# prob = logits.softmax(dim=1)
# for i, (patch, (x, y)) in enumerate(patches):
# score = prob[i][0].item()
# score_patches.append((patch, (x, y), score))
# # Create heatmap based on scores
# img_h, img_w = img.size
# pat_h, pat_w = patches[0][0].size
# heatmap = np.zeros((img_h, img_w))
# for _, (x, y), score in score_patches:
# heatmap[y:y + pat_h, x:x + pat_w] += score
# fig, ax = plt.subplots()
# ax.imshow(img)
# ax.imshow(heatmap, cmap='viridis', alpha=0.6)
# ax.axis('off')
# plt.show()
# print("Genrating images with bounding box")
# box_img = bounding_box(img, heatmap)
# plt.imshow(box_img)
# plt.axis('off')
# plt.show()
# except FileNotFoundError:
# print(f"Error opening image: {img_path}")
# return
# if __name__ == "__main__":
# main()
def run_detection_pipeline(input_image, text_prompt):
print("Starting the object detection process...")
img = input_image
prompt = [text_prompt, "a photo of a blank background"]
score_patches = []
all_scores = []
patches = image_patch(img)
print(f"Extracted {len(patches)} patches from the image.")
patch_batch = [p for p, (x, y) in patches]
input_data = processor(text=prompt, images=patch_batch, return_tensors="pt", padding=True)
input_data = {k: v.to(device) for k, v in input_data.items()}
with torch.no_grad():
output = model(**input_data)
logits = output.logits_per_image
prob = logits.softmax(dim=1)
for i, (patch, (x, y)) in enumerate(patches):
score = prob[i][0].item()
score_patches.append((patch, (x, y), score))
all_scores.append(score)
confidence_threshold = 0.20
max_score = max(all_scores) if all_scores else 0
print(f"Max confidence score: {max_score:.4f}")
if max_score < confidence_threshold:
msg = f"Could not find '{text_prompt}' with enough confidence."
return msg, input_image
img_h, img_w = img.size
if not patches:
print("Warning: No patches were extracted from the image.")
return img
pat_h, pat_w = patches[0][0].size
heatmap = np.zeros((img_h, img_w))
for _, (x, y), score in score_patches:
heatmap[y:y + pat_h, x:x + pat_w] += score
print("Generating image with bounding box...")
box_img, found = bounding_box(img, heatmap)
if not found:
msg = "No object detected matching the prompt."
else:
msg = "Object detected and highlighted."
return msg, Image.fromarray(box_img) |