Spaces:

AdarshDRC
/

visual-search-api

Sleeping

App Files Files Community

AdarshDRC commited on 8 days ago

Commit

c96096b

verified ·

1 Parent(s): 0c1e873

Update src/models.py

Browse files

Files changed (1) hide show

src/models.py +144 -48

src/models.py CHANGED Viewed

@@ -1,58 +1,154 @@
 # src/models.py
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoModel
 from ultralytics import YOLO
 class AIModelManager:
     def __init__(self):
-        # Load SigLIP (Vision & Text Encoder)
-        self.processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224",use_fast=False)
-        self.model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        self.model.eval() # Set to evaluation mode
-        # Load YOLOv11 (Nano version for speed)
-        self.yolo = YOLO('yolov8n.pt') # Will auto-download the tiny weights
-    def encode_image(self, image: Image.Image):
-        """Converts a PIL Image into a vector."""
-        inputs = self.processor(images=image, return_tensors="pt")
         with torch.no_grad():
-            outputs = self.model.get_image_features(**inputs)
-            # Extract the raw tensor from the output object
-            if hasattr(outputs, 'image_embeds'):
-                image_features = outputs.image_embeds
-            elif hasattr(outputs, 'pooler_output'):
-                image_features = outputs.pooler_output
-            else:
-                image_features = outputs
-        return image_features.flatten().numpy()
-    def encode_text(self, text: str):
-        """Converts a text string into a vector."""
-        inputs = self.processor(text=text, return_tensors="pt", padding="max_length")
-        with torch.no_grad():
-            outputs = self.model.get_text_features(**inputs)
-            # Hugging Face quirk: Extract the raw tensor from the             output object
-            if hasattr(outputs, 'text_embeds'):
-                text_features = outputs.text_embeds
-            elif hasattr(outputs, 'pooler_output'):
-                text_features = outputs.pooler_output
-            else:
-                text_features = outputs
-        return text_features.flatten().numpy()
-    def get_crops_from_image(self, image: Image.Image):
-        """Uses YOLO to find objects and returns a list of cropped PIL Images."""
-        results = self.yolo(image, conf=0.5) # Only keep confident detections
-        crops = []
-        for result in results:
-            for box in result.boxes.xyxy: # Get bounding box coordinates
-                x1, y1, x2, y2 = map(int, box.tolist())
-                cropped_img = image.crop((x1, y1, x2, y2))
-                crops.append(cropped_img)
-        return crops

 # src/models.py
 import torch
+import cv2
+import numpy as np
 from PIL import Image
+from transformers import AutoProcessor, AutoModel, AutoImageProcessor
 from ultralytics import YOLO
+import torch.nn.functional as F
+from deepface import DeepFace
+# YOLO class index for "person" — we must exclude these from the object lane
+# when faces have already been found, to avoid polluting the object index with humans.
+YOLO_PERSON_CLASS_ID = 0
+# Minimum face bounding box area (pixels²) to avoid indexing tiny/background faces
+# e.g. a face on a TV screen in the background, or a crowd member 50px wide
+MIN_FACE_AREA = 3000  # roughly 55x55 pixels minimum
 class AIModelManager:
     def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
+        print(f"Loading models onto: {self.device.upper()}...")
+        self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224", use_fast=False)
+        self.siglip_model = AutoModel.from_pretrained("google/siglip-base-patch16-224").to(self.device)
+        self.siglip_model.eval()
+        self.dinov2_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+        self.dinov2_model = AutoModel.from_pretrained('facebook/dinov2-base').to(self.device)
+        self.dinov2_model.eval()
+        self.yolo = YOLO('yolo11n-seg.pt')
+    def _embed_object_crop(self, crop_pil):
+        """Runs SigLIP + DINOv2 on a single crop and returns the fused 1536-D vector."""
         with torch.no_grad():
+            siglip_inputs = self.siglip_processor(images=crop_pil, return_tensors="pt").to(self.device)
+            siglip_out = self.siglip_model.get_image_features(**siglip_inputs)
+            if hasattr(siglip_out, 'image_embeds'):
+                siglip_out = siglip_out.image_embeds
+            elif isinstance(siglip_out, tuple):
+                siglip_out = siglip_out[0]
+            siglip_vec = F.normalize(siglip_out, p=2, dim=1).cpu()
+            dinov2_inputs = self.dinov2_processor(images=crop_pil, return_tensors="pt").to(self.device)
+            dinov2_out = self.dinov2_model(**dinov2_inputs)
+            dinov2_vec = dinov2_out.last_hidden_state[:, 0, :]
+            dinov2_vec = F.normalize(dinov2_vec, p=2, dim=1).cpu()
+            object_vec = torch.cat((siglip_vec, dinov2_vec), dim=1)
+            object_vec = F.normalize(object_vec, p=2, dim=1)
+        return object_vec.flatten().numpy()
+    def process_image(self, image_path: str, is_query=False):
+        """
+        Master function: Extracts EVERY face and EVERY non-human object from an image.
+        Key design decisions:
+        - Face lane runs first and tags every face with its bounding box area.
+        - Only faces above MIN_FACE_AREA are indexed (filters background/tiny faces).
+        - For queries, ALL detected faces are used (not just the first one).
+        - Object lane SKIPS any YOLO detection whose class is 'person', so humans
+          never pollute the object index when faces were already found.
+        - If NO faces are found at all, humans caught by YOLO DO go into the object
+          lane (as a fallback for silhouettes, backs-of-head, full body shots etc.)
+        """
+        extracted_vectors = []
+        original_img_pil = Image.open(image_path).convert('RGB')
+        img_np = np.array(original_img_pil)
+        img_h, img_w = img_np.shape[:2]
+        faces_were_found = False  # Track whether Lane 1 found anything usable
+        # ==========================================
+        # LANE 1: THE FACE LANE
+        # ==========================================
+        try:
+            face_objs = DeepFace.represent(
+                img_path=img_np,
+                model_name="GhostFaceNet",
+                detector_backend="retinaface",
+                enforce_detection=True,
+                align=True
+            )
+            for index, face in enumerate(face_objs):
+                # --- BUG FIX 5: Filter out tiny/background faces ---
+                facial_area = face.get("facial_area", {})
+                fw = facial_area.get("w", img_w)
+                fh = facial_area.get("h", img_h)
+                face_area_px = fw * fh
+                if face_area_px < MIN_FACE_AREA:
+                    print(f"🟡 FACE {index+1} SKIPPED: Too small ({fw}x{fh}px = {face_area_px}px²) — likely background noise.")
+                    continue
+                face_vec = torch.tensor([face["embedding"]])
+                face_vec = F.normalize(face_vec, p=2, dim=1)
+                extracted_vectors.append({
+                    "type": "face",
+                    "vector": face_vec.flatten().numpy()
+                })
+                faces_were_found = True
+                print(f"🟢 FACE {index+1} EXTRACTED: {fw}x{fh}px — Added to Face Index.")
+                # --- BUG FIX 2: For queries, do NOT break — search with ALL faces ---
+                # The calling code in main.py already loops over all returned vectors,
+                # so returning multiple face vectors means we search for every person
+                # in a group photo query simultaneously.
+                # (is_query flag is kept as parameter for future use / logging only)
+        except ValueError:
+            print("🟠 NO FACES DETECTED -> Falling back to Object Lane for any humans.")
+        # ==========================================
+        # LANE 2: THE OBJECT LANE
+        # ==========================================
+        yolo_results = self.yolo(image_path, conf=0.5)
+        # Always include the full image as one crop for global context
+        crops = [original_img_pil]
+        for r in yolo_results:
+            if r.masks is not None:
+                for seg_idx, mask_xy in enumerate(r.masks.xy):
+                    # --- BUG FIX 1: Skip 'person' class detections when faces were found ---
+                    # This prevents human body crops from polluting the object index.
+                    # If no faces were found (back-of-head, silhouette, etc.), we DO
+                    # allow person-class detections through as a fallback.
+                    detected_class_id = int(r.boxes.cls[seg_idx].item())
+                    if faces_were_found and detected_class_id == YOLO_PERSON_CLASS_ID:
+                        print(f"🔵 PERSON crop SKIPPED (faces already in Face Lane) — avoiding object index pollution.")
+                        continue
+                    polygon = np.array(mask_xy, dtype=np.int32)
+                    if len(polygon) < 3:
+                        continue
+                    x, y, w, h = cv2.boundingRect(polygon)
+                    if w < 30 or h < 30:
+                        continue
+                    cropped_img = original_img_pil.crop((x, y, x + w, y + h))
+                    crops.append(cropped_img)
+        for crop in crops:
+            vec = self._embed_object_crop(crop)
+            extracted_vectors.append({
+                "type": "object",
+                "vector": vec
+            })
+        return extracted_vectors