Spaces:

AdarshDRC
/

visual-search-api

Running

App Files Files Community

AdarshDRC commited on 6 days ago

Commit

5d013dc

verified ·

1 Parent(s): 8f0f0e4

Update src/models.py

Browse files

Files changed (1) hide show

src/models.py +41 -18

src/models.py CHANGED Viewed

@@ -44,7 +44,7 @@ class AIModelManager:
         )
         print(f"Loading models onto: {self.device.upper()}...")
-        self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224", use_fast=True)
         self.siglip_model = AutoModel.from_pretrained("google/siglip-base-patch16-224").to(self.device).eval()
         self.dinov2_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
@@ -57,7 +57,7 @@ class AIModelManager:
         # FIX 2: Removed torch.compile() because HF Spaces do not have the g++ compiler installed by default.
         # This fixes the "InvalidCxxCompiler" Search crash.
-        self.yolo = YOLO("yolo11n.pt")
         self._cache = {}
         self._cache_maxsize = 256
@@ -128,27 +128,50 @@ class AIModelManager:
             except Exception as e:
                 print(f"🟠 Face lane error: {e} — falling back to object lane")
-        crops = [small_pil]
         yolo_results = self.yolo(image_path, conf=0.5, verbose=False)
         for r in yolo_results:
-            if r.boxes is None:
-                continue
-            for box in r.boxes:
-                cls_id = int(box.cls.item())
-                if faces_found and cls_id == YOLO_PERSON_CLASS_ID:
-                    continue
-                x1, y1, x2, y2 = box.xyxy[0].tolist()
-                w, h = x2 - x1, y2 - y1
-                if w < 30 or h < 30:
-                    continue
-                crop = small_pil.crop((x1, y1, x2, y2))
-                crops.append(crop)
-                if len(crops) >= MAX_CROPS + 1:
-                    break
-            if len(crops) >= MAX_CROPS + 1:
                 break
         print(f"🧠 Embedding {len(crops)} crop(s) in one batch …")
         vecs = self._embed_crops_batch(crops)
         for vec in vecs:

         )
         print(f"Loading models onto: {self.device.upper()}...")
+        self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224", use_fast=False)
         self.siglip_model = AutoModel.from_pretrained("google/siglip-base-patch16-224").to(self.device).eval()
         self.dinov2_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
         # FIX 2: Removed torch.compile() because HF Spaces do not have the g++ compiler installed by default.
         # This fixes the "InvalidCxxCompiler" Search crash.
+        self.yolo = YOLO("yolo11n-seg.pt")   # seg model → pixel masks → accurate crops
         self._cache = {}
         self._cache_maxsize = 256
             except Exception as e:
                 print(f"🟠 Face lane error: {e} — falling back to object lane")
+        # Full-res PIL for crops — YOLO returns coordinates in full-res pixel space.
+        # We crop from original_pil then resize each crop before embedding.
+        # BUG FIX: old optimised code cropped from small_pil (512px) using
+        # full-res YOLO coordinates → completely wrong crop regions.
+        crops_pil = [original_pil]   # full-image always included for global context
         yolo_results = self.yolo(image_path, conf=0.5, verbose=False)
         for r in yolo_results:
+            # Use segmentation masks when available (yolo11n-seg.pt)
+            if r.masks is not None:
+                for seg_idx, mask_xy in enumerate(r.masks.xy):
+                    cls_id = int(r.boxes.cls[seg_idx].item())
+                    if faces_found and cls_id == YOLO_PERSON_CLASS_ID:
+                        print("🔵 PERSON crop skipped — face lane already active")
+                        continue
+                    polygon = np.array(mask_xy, dtype=np.int32)
+                    if len(polygon) < 3:
+                        continue
+                    x, y, w, h = cv2.boundingRect(polygon)
+                    if w < 30 or h < 30:
+                        continue
+                    crop = original_pil.crop((x, y, x + w, y + h))
+                    crops_pil.append(crop)
+                    if len(crops_pil) >= MAX_CROPS + 1:
+                        break
+            elif r.boxes is not None:
+                # Fallback: plain bounding boxes (shouldn't happen with seg model)
+                for box in r.boxes:
+                    cls_id = int(box.cls.item())
+                    if faces_found and cls_id == YOLO_PERSON_CLASS_ID:
+                        continue
+                    x1, y1, x2, y2 = box.xyxy[0].tolist()
+                    if (x2 - x1) < 30 or (y2 - y1) < 30:
+                        continue
+                    crop = original_pil.crop((x1, y1, x2, y2))
+                    crops_pil.append(crop)
+            if len(crops_pil) >= MAX_CROPS + 1:
                 break
+        # Resize each crop to MAX_IMAGE_SIZE before batched embedding
+        # (models expect ~224px anyway; no quality loss, big speed gain)
+        crops = [_resize_pil(c, MAX_IMAGE_SIZE) for c in crops_pil]
         print(f"🧠 Embedding {len(crops)} crop(s) in one batch …")
         vecs = self._embed_crops_batch(crops)
         for vec in vecs: