Spaces:

AdarshDRC
/

visual-search-api

Running

File size: 7,092 Bytes

3e805ab
 
c96096b
 
3e805ab
c96096b
3e805ab
c96096b
 
 
 
 
 
 
 
 
 
3e805ab
 
 
c96096b
 
 
 
 
 
 
 
 
 
3e805ab
c96096b
3e805ab
c96096b
 
3e805ab
c96096b

# src/models.py
import torch
import cv2
import numpy as np
from PIL import Image
from transformers import AutoProcessor, AutoModel, AutoImageProcessor
from ultralytics import YOLO
import torch.nn.functional as F
from deepface import DeepFace

# YOLO class index for "person" — we must exclude these from the object lane
# when faces have already been found, to avoid polluting the object index with humans.
YOLO_PERSON_CLASS_ID = 0

# Minimum face bounding box area (pixels²) to avoid indexing tiny/background faces
# e.g. a face on a TV screen in the background, or a crowd member 50px wide
MIN_FACE_AREA = 3000  # roughly 55x55 pixels minimum

class AIModelManager:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
        print(f"Loading models onto: {self.device.upper()}...")

        self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224", use_fast=False)
        self.siglip_model = AutoModel.from_pretrained("google/siglip-base-patch16-224").to(self.device)
        self.siglip_model.eval()

        self.dinov2_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
        self.dinov2_model = AutoModel.from_pretrained('facebook/dinov2-base').to(self.device)
        self.dinov2_model.eval()
        
        self.yolo = YOLO('yolo11n-seg.pt')

    def _embed_object_crop(self, crop_pil):
        """Runs SigLIP + DINOv2 on a single crop and returns the fused 1536-D vector."""
        with torch.no_grad():
            siglip_inputs = self.siglip_processor(images=crop_pil, return_tensors="pt").to(self.device)
            siglip_out = self.siglip_model.get_image_features(**siglip_inputs)
            if hasattr(siglip_out, 'image_embeds'):
                siglip_out = siglip_out.image_embeds
            elif isinstance(siglip_out, tuple):
                siglip_out = siglip_out[0]
            siglip_vec = F.normalize(siglip_out, p=2, dim=1).cpu()

            dinov2_inputs = self.dinov2_processor(images=crop_pil, return_tensors="pt").to(self.device)
            dinov2_out = self.dinov2_model(**dinov2_inputs)
            dinov2_vec = dinov2_out.last_hidden_state[:, 0, :]
            dinov2_vec = F.normalize(dinov2_vec, p=2, dim=1).cpu()

            object_vec = torch.cat((siglip_vec, dinov2_vec), dim=1)
            object_vec = F.normalize(object_vec, p=2, dim=1)

        return object_vec.flatten().numpy()

    def process_image(self, image_path: str, is_query=False):
        """
        Master function: Extracts EVERY face and EVERY non-human object from an image.

        Key design decisions:
        - Face lane runs first and tags every face with its bounding box area.
        - Only faces above MIN_FACE_AREA are indexed (filters background/tiny faces).
        - For queries, ALL detected faces are used (not just the first one).
        - Object lane SKIPS any YOLO detection whose class is 'person', so humans
          never pollute the object index when faces were already found.
        - If NO faces are found at all, humans caught by YOLO DO go into the object
          lane (as a fallback for silhouettes, backs-of-head, full body shots etc.)
        """
        extracted_vectors = []
        original_img_pil = Image.open(image_path).convert('RGB')
        img_np = np.array(original_img_pil)
        img_h, img_w = img_np.shape[:2]

        faces_were_found = False  # Track whether Lane 1 found anything usable

        # ==========================================
        # LANE 1: THE FACE LANE
        # ==========================================
        try:
            face_objs = DeepFace.represent(
                img_path=img_np,
                model_name="GhostFaceNet",
                detector_backend="retinaface",
                enforce_detection=True,
                align=True
            )

            for index, face in enumerate(face_objs):
                # --- BUG FIX 5: Filter out tiny/background faces ---
                facial_area = face.get("facial_area", {})
                fw = facial_area.get("w", img_w)
                fh = facial_area.get("h", img_h)
                face_area_px = fw * fh

                if face_area_px < MIN_FACE_AREA:
                    print(f"🟡 FACE {index+1} SKIPPED: Too small ({fw}x{fh}px = {face_area_px}px²) — likely background noise.")
                    continue

                face_vec = torch.tensor([face["embedding"]])
                face_vec = F.normalize(face_vec, p=2, dim=1)

                extracted_vectors.append({
                    "type": "face",
                    "vector": face_vec.flatten().numpy()
                })
                faces_were_found = True
                print(f"🟢 FACE {index+1} EXTRACTED: {fw}x{fh}px — Added to Face Index.")

                # --- BUG FIX 2: For queries, do NOT break — search with ALL faces ---
                # The calling code in main.py already loops over all returned vectors,
                # so returning multiple face vectors means we search for every person
                # in a group photo query simultaneously.
                # (is_query flag is kept as parameter for future use / logging only)

        except ValueError:
            print("🟠 NO FACES DETECTED -> Falling back to Object Lane for any humans.")

        # ==========================================
        # LANE 2: THE OBJECT LANE
        # ==========================================
        yolo_results = self.yolo(image_path, conf=0.5)

        # Always include the full image as one crop for global context
        crops = [original_img_pil]

        for r in yolo_results:
            if r.masks is not None:
                for seg_idx, mask_xy in enumerate(r.masks.xy):
                    # --- BUG FIX 1: Skip 'person' class detections when faces were found ---
                    # This prevents human body crops from polluting the object index.
                    # If no faces were found (back-of-head, silhouette, etc.), we DO
                    # allow person-class detections through as a fallback.
                    detected_class_id = int(r.boxes.cls[seg_idx].item())
                    if faces_were_found and detected_class_id == YOLO_PERSON_CLASS_ID:
                        print(f"🔵 PERSON crop SKIPPED (faces already in Face Lane) — avoiding object index pollution.")
                        continue

                    polygon = np.array(mask_xy, dtype=np.int32)
                    if len(polygon) < 3:
                        continue
                    x, y, w, h = cv2.boundingRect(polygon)
                    if w < 30 or h < 30:
                        continue

                    cropped_img = original_img_pil.crop((x, y, x + w, y + h))
                    crops.append(cropped_img)

        for crop in crops:
            vec = self._embed_object_crop(crop)
            extracted_vectors.append({
                "type": "object",
                "vector": vec
            })

        return extracted_vectors