|
|
"""SigLIP Image Embedding Client - Local model for image embeddings. |
|
|
|
|
|
Uses open_clip with ViT-B-16-SigLIP model for generating 768-dim image embeddings. |
|
|
Model is loaded once at startup and reused for all requests. |
|
|
""" |
|
|
|
|
|
import io |
|
|
from typing import Optional |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
class SigLIPClient: |
|
|
""" |
|
|
Local SigLIP model client for image embeddings. |
|
|
|
|
|
Model: ViT-B-16-SigLIP (pretrained on WebLI) |
|
|
Output: 768-dimensional normalized embedding vector |
|
|
""" |
|
|
|
|
|
_instance: Optional["SigLIPClient"] = None |
|
|
_initialized: bool = False |
|
|
|
|
|
def __new__(cls): |
|
|
"""Singleton pattern - only one model instance.""" |
|
|
if cls._instance is None: |
|
|
cls._instance = super().__new__(cls) |
|
|
return cls._instance |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize SigLIP model (only once).""" |
|
|
if SigLIPClient._initialized: |
|
|
return |
|
|
|
|
|
self.model = None |
|
|
self.preprocess = None |
|
|
self.device = None |
|
|
self._load_model() |
|
|
SigLIPClient._initialized = True |
|
|
|
|
|
def _load_model(self): |
|
|
"""Load SigLIP model.""" |
|
|
try: |
|
|
import torch |
|
|
import open_clip |
|
|
|
|
|
print("🔄 Loading SigLIP model (ViT-B-16-SigLIP)...") |
|
|
|
|
|
self.model, _, self.preprocess = open_clip.create_model_and_transforms( |
|
|
"ViT-B-16-SigLIP", pretrained="webli" |
|
|
) |
|
|
self.model.eval() |
|
|
|
|
|
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
self.model.to(self.device) |
|
|
|
|
|
print(f"✅ SigLIP model loaded on {self.device}") |
|
|
|
|
|
except ImportError as e: |
|
|
print(f"⚠️ SigLIP dependencies not installed: {e}") |
|
|
print(" Install with: pip install torch open_clip_torch pillow") |
|
|
raise |
|
|
except Exception as e: |
|
|
print(f"❌ Failed to load SigLIP model: {e}") |
|
|
raise |
|
|
|
|
|
def embed_image(self, image: Image.Image) -> np.ndarray: |
|
|
""" |
|
|
Generate embedding for a PIL Image. |
|
|
|
|
|
Args: |
|
|
image: PIL Image object |
|
|
|
|
|
Returns: |
|
|
Normalized 768-dim embedding vector |
|
|
""" |
|
|
import torch |
|
|
|
|
|
|
|
|
if image.mode != 'RGB': |
|
|
image = image.convert('RGB') |
|
|
|
|
|
|
|
|
image_tensor = self.preprocess(image).unsqueeze(0).to(self.device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
image_features = self.model.encode_image(image_tensor) |
|
|
image_features = image_features / image_features.norm(dim=-1, keepdim=True) |
|
|
|
|
|
return image_features.cpu().numpy()[0] |
|
|
|
|
|
def embed_image_bytes(self, image_bytes: bytes) -> np.ndarray: |
|
|
""" |
|
|
Generate embedding from raw image bytes. |
|
|
|
|
|
Args: |
|
|
image_bytes: Raw image bytes (JPEG, PNG, etc.) |
|
|
|
|
|
Returns: |
|
|
Normalized 768-dim embedding vector |
|
|
""" |
|
|
image = Image.open(io.BytesIO(image_bytes)).convert('RGB') |
|
|
return self.embed_image(image) |
|
|
|
|
|
def embed_image_url(self, image_url: str) -> Optional[np.ndarray]: |
|
|
""" |
|
|
Download and embed image from URL. |
|
|
|
|
|
Args: |
|
|
image_url: URL to image |
|
|
|
|
|
Returns: |
|
|
Embedding vector or None if failed |
|
|
""" |
|
|
import httpx |
|
|
|
|
|
try: |
|
|
response = httpx.get(image_url, timeout=30.0) |
|
|
response.raise_for_status() |
|
|
return self.embed_image_bytes(response.content) |
|
|
except Exception as e: |
|
|
print(f"⚠️ Failed to embed image from URL: {e}") |
|
|
return None |
|
|
|
|
|
@property |
|
|
def is_loaded(self) -> bool: |
|
|
"""Check if model is loaded.""" |
|
|
return self.model is not None |
|
|
|
|
|
|
|
|
|
|
|
_siglip_client: Optional[SigLIPClient] = None |
|
|
|
|
|
|
|
|
def get_siglip_client() -> SigLIPClient: |
|
|
"""Get or create SigLIP client singleton.""" |
|
|
global _siglip_client |
|
|
if _siglip_client is None: |
|
|
_siglip_client = SigLIPClient() |
|
|
return _siglip_client |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|