DEM-SigLIP-2: Local Scale Equivariance for SigLIP-2

Fine-tuned SigLIP-2 model with Deep Equilibrium Models (DEM) for improved local scale equivariance and feature robustness.

Usage

import torch
import sys
from PIL import Image
from torchvision import transforms
from huggingface_hub import snapshot_download
import numpy as np
import requests
from io import BytesIO

# Helper function to load the model from HuggingFace Hub
def load_dem_siglip2(repo_name="ashiq24/siglip2-base-lse"):
    """Load DEM-SigLIP-2 model from HuggingFace Hub."""

    # Download all model files (checkpoint + code)
    cache_dir = snapshot_download(repo_id=repo_name)

    # Add cache directory to Python path to import custom modules
    if cache_dir not in sys.path:
        sys.path.insert(0, cache_dir)

    # Import and use the loading function
    from modeling_siglip2_dem import load_dem_siglip2_model

    # Load model with checkpoint path
    checkpoint_path = f"{cache_dir}/model_best.safetensors"
    return load_dem_siglip2_model(checkpoint_path=checkpoint_path, device='cpu')

# Load model
model = load_dem_siglip2("ashiq24/siglip2-base-lse")
model.eval()

# Move to GPU if available
device = 'cpu'
model = model.to(device)

# Prepare image with SigLIP preprocessing
transform = transforms.Compose([
    transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load and preprocess image
# Option 1: Load image from a URL
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
response = requests.get(image_url)
image = Image.open(BytesIO(response.content)).convert("RGB")

# Option 2: Load a local image (uncomment and replace 'your_image.jpg' with your image path)
# image = Image.open('your_image.jpg').convert("RGB")

pixel_values = transform(image).unsqueeze(0).to(device)

# Run inference
with torch.no_grad():
    embeddings = model(pixel_values)  # shape: (1, 768)

print(f"Embedding shape: {embeddings.shape}")

πŸ“š Citation

@inproceedings{rahman2025local,
  title={Local Scale Equivariance with Latent Deep Equilibrium Canonicalizer},
  author={Rahman, Md Ashiqur and Yang, Chiao-An and Cheng, Michael N and Hao, Lim Jun and Jiang, Jeremiah and Lim, Teck-Yian and Yeh, Raymond A},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={10527--10537},
  year={2025}
}
Downloads last month
23
Inference Providers NEW
This model isn't deployed by any Inference Provider. πŸ™‹ Ask for provider support

Dataset used to train ashiq24/siglip2-base-lse