DEM-SigLIP-2: Local Scale Equivariance for SigLIP-2
Fine-tuned SigLIP-2 model with Deep Equilibrium Models (DEM) for improved local scale equivariance and feature robustness.
Usage
import torch
import sys
from PIL import Image
from torchvision import transforms
from huggingface_hub import snapshot_download
import numpy as np
import requests
from io import BytesIO
# Helper function to load the model from HuggingFace Hub
def load_dem_siglip2(repo_name="ashiq24/siglip2-base-lse"):
"""Load DEM-SigLIP-2 model from HuggingFace Hub."""
# Download all model files (checkpoint + code)
cache_dir = snapshot_download(repo_id=repo_name)
# Add cache directory to Python path to import custom modules
if cache_dir not in sys.path:
sys.path.insert(0, cache_dir)
# Import and use the loading function
from modeling_siglip2_dem import load_dem_siglip2_model
# Load model with checkpoint path
checkpoint_path = f"{cache_dir}/model_best.safetensors"
return load_dem_siglip2_model(checkpoint_path=checkpoint_path, device='cpu')
# Load model
model = load_dem_siglip2("ashiq24/siglip2-base-lse")
model.eval()
# Move to GPU if available
device = 'cpu'
model = model.to(device)
# Prepare image with SigLIP preprocessing
transform = transforms.Compose([
transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Load and preprocess image
# Option 1: Load image from a URL
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
response = requests.get(image_url)
image = Image.open(BytesIO(response.content)).convert("RGB")
# Option 2: Load a local image (uncomment and replace 'your_image.jpg' with your image path)
# image = Image.open('your_image.jpg').convert("RGB")
pixel_values = transform(image).unsqueeze(0).to(device)
# Run inference
with torch.no_grad():
embeddings = model(pixel_values) # shape: (1, 768)
print(f"Embedding shape: {embeddings.shape}")
π Citation
@inproceedings{rahman2025local,
title={Local Scale Equivariance with Latent Deep Equilibrium Canonicalizer},
author={Rahman, Md Ashiqur and Yang, Chiao-An and Cheng, Michael N and Hao, Lim Jun and Jiang, Jeremiah and Lim, Teck-Yian and Yeh, Raymond A},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={10527--10537},
year={2025}
}
- Downloads last month
- 23
Inference Providers NEW
This model isn't deployed by any Inference Provider. π Ask for provider support