|
|
from PIL import Image |
|
|
import httpx |
|
|
from io import BytesIO |
|
|
from transformers import AutoProcessor, AutoModel |
|
|
import torch |
|
|
|
|
|
model = AutoModel.from_pretrained("google/siglip2-base-patch16-224") |
|
|
processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224") |
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
|
|
with httpx.stream("GET", url) as response: |
|
|
image = Image.open(BytesIO(response.read())) |
|
|
texts = ["a photo of 2 cats", "a photo of 2 dogs"] |
|
|
|
|
|
inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt") |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
|
|
|
logits_per_image = outputs.logits_per_image |
|
|
probs = torch.sigmoid(logits_per_image) |
|
|
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'") |
|
|
|