""" SigLIP 2 Text & Image Encoder -- HuggingFace Space Encodes text or image queries to 768-dim vectors for the Epstein photo search. Model: google/siglip2-base-patch16-224 """ import gradio as gr import torch import torch.nn.functional as F from PIL import Image from transformers import AutoModel, AutoTokenizer, AutoProcessor MODEL_NAME = "google/siglip2-base-patch16-224" print(f"Loading {MODEL_NAME}...") model = AutoModel.from_pretrained(MODEL_NAME).eval() tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) processor = AutoProcessor.from_pretrained(MODEL_NAME) print(f"Model loaded. Text hidden size: {model.config.text_config.hidden_size}") def encode(text: str) -> list: inputs = tokenizer([text], return_tensors="pt", padding="max_length", max_length=64, truncation=True) with torch.no_grad(): feats = model.text_model(**inputs).pooler_output feats = F.normalize(feats, dim=-1) return feats[0].tolist() def encode_image(image) -> list: if image is None: raise gr.Error("No image provided") # Gradio 6.x base64 shortcut returns RGBA — SigLIP needs RGB if isinstance(image, Image.Image): image = image.convert("RGB") elif isinstance(image, str): image = Image.open(image).convert("RGB") else: raise gr.Error(f"Unexpected image type: {type(image)}") inputs = processor(images=[image], return_tensors="pt") with torch.no_grad(): feats = model.get_image_features(pixel_values=inputs["pixel_values"]) if not isinstance(feats, torch.Tensor): feats = feats.pooler_output feats = F.normalize(feats, dim=-1) return feats[0].tolist() with gr.Blocks(title="SigLIP 2 Encoder") as demo: gr.Markdown("# SigLIP 2 Encoder\nEncodes text or images to 768-dim normalized vectors using google/siglip2-base-patch16-224") with gr.Tab("Text"): text_input = gr.Textbox(label="Text") text_output = gr.JSON(label="Embedding (768-dim)") text_btn = gr.Button("Encode Text") text_btn.click(fn=encode, inputs=text_input, outputs=text_output, api_name="encode") with gr.Tab("Image"): image_input = gr.Image(type="pil", label="Image") image_output = gr.JSON(label="Embedding (768-dim)") image_btn = gr.Button("Encode Image") image_btn.click(fn=encode_image, inputs=image_input, outputs=image_output, api_name="encode_image") demo.launch()