Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import open_clip | |
| import torch | |
| from PIL import Image | |
| class FashionEncoder: | |
| """Encodes images, text, or fused (image + text modifier) queries.""" | |
| MODEL_NAME = "hf-hub:Marqo/marqo-fashionSigLIP" | |
| def __init__(self): | |
| self.model, _, self.preprocess = open_clip.create_model_and_transforms( | |
| self.MODEL_NAME | |
| ) | |
| self.tokenizer = open_clip.get_tokenizer(self.MODEL_NAME) | |
| self.model.eval() | |
| def encode_image(self, image): | |
| """Encode a single PIL image -> normalised 768-dim vector.""" | |
| x = self.preprocess(image).unsqueeze(0) | |
| v = self.model.encode_image(x).squeeze().numpy() | |
| return v / np.linalg.norm(v) | |
| def encode_images(self, images): | |
| """Encode a batch of PIL images -> (N, 768) normalised array.""" | |
| tensors = torch.stack([self.preprocess(img) for img in images]) | |
| vecs = self.model.encode_image(tensors).numpy() | |
| norms = np.linalg.norm(vecs, axis=1, keepdims=True) | |
| return vecs / norms | |
| def encode_text(self, text): | |
| """Encode a text string -> normalised 768-dim vector.""" | |
| tok = self.tokenizer([text]) | |
| v = self.model.encode_text(tok).squeeze().numpy() | |
| return v / np.linalg.norm(v) | |
| def encode_multimodal(self, image=None, text=None, alpha=0.7): | |
| """ | |
| Fuse image + text modifier into one query vector via weighted sum. | |
| alpha = image weight; (1 - alpha) = text weight. | |
| """ | |
| parts = [] | |
| if image is not None: | |
| parts.append((alpha, self.encode_image(image))) | |
| if text: | |
| weight = (1.0 - alpha) if image is not None else 1.0 | |
| parts.append((weight, self.encode_text(text))) | |
| if not parts: | |
| raise ValueError("At least one of image or text must be provided.") | |
| fused = sum(w * v for w, v in parts) | |
| return fused / np.linalg.norm(fused) |