Zero-Shot Image Classification
OpenCLIP
ONNX
Safetensors
Transformers.js
Transformers
English
clip
e-commerce
fashion
multimodal retrieval
custom_code
Instructions to use Findle/marqo-fashionCLIP with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- OpenCLIP
How to use Findle/marqo-fashionCLIP with OpenCLIP:
import open_clip model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:Findle/marqo-fashionCLIP') tokenizer = open_clip.get_tokenizer('hf-hub:Findle/marqo-fashionCLIP') - Transformers.js
How to use Findle/marqo-fashionCLIP with Transformers.js:
// npm i @huggingface/transformers import { pipeline } from '@huggingface/transformers'; // Allocate pipeline const pipe = await pipeline('zero-shot-image-classification', 'Findle/marqo-fashionCLIP'); - Transformers
How to use Findle/marqo-fashionCLIP with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("zero-shot-image-classification", model="Findle/marqo-fashionCLIP", trust_remote_code=True) pipe( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/parrots.png", candidate_labels=["animals", "humans", "landscape"], )# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Findle/marqo-fashionCLIP", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| from typing import Dict, Any | |
| from PIL import Image | |
| import open_clip | |
| import torch | |
| import base64 | |
| import io | |
| import os | |
| import requests | |
| class EndpointHandler: | |
| def __init__(self, path: str = ""): | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| safetensors = f"{path}/open_clip_model.safetensors" | |
| bin_file = f"{path}/open_clip_pytorch_model.bin" | |
| if os.path.exists(safetensors): | |
| pretrained = safetensors | |
| elif os.path.exists(bin_file): | |
| pretrained = bin_file | |
| else: | |
| raise RuntimeError(f"No open_clip weights found in {path}") | |
| self.model, _, self.preprocess = open_clip.create_model_and_transforms( | |
| "ViT-B-16", | |
| pretrained=pretrained, | |
| ) | |
| self.tokenizer = open_clip.get_tokenizer("ViT-B-16") | |
| self.model = self.model.to(self.device) | |
| self.model.eval() | |
| def __call__(self, data: Dict[str, Any]) -> list: | |
| inputs = data.get("inputs") | |
| if not inputs: | |
| raise ValueError("'inputs' is required — pass an image URL, base64 string, or text") | |
| if self._is_image(inputs): | |
| return self._embed_image(inputs) | |
| else: | |
| return self._embed_text(inputs) | |
| def _is_image(self, source: str) -> bool: | |
| return source.startswith("http://") or source.startswith("https://") | |
| def _embed_image(self, source: str) -> list: | |
| image = self._load_image(source) | |
| pixel_values = self.preprocess(image).unsqueeze(0).to(self.device) | |
| with torch.no_grad(): | |
| features = self.model.encode_image(pixel_values, normalize=True) | |
| return features[0].tolist() | |
| def _embed_text(self, text: str) -> list: | |
| tokens = self.tokenizer([text]).to(self.device) | |
| with torch.no_grad(): | |
| features = self.model.encode_text(tokens, normalize=True) | |
| return features[0].tolist() | |
| def _load_image(self, source: str) -> Image.Image: | |
| if source.startswith("http://") or source.startswith("https://"): | |
| response = requests.get(source, timeout=10) | |
| response.raise_for_status() | |
| return Image.open(io.BytesIO(response.content)).convert("RGB") | |
| try: | |
| image_bytes = base64.b64decode(source) | |
| return Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| except Exception as e: | |
| raise ValueError(f"Could not load image from input: {e}") | |