Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import torch | |
| from transformers import CLIPProcessor, CLIPModel | |
| def load_clip_model(): | |
| print("Downloading and caching CLIP model...") | |
| model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
| processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| return model, processor | |
| # 2. Call the function | |
| clip_model, clip_processor = load_clip_model() | |
| def embed_image(pil_image): | |
| inputs = clip_processor(images = pil_image, return_tensors = "pt") | |
| with torch.no_grad(): | |
| outputs = clip_model.vision_model(**inputs) | |
| pooled_output = outputs[1] | |
| image_features = clip_model.visual_projection(pooled_output) | |
| image_features = image_features / image_features.norm(p = 2, dim = -1, keepdim = True) | |
| return image_features.squeeze().numpy() | |
| def embed_text(text): | |
| inputs = clip_processor( | |
| text = text, | |
| return_tensors = "pt", | |
| padding = True, | |
| truncation = True, | |
| max_length = 77 | |
| ) | |
| with torch.no_grad(): | |
| outputs = clip_model.text_model(**inputs) | |
| pooled_output = outputs[1] | |
| text_features = clip_model.text_projection(pooled_output) | |
| text_features = text_features / text_features.norm(p = 2, dim = -1, keepdim = True) | |
| return text_features.squeeze().numpy() | |