import streamlit as st import torch from transformers import CLIPProcessor, CLIPModel @st.cache_resource def load_clip_model(): print("Downloading and caching CLIP model...") model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") return model, processor # 2. Call the function clip_model, clip_processor = load_clip_model() def embed_image(pil_image): inputs = clip_processor(images = pil_image, return_tensors = "pt") with torch.no_grad(): outputs = clip_model.vision_model(**inputs) pooled_output = outputs[1] image_features = clip_model.visual_projection(pooled_output) image_features = image_features / image_features.norm(p = 2, dim = -1, keepdim = True) return image_features.squeeze().numpy() def embed_text(text): inputs = clip_processor( text = text, return_tensors = "pt", padding = True, truncation = True, max_length = 77 ) with torch.no_grad(): outputs = clip_model.text_model(**inputs) pooled_output = outputs[1] text_features = clip_model.text_projection(pooled_output) text_features = text_features / text_features.norm(p = 2, dim = -1, keepdim = True) return text_features.squeeze().numpy()