import os import sys src_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..", "src")) sys.path.append(src_directory) from pinecone import Pinecone, ServerlessSpec import time from transformers import AutoProcessor ,CLIPModel from backend import dataset from utils import logger from dotenv import load_dotenv logger = logger.get_logger() model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") def create_index(): load_dotenv() api_key=os.environ.get("PINECONE_API_KEY") pc = Pinecone(api_key=api_key) index_name = "image-search" dimension = 512 metric = "cosine" if not pc.has_index(index_name): pc.create_index( name=index_name, dimension=dimension, metric=metric, spec=ServerlessSpec( cloud="aws", region="us-east-1" ) ) while True: index = pc.describe_index(index_name) if index.status.get("ready",False): unsplash_index = pc.Index(index_name) return unsplash_index else: time.sleep(1) else: unsplash_index=pc.Index(index_name) return unsplash_index def add_data_to_database(data_frame): unsplash_index = create_index() for _,data in data_frame.iterrows(): logger.info("Adding embedding") url= data["photo_image_url"] img = dataset.get_image_from_url(url) url = data["photo_image_url"] id = data['photo_id'] inputs = processor(images=img, return_tensors="pt") image_features = model.get_image_features(**inputs) embddings = image_features.detach().cpu().numpy().flatten().tolist() unsplash_index.upsert( vectors=[{ "id":id, "values":embddings, "metadata": { "url": url, "photo_id": id } } ], namespace="image-search-dataset" ) logger.info("Successfully added image to Pinecone index.") # df = dataset.get_df(3200,3500) # add_data_to_database(df)