Clip_Search / src /database /pinecone_index.py
molehh's picture
modified env file
8a2e6ad
import os
import sys
src_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..", "src"))
sys.path.append(src_directory)
from pinecone import Pinecone, ServerlessSpec
import time
from transformers import AutoProcessor ,CLIPModel
from backend import dataset
from utils import logger
from dotenv import load_dotenv
logger = logger.get_logger()
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
def create_index():
load_dotenv()
api_key=os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)
index_name = "image-search"
dimension = 512
metric = "cosine"
if not pc.has_index(index_name):
pc.create_index(
name=index_name,
dimension=dimension,
metric=metric,
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
while True:
index = pc.describe_index(index_name)
if index.status.get("ready",False):
unsplash_index = pc.Index(index_name)
return unsplash_index
else:
time.sleep(1)
else:
unsplash_index=pc.Index(index_name)
return unsplash_index
def add_data_to_database(data_frame):
unsplash_index = create_index()
for _,data in data_frame.iterrows():
logger.info("Adding embedding")
url= data["photo_image_url"]
img = dataset.get_image_from_url(url)
url = data["photo_image_url"]
id = data['photo_id']
inputs = processor(images=img, return_tensors="pt")
image_features = model.get_image_features(**inputs)
embddings = image_features.detach().cpu().numpy().flatten().tolist()
unsplash_index.upsert(
vectors=[{
"id":id,
"values":embddings,
"metadata": {
"url": url,
"photo_id": id
}
} ],
namespace="image-search-dataset"
)
logger.info("Successfully added image to Pinecone index.")
# df = dataset.get_df(3200,3500)
# add_data_to_database(df)