File size: 2,252 Bytes
632e4ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a2e6ad
632e4ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a2e6ad
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import sys
src_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..", "src"))
sys.path.append(src_directory)
from pinecone import Pinecone, ServerlessSpec

import time
from transformers import AutoProcessor ,CLIPModel
from backend import dataset
from utils import logger
from dotenv import load_dotenv

logger = logger.get_logger()

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")


def create_index():
    load_dotenv()
    api_key=os.environ.get("PINECONE_API_KEY")
    pc = Pinecone(api_key=api_key)

    index_name = "image-search"
    dimension = 512   
    metric = "cosine"  

    if not pc.has_index(index_name): 
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(
            cloud="aws",      
             region="us-east-1"  
            )
        )

        while True:
            index = pc.describe_index(index_name)
            if index.status.get("ready",False):
                unsplash_index = pc.Index(index_name)
                return unsplash_index
            else:
                time.sleep(1)
    else:
        unsplash_index=pc.Index(index_name)
        return unsplash_index
    
def add_data_to_database(data_frame):
    unsplash_index = create_index()
    for _,data in data_frame.iterrows():
        logger.info("Adding embedding")
        url= data["photo_image_url"]
        img = dataset.get_image_from_url(url)
        url = data["photo_image_url"]
        id = data['photo_id']
        inputs = processor(images=img, return_tensors="pt")
        image_features = model.get_image_features(**inputs)
        embddings = image_features.detach().cpu().numpy().flatten().tolist()

        unsplash_index.upsert(
            vectors=[{
                "id":id,
                "values":embddings,
                "metadata": {
                "url": url,
                "photo_id": id 
            }
           } ],
             namespace="image-search-dataset"
        )
        logger.info("Successfully added image to Pinecone index.")

# df = dataset.get_df(3200,3500)
# add_data_to_database(df)