akshansh36 commited on
Commit
b9500c0
·
verified ·
1 Parent(s): 70124e2

Create helper/create_embeddings.py

Browse files
Files changed (1) hide show
  1. helper/create_embeddings.py +93 -0
helper/create_embeddings.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pinecone
2
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
3
+ import os
4
+ import uuid
5
+ from pymongo import MongoClient
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+ FLASH_API = os.getenv("FLASH_API")
9
+ PINECONE_API=os.getenv("PINECONE_API")
10
+ PINECONE_INDEX=os.getenv("PINECONE_INDEX")
11
+ google_embeddings = GoogleGenerativeAIEmbeddings(
12
+ model="models/embedding-001", # Correct model name
13
+ google_api_key=FLASH_API # Your API key
14
+ )
15
+
16
+ # Initialize Pinecone instance
17
+ pc = pinecone.Pinecone(
18
+ api_key=PINECONE_API # Your Pinecone API key
19
+ )
20
+
21
+ MONGO_URI = os.getenv("MONGO_URI")
22
+ DB_NAME = os.getenv("DB_NAME")
23
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
24
+
25
+ mongo_client = MongoClient(MONGO_URI)
26
+ db = mongo_client[DB_NAME]
27
+ collection = db[COLLECTION_NAME]
28
+ # Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
29
+
30
+ index = pc.Index(PINECONE_INDEX)
31
+
32
+
33
+ def create_embedding(object_url,tags,categories):
34
+ try:
35
+ document = collection.find_one({'object_url': object_url})
36
+ content = document.get("description")
37
+ file_type = document.get("type")
38
+ mongo_id = str(document.get('_id')) # Convert ObjectId to string for storage in metadata
39
+
40
+ # Generate the embedding
41
+ embedding = google_embeddings.embed_query(content)
42
+
43
+ # Generate a unique ID for Pinecone
44
+ pinecone_id = str(uuid.uuid4())
45
+
46
+ # Prepare the vector with metadata
47
+ vectors = [{
48
+ 'id': pinecone_id,
49
+ 'values': embedding,
50
+ 'metadata': {
51
+ 'description': content,
52
+ "url": object_url,
53
+ "filetype": file_type,
54
+ "mongo_id": mongo_id,
55
+ "tags":','.join(tags),
56
+ "categories":','.join(categories)# Store MongoDB ID in metadata
57
+ }
58
+ }]
59
+
60
+ # Upsert the vector to Pinecone
61
+ index.upsert(vectors)
62
+ print(f"Inserted: {object_url} in Pinecone with MongoDB ID in metadata")
63
+
64
+ # Update MongoDB document with Pinecone ID and success status
65
+ collection.update_one(
66
+ {"_id": document["_id"]},
67
+ {"$set": {
68
+ "pinecone_id": pinecone_id,
69
+ "successfully_embedding_created": True
70
+ }}
71
+ )
72
+
73
+ return True
74
+ except Exception as e:
75
+ print(f"Error occurred: {e}")
76
+
77
+ # Update MongoDB document with failure status
78
+ collection.update_one(
79
+ {"_id": document["_id"]},
80
+ {"$set": {
81
+ "successfully_embedding_created": False
82
+ }}
83
+ )
84
+
85
+ return False
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+