Spaces:
Sleeping
Sleeping
Create helper/create_embeddings.py
Browse files- helper/create_embeddings.py +93 -0
helper/create_embeddings.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pinecone
|
| 2 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 3 |
+
import os
|
| 4 |
+
import uuid
|
| 5 |
+
from pymongo import MongoClient
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
load_dotenv()
|
| 8 |
+
FLASH_API = os.getenv("FLASH_API")
|
| 9 |
+
PINECONE_API=os.getenv("PINECONE_API")
|
| 10 |
+
PINECONE_INDEX=os.getenv("PINECONE_INDEX")
|
| 11 |
+
google_embeddings = GoogleGenerativeAIEmbeddings(
|
| 12 |
+
model="models/embedding-001", # Correct model name
|
| 13 |
+
google_api_key=FLASH_API # Your API key
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# Initialize Pinecone instance
|
| 17 |
+
pc = pinecone.Pinecone(
|
| 18 |
+
api_key=PINECONE_API # Your Pinecone API key
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
MONGO_URI = os.getenv("MONGO_URI")
|
| 22 |
+
DB_NAME = os.getenv("DB_NAME")
|
| 23 |
+
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
|
| 24 |
+
|
| 25 |
+
mongo_client = MongoClient(MONGO_URI)
|
| 26 |
+
db = mongo_client[DB_NAME]
|
| 27 |
+
collection = db[COLLECTION_NAME]
|
| 28 |
+
# Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
|
| 29 |
+
|
| 30 |
+
index = pc.Index(PINECONE_INDEX)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def create_embedding(object_url,tags,categories):
|
| 34 |
+
try:
|
| 35 |
+
document = collection.find_one({'object_url': object_url})
|
| 36 |
+
content = document.get("description")
|
| 37 |
+
file_type = document.get("type")
|
| 38 |
+
mongo_id = str(document.get('_id')) # Convert ObjectId to string for storage in metadata
|
| 39 |
+
|
| 40 |
+
# Generate the embedding
|
| 41 |
+
embedding = google_embeddings.embed_query(content)
|
| 42 |
+
|
| 43 |
+
# Generate a unique ID for Pinecone
|
| 44 |
+
pinecone_id = str(uuid.uuid4())
|
| 45 |
+
|
| 46 |
+
# Prepare the vector with metadata
|
| 47 |
+
vectors = [{
|
| 48 |
+
'id': pinecone_id,
|
| 49 |
+
'values': embedding,
|
| 50 |
+
'metadata': {
|
| 51 |
+
'description': content,
|
| 52 |
+
"url": object_url,
|
| 53 |
+
"filetype": file_type,
|
| 54 |
+
"mongo_id": mongo_id,
|
| 55 |
+
"tags":','.join(tags),
|
| 56 |
+
"categories":','.join(categories)# Store MongoDB ID in metadata
|
| 57 |
+
}
|
| 58 |
+
}]
|
| 59 |
+
|
| 60 |
+
# Upsert the vector to Pinecone
|
| 61 |
+
index.upsert(vectors)
|
| 62 |
+
print(f"Inserted: {object_url} in Pinecone with MongoDB ID in metadata")
|
| 63 |
+
|
| 64 |
+
# Update MongoDB document with Pinecone ID and success status
|
| 65 |
+
collection.update_one(
|
| 66 |
+
{"_id": document["_id"]},
|
| 67 |
+
{"$set": {
|
| 68 |
+
"pinecone_id": pinecone_id,
|
| 69 |
+
"successfully_embedding_created": True
|
| 70 |
+
}}
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
return True
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"Error occurred: {e}")
|
| 76 |
+
|
| 77 |
+
# Update MongoDB document with failure status
|
| 78 |
+
collection.update_one(
|
| 79 |
+
{"_id": document["_id"]},
|
| 80 |
+
{"$set": {
|
| 81 |
+
"successfully_embedding_created": False
|
| 82 |
+
}}
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
return False
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|