adf-chatbot2 / src /search /vector_search.py
Yannick Lemin
fixed tests
e9c64b8
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_core.documents import Document
from src.database.connection import DatabaseConnection
from src.utils.constants import Constants
from src.utils.logging import get_logger
class VectorSearch:
"""
Class to handle vector search functionality using MongoDB Atlas Vector Search and OpenAI embeddings.
"""
def __init__(self):
"""
Initialize the vector search with OpenAI embeddings and MongoDB Atlas Vector Search.
"""
self.logger = get_logger()
self.logger.info("Initializing vector search")
load_dotenv(override=True)
self.openai_api_key = os.getenv("OPENAI_API_KEY")
if not self.openai_api_key:
self.logger.error("OPENAI_API_KEY environment variable is not set")
raise ValueError("OPENAI_API_KEY environment variable is not set")
try:
self.db_connection = DatabaseConnection()
self.collection = self.db_connection.get_collection()
self.logger.info(f"Initializing OpenAI embeddings with model: {Constants.EMBEDDING_MODEL}")
self.embeddings = OpenAIEmbeddings(
model=Constants.EMBEDDING_MODEL,
openai_api_key=self.openai_api_key
)
self.logger.info(f"Initializing MongoDB Atlas Vector Search with index: {Constants.ATLAS_VECTOR_SEARCH_INDEX_NAME}")
self.vector_search = MongoDBAtlasVectorSearch.from_connection_string(
connection_string=os.getenv("MONGODB_URI"),
namespace=f"{Constants.DB_NAME}.{Constants.COLLECTION_NAME}",
embedding=self.embeddings,
index_name=Constants.ATLAS_VECTOR_SEARCH_INDEX_NAME,
text_key="text"
)
self.logger.info("Vector search initialized successfully")
except Exception as e:
self.logger.error(f"Failed to initialize vector search: {str(e)}")
raise
def search(self, query, limit=5):
"""
Search for documents similar to the query.
Args:
query (str): The search query.
limit (int, optional): Maximum number of results to return. Defaults to 5.
Returns:
list: List of search results.
"""
self.logger.info(f"Performing vector search with query: '{query}' (limit: {limit})")
try:
results = self.vector_search.similarity_search(query, k=limit)
self.logger.info(f"Vector search returned {len(results)} results")
return self._format_results(results)
except Exception as e:
self.logger.error(f"Error during vector search: {str(e)}")
raise
def _format_results(self, results):
"""
Format the search results into a standardized format.
Args:
results (list): List of Document objects from the vector search.
Returns:
list: List of dictionaries with standardized fields.
"""
self.logger.debug(f"Formatting {len(results)} search results")
try:
formatted_results = []
for doc in results:
metadata = doc.metadata
thumbnail = metadata.get("thumbnail")
if thumbnail == "" or thumbnail is None:
thumbnail = Constants.PLACEHOLDER_IMAGE_URL
formatted_results.append({
"id": metadata.get("id", ""),
"title": metadata.get("title", ""),
"text": doc.page_content,
"price": metadata.get("price", ""),
"thumbnail": thumbnail,
"product_page_url": metadata.get("product_page_url", "")
})
self.logger.debug("Results formatted successfully")
return formatted_results
except Exception as e:
self.logger.error(f"Error formatting search results: {str(e)}")
raise
def close(self):
"""
Close the database connection.
"""
self.logger.info("Closing vector search resources")
try:
self.db_connection.close_connection()
self.logger.info("Vector search resources closed successfully")
except Exception as e:
self.logger.error(f"Error closing vector search resources: {str(e)}")