Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_mongodb import MongoDBAtlasVectorSearch | |
| from langchain_core.documents import Document | |
| from src.database.connection import DatabaseConnection | |
| from src.utils.constants import Constants | |
| from src.utils.logging import get_logger | |
| class VectorSearch: | |
| """ | |
| Class to handle vector search functionality using MongoDB Atlas Vector Search and OpenAI embeddings. | |
| """ | |
| def __init__(self): | |
| """ | |
| Initialize the vector search with OpenAI embeddings and MongoDB Atlas Vector Search. | |
| """ | |
| self.logger = get_logger() | |
| self.logger.info("Initializing vector search") | |
| load_dotenv(override=True) | |
| self.openai_api_key = os.getenv("OPENAI_API_KEY") | |
| if not self.openai_api_key: | |
| self.logger.error("OPENAI_API_KEY environment variable is not set") | |
| raise ValueError("OPENAI_API_KEY environment variable is not set") | |
| try: | |
| self.db_connection = DatabaseConnection() | |
| self.collection = self.db_connection.get_collection() | |
| self.logger.info(f"Initializing OpenAI embeddings with model: {Constants.EMBEDDING_MODEL}") | |
| self.embeddings = OpenAIEmbeddings( | |
| model=Constants.EMBEDDING_MODEL, | |
| openai_api_key=self.openai_api_key | |
| ) | |
| self.logger.info(f"Initializing MongoDB Atlas Vector Search with index: {Constants.ATLAS_VECTOR_SEARCH_INDEX_NAME}") | |
| self.vector_search = MongoDBAtlasVectorSearch.from_connection_string( | |
| connection_string=os.getenv("MONGODB_URI"), | |
| namespace=f"{Constants.DB_NAME}.{Constants.COLLECTION_NAME}", | |
| embedding=self.embeddings, | |
| index_name=Constants.ATLAS_VECTOR_SEARCH_INDEX_NAME, | |
| text_key="text" | |
| ) | |
| self.logger.info("Vector search initialized successfully") | |
| except Exception as e: | |
| self.logger.error(f"Failed to initialize vector search: {str(e)}") | |
| raise | |
| def search(self, query, limit=5): | |
| """ | |
| Search for documents similar to the query. | |
| Args: | |
| query (str): The search query. | |
| limit (int, optional): Maximum number of results to return. Defaults to 5. | |
| Returns: | |
| list: List of search results. | |
| """ | |
| self.logger.info(f"Performing vector search with query: '{query}' (limit: {limit})") | |
| try: | |
| results = self.vector_search.similarity_search(query, k=limit) | |
| self.logger.info(f"Vector search returned {len(results)} results") | |
| return self._format_results(results) | |
| except Exception as e: | |
| self.logger.error(f"Error during vector search: {str(e)}") | |
| raise | |
| def _format_results(self, results): | |
| """ | |
| Format the search results into a standardized format. | |
| Args: | |
| results (list): List of Document objects from the vector search. | |
| Returns: | |
| list: List of dictionaries with standardized fields. | |
| """ | |
| self.logger.debug(f"Formatting {len(results)} search results") | |
| try: | |
| formatted_results = [] | |
| for doc in results: | |
| metadata = doc.metadata | |
| thumbnail = metadata.get("thumbnail") | |
| if thumbnail == "" or thumbnail is None: | |
| thumbnail = Constants.PLACEHOLDER_IMAGE_URL | |
| formatted_results.append({ | |
| "id": metadata.get("id", ""), | |
| "title": metadata.get("title", ""), | |
| "text": doc.page_content, | |
| "price": metadata.get("price", ""), | |
| "thumbnail": thumbnail, | |
| "product_page_url": metadata.get("product_page_url", "") | |
| }) | |
| self.logger.debug("Results formatted successfully") | |
| return formatted_results | |
| except Exception as e: | |
| self.logger.error(f"Error formatting search results: {str(e)}") | |
| raise | |
| def close(self): | |
| """ | |
| Close the database connection. | |
| """ | |
| self.logger.info("Closing vector search resources") | |
| try: | |
| self.db_connection.close_connection() | |
| self.logger.info("Vector search resources closed successfully") | |
| except Exception as e: | |
| self.logger.error(f"Error closing vector search resources: {str(e)}") | |