Spaces:

Clocksp
/

Insurance-AI

Running

App Files Files Community

Clocksp commited on Jan 28

Commit

d0f35dc

verified ·

1 Parent(s): e1d7c8a

Update src/utils/vector_store.py

Browse files

Files changed (1) hide show

src/utils/vector_store.py +306 -305

src/utils/vector_store.py CHANGED Viewed

@@ -1,305 +1,306 @@
-from typing import List, Optional, Dict, Any
-from langchain_classic.schema import Document
-from langchain_google_genai import GoogleGenerativeAIEmbeddings
-from langchain_qdrant import QdrantVectorStore
-from qdrant_client import QdrantClient
-from qdrant_client.models import Distance, VectorParams, PointStruct
-from config import Config
-import uuid
-class VectorStoreManager:
-    """Manages Qdrant vector store operations for insurance documents"""
-    def __init__(self):
-        """Initialize Qdrant client and embeddings"""
-        # Validate configuration
-        Config.validate_config()
-        # Get configuration
-        self.qdrant_config = Config.get_qdrant_config()
-        self.retrieval_config = Config.get_retrieval_config()
-        # Initialize Qdrant client
-        self.client = QdrantClient(
-            url=self.qdrant_config["url"],
-            api_key=self.qdrant_config["api_key"],
-        )
-        # Initialize embeddings
-        self.embeddings = GoogleGenerativeAIEmbeddings(
-            model=Config.EMBEDDING_MODEL,
-            google_api_key=Config.GEMINI_API_KEY
-        )
-        self.collection_name = self.qdrant_config["collection_name"]
-        print("Vector store manager initialized")
-    def create_collection(self, recreate: bool = False) -> bool:
-        """
-        Create a new collection in Qdrant
-        Args:
-            recreate: If True, delete existing collection and create new one
-        Returns:
-            Boolean indicating success
-        """
-        try:
-            # Check if collection exists
-            collections = self.client.get_collections().collections
-            collection_exists = any(c.name == self.collection_name for c in collections)
-            if collection_exists:
-                if recreate:
-                    print(f"⚠ Deleting existing collection: {self.collection_name}")
-                    self.client.delete_collection(self.collection_name)
-                else:
-                    print(f" Collection '{self.collection_name}' already exists")
-                    return True
-            # Create new collection
-            self.client.create_collection(
-                collection_name=self.collection_name,
-                vectors_config=VectorParams(
-                    size=self.qdrant_config["vector_size"],
-                    distance=Distance.COSINE
-                )
-            )
-            print(f" Created collection: {self.collection_name}")
-            return True
-        except Exception as e:
-            print(f" Error creating collection: {str(e)}")
-            raise
-    def add_documents(self, documents: List[Document], batch_size: int = 100) -> List[str]:
-        """
-        Add documents to Qdrant vector store
-        Args:
-            documents: List of Document objects to add
-            batch_size: Number of documents to process in each batch
-        Returns:
-            List of document IDs
-        """
-        try:
-            print(f"Adding {len(documents)} documents to vector store...")
-            # Ensure collection exists
-            self.create_collection(recreate=False)
-            # Initialize vector store
-            vector_store = QdrantVectorStore(
-                client=self.client,
-                collection_name=self.collection_name,
-                embedding=self.embeddings
-            )
-            # Add documents in batches
-            all_ids = []
-            for i in range(0, len(documents), batch_size):
-                batch = documents[i:i + batch_size]
-                # Generate unique IDs for this batch
-                batch_ids = [str(uuid.uuid4()) for _ in batch]
-                # Add to vector store
-                vector_store.add_documents(documents=batch, ids=batch_ids)
-                all_ids.extend(batch_ids)
-                print(f"   Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
-            print(f" Successfully added {len(documents)} documents")
-            return all_ids
-        except Exception as e:
-            print(f" Error adding documents: {str(e)}")
-            raise
-    def similarity_search(
-        self,
-        query: str,
-        k: Optional[int] = None,
-        filter_dict: Optional[Dict[str, Any]] = None
-    ) -> List[Document]:
-        """
-        Search for similar documents using semantic similarity
-        Args:
-            query: Search query string
-            k: Number of results to return (default from config)
-            filter_dict: Optional metadata filters (e.g., {"section_type": "exclusions"})
-        Returns:
-            List of most similar Documents
-        """
-        try:
-            if k is None:
-                k = self.retrieval_config["top_k"]
-            # Initialize vector store for querying
-            vector_store = QdrantVectorStore(
-                client=self.client,
-                collection_name=self.collection_name,
-                embedding=self.embeddings
-            )
-            if filter_dict:
-                # Get more results than needed
-                results = vector_store.similarity_search(query=query, k=k*3)
-                # Filter by metadata
-                filtered_results = []
-                for doc in results:
-                    match = True
-                    for key, value in filter_dict.items():
-                        if doc.metadata.get(key) != value:
-                            match = False
-                            break
-                    if match:
-                        filtered_results.append(doc)
-                    # Stop when we have enough results
-                    if len(filtered_results) >= k:
-                        break
-                return filtered_results[:k]
-            else:
-                results = vector_store.similarity_search(query=query, k=k)
-                return results
-        except Exception as e:
-            print(f" Error during similarity search: {str(e)}")
-            raise
-    def similarity_search_with_score(
-        self,
-        query: str,
-        k: Optional[int] = None,
-        score_threshold: Optional[float] = None
-    ) -> List[tuple[Document, float]]:
-        """
-        Search with similarity scores
-        Args:
-            query: Search query string
-            k: Number of results to return
-            score_threshold: Minimum similarity score (default from config)
-        Returns:
-            List of (Document, score) tuples
-        """
-        try:
-            if k is None:
-                k = self.retrieval_config["top_k"]
-            if score_threshold is None:
-                score_threshold = self.retrieval_config["similarity_threshold"]
-            # Initialize vector store
-            vector_store = QdrantVectorStore(
-                client=self.client,
-                collection_name=self.collection_name,
-                embedding=self.embeddings
-            )
-            # Search with scores
-            results = vector_store.similarity_search_with_score(query=query, k=k)
-            # Filter by score threshold
-            filtered_results = [
-                (doc, score) for doc, score in results
-                if score >= score_threshold
-            ]
-            print(f" Found {len(filtered_results)} results above threshold {score_threshold}")
-            return filtered_results
-        except Exception as e:
-            print(f" Error during similarity search with score: {str(e)}")
-            raise
-    def search_by_section_type(
-        self,
-        query: str,
-        section_type: str,
-        k: Optional[int] = None
-    ) -> List[Document]:
-        """
-        Search within a specific section type (e.g., 'exclusions', 'addons')
-        Args:
-            query: Search query string
-            section_type: Type of section to search in
-            k: Number of results to return
-        Returns:
-            List of Documents from specified section type
-        """
-        filter_dict = {"section_type": section_type}
-        return self.similarity_search(query=query, k=k, filter_dict=filter_dict)
-    def get_collection_info(self) -> Dict:
-        """
-        Get information about the current collection
-        Returns:
-            Dictionary with collection statistics
-        """
-        try:
-            collection_info = self.client.get_collection(self.collection_name)
-            return {
-                "name": self.collection_name,
-                "vectors_count": collection_info.vectors_count,
-                "points_count": collection_info.points_count,
-                "status": collection_info.status,
-            }
-        except Exception as e:
-            print(f" Error getting collection info: {str(e)}")
-            return {}
-    def delete_collection(self) -> bool:
-        """
-        Delete the current collection
-        Returns:
-            Boolean indicating success
-        """
-        try:
-            self.client.delete_collection(self.collection_name)
-            print(f" Deleted collection: {self.collection_name}")
-            return True
-        except Exception as e:
-            print(f" Error deleting collection: {str(e)}")
-            return False
-    def get_retriever(self, **kwargs):
-        """
-        Get a LangChain retriever object for use in chains
-        Args:
-            **kwargs: Additional arguments for retriever configuration
-        Returns:
-            VectorStoreRetriever object
-        """
-        vector_store = QdrantVectorStore(
-            client=self.client,
-            collection_name=self.collection_name,
-            embedding=self.embeddings
-        )
-        # Set default search kwargs
-        search_kwargs = {
-            "k": self.retrieval_config["top_k"]
-        }
-        search_kwargs.update(kwargs)
-        return vector_store.as_retriever(search_kwargs=search_kwargs)

+from typing import List, Optional, Dict, Any
+from langchain_classic.schema import Document
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, PointStruct
+from config import Config
+import uuid
+class VectorStoreManager:
+    """Manages Qdrant vector store operations for insurance documents"""
+    def __init__(self):
+        """Initialize Qdrant client and embeddings"""
+        # Validate configuration
+        Config.validate_config()
+        # Get configuration
+        self.qdrant_config = Config.get_qdrant_config()
+        self.retrieval_config = Config.get_retrieval_config()
+        # Initialize Qdrant client
+        self.client = QdrantClient(
+            url=self.qdrant_config["url"],
+            api_key=self.qdrant_config["api_key"],
+        )
+        # Initialize embeddings
+        self.embeddings = GoogleGenerativeAIEmbeddings(
+            model=Config.EMBEDDING_MODEL,
+            output_dimensionality=Config.EMBEDDING_DIMENSION,
+            google_api_key=Config.GEMINI_API_KEY
+        )
+        self.collection_name = self.qdrant_config["collection_name"]
+        print("Vector store manager initialized")
+    def create_collection(self, recreate: bool = False) -> bool:
+        """
+        Create a new collection in Qdrant
+        Args:
+            recreate: If True, delete existing collection and create new one
+        Returns:
+            Boolean indicating success
+        """
+        try:
+            # Check if collection exists
+            collections = self.client.get_collections().collections
+            collection_exists = any(c.name == self.collection_name for c in collections)
+            if collection_exists:
+                if recreate:
+                    print(f"⚠ Deleting existing collection: {self.collection_name}")
+                    self.client.delete_collection(self.collection_name)
+                else:
+                    print(f" Collection '{self.collection_name}' already exists")
+                    return True
+            # Create new collection
+            self.client.create_collection(
+                collection_name=self.collection_name,
+                vectors_config=VectorParams(
+                    size=self.qdrant_config["vector_size"],
+                    distance=Distance.COSINE
+                )
+            )
+            print(f" Created collection: {self.collection_name}")
+            return True
+        except Exception as e:
+            print(f" Error creating collection: {str(e)}")
+            raise
+    def add_documents(self, documents: List[Document], batch_size: int = 100) -> List[str]:
+        """
+        Add documents to Qdrant vector store
+        Args:
+            documents: List of Document objects to add
+            batch_size: Number of documents to process in each batch
+        Returns:
+            List of document IDs
+        """
+        try:
+            print(f"Adding {len(documents)} documents to vector store...")
+            # Ensure collection exists
+            self.create_collection(recreate=False)
+            # Initialize vector store
+            vector_store = QdrantVectorStore(
+                client=self.client,
+                collection_name=self.collection_name,
+                embedding=self.embeddings
+            )
+            # Add documents in batches
+            all_ids = []
+            for i in range(0, len(documents), batch_size):
+                batch = documents[i:i + batch_size]
+                # Generate unique IDs for this batch
+                batch_ids = [str(uuid.uuid4()) for _ in batch]
+                # Add to vector store
+                vector_store.add_documents(documents=batch, ids=batch_ids)
+                all_ids.extend(batch_ids)
+                print(f"   Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
+            print(f" Successfully added {len(documents)} documents")
+            return all_ids
+        except Exception as e:
+            print(f" Error adding documents: {str(e)}")
+            raise
+    def similarity_search(
+        self,
+        query: str,
+        k: Optional[int] = None,
+        filter_dict: Optional[Dict[str, Any]] = None
+    ) -> List[Document]:
+        """
+        Search for similar documents using semantic similarity
+        Args:
+            query: Search query string
+            k: Number of results to return (default from config)
+            filter_dict: Optional metadata filters (e.g., {"section_type": "exclusions"})
+        Returns:
+            List of most similar Documents
+        """
+        try:
+            if k is None:
+                k = self.retrieval_config["top_k"]
+            # Initialize vector store for querying
+            vector_store = QdrantVectorStore(
+                client=self.client,
+                collection_name=self.collection_name,
+                embedding=self.embeddings
+            )
+            if filter_dict:
+                # Get more results than needed
+                results = vector_store.similarity_search(query=query, k=k*3)
+                # Filter by metadata
+                filtered_results = []
+                for doc in results:
+                    match = True
+                    for key, value in filter_dict.items():
+                        if doc.metadata.get(key) != value:
+                            match = False
+                            break
+                    if match:
+                        filtered_results.append(doc)
+                    # Stop when we have enough results
+                    if len(filtered_results) >= k:
+                        break
+                return filtered_results[:k]
+            else:
+                results = vector_store.similarity_search(query=query, k=k)
+                return results
+        except Exception as e:
+            print(f" Error during similarity search: {str(e)}")
+            raise
+    def similarity_search_with_score(
+        self,
+        query: str,
+        k: Optional[int] = None,
+        score_threshold: Optional[float] = None
+    ) -> List[tuple[Document, float]]:
+        """
+        Search with similarity scores
+        Args:
+            query: Search query string
+            k: Number of results to return
+            score_threshold: Minimum similarity score (default from config)
+        Returns:
+            List of (Document, score) tuples
+        """
+        try:
+            if k is None:
+                k = self.retrieval_config["top_k"]
+            if score_threshold is None:
+                score_threshold = self.retrieval_config["similarity_threshold"]
+            # Initialize vector store
+            vector_store = QdrantVectorStore(
+                client=self.client,
+                collection_name=self.collection_name,
+                embedding=self.embeddings
+            )
+            # Search with scores
+            results = vector_store.similarity_search_with_score(query=query, k=k)
+            # Filter by score threshold
+            filtered_results = [
+                (doc, score) for doc, score in results
+                if score >= score_threshold
+            ]
+            print(f" Found {len(filtered_results)} results above threshold {score_threshold}")
+            return filtered_results
+        except Exception as e:
+            print(f" Error during similarity search with score: {str(e)}")
+            raise
+    def search_by_section_type(
+        self,
+        query: str,
+        section_type: str,
+        k: Optional[int] = None
+    ) -> List[Document]:
+        """
+        Search within a specific section type (e.g., 'exclusions', 'addons')
+        Args:
+            query: Search query string
+            section_type: Type of section to search in
+            k: Number of results to return
+        Returns:
+            List of Documents from specified section type
+        """
+        filter_dict = {"section_type": section_type}
+        return self.similarity_search(query=query, k=k, filter_dict=filter_dict)
+    def get_collection_info(self) -> Dict:
+        """
+        Get information about the current collection
+        Returns:
+            Dictionary with collection statistics
+        """
+        try:
+            collection_info = self.client.get_collection(self.collection_name)
+            return {
+                "name": self.collection_name,
+                "vectors_count": collection_info.vectors_count,
+                "points_count": collection_info.points_count,
+                "status": collection_info.status,
+            }
+        except Exception as e:
+            print(f" Error getting collection info: {str(e)}")
+            return {}
+    def delete_collection(self) -> bool:
+        """
+        Delete the current collection
+        Returns:
+            Boolean indicating success
+        """
+        try:
+            self.client.delete_collection(self.collection_name)
+            print(f" Deleted collection: {self.collection_name}")
+            return True
+        except Exception as e:
+            print(f" Error deleting collection: {str(e)}")
+            return False
+    def get_retriever(self, **kwargs):
+        """
+        Get a LangChain retriever object for use in chains
+        Args:
+            **kwargs: Additional arguments for retriever configuration
+        Returns:
+            VectorStoreRetriever object
+        """
+        vector_store = QdrantVectorStore(
+            client=self.client,
+            collection_name=self.collection_name,
+            embedding=self.embeddings
+        )
+        # Set default search kwargs
+        search_kwargs = {
+            "k": self.retrieval_config["top_k"]
+        }
+        search_kwargs.update(kwargs)
+        return vector_store.as_retriever(search_kwargs=search_kwargs)