Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

2b4b6b2

verified ·

1 Parent(s): 827fd16

Update utils/vector_store.py

Browse files

Files changed (1) hide show

utils/vector_store.py +77 -62

utils/vector_store.py CHANGED Viewed

@@ -1,90 +1,105 @@
-from sentence_transformers import SentenceTransformer
-import faiss
-import numpy as np
-from typing import List, Dict
 import os
 import pickle
 class VectorStore:
-    def __init__(self, storage_path: str = "data/vector_store", dimension: int = 384):
         """
-        Initialize the VectorStore.
         Args:
-            storage_path (str): Path to store the FAISS index and metadata.
-            dimension (int): Dimension of the embeddings (depends on the embedding model used).
         """
         self.storage_path = storage_path
-        os.makedirs(self.storage_path, exist_ok=True)
-        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Pre-trained model
-        self.dimension = dimension
-        # Initialize FAISS index and metadata
-        self.index = faiss.IndexFlatL2(self.dimension)
-        self.metadata = []
-        self._load_vector_store()
-    def _load_vector_store(self):
-        """Load the FAISS index and metadata from persistent storage."""
-        try:
-            index_path = os.path.join(self.storage_path, "faiss.index")
-            metadata_path = os.path.join(self.storage_path, "metadata.pkl")
-            if os.path.exists(index_path) and os.path.exists(metadata_path):
-                self.index = faiss.read_index(index_path)
-                with open(metadata_path, "rb") as f:
-                    self.metadata = pickle.load(f)
-        except Exception as e:
-            print(f"Failed to load vector store: {e}")
-    def add_texts(self, texts: List[str], metadatas: List[Dict] = None):
         """
-        Add texts and their metadata to the vector store.
         Args:
-            texts (List[str]): List of text chunks to be added.
-            metadatas (List[Dict]): List of metadata dictionaries corresponding to the text chunks.
         """
-        embeddings = self.embedder.encode(texts, show_progress_bar=True)
-        self.index.add(np.array(embeddings).astype("float32"))
-        self.metadata.extend(metadatas if metadatas else [{}] * len(texts))
-        self._save_vector_store()
-    def similarity_search(self, query: str, k: int = 5) -> List[Dict]:
         """
-        Perform a similarity search for the given query.
         Args:
-            query (str): The query text.
-            k (int): Number of closest matches to retrieve.
         Returns:
-            List[Dict]: A list of dictionaries containing the text and its relevance score.
         """
-        query_embedding = self.embedder.encode([query]).astype("float32")
-        distances, indices = self.index.search(query_embedding, k)
-        return [
-            {"text": self.metadata[i]["text"], "distance": distances[0][j]}
-            for j, i in enumerate(indices[0]) if i < len(self.metadata)
-        ]
-    def _save_vector_store(self):
-        """Save the FAISS index and metadata to persistent storage."""
-        try:
-            index_path = os.path.join(self.storage_path, "faiss.index")
-            metadata_path = os.path.join(self.storage_path, "metadata.pkl")
-            faiss.write_index(self.index, index_path)
-            with open(metadata_path, "wb") as f:
-                pickle.dump(self.metadata, f)
-        except Exception as e:
-            print(f"Failed to save vector store: {e}")
-    def reset_store(self):
         """
-        Reset the vector store by clearing the FAISS index and metadata.
-        This is useful for starting fresh.
         """
-        self.index = faiss.IndexFlatL2(self.dimension)
-        self.metadata = []
-        self._save_vector_store()

 import os
 import pickle
+from typing import List, Dict, Any
+from sentence_transformers import SentenceTransformer, util
 class VectorStore:
+    def __init__(self, storage_path: str = "data/vector_store"):
         """
+        Initialize VectorStore.
         Args:
+            storage_path (str): Path to store vectorized documents.
         """
         self.storage_path = storage_path
+        os.makedirs(storage_path, exist_ok=True)
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+        self.vectors = self._load_vectors()
+    def _load_vectors(self) -> List[Dict]:
+        """
+        Load stored vectors from the file system.
+        Returns:
+            List[Dict]: List of stored vectorized documents.
+        """
+        vector_file = os.path.join(self.storage_path, "vectors.pkl")
+        if os.path.exists(vector_file):
+            with open(vector_file, "rb") as f:
+                return pickle.load(f)
+        return []
+    def _save_vectors(self):
+        """
+        Save the current vectors to the file system.
+        """
+        vector_file = os.path.join(self.storage_path, "vectors.pkl")
+        with open(vector_file, "wb") as f:
+            pickle.dump(self.vectors, f)
+    def add_document(self, doc_id: str, text: str, metadata: Dict[str, Any]):
         """
+        Add a new document to the vector store.
         Args:
+            doc_id (str): Unique document identifier.
+            text (str): Full text of the document.
+            metadata (Dict[str, Any]): Metadata associated with the document.
         """
+        vector = self.model.encode(text, convert_to_tensor=True)
+        self.vectors.append({"doc_id": doc_id, "vector": vector, "text": text, "metadata": metadata})
+        self._save_vectors()
+    def similarity_search(self, query: str, top_k: int = 5) -> List[Dict]:
         """
+        Perform a similarity search for the query against stored vectors.
         Args:
+            query (str): Query string to search for.
+            top_k (int): Number of top results to return.
         Returns:
+            List[Dict]: List of the most similar documents.
         """
+        query_vector = self.model.encode(query, convert_to_tensor=True)
+        results = []
+        for doc in self.vectors:
+            similarity_score = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
+            results.append({"doc_id": doc["doc_id"], "text": doc["text"], "metadata": doc["metadata"], "score": similarity_score})
+        results = sorted(results, key=lambda x: x["score"], reverse=True)
+        return results[:top_k]
+    def chat_with_context(self, query: str, context: str) -> str:
         """
+        Generate a response to the query using the provided context.
+        Args:
+            query (str): Query string from the user.
+            context (str): Context string from relevant documents.
+        Returns:
+            str: Generated response.
         """
+        # Combine query and context for the final prompt
+        combined_input = f"""
+        Context:
+        {context}
+        Question:
+        {query}
+        Please provide a detailed and accurate response.
+        """
+        # Placeholder for LLM API integration
+        try:
+            # Simulate response using pre-trained embeddings and relevance
+            results = self.similarity_search(query, top_k=3)
+            return (
+                f"Based on the context:\n\n"
+                f"{results[0]['text'][:500]}...\n\n"
+                f"Response: The query '{query}' relates to the provided context."
+            )
+        except Exception as e:
+            return f"Error generating response: {str(e)}"