Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

d2ccb82

verified ·

1 Parent(s): 3ab44aa

Update utils/vector_store.py

Browse files

Files changed (1) hide show

utils/vector_store.py +191 -81

utils/vector_store.py CHANGED Viewed

@@ -2,106 +2,216 @@ import os
 import pickle
 from typing import List, Dict, Any
 from sentence_transformers import SentenceTransformer, util
 class VectorStore:
-    def __init__(self, storage_path: str = "data/vector_store"):
-        """
-        Initialize VectorStore.
-        Args:
-            storage_path (str): Path to store vectorized documents.
-        """
         self.storage_path = storage_path
         os.makedirs(storage_path, exist_ok=True)
-        self.model = SentenceTransformer('all-MiniLM-L6-v2')
         self.vectors = self._load_vectors()
     def _load_vectors(self) -> List[Dict]:
-        """
-        Load stored vectors from the file system.
-        Returns:
-            List[Dict]: List of stored vectorized documents.
-        """
         vector_file = os.path.join(self.storage_path, "vectors.pkl")
-        if os.path.exists(vector_file):
-            with open(vector_file, "rb") as f:
-                return pickle.load(f)
-        return []
     def _save_vectors(self):
-        """
-        Save the current vectors to the file system.
-        """
         vector_file = os.path.join(self.storage_path, "vectors.pkl")
-        with open(vector_file, "wb") as f:
-            pickle.dump(self.vectors, f)
     def add_document(self, doc_id: str, text: str, metadata: Dict[str, Any]):
-        """
-        Add a new document to the vector store.
-        Args:
-            doc_id (str): Unique document identifier.
-            text (str): Full text of the document.
-            metadata (Dict[str, Any]): Metadata associated with the document.
-        """
-        vector = self.model.encode(text, convert_to_tensor=True)
-        self.vectors.append({"doc_id": doc_id, "vector": vector, "text": text, "metadata": metadata})
         self._save_vectors()
-    def similarity_search(self, query: str, top_k: int = 5) -> List[Dict]:
-        """
-        Perform a similarity search for the query against stored vectors.
-        Args:
-            query (str): Query string to search for.
-            top_k (int): Number of top results to return.
-        Returns:
-            List[Dict]: List of the most similar documents.
-        """
         query_vector = self.model.encode(query, convert_to_tensor=True)
         results = []
         for doc in self.vectors:
-            similarity_score = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
-            results.append({"doc_id": doc["doc_id"], "text": doc["text"], "metadata": doc["metadata"], "score": similarity_score})
-        results = sorted(results, key=lambda x: x["score"], reverse=True)
-        return results[:top_k]
-    def chat_with_context(self, query: str, context: str) -> str:
-        """
-        Generate a response to the query using the provided context.
-        Args:
-            query (str): Query string from the user.
-            context (str): Context string from relevant documents.
-        Returns:
-            str: Generated response.
-        """
-        # Combine query and context for the final prompt
-        combined_input = f"""
-        Context:
-        {context}
-        Question:
-        {query}
-        Please provide a detailed and accurate response.
-        """
-        try:
-            results = self.similarity_search(query, top_k=3)
-            if not results:
-                return "No relevant context found. Please upload more documents or refine your query."
-            # Use the top result for a response simulation or pass to an LLM
-            return (
-                f"Based on the context:\n\n"
-                f"{results[0]['text'][:500]}...\n\n"
-                f"Response: The query '{query}' relates to the provided context."
             )
-        except Exception as e:
-            return f"Error generating response: {str(e)}"

 import pickle
 from typing import List, Dict, Any
 from sentence_transformers import SentenceTransformer, util
+import numpy as np
+from datetime import datetime
 class VectorStore:
+    def __init__(self, storage_path: str = "data/vector_store", model_name: str = 'all-MiniLM-L6-v2'):
+        """Initialize VectorStore with improved chunk handling."""
         self.storage_path = storage_path
         os.makedirs(storage_path, exist_ok=True)
+        self.model = SentenceTransformer(model_name)
         self.vectors = self._load_vectors()
+        self.chunk_size = 512  # Optimal size for most transformer models
+        self.chunk_overlap = 50  # Overlap to maintain context
     def _load_vectors(self) -> List[Dict]:
+        """Load vectors with error handling and versioning."""
         vector_file = os.path.join(self.storage_path, "vectors.pkl")
+        try:
+            if os.path.exists(vector_file):
+                with open(vector_file, "rb") as f:
+                    vectors = pickle.load(f)
+                return vectors if isinstance(vectors, list) else []
+        except Exception as e:
+            print(f"Error loading vectors: {e}")
+            return []
     def _save_vectors(self):
+        """Save vectors with backup and atomic write."""
         vector_file = os.path.join(self.storage_path, "vectors.pkl")
+        backup_file = vector_file + ".backup"
+        # Create backup of existing vectors
+        if os.path.exists(vector_file):
+            os.replace(vector_file, backup_file)
+        try:
+            with open(vector_file, "wb") as f:
+                pickle.dump(self.vectors, f)
+            # Remove backup after successful save
+            if os.path.exists(backup_file):
+                os.remove(backup_file)
+        except Exception as e:
+            print(f"Error saving vectors: {e}")
+            # Restore from backup if save failed
+            if os.path.exists(backup_file):
+                os.replace(backup_file, vector_file)
     def add_document(self, doc_id: str, text: str, metadata: Dict[str, Any]):
+        """Add document with improved chunking and metadata."""
+        # Create chunks with overlap
+        chunks = self._create_chunks(text)
+        # Add timestamp and chunk info to metadata
+        base_metadata = {
+            **metadata,
+            "added_at": datetime.now().isoformat(),
+            "doc_id": doc_id,
+            "total_chunks": len(chunks)
+        }
+        # Process and store chunks
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_metadata = {
+                **base_metadata,
+                "chunk_idx": chunk_idx,
+                "chunk_text": chunk[:200]  # Store preview of chunk text
+            }
+            # Encode chunk
+            vector = self.model.encode(chunk, convert_to_tensor=True)
+            # Store chunk with metadata
+            self.vectors.append({
+                "doc_id": f"{doc_id}_chunk_{chunk_idx}",
+                "vector": vector,
+                "text": chunk,
+                "metadata": chunk_metadata
+            })
         self._save_vectors()
+    def _create_chunks(self, text: str) -> List[str]:
+        """Create overlapping chunks with improved sentence boundary handling."""
+        # Split into sentences first
+        sentences = [s.strip() for s in text.split('.') if s.strip()]
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        for sentence in sentences:
+            sentence_size = len(sentence.split())
+            if current_size + sentence_size > self.chunk_size:
+                # Save current chunk
+                if current_chunk:
+                    chunks.append(' '.join(current_chunk))
+                # Start new chunk with overlap
+                overlap_start = max(0, len(current_chunk) - self.chunk_overlap)
+                current_chunk = current_chunk[overlap_start:] + [sentence]
+                current_size = sum(len(s.split()) for s in current_chunk)
+            else:
+                current_chunk.append(sentence)
+                current_size += sentence_size
+        # Add final chunk
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
+    def similarity_search(
+        self,
+        query: str,
+        k: int = 5,
+        threshold: float = 0.5,
+        filter_criteria: Dict[str, List] = None
+    ) -> List[Dict]:
+        """Enhanced similarity search with filtering and re-ranking."""
+        # Encode query
         query_vector = self.model.encode(query, convert_to_tensor=True)
+        # Calculate similarities and filter results
         results = []
         for doc in self.vectors:
+            # Apply filters if specified
+            if filter_criteria:
+                skip = False
+                for key, values in filter_criteria.items():
+                    doc_value = self._get_nested_dict_value(doc["metadata"], key)
+                    if doc_value not in values:
+                        skip = True
+                        break
+                if skip:
+                    continue
+            # Calculate similarity
+            similarity = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
+            if similarity >= threshold:
+                results.append({
+                    **doc,
+                    "score": similarity
+                })
+        # Sort by similarity score
+        results.sort(key=lambda x: x["score"], reverse=True)
+        # Re-rank results based on chunk position and metadata
+        reranked_results = self._rerank_results(results[:k*2], query)
+        return reranked_results[:k]
+    def _rerank_results(self, results: List[Dict], query: str) -> List[Dict]:
+        """Re-rank results considering chunk position and metadata relevance."""
+        for result in results:
+            # Adjust score based on chunk position
+            chunk_idx = result["metadata"].get("chunk_idx", 0)
+            total_chunks = result["metadata"].get("total_chunks", 1)
+            position_score = 1 - (chunk_idx / total_chunks)  # Favor earlier chunks
+            # Adjust score based on metadata relevance
+            metadata_score = self._calculate_metadata_relevance(result["metadata"], query)
+            # Combine scores
+            result["final_score"] = (
+                result["score"] * 0.6 +  # Base similarity
+                position_score * 0.2 +    # Position importance
+                metadata_score * 0.2      # Metadata relevance
             )
+        return sorted(results, key=lambda x: x["final_score"], reverse=True)
+    def _calculate_metadata_relevance(self, metadata: Dict, query: str) -> float:
+        """Calculate relevance score based on metadata matching."""
+        relevance_score = 0.0
+        query_lower = query.lower()
+        # Check for metadata field matches
+        for key, value in metadata.items():
+            if isinstance(value, str):
+                if value.lower() in query_lower:
+                    relevance_score += 0.2
+                elif query_lower in value.lower():
+                    relevance_score += 0.1
+        return min(1.0, relevance_score)  # Normalize to [0,1]
+    def _get_nested_dict_value(self, d: Dict, key_path: str):
+        """Get value from nested dictionary using dot notation."""
+        keys = key_path.split('.')
+        value = d
+        for key in keys:
+            if isinstance(value, dict):
+                value = value.get(key)
+            else:
+                return None
+        return value
+    def get_document_embeddings(self, doc_id: str) -> List[Dict]:
+        """Retrieve all embeddings for a specific document."""
+        return [doc for doc in self.vectors if doc["metadata"]["doc_id"] == doc_id]
+    def delete_document(self, doc_id: str):
+        """Delete all chunks associated with a document."""
+        self.vectors = [doc for doc in self.vectors
+                       if doc["metadata"]["doc_id"] != doc_id]
+        self._save_vectors()
+    def update_metadata(self, doc_id: str, metadata_updates: Dict):
+        """Update metadata for all chunks of a document."""
+        for doc in self.vectors:
+            if doc["metadata"]["doc_id"] == doc_id:
+                doc["metadata"].update(metadata_updates)
+        self._save_vectors()