Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

ae7cccb

verified ·

1 Parent(s): 896a693

Update utils/vector_store.py

Browse files

Files changed (1) hide show

utils/vector_store.py +82 -60

utils/vector_store.py CHANGED Viewed

@@ -1,68 +1,90 @@
-import streamlit as st
-from utils.document_processor import DocumentProcessor
-from components.template_generator import render_template_generator
-from components.document_viewer import DocumentViewer
-from typing import Tuple, List, Dict
-# Initialize components
-vector_store = VectorStore()
-doc_processor = DocumentProcessor()
-# Page configuration
-st.set_page_config(
-    page_title="Legal AI Assistant",
-    page_icon="⚖️",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Sidebar navigation
-tab = st.sidebar.radio(
-    "Navigation",
-    ["📁 Manage Documents", "📝 Generate Templates", "🔍 Search Documents"]
-)
-# Tab 1: Manage Documents
-if tab == "📁 Manage Documents":
-    st.title("📁 Manage Documents")
-    uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
-    if uploaded_file:
-        with st.spinner("Processing document..."):
-            text, chunks = doc_processor.process_document(uploaded_file)
-            st.success("Document processed successfully!")
-            # Add to vector store
-            vector_store.add_texts(
-                texts=[chunk["text"] for chunk in chunks],
-                metadatas=[{"text": chunk["text"], "chunk_id": chunk["chunk_id"], "filename": uploaded_file.name} for chunk in chunks]
-            )
-            st.success("Document added to vector store!")
-    # List processed documents
-    st.subheader("Processed Documents")
-    processed_docs = vector_store.metadata
-    if processed_docs:
-        for idx, doc in enumerate(processed_docs):
-            st.markdown(f"{idx+1}. **{doc.get('filename', 'Unknown')}** - Chunk ID: {doc['chunk_id']}")
-    else:
-        st.info("No documents uploaded yet.")
-# Tab 2: Generate Templates
-elif tab == "📝 Generate Templates":
-    st.title("📝 Generate Templates")
-    render_template_generator()
-# Tab 3: Search Documents
-elif tab == "🔍 Search Documents":
-    st.title("🔍 Search Documents")
-    query = st.text_input("Enter your query:")
-    if query:
-        with st.spinner("Searching for relevant chunks..."):
-            results = vector_store.similarity_search(query)
-            if results:
-                st.success("Found relevant chunks:")
-                for result in results:
-                    st.markdown(f"- **Chunk:** {result['text'][:200]}... (Relevance: {result['distance']:.2f})")
-            else:
-                st.warning("No relevant chunks found.")

+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+from typing import List, Dict
+import os
+import pickle
+class VectorStore:
+    def __init__(self, storage_path: str = "data/vector_store", dimension: int = 384):
+        """
+        Initialize the VectorStore.
+        Args:
+            storage_path (str): Path to store the FAISS index and metadata.
+            dimension (int): Dimension of the embeddings (depends on the embedding model used).
+        """
+        self.storage_path = storage_path
+        os.makedirs(self.storage_path, exist_ok=True)
+        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Pre-trained model
+        self.dimension = dimension
+        # Initialize FAISS index and metadata
+        self.index = faiss.IndexFlatL2(self.dimension)
+        self.metadata = []
+        self._load_vector_store()
+    def _load_vector_store(self):
+        """Load the FAISS index and metadata from persistent storage."""
+        try:
+            index_path = os.path.join(self.storage_path, "faiss.index")
+            metadata_path = os.path.join(self.storage_path, "metadata.pkl")
+            if os.path.exists(index_path) and os.path.exists(metadata_path):
+                self.index = faiss.read_index(index_path)
+                with open(metadata_path, "rb") as f:
+                    self.metadata = pickle.load(f)
+        except Exception as e:
+            print(f"Failed to load vector store: {e}")
+    def add_texts(self, texts: List[str], metadatas: List[Dict] = None):
+        """
+        Add texts and their metadata to the vector store.
+        Args:
+            texts (List[str]): List of text chunks to be added.
+            metadatas (List[Dict]): List of metadata dictionaries corresponding to the text chunks.
+        """
+        embeddings = self.embedder.encode(texts, show_progress_bar=True)
+        self.index.add(np.array(embeddings).astype("float32"))
+        self.metadata.extend(metadatas if metadatas else [{}] * len(texts))
+        self._save_vector_store()
+    def similarity_search(self, query: str, k: int = 5) -> List[Dict]:
+        """
+        Perform a similarity search for the given query.
+        Args:
+            query (str): The query text.
+            k (int): Number of closest matches to retrieve.
+        Returns:
+            List[Dict]: A list of dictionaries containing the text and its relevance score.
+        """
+        query_embedding = self.embedder.encode([query]).astype("float32")
+        distances, indices = self.index.search(query_embedding, k)
+        return [
+            {"text": self.metadata[i]["text"], "distance": distances[0][j]}
+            for j, i in enumerate(indices[0]) if i < len(self.metadata)
+        ]
+    def _save_vector_store(self):
+        """Save the FAISS index and metadata to persistent storage."""
+        try:
+            index_path = os.path.join(self.storage_path, "faiss.index")
+            metadata_path = os.path.join(self.storage_path, "metadata.pkl")
+            faiss.write_index(self.index, index_path)
+            with open(metadata_path, "wb") as f:
+                pickle.dump(self.metadata, f)
+        except Exception as e:
+            print(f"Failed to save vector store: {e}")
+    def reset_store(self):
+        """
+        Reset the vector store by clearing the FAISS index and metadata.
+        This is useful for starting fresh.
+        """
+        self.index = faiss.IndexFlatL2(self.dimension)
+        self.metadata = []
+        self._save_vector_store()