cryogenic22 commited on
Commit
ae7cccb
Β·
verified Β·
1 Parent(s): 896a693

Update utils/vector_store.py

Browse files
Files changed (1) hide show
  1. utils/vector_store.py +82 -60
utils/vector_store.py CHANGED
@@ -1,68 +1,90 @@
1
- import streamlit as st
2
- from utils.document_processor import DocumentProcessor
3
- from components.template_generator import render_template_generator
4
- from components.document_viewer import DocumentViewer
5
- from typing import Tuple, List, Dict
 
6
 
7
- # Initialize components
8
- vector_store = VectorStore()
9
- doc_processor = DocumentProcessor()
10
 
11
- # Page configuration
12
- st.set_page_config(
13
- page_title="Legal AI Assistant",
14
- page_icon="βš–οΈ",
15
- layout="wide",
16
- initial_sidebar_state="expanded"
17
- )
18
 
19
- # Sidebar navigation
20
- tab = st.sidebar.radio(
21
- "Navigation",
22
- ["πŸ“ Manage Documents", "πŸ“ Generate Templates", "πŸ” Search Documents"]
23
- )
 
24
 
25
- # Tab 1: Manage Documents
26
- if tab == "πŸ“ Manage Documents":
27
- st.title("πŸ“ Manage Documents")
28
- uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
29
-
30
- if uploaded_file:
31
- with st.spinner("Processing document..."):
32
- text, chunks = doc_processor.process_document(uploaded_file)
33
- st.success("Document processed successfully!")
34
 
35
- # Add to vector store
36
- vector_store.add_texts(
37
- texts=[chunk["text"] for chunk in chunks],
38
- metadatas=[{"text": chunk["text"], "chunk_id": chunk["chunk_id"], "filename": uploaded_file.name} for chunk in chunks]
39
- )
40
- st.success("Document added to vector store!")
41
 
42
- # List processed documents
43
- st.subheader("Processed Documents")
44
- processed_docs = vector_store.metadata
45
- if processed_docs:
46
- for idx, doc in enumerate(processed_docs):
47
- st.markdown(f"{idx+1}. **{doc.get('filename', 'Unknown')}** - Chunk ID: {doc['chunk_id']}")
48
- else:
49
- st.info("No documents uploaded yet.")
 
 
 
50
 
51
- # Tab 2: Generate Templates
52
- elif tab == "πŸ“ Generate Templates":
53
- st.title("πŸ“ Generate Templates")
54
- render_template_generator()
55
 
56
- # Tab 3: Search Documents
57
- elif tab == "πŸ” Search Documents":
58
- st.title("πŸ” Search Documents")
59
- query = st.text_input("Enter your query:")
60
- if query:
61
- with st.spinner("Searching for relevant chunks..."):
62
- results = vector_store.similarity_search(query)
63
- if results:
64
- st.success("Found relevant chunks:")
65
- for result in results:
66
- st.markdown(f"- **Chunk:** {result['text'][:200]}... (Relevance: {result['distance']:.2f})")
67
- else:
68
- st.warning("No relevant chunks found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import faiss
3
+ import numpy as np
4
+ from typing import List, Dict
5
+ import os
6
+ import pickle
7
 
 
 
 
8
 
9
+ class VectorStore:
10
+ def __init__(self, storage_path: str = "data/vector_store", dimension: int = 384):
11
+ """
12
+ Initialize the VectorStore.
 
 
 
13
 
14
+ Args:
15
+ storage_path (str): Path to store the FAISS index and metadata.
16
+ dimension (int): Dimension of the embeddings (depends on the embedding model used).
17
+ """
18
+ self.storage_path = storage_path
19
+ os.makedirs(self.storage_path, exist_ok=True)
20
 
21
+ self.embedder = SentenceTransformer("all-MiniLM-L6-v2") # Pre-trained model
22
+ self.dimension = dimension
 
 
 
 
 
 
 
23
 
24
+ # Initialize FAISS index and metadata
25
+ self.index = faiss.IndexFlatL2(self.dimension)
26
+ self.metadata = []
27
+ self._load_vector_store()
 
 
28
 
29
+ def _load_vector_store(self):
30
+ """Load the FAISS index and metadata from persistent storage."""
31
+ try:
32
+ index_path = os.path.join(self.storage_path, "faiss.index")
33
+ metadata_path = os.path.join(self.storage_path, "metadata.pkl")
34
+ if os.path.exists(index_path) and os.path.exists(metadata_path):
35
+ self.index = faiss.read_index(index_path)
36
+ with open(metadata_path, "rb") as f:
37
+ self.metadata = pickle.load(f)
38
+ except Exception as e:
39
+ print(f"Failed to load vector store: {e}")
40
 
41
+ def add_texts(self, texts: List[str], metadatas: List[Dict] = None):
42
+ """
43
+ Add texts and their metadata to the vector store.
 
44
 
45
+ Args:
46
+ texts (List[str]): List of text chunks to be added.
47
+ metadatas (List[Dict]): List of metadata dictionaries corresponding to the text chunks.
48
+ """
49
+ embeddings = self.embedder.encode(texts, show_progress_bar=True)
50
+ self.index.add(np.array(embeddings).astype("float32"))
51
+ self.metadata.extend(metadatas if metadatas else [{}] * len(texts))
52
+ self._save_vector_store()
53
+
54
+ def similarity_search(self, query: str, k: int = 5) -> List[Dict]:
55
+ """
56
+ Perform a similarity search for the given query.
57
+
58
+ Args:
59
+ query (str): The query text.
60
+ k (int): Number of closest matches to retrieve.
61
+
62
+ Returns:
63
+ List[Dict]: A list of dictionaries containing the text and its relevance score.
64
+ """
65
+ query_embedding = self.embedder.encode([query]).astype("float32")
66
+ distances, indices = self.index.search(query_embedding, k)
67
+ return [
68
+ {"text": self.metadata[i]["text"], "distance": distances[0][j]}
69
+ for j, i in enumerate(indices[0]) if i < len(self.metadata)
70
+ ]
71
+
72
+ def _save_vector_store(self):
73
+ """Save the FAISS index and metadata to persistent storage."""
74
+ try:
75
+ index_path = os.path.join(self.storage_path, "faiss.index")
76
+ metadata_path = os.path.join(self.storage_path, "metadata.pkl")
77
+ faiss.write_index(self.index, index_path)
78
+ with open(metadata_path, "wb") as f:
79
+ pickle.dump(self.metadata, f)
80
+ except Exception as e:
81
+ print(f"Failed to save vector store: {e}")
82
+
83
+ def reset_store(self):
84
+ """
85
+ Reset the vector store by clearing the FAISS index and metadata.
86
+ This is useful for starting fresh.
87
+ """
88
+ self.index = faiss.IndexFlatL2(self.dimension)
89
+ self.metadata = []
90
+ self._save_vector_store()