|
|
from langchain_community.vectorstores import FAISS
|
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
from typing import List, Optional
|
|
|
from src.config.settings import settings
|
|
|
from src.agenticRAG.components.embeddings import EmbeddingFactory
|
|
|
import os
|
|
|
from typing import Dict, Any, List, Optional
|
|
|
from pathlib import Path
|
|
|
from src.agenticRAG.components.document_parsing import DocumentChunker
|
|
|
|
|
|
|
|
|
class VectorStoreManager:
|
|
|
"""Manager for vector store operations"""
|
|
|
|
|
|
def __init__(self):
|
|
|
self.embeddings = EmbeddingFactory.get_embeddings()
|
|
|
self.vectorstore = None
|
|
|
|
|
|
def load_vectorstore(self, path: Optional[str] = None) -> bool:
|
|
|
"""Load vector store from path"""
|
|
|
try:
|
|
|
path = path or settings.VECTORSTORE_PATH
|
|
|
if os.path.exists(path):
|
|
|
self.vectorstore = FAISS.load_local(path, self.embeddings, allow_dangerous_deserialization=True)
|
|
|
return True
|
|
|
return False
|
|
|
except Exception as e:
|
|
|
print(f"Error loading vectorstore: {e}")
|
|
|
return False
|
|
|
|
|
|
def search_documents(self, query: str, k: int = 3) -> List[str]:
|
|
|
"""Search for similar documents"""
|
|
|
if not self.vectorstore:
|
|
|
return []
|
|
|
|
|
|
try:
|
|
|
docs = self.vectorstore.similarity_search(query, k=k)
|
|
|
return [doc.page_content for doc in docs]
|
|
|
except Exception as e:
|
|
|
print(f"Error searching documents: {e}")
|
|
|
return []
|
|
|
|
|
|
def add_documents(self, texts: List[str], metadatas: Optional[List[dict]] = None):
|
|
|
"""Add documents to vector store"""
|
|
|
if not self.vectorstore:
|
|
|
self.vectorstore = FAISS.from_texts(texts, self.embeddings, metadatas=metadatas)
|
|
|
else:
|
|
|
self.vectorstore.add_texts(texts, metadatas=metadatas)
|
|
|
|
|
|
def save_vectorstore(self, path: Optional[str] = None):
|
|
|
"""Save vector store to path"""
|
|
|
if self.vectorstore:
|
|
|
path = path or settings.VECTORSTORE_PATH
|
|
|
self.vectorstore.save_local(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def store_documents_in_vectorstore(
|
|
|
file_paths: List[str],
|
|
|
vectorstore_manager: Optional[VectorStoreManager] = None,
|
|
|
chunk_size: int = 1000,
|
|
|
chunk_overlap: int = 200,
|
|
|
save_path: Optional[str] = None,
|
|
|
include_metadata: bool = True
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
Process documents and store them in vector store
|
|
|
|
|
|
Args:
|
|
|
file_paths (List[str]): List of file paths to process
|
|
|
vectorstore_manager (VectorStoreManager, optional): Existing manager instance
|
|
|
chunk_size (int): Size of each chunk
|
|
|
chunk_overlap (int): Overlap between chunks
|
|
|
save_path (str, optional): Path to save the vector store
|
|
|
include_metadata (bool): Whether to include file metadata
|
|
|
|
|
|
Returns:
|
|
|
Dict[str, Any]: Processing results with statistics
|
|
|
"""
|
|
|
|
|
|
if vectorstore_manager is None:
|
|
|
vectorstore_manager = VectorStoreManager()
|
|
|
|
|
|
chunker = DocumentChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
|
|
|
|
|
|
vectorstore_manager.load_vectorstore(save_path)
|
|
|
|
|
|
|
|
|
results = {
|
|
|
"total_files": len(file_paths),
|
|
|
"processed_files": 0,
|
|
|
"failed_files": [],
|
|
|
"total_chunks": 0,
|
|
|
"chunks_by_file": {}
|
|
|
}
|
|
|
|
|
|
try:
|
|
|
for file_path in file_paths:
|
|
|
try:
|
|
|
print(f"Processing file: {file_path}")
|
|
|
|
|
|
|
|
|
chunks = chunker.process_file(file_path)
|
|
|
|
|
|
if chunks:
|
|
|
|
|
|
metadatas = None
|
|
|
if include_metadata:
|
|
|
file_name = Path(file_path).name
|
|
|
file_extension = Path(file_path).suffix
|
|
|
metadatas = [
|
|
|
{
|
|
|
"source": file_path,
|
|
|
"file_name": file_name,
|
|
|
"file_extension": file_extension,
|
|
|
"chunk_index": i
|
|
|
}
|
|
|
for i in range(len(chunks))
|
|
|
]
|
|
|
|
|
|
|
|
|
vectorstore_manager.add_documents(chunks, metadatas)
|
|
|
|
|
|
|
|
|
results["processed_files"] += 1
|
|
|
results["total_chunks"] += len(chunks)
|
|
|
results["chunks_by_file"][file_path] = len(chunks)
|
|
|
|
|
|
print(f"Successfully processed {file_path}: {len(chunks)} chunks")
|
|
|
|
|
|
else:
|
|
|
print(f"No chunks extracted from {file_path}")
|
|
|
results["failed_files"].append(file_path)
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error processing file {file_path}: {e}")
|
|
|
results["failed_files"].append(file_path)
|
|
|
|
|
|
|
|
|
if results["total_chunks"] > 0:
|
|
|
vectorstore_manager.save_vectorstore(save_path)
|
|
|
print(f"Vector store saved with {results['total_chunks']} total chunks")
|
|
|
|
|
|
return results
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error in store_documents_in_vectorstore: {e}")
|
|
|
results["error"] = str(e)
|
|
|
return results
|
|
|
|
|
|
|
|
|
def store_single_document_in_vectorstore(
|
|
|
file_path: str,
|
|
|
vectorstore_manager: Optional[VectorStoreManager] = None,
|
|
|
chunk_size: int = 1000,
|
|
|
chunk_overlap: int = 200,
|
|
|
save_path: Optional[str] = None
|
|
|
) -> bool:
|
|
|
"""
|
|
|
Process and store a single document in vector store
|
|
|
|
|
|
Args:
|
|
|
file_path (str): Path to the file to process
|
|
|
vectorstore_manager (VectorStoreManager, optional): Existing manager instance
|
|
|
chunk_size (int): Size of each chunk
|
|
|
chunk_overlap (int): Overlap between chunks
|
|
|
save_path (str, optional): Path to save the vector store
|
|
|
|
|
|
Returns:
|
|
|
bool: Success status
|
|
|
"""
|
|
|
results = store_documents_in_vectorstore(
|
|
|
file_paths=[file_path],
|
|
|
vectorstore_manager=vectorstore_manager,
|
|
|
chunk_size=chunk_size,
|
|
|
chunk_overlap=chunk_overlap,
|
|
|
save_path=save_path
|
|
|
)
|
|
|
|
|
|
return results["processed_files"] > 0
|
|
|
|
|
|
|
|
|
def batch_store_documents(
|
|
|
directory_path: str,
|
|
|
file_extensions: List[str] = [".pdf", ".docx", ".txt", ".md"],
|
|
|
vectorstore_manager: Optional[VectorStoreManager] = None,
|
|
|
chunk_size: int = 1000,
|
|
|
chunk_overlap: int = 200,
|
|
|
save_path: Optional[str] = None
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
Process and store all documents from a directory
|
|
|
|
|
|
Args:
|
|
|
directory_path (str): Path to directory containing documents
|
|
|
file_extensions (List[str]): List of file extensions to process
|
|
|
vectorstore_manager (VectorStoreManager, optional): Existing manager instance
|
|
|
chunk_size (int): Size of each chunk
|
|
|
chunk_overlap (int): Overlap between chunks
|
|
|
save_path (str, optional): Path to save the vector store
|
|
|
|
|
|
Returns:
|
|
|
Dict[str, Any]: Processing results
|
|
|
"""
|
|
|
|
|
|
directory = Path(directory_path)
|
|
|
file_paths = []
|
|
|
|
|
|
for extension in file_extensions:
|
|
|
file_paths.extend(directory.glob(f"*{extension}"))
|
|
|
|
|
|
|
|
|
file_paths = [str(path) for path in file_paths]
|
|
|
|
|
|
if not file_paths:
|
|
|
print(f"No files found in {directory_path} with extensions {file_extensions}")
|
|
|
return {"total_files": 0, "processed_files": 0, "failed_files": [], "total_chunks": 0}
|
|
|
|
|
|
print(f"Found {len(file_paths)} files to process")
|
|
|
|
|
|
return store_documents_in_vectorstore(
|
|
|
file_paths=file_paths,
|
|
|
vectorstore_manager=vectorstore_manager,
|
|
|
chunk_size=chunk_size,
|
|
|
chunk_overlap=chunk_overlap,
|
|
|
save_path=save_path
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""Example usage of the vector store functions"""
|
|
|
|
|
|
|
|
|
vs_manager = VectorStoreManager()
|
|
|
|
|
|
|
|
|
print("=== Storing Single Document ===")
|
|
|
file_path = "/home/ubuntu/OMANI-Therapist-Voice-ChatBot/KnowledgebaseFile/SuicideGuard_An_NLP-Based_Chrome_Extension_for_Detecting_Suicidal_Thoughts_in_Bengali.pdf"
|
|
|
success = store_single_document_in_vectorstore(
|
|
|
file_path=file_path,
|
|
|
vectorstore_manager=vs_manager,
|
|
|
chunk_size=1000,
|
|
|
chunk_overlap=150
|
|
|
)
|
|
|
print(f"Single document processing: {'Success' if success else 'Failed'}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n=== Searching Vector Store ===")
|
|
|
query = "suicide prevention techniques"
|
|
|
search_results = vs_manager.search_documents(query, k=3)
|
|
|
|
|
|
print(f"Search results for '{query}':")
|
|
|
for i, result in enumerate(search_results):
|
|
|
print(f" Result {i+1}: {result[:200]}...")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |