Spaces:
Sleeping
Sleeping
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from typing import List, Optional | |
| from src.config.settings import settings | |
| from src.agenticRAG.components.embeddings import EmbeddingFactory | |
| import os | |
| from typing import Dict, Any, List, Optional | |
| from pathlib import Path | |
| from src.agenticRAG.components.document_parsing import DocumentChunker | |
| class VectorStoreManager: | |
| """Manager for vector store operations""" | |
| def __init__(self): | |
| self.embeddings = EmbeddingFactory.get_embeddings() | |
| self.vectorstore = None | |
| def load_vectorstore(self, path: Optional[str] = None) -> bool: | |
| """Load vector store from path""" | |
| try: | |
| path = path or settings.VECTORSTORE_PATH | |
| if os.path.exists(path): | |
| self.vectorstore = FAISS.load_local(path, self.embeddings, allow_dangerous_deserialization=True) | |
| return True | |
| return False | |
| except Exception as e: | |
| print(f"Error loading vectorstore: {e}") | |
| return False | |
| def search_documents(self, query: str, k: int = 3) -> List[str]: | |
| """Search for similar documents""" | |
| if not self.vectorstore: | |
| return [] | |
| try: | |
| docs = self.vectorstore.similarity_search(query, k=k) | |
| return [doc.page_content for doc in docs] | |
| except Exception as e: | |
| print(f"Error searching documents: {e}") | |
| return [] | |
| def add_documents(self, texts: List[str], metadatas: Optional[List[dict]] = None): | |
| """Add documents to vector store""" | |
| if not self.vectorstore: | |
| self.vectorstore = FAISS.from_texts(texts, self.embeddings, metadatas=metadatas) | |
| else: | |
| self.vectorstore.add_texts(texts, metadatas=metadatas) | |
| def save_vectorstore(self, path: Optional[str] = None): | |
| """Save vector store to path""" | |
| if self.vectorstore: | |
| path = path or settings.VECTORSTORE_PATH | |
| self.vectorstore.save_local(path) | |
| def store_documents_in_vectorstore( | |
| file_paths: List[str], | |
| vectorstore_manager: Optional[VectorStoreManager] = None, | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200, | |
| save_path: Optional[str] = None, | |
| include_metadata: bool = True | |
| ) -> Dict[str, Any]: | |
| """ | |
| Process documents and store them in vector store | |
| Args: | |
| file_paths (List[str]): List of file paths to process | |
| vectorstore_manager (VectorStoreManager, optional): Existing manager instance | |
| chunk_size (int): Size of each chunk | |
| chunk_overlap (int): Overlap between chunks | |
| save_path (str, optional): Path to save the vector store | |
| include_metadata (bool): Whether to include file metadata | |
| Returns: | |
| Dict[str, Any]: Processing results with statistics | |
| """ | |
| # Initialize components | |
| if vectorstore_manager is None: | |
| vectorstore_manager = VectorStoreManager() | |
| chunker = DocumentChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| # Load existing vectorstore if available | |
| vectorstore_manager.load_vectorstore(save_path) | |
| # Track processing statistics | |
| results = { | |
| "total_files": len(file_paths), | |
| "processed_files": 0, | |
| "failed_files": [], | |
| "total_chunks": 0, | |
| "chunks_by_file": {} | |
| } | |
| try: | |
| for file_path in file_paths: | |
| try: | |
| print(f"Processing file: {file_path}") | |
| # Process file into chunks | |
| chunks = chunker.process_file(file_path) | |
| if chunks: | |
| # Prepare metadata if requested | |
| metadatas = None | |
| if include_metadata: | |
| file_name = Path(file_path).name | |
| file_extension = Path(file_path).suffix | |
| metadatas = [ | |
| { | |
| "source": file_path, | |
| "file_name": file_name, | |
| "file_extension": file_extension, | |
| "chunk_index": i | |
| } | |
| for i in range(len(chunks)) | |
| ] | |
| # Add documents to vector store | |
| vectorstore_manager.add_documents(chunks, metadatas) | |
| # Update statistics | |
| results["processed_files"] += 1 | |
| results["total_chunks"] += len(chunks) | |
| results["chunks_by_file"][file_path] = len(chunks) | |
| print(f"Successfully processed {file_path}: {len(chunks)} chunks") | |
| else: | |
| print(f"No chunks extracted from {file_path}") | |
| results["failed_files"].append(file_path) | |
| except Exception as e: | |
| print(f"Error processing file {file_path}: {e}") | |
| results["failed_files"].append(file_path) | |
| # Save the vector store | |
| if results["total_chunks"] > 0: | |
| vectorstore_manager.save_vectorstore(save_path) | |
| print(f"Vector store saved with {results['total_chunks']} total chunks") | |
| return results | |
| except Exception as e: | |
| print(f"Error in store_documents_in_vectorstore: {e}") | |
| results["error"] = str(e) | |
| return results | |
| def store_single_document_in_vectorstore( | |
| file_path: str, | |
| vectorstore_manager: Optional[VectorStoreManager] = None, | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200, | |
| save_path: Optional[str] = None | |
| ) -> bool: | |
| """ | |
| Process and store a single document in vector store | |
| Args: | |
| file_path (str): Path to the file to process | |
| vectorstore_manager (VectorStoreManager, optional): Existing manager instance | |
| chunk_size (int): Size of each chunk | |
| chunk_overlap (int): Overlap between chunks | |
| save_path (str, optional): Path to save the vector store | |
| Returns: | |
| bool: Success status | |
| """ | |
| results = store_documents_in_vectorstore( | |
| file_paths=[file_path], | |
| vectorstore_manager=vectorstore_manager, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| save_path=save_path | |
| ) | |
| return results["processed_files"] > 0 | |
| def batch_store_documents( | |
| directory_path: str, | |
| file_extensions: List[str] = [".pdf", ".docx", ".txt", ".md"], | |
| vectorstore_manager: Optional[VectorStoreManager] = None, | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200, | |
| save_path: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Process and store all documents from a directory | |
| Args: | |
| directory_path (str): Path to directory containing documents | |
| file_extensions (List[str]): List of file extensions to process | |
| vectorstore_manager (VectorStoreManager, optional): Existing manager instance | |
| chunk_size (int): Size of each chunk | |
| chunk_overlap (int): Overlap between chunks | |
| save_path (str, optional): Path to save the vector store | |
| Returns: | |
| Dict[str, Any]: Processing results | |
| """ | |
| # Find all files with specified extensions | |
| directory = Path(directory_path) | |
| file_paths = [] | |
| for extension in file_extensions: | |
| file_paths.extend(directory.glob(f"*{extension}")) | |
| # Convert to string paths | |
| file_paths = [str(path) for path in file_paths] | |
| if not file_paths: | |
| print(f"No files found in {directory_path} with extensions {file_extensions}") | |
| return {"total_files": 0, "processed_files": 0, "failed_files": [], "total_chunks": 0} | |
| print(f"Found {len(file_paths)} files to process") | |
| return store_documents_in_vectorstore( | |
| file_paths=file_paths, | |
| vectorstore_manager=vectorstore_manager, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| save_path=save_path | |
| ) | |
| # Example usage | |
| def main(): | |
| """Example usage of the vector store functions""" | |
| # Initialize vector store manager | |
| vs_manager = VectorStoreManager() | |
| # Example 1: Store a single document | |
| print("=== Storing Single Document ===") | |
| file_path = "/home/ubuntu/OMANI-Therapist-Voice-ChatBot/KnowledgebaseFile/SuicideGuard_An_NLP-Based_Chrome_Extension_for_Detecting_Suicidal_Thoughts_in_Bengali.pdf" | |
| success = store_single_document_in_vectorstore( | |
| file_path=file_path, | |
| vectorstore_manager=vs_manager, | |
| chunk_size=1000, | |
| chunk_overlap=150 | |
| ) | |
| print(f"Single document processing: {'Success' if success else 'Failed'}") | |
| # # Example 2: Store multiple documents | |
| # print("\n=== Storing Multiple Documents ===") | |
| # file_paths = [ | |
| # "document1.pdf", | |
| # "document2.docx", | |
| # "document3.txt" | |
| # ] | |
| # results = store_documents_in_vectorstore( | |
| # file_paths=file_paths, | |
| # vectorstore_manager=vs_manager, | |
| # chunk_size=1000, | |
| # chunk_overlap=200 | |
| # ) | |
| # print(f"Processing Results:") | |
| # print(f" Total files: {results['total_files']}") | |
| # print(f" Processed files: {results['processed_files']}") | |
| # print(f" Failed files: {results['failed_files']}") | |
| # print(f" Total chunks: {results['total_chunks']}") | |
| # # Example 3: Batch process directory | |
| # print("\n=== Batch Processing Directory ===") | |
| # directory_path = "/home/ubuntu/OMANI-Therapist-Voice-ChatBot/KnowledgebaseFile/" | |
| # batch_results = batch_store_documents( | |
| # directory_path=directory_path, | |
| # file_extensions=[".pdf", ".docx", ".txt", ".md"], | |
| # vectorstore_manager=vs_manager | |
| # ) | |
| # print(f"Batch Processing Results:") | |
| # print(f" Total files: {batch_results['total_files']}") | |
| # print(f" Processed files: {batch_results['processed_files']}") | |
| # print(f" Total chunks: {batch_results['total_chunks']}") | |
| # Example 4: Search the vector store | |
| print("\n=== Searching Vector Store ===") | |
| query = "suicide prevention techniques" | |
| search_results = vs_manager.search_documents(query, k=3) | |
| print(f"Search results for '{query}':") | |
| for i, result in enumerate(search_results): | |
| print(f" Result {i+1}: {result[:200]}...") | |
| if __name__ == "__main__": | |
| main() |