Spaces:
Sleeping
Sleeping
| """ | |
| Vector store implementation for RAG capabilities. | |
| """ | |
| from typing import List, Dict, Any, Optional | |
| import json | |
| import os | |
| from pathlib import Path | |
| from langchain.vectorstores import FAISS | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.document_loaders import DirectoryLoader | |
| class VectorStore: | |
| """ | |
| Manages vector storage for RAG capabilities. | |
| """ | |
| def __init__(self, api_key: Optional[str] = None): | |
| """ | |
| Initialize the vector store. | |
| Args: | |
| api_key: Optional OpenAI API key | |
| """ | |
| self.api_key = api_key | |
| # Use free sentence-transformers embeddings (no API key needed) | |
| try: | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| print("✅ Using free HuggingFace embeddings") | |
| except ImportError: | |
| # Fallback to OpenAI if HuggingFace not available | |
| if api_key: | |
| from langchain.embeddings import OpenAIEmbeddings | |
| self.embeddings = OpenAIEmbeddings(api_key=api_key) | |
| print("✅ Using OpenAI embeddings") | |
| else: | |
| raise ValueError("HuggingFace not available and no OpenAI API key provided") | |
| self.vector_store_path = Path("vector_db") | |
| self.vector_store_path.mkdir(exist_ok=True) | |
| self.vector_store = None | |
| def load_documents(self, directory: str = None) -> None: | |
| """ | |
| Load documents from a directory and create embeddings. | |
| If no directory is provided, creates a minimal default vector store. | |
| Args: | |
| directory: Optional path to directory containing documents | |
| """ | |
| try: | |
| # If no directory provided, create a minimal vector store | |
| if directory is None: | |
| self._create_minimal_vector_store() | |
| return | |
| # Check if directory exists | |
| if not os.path.exists(directory): | |
| print(f"Warning: Document directory {directory} not found. Creating minimal vector store.") | |
| self._create_minimal_vector_store() | |
| return | |
| # Try to load documents | |
| loader = DirectoryLoader(directory) | |
| documents = loader.load() | |
| if not documents: | |
| print("Warning: No documents found in directory. Creating minimal vector store.") | |
| self._create_minimal_vector_store() | |
| return | |
| # Process documents | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| ) | |
| texts = text_splitter.split_documents(documents) | |
| # Create or update vector store | |
| if os.path.exists(self.vector_store_path / "index.faiss"): | |
| self.vector_store = FAISS.load_local( | |
| str(self.vector_store_path), | |
| self.embeddings | |
| ) | |
| self.vector_store.add_documents(texts) | |
| else: | |
| self.vector_store = FAISS.from_documents( | |
| texts, | |
| self.embeddings | |
| ) | |
| self.vector_store.save_local(str(self.vector_store_path)) | |
| except Exception as e: | |
| print(f"Error loading documents: {str(e)}") | |
| self._create_minimal_vector_store() | |
| def _create_minimal_vector_store(self) -> None: | |
| """Create a minimal vector store with default content.""" | |
| try: | |
| default_texts = [ | |
| "This is a default document. The vector store was initialized with minimal content.", | |
| "You can add your own documents to the vector store by placing them in the vector_db/documents directory.", | |
| "The application will automatically load and index any text files found in that directory." | |
| ] | |
| if os.path.exists(self.vector_store_path / "index.faiss"): | |
| self.vector_store = FAISS.load_local( | |
| str(self.vector_store_path), | |
| self.embeddings | |
| ) | |
| else: | |
| self.vector_store = FAISS.from_texts( | |
| default_texts, | |
| self.embeddings | |
| ) | |
| self.vector_store.save_local(str(self.vector_store_path)) | |
| except Exception as e: | |
| print(f"Error creating minimal vector store: {str(e)}") | |
| # Create an empty FAISS index as a last resort | |
| self.vector_store = FAISS.from_texts( | |
| ["Default document"], | |
| self.embeddings | |
| ) | |
| def search(self, query: str, k: int = 4, documents: List[str] = None) -> List[Dict[str, Any]]: | |
| """ | |
| Search for relevant documents based on query. | |
| Args: | |
| query: Search query | |
| k: Number of results to return | |
| documents: Optional list of documents to search through (fallback) | |
| Returns: | |
| List of relevant documents with scores | |
| """ | |
| # If vector store is not available, fall back to simple text search | |
| if not self.vector_store: | |
| if not documents: | |
| return [] | |
| # Simple text-based search as fallback | |
| query = query.lower() | |
| return [ | |
| {"content": doc, "score": 1.0, "metadata": {}} | |
| for doc in documents | |
| if query in doc.lower() | |
| ][:k] | |
| try: | |
| results = self.vector_store.similarity_search_with_score(query, k=k) | |
| formatted_results = [] | |
| for doc, score in results: | |
| formatted_results.append({ | |
| "content": doc.page_content, | |
| "metadata": getattr(doc, 'metadata', {}), | |
| "score": float(score) if hasattr(score, '__float__') else 0.0 | |
| }) | |
| return formatted_results | |
| except Exception as e: | |
| print(f"Error in vector store search: {str(e)}") | |
| # Fall back to simple text search if available | |
| if documents: | |
| query = query.lower() | |
| return [ | |
| {"content": doc, "score": 1.0, "metadata": {}} | |
| for doc in documents | |
| if query in doc.lower() | |
| ][:k] | |
| return [] | |