Spaces:

crackbit
/

ai-learning-path-generator

Sleeping

File size: 6,718 Bytes

7644eac

"""
Vector store implementation for RAG capabilities.
"""
from typing import List, Dict, Any, Optional
import json
import os
from pathlib import Path
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader

class VectorStore:
    """
    Manages vector storage for RAG capabilities.
    """
    def __init__(self, api_key: Optional[str] = None):
        """
        Initialize the vector store.
        
        Args:
            api_key: Optional OpenAI API key
        """
        self.api_key = api_key
        
        # Use free sentence-transformers embeddings (no API key needed)
        try:
            from langchain.embeddings import HuggingFaceEmbeddings
            self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
            print("✅ Using free HuggingFace embeddings")
        except ImportError:
            # Fallback to OpenAI if HuggingFace not available
            if api_key:
                from langchain.embeddings import OpenAIEmbeddings
                self.embeddings = OpenAIEmbeddings(api_key=api_key)
                print("✅ Using OpenAI embeddings")
            else:
                raise ValueError("HuggingFace not available and no OpenAI API key provided")
            
        self.vector_store_path = Path("vector_db")
        self.vector_store_path.mkdir(exist_ok=True)
        self.vector_store = None

    def load_documents(self, directory: str = None) -> None:
        """
        Load documents from a directory and create embeddings.
        If no directory is provided, creates a minimal default vector store.
        
        Args:
            directory: Optional path to directory containing documents
        """
        try:
            # If no directory provided, create a minimal vector store
            if directory is None:
                self._create_minimal_vector_store()
                return
                
            # Check if directory exists
            if not os.path.exists(directory):
                print(f"Warning: Document directory {directory} not found. Creating minimal vector store.")
                self._create_minimal_vector_store()
                return
                
            # Try to load documents
            loader = DirectoryLoader(directory)
            documents = loader.load()
            
            if not documents:
                print("Warning: No documents found in directory. Creating minimal vector store.")
                self._create_minimal_vector_store()
                return
            
            # Process documents
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
            )
            
            texts = text_splitter.split_documents(documents)
            
            # Create or update vector store
            if os.path.exists(self.vector_store_path / "index.faiss"):
                self.vector_store = FAISS.load_local(
                    str(self.vector_store_path),
                    self.embeddings
                )
                self.vector_store.add_documents(texts)
            else:
                self.vector_store = FAISS.from_documents(
                    texts,
                    self.embeddings
                )
                self.vector_store.save_local(str(self.vector_store_path))
                
        except Exception as e:
            print(f"Error loading documents: {str(e)}")
            self._create_minimal_vector_store()
    
    def _create_minimal_vector_store(self) -> None:
        """Create a minimal vector store with default content."""
        try:
            default_texts = [
                "This is a default document. The vector store was initialized with minimal content.",
                "You can add your own documents to the vector store by placing them in the vector_db/documents directory.",
                "The application will automatically load and index any text files found in that directory."
            ]
            
            if os.path.exists(self.vector_store_path / "index.faiss"):
                self.vector_store = FAISS.load_local(
                    str(self.vector_store_path),
                    self.embeddings
                )
            else:
                self.vector_store = FAISS.from_texts(
                    default_texts,
                    self.embeddings
                )
                self.vector_store.save_local(str(self.vector_store_path))
                
        except Exception as e:
            print(f"Error creating minimal vector store: {str(e)}")
            # Create an empty FAISS index as a last resort
            self.vector_store = FAISS.from_texts(
                ["Default document"],
                self.embeddings
            )

    def search(self, query: str, k: int = 4, documents: List[str] = None) -> List[Dict[str, Any]]:
        """
        Search for relevant documents based on query.
        
        Args:
            query: Search query
            k: Number of results to return
            documents: Optional list of documents to search through (fallback)
            
        Returns:
            List of relevant documents with scores
        """
        # If vector store is not available, fall back to simple text search
        if not self.vector_store:
            if not documents:
                return []
                
            # Simple text-based search as fallback
            query = query.lower()
            return [
                {"content": doc, "score": 1.0, "metadata": {}}
                for doc in documents
                if query in doc.lower()
            ][:k]
            
        try:
            results = self.vector_store.similarity_search_with_score(query, k=k)
            formatted_results = []
            for doc, score in results:
                formatted_results.append({
                    "content": doc.page_content,
                    "metadata": getattr(doc, 'metadata', {}),
                    "score": float(score) if hasattr(score, '__float__') else 0.0
                })
            return formatted_results
            
        except Exception as e:
            print(f"Error in vector store search: {str(e)}")
            # Fall back to simple text search if available
            if documents:
                query = query.lower()
                return [
                    {"content": doc, "score": 1.0, "metadata": {}}
                    for doc in documents
                    if query in doc.lower()
                ][:k]
            return []