Spaces:

synaptyx
/

SuoMoto.AI

Sleeping

File size: 5,090 Bytes


# utils/vector_store.py
import faiss
import numpy as np
from typing import List, Dict, Optional
import pickle
import os
from pathlib import Path

class VectorStore:
    def __init__(self):
        # Use absolute path for HF Spaces
        self.persist_directory = "/data/faiss"
        self.index = None
        self.documents = []
        self.metadata = []
        
        # Ensure directories exist
        self._create_data_directories()
        
        # Try to load existing index and data
        self._load_or_create_index()

    def _create_data_directories(self):
        """Create necessary data directories"""
        # Create main data directory
        Path("/data").mkdir(parents=True, exist_ok=True)
        # Create FAISS specific directory
        Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
        # Create uploads directory
        Path("/data/uploads").mkdir(parents=True, exist_ok=True)

    def _load_or_create_index(self):
        """Load existing index or create new one"""
        index_path = os.path.join(self.persist_directory, "faiss.index")
        data_path = os.path.join(self.persist_directory, "documents.pkl")
        
        try:
            if os.path.exists(index_path) and os.path.exists(data_path):
                print(f"Loading existing index from {index_path}")
                # Load existing index
                self.index = faiss.read_index(index_path)
                
                # Load documents and metadata
                with open(data_path, 'rb') as f:
                    data = pickle.load(f)
                    self.documents = data['documents']
                    self.metadata = data['metadata']
                print(f"Loaded {len(self.documents)} documents from existing index")
            else:
                print("No existing index found, creating new one")
                # Create new index
                self.index = None  # Will be created when first vectors are added
                self.documents = []
                self.metadata = []
        except Exception as e:
            print(f"Error loading index: {e}")
            self.index = None
            self.documents = []
            self.metadata = []

    
    def _save_index(self):
        """Save index and data to disk"""
        if self.index is not None:
            index_path = os.path.join(self.persist_directory, "faiss.index")
            data_path = os.path.join(self.persist_directory, "documents.pkl")
            
            try:
                # Save FAISS index
                faiss.write_index(self.index, index_path)
                
                # Save documents and metadata
                with open(data_path, 'wb') as f:
                    pickle.dump({
                        'documents': self.documents,
                        'metadata': self.metadata
                    }, f)
            except Exception as e:
                print(f"Error saving index: {e}")

    def add_documents(self, chunks: List[Dict], metadata: Optional[Dict] = None):
        """Add document chunks to vector store"""
        if not chunks:
            return

        # Extract vectors and documents
        vectors = np.array([chunk["embeddings"] for chunk in chunks])
        
        # Create index if it doesn't exist
        if self.index is None:
            dimension = vectors.shape[1]
            self.index = faiss.IndexFlatL2(dimension)

        # Add vectors to index
        self.index.add(vectors.astype(np.float32))
        
        # Store documents and metadata
        for chunk in chunks:
            chunk_metadata = {
                "chunk_id": len(self.documents),
                "text_length": len(chunk["text"])
            }
            if metadata:
                chunk_metadata.update(metadata)
            
            self.documents.append(chunk["text"])
            self.metadata.append(chunk_metadata)
        
        # Save updated index
        self._save_index()

    def search(self, query_vector: np.ndarray, n_results: int = 5) -> List[Dict]:
        """Search for similar documents"""
        if self.index is None or self.index.ntotal == 0:
            return []

        # Reshape query vector if needed
        if len(query_vector.shape) == 1:
            query_vector = query_vector.reshape(1, -1)

        # Perform search
        distances, indices = self.index.search(query_vector.astype(np.float32), n_results)
        
        # Format results
        results = []
        for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
            if idx < len(self.documents):  # Check if index is valid
                results.append({
                    "text": self.documents[idx],
                    "metadata": self.metadata[idx],
                    "distance": float(dist)
                })
        
        return results

    def get_all_documents(self) -> List[Dict]:
        """Get all stored documents"""
        return [
            {"text": doc, "metadata": meta}
            for doc, meta in zip(self.documents, self.metadata)
        ]