Spaces:
Sleeping
Sleeping
| # utils/vector_store.py | |
| import faiss | |
| import numpy as np | |
| from typing import List, Dict, Optional | |
| import pickle | |
| import os | |
| from pathlib import Path | |
| class VectorStore: | |
| def __init__(self): | |
| # Use absolute path for HF Spaces | |
| self.persist_directory = "/data/faiss" | |
| self.index = None | |
| self.documents = [] | |
| self.metadata = [] | |
| # Ensure directories exist | |
| self._create_data_directories() | |
| # Try to load existing index and data | |
| self._load_or_create_index() | |
| def _create_data_directories(self): | |
| """Create necessary data directories""" | |
| # Create main data directory | |
| Path("/data").mkdir(parents=True, exist_ok=True) | |
| # Create FAISS specific directory | |
| Path(self.persist_directory).mkdir(parents=True, exist_ok=True) | |
| # Create uploads directory | |
| Path("/data/uploads").mkdir(parents=True, exist_ok=True) | |
| def _load_or_create_index(self): | |
| """Load existing index or create new one""" | |
| index_path = os.path.join(self.persist_directory, "faiss.index") | |
| data_path = os.path.join(self.persist_directory, "documents.pkl") | |
| try: | |
| if os.path.exists(index_path) and os.path.exists(data_path): | |
| print(f"Loading existing index from {index_path}") | |
| # Load existing index | |
| self.index = faiss.read_index(index_path) | |
| # Load documents and metadata | |
| with open(data_path, 'rb') as f: | |
| data = pickle.load(f) | |
| self.documents = data['documents'] | |
| self.metadata = data['metadata'] | |
| print(f"Loaded {len(self.documents)} documents from existing index") | |
| else: | |
| print("No existing index found, creating new one") | |
| # Create new index | |
| self.index = None # Will be created when first vectors are added | |
| self.documents = [] | |
| self.metadata = [] | |
| except Exception as e: | |
| print(f"Error loading index: {e}") | |
| self.index = None | |
| self.documents = [] | |
| self.metadata = [] | |
| def _save_index(self): | |
| """Save index and data to disk""" | |
| if self.index is not None: | |
| index_path = os.path.join(self.persist_directory, "faiss.index") | |
| data_path = os.path.join(self.persist_directory, "documents.pkl") | |
| try: | |
| # Save FAISS index | |
| faiss.write_index(self.index, index_path) | |
| # Save documents and metadata | |
| with open(data_path, 'wb') as f: | |
| pickle.dump({ | |
| 'documents': self.documents, | |
| 'metadata': self.metadata | |
| }, f) | |
| except Exception as e: | |
| print(f"Error saving index: {e}") | |
| def add_documents(self, chunks: List[Dict], metadata: Optional[Dict] = None): | |
| """Add document chunks to vector store""" | |
| if not chunks: | |
| return | |
| # Extract vectors and documents | |
| vectors = np.array([chunk["embeddings"] for chunk in chunks]) | |
| # Create index if it doesn't exist | |
| if self.index is None: | |
| dimension = vectors.shape[1] | |
| self.index = faiss.IndexFlatL2(dimension) | |
| # Add vectors to index | |
| self.index.add(vectors.astype(np.float32)) | |
| # Store documents and metadata | |
| for chunk in chunks: | |
| chunk_metadata = { | |
| "chunk_id": len(self.documents), | |
| "text_length": len(chunk["text"]) | |
| } | |
| if metadata: | |
| chunk_metadata.update(metadata) | |
| self.documents.append(chunk["text"]) | |
| self.metadata.append(chunk_metadata) | |
| # Save updated index | |
| self._save_index() | |
| def search(self, query_vector: np.ndarray, n_results: int = 5) -> List[Dict]: | |
| """Search for similar documents""" | |
| if self.index is None or self.index.ntotal == 0: | |
| return [] | |
| # Reshape query vector if needed | |
| if len(query_vector.shape) == 1: | |
| query_vector = query_vector.reshape(1, -1) | |
| # Perform search | |
| distances, indices = self.index.search(query_vector.astype(np.float32), n_results) | |
| # Format results | |
| results = [] | |
| for i, (dist, idx) in enumerate(zip(distances[0], indices[0])): | |
| if idx < len(self.documents): # Check if index is valid | |
| results.append({ | |
| "text": self.documents[idx], | |
| "metadata": self.metadata[idx], | |
| "distance": float(dist) | |
| }) | |
| return results | |
| def get_all_documents(self) -> List[Dict]: | |
| """Get all stored documents""" | |
| return [ | |
| {"text": doc, "metadata": meta} | |
| for doc, meta in zip(self.documents, self.metadata) | |
| ] |