Spaces:

synaptyx
/

SuoMoto.AI

Sleeping

App Files Files Community

SuoMoto.AI / utils /vector_store.py

cryogenic22

Update utils/vector_store.py

4339a4c verified about 1 year ago

raw

history blame contribute delete

5.09 kB


	# utils/vector_store.py
	import faiss
	import numpy as np
	from typing import List, Dict, Optional
	import pickle
	import os
	from pathlib import Path

	class VectorStore:
	def __init__(self):
	# Use absolute path for HF Spaces
	self.persist_directory = "/data/faiss"
	self.index = None
	self.documents = []
	self.metadata = []

	# Ensure directories exist
	self._create_data_directories()

	# Try to load existing index and data
	self._load_or_create_index()

	def _create_data_directories(self):
	"""Create necessary data directories"""
	# Create main data directory
	Path("/data").mkdir(parents=True, exist_ok=True)
	# Create FAISS specific directory
	Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
	# Create uploads directory
	Path("/data/uploads").mkdir(parents=True, exist_ok=True)

	def _load_or_create_index(self):
	"""Load existing index or create new one"""
	index_path = os.path.join(self.persist_directory, "faiss.index")
	data_path = os.path.join(self.persist_directory, "documents.pkl")

	try:
	if os.path.exists(index_path) and os.path.exists(data_path):
	print(f"Loading existing index from {index_path}")
	# Load existing index
	self.index = faiss.read_index(index_path)

	# Load documents and metadata
	with open(data_path, 'rb') as f:
	data = pickle.load(f)
	self.documents = data['documents']
	self.metadata = data['metadata']
	print(f"Loaded {len(self.documents)} documents from existing index")
	else:
	print("No existing index found, creating new one")
	# Create new index
	self.index = None # Will be created when first vectors are added
	self.documents = []
	self.metadata = []
	except Exception as e:
	print(f"Error loading index: {e}")
	self.index = None
	self.documents = []
	self.metadata = []


	def _save_index(self):
	"""Save index and data to disk"""
	if self.index is not None:
	index_path = os.path.join(self.persist_directory, "faiss.index")
	data_path = os.path.join(self.persist_directory, "documents.pkl")

	try:
	# Save FAISS index
	faiss.write_index(self.index, index_path)

	# Save documents and metadata
	with open(data_path, 'wb') as f:
	pickle.dump({
	'documents': self.documents,
	'metadata': self.metadata
	}, f)
	except Exception as e:
	print(f"Error saving index: {e}")

	def add_documents(self, chunks: List[Dict], metadata: Optional[Dict] = None):
	"""Add document chunks to vector store"""
	if not chunks:
	return

	# Extract vectors and documents
	vectors = np.array([chunk["embeddings"] for chunk in chunks])

	# Create index if it doesn't exist
	if self.index is None:
	dimension = vectors.shape[1]
	self.index = faiss.IndexFlatL2(dimension)

	# Add vectors to index
	self.index.add(vectors.astype(np.float32))

	# Store documents and metadata
	for chunk in chunks:
	chunk_metadata = {
	"chunk_id": len(self.documents),
	"text_length": len(chunk["text"])
	}
	if metadata:
	chunk_metadata.update(metadata)

	self.documents.append(chunk["text"])
	self.metadata.append(chunk_metadata)

	# Save updated index
	self._save_index()

	def search(self, query_vector: np.ndarray, n_results: int = 5) -> List[Dict]:
	"""Search for similar documents"""
	if self.index is None or self.index.ntotal == 0:
	return []

	# Reshape query vector if needed
	if len(query_vector.shape) == 1:
	query_vector = query_vector.reshape(1, -1)

	# Perform search
	distances, indices = self.index.search(query_vector.astype(np.float32), n_results)

	# Format results
	results = []
	for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
	if idx < len(self.documents): # Check if index is valid
	results.append({
	"text": self.documents[idx],
	"metadata": self.metadata[idx],
	"distance": float(dist)
	})

	return results

	def get_all_documents(self) -> List[Dict]:
	"""Get all stored documents"""
	return [
	{"text": doc, "metadata": meta}
	for doc, meta in zip(self.documents, self.metadata)
	]