Spaces:
Sleeping
Sleeping
File size: 5,090 Bytes
4339a4c ea88b9e 65387c4 ea88b9e 65387c4 ea88b9e 4339a4c 65387c4 4339a4c 65387c4 4339a4c 65387c4 4339a4c 65387c4 4339a4c 65387c4 4339a4c 65387c4 4339a4c 65387c4 ea88b9e 65387c4 ea88b9e 65387c4 ea88b9e 65387c4 ea88b9e 65387c4 ea88b9e 65387c4 ea88b9e 65387c4 ea88b9e 65387c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# utils/vector_store.py
import faiss
import numpy as np
from typing import List, Dict, Optional
import pickle
import os
from pathlib import Path
class VectorStore:
def __init__(self):
# Use absolute path for HF Spaces
self.persist_directory = "/data/faiss"
self.index = None
self.documents = []
self.metadata = []
# Ensure directories exist
self._create_data_directories()
# Try to load existing index and data
self._load_or_create_index()
def _create_data_directories(self):
"""Create necessary data directories"""
# Create main data directory
Path("/data").mkdir(parents=True, exist_ok=True)
# Create FAISS specific directory
Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
# Create uploads directory
Path("/data/uploads").mkdir(parents=True, exist_ok=True)
def _load_or_create_index(self):
"""Load existing index or create new one"""
index_path = os.path.join(self.persist_directory, "faiss.index")
data_path = os.path.join(self.persist_directory, "documents.pkl")
try:
if os.path.exists(index_path) and os.path.exists(data_path):
print(f"Loading existing index from {index_path}")
# Load existing index
self.index = faiss.read_index(index_path)
# Load documents and metadata
with open(data_path, 'rb') as f:
data = pickle.load(f)
self.documents = data['documents']
self.metadata = data['metadata']
print(f"Loaded {len(self.documents)} documents from existing index")
else:
print("No existing index found, creating new one")
# Create new index
self.index = None # Will be created when first vectors are added
self.documents = []
self.metadata = []
except Exception as e:
print(f"Error loading index: {e}")
self.index = None
self.documents = []
self.metadata = []
def _save_index(self):
"""Save index and data to disk"""
if self.index is not None:
index_path = os.path.join(self.persist_directory, "faiss.index")
data_path = os.path.join(self.persist_directory, "documents.pkl")
try:
# Save FAISS index
faiss.write_index(self.index, index_path)
# Save documents and metadata
with open(data_path, 'wb') as f:
pickle.dump({
'documents': self.documents,
'metadata': self.metadata
}, f)
except Exception as e:
print(f"Error saving index: {e}")
def add_documents(self, chunks: List[Dict], metadata: Optional[Dict] = None):
"""Add document chunks to vector store"""
if not chunks:
return
# Extract vectors and documents
vectors = np.array([chunk["embeddings"] for chunk in chunks])
# Create index if it doesn't exist
if self.index is None:
dimension = vectors.shape[1]
self.index = faiss.IndexFlatL2(dimension)
# Add vectors to index
self.index.add(vectors.astype(np.float32))
# Store documents and metadata
for chunk in chunks:
chunk_metadata = {
"chunk_id": len(self.documents),
"text_length": len(chunk["text"])
}
if metadata:
chunk_metadata.update(metadata)
self.documents.append(chunk["text"])
self.metadata.append(chunk_metadata)
# Save updated index
self._save_index()
def search(self, query_vector: np.ndarray, n_results: int = 5) -> List[Dict]:
"""Search for similar documents"""
if self.index is None or self.index.ntotal == 0:
return []
# Reshape query vector if needed
if len(query_vector.shape) == 1:
query_vector = query_vector.reshape(1, -1)
# Perform search
distances, indices = self.index.search(query_vector.astype(np.float32), n_results)
# Format results
results = []
for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
if idx < len(self.documents): # Check if index is valid
results.append({
"text": self.documents[idx],
"metadata": self.metadata[idx],
"distance": float(dist)
})
return results
def get_all_documents(self) -> List[Dict]:
"""Get all stored documents"""
return [
{"text": doc, "metadata": meta}
for doc, meta in zip(self.documents, self.metadata)
] |