pdf_chat_assistant / src /utils /vector_store.py
Seif-aber
implemented pdf chat assistant with gemini and RAG
edac567
"""Lightweight on-disk key β†’ embedding store (pickle-based)."""
from __future__ import annotations
from typing import List, Dict, Optional, Any
import pickle
import os
class VectorStore:
"""Persist simple embedding entries (vector + metadata) to a pickle file."""
def __init__(self, storage_path: str) -> None:
"""
Initialize the vector store.
Args:
storage_path: Path to pickle file used for persistence.
"""
self.storage_path = storage_path
self.embeddings: Dict[str, Dict[str, Any]] = {}
os.makedirs(os.path.dirname(storage_path), exist_ok=True)
self.load_embeddings()
def load_embeddings(self) -> None:
"""Load embeddings from disk if file exists."""
if os.path.exists(self.storage_path):
try:
with open(self.storage_path, "rb") as f:
self.embeddings = pickle.load(f)
except Exception as e:
print(f"[VectorStore] Error loading embeddings: {e}")
self.embeddings = {}
def save_embeddings(self) -> None:
"""Persist current embeddings to disk."""
try:
with open(self.storage_path, "wb") as f:
pickle.dump(self.embeddings, f)
except Exception as e:
print(f"[VectorStore] Error saving embeddings: {e}")
def add_embedding(self, key: str, vector: List[float], metadata: Optional[Dict] = None) -> None:
"""
Add or overwrite an embedding entry.
Args:
key: Unique identifier (e.g. 'pdf1_chunk_0')
vector: Embedding vector as list of floats
metadata: Optional metadata dictionary
"""
self.embeddings[key] = {"vector": vector, "metadata": metadata or {}}
self.save_embeddings()
def get_embedding_data(self, key: str) -> Optional[Dict]:
"""
Retrieve full embedding entry.
Args:
key: Embedding key
Returns:
Dict with 'vector' and 'metadata' or None.
"""
return self.embeddings.get(key)
def get_embedding_vector(self, key: str) -> Optional[List[float]]:
"""
Retrieve only the vector.
Args:
key: Embedding key
Returns:
Vector list or None.
"""
entry = self.embeddings.get(key)
return entry["vector"] if entry else None
def get_all_embeddings(self) -> List[str]:
"""
List all embedding keys.
Returns:
List of keys.
"""
return list(self.embeddings.keys())
def clear_embeddings(self) -> None:
"""Remove all embeddings."""
self.embeddings = {}
self.save_embeddings()
def remove_embeddings_by_prefix(self, prefix: str) -> None:
"""
Remove embeddings whose keys start with prefix.
Args:
prefix: Key prefix filter.
"""
to_remove = [k for k in self.embeddings if k.startswith(prefix)]
for k in to_remove:
del self.embeddings[k]
self.save_embeddings()