NotebookLM / persistence /vector_store.py
internomega-terrablue
source selection
690fe5e
"""Pinecone vector store operations."""
import os
import logging
logger = logging.getLogger(__name__)
INDEX_NAME = "notebooklm"
UPSERT_BATCH_SIZE = 100
class VectorStore:
"""Pinecone client for upserting, deleting, and querying vectors."""
def __init__(self):
self._index = None
def _get_index(self):
"""Lazy-initialize the Pinecone index connection."""
if self._index is not None:
return self._index
from pinecone import Pinecone
api_key = os.environ.get("Pinecone_API")
if not api_key:
raise RuntimeError(
"Pinecone_API not found in environment. "
"Add it as a Secret in your HF Space settings."
)
pc = Pinecone(api_key=api_key)
self._index = pc.Index(INDEX_NAME)
logger.info("Connected to Pinecone index: %s", INDEX_NAME)
return self._index
def upsert(self, records: list[dict], namespace: str) -> int:
"""
Upsert embedding records into Pinecone in batches.
Args:
records: List of {"id": str, "values": list[float], "metadata": dict}
namespace: Pinecone namespace (notebook_id)
Returns:
Number of vectors upserted
"""
index = self._get_index()
total = 0
for i in range(0, len(records), UPSERT_BATCH_SIZE):
batch = records[i : i + UPSERT_BATCH_SIZE]
index.upsert(vectors=batch, namespace=namespace)
total += len(batch)
logger.info("Upserted %d vectors to namespace '%s'", total, namespace)
return total
def delete_by_source(self, source_id: str, namespace: str) -> None:
"""Delete all vectors belonging to a specific source."""
try:
index = self._get_index()
index.delete(
filter={"source_id": {"$eq": source_id}},
namespace=namespace,
)
logger.info("Deleted vectors for source '%s' from namespace '%s'", source_id, namespace)
except Exception as e:
logger.error("Failed to delete vectors from Pinecone: %s", e)
def delete_namespace(self, namespace: str) -> None:
"""Delete all vectors in a namespace (when a notebook is deleted)."""
try:
index = self._get_index()
index.delete(delete_all=True, namespace=namespace)
logger.info("Deleted entire namespace '%s'", namespace)
except Exception as e:
logger.error("Failed to delete namespace from Pinecone: %s", e)
def query(self, query_vector: list[float], namespace: str, top_k: int = 5, filter: dict | None = None) -> list[dict]:
"""
Query Pinecone for the most similar chunks.
Returns list of {"text", "source_id", "source_filename", "chunk_index", "score"}.
"""
try:
index = self._get_index()
results = index.query(
vector=query_vector,
namespace=namespace,
top_k=top_k,
include_metadata=True,
filter=filter,
)
matches = []
for match in results.get("matches", []):
meta = match.get("metadata", {})
matches.append({
"text": meta.get("text", ""),
"source_id": meta.get("source_id", ""),
"source_filename": meta.get("source_filename", ""),
"chunk_index": meta.get("chunk_index", 0),
"score": match.get("score", 0.0),
})
return matches
except Exception as e:
logger.error("Pinecone query failed: %s", e)
return []