"""Pinecone vector store operations.""" import os import logging logger = logging.getLogger(__name__) INDEX_NAME = "notebooklm" UPSERT_BATCH_SIZE = 100 class VectorStore: """Pinecone client for upserting, deleting, and querying vectors.""" def __init__(self): self._index = None def _get_index(self): """Lazy-initialize the Pinecone index connection.""" if self._index is not None: return self._index from pinecone import Pinecone api_key = os.environ.get("Pinecone_API") if not api_key: raise RuntimeError( "Pinecone_API not found in environment. " "Add it as a Secret in your HF Space settings." ) pc = Pinecone(api_key=api_key) self._index = pc.Index(INDEX_NAME) logger.info("Connected to Pinecone index: %s", INDEX_NAME) return self._index def upsert(self, records: list[dict], namespace: str) -> int: """ Upsert embedding records into Pinecone in batches. Args: records: List of {"id": str, "values": list[float], "metadata": dict} namespace: Pinecone namespace (notebook_id) Returns: Number of vectors upserted """ index = self._get_index() total = 0 for i in range(0, len(records), UPSERT_BATCH_SIZE): batch = records[i : i + UPSERT_BATCH_SIZE] index.upsert(vectors=batch, namespace=namespace) total += len(batch) logger.info("Upserted %d vectors to namespace '%s'", total, namespace) return total def delete_by_source(self, source_id: str, namespace: str) -> None: """Delete all vectors belonging to a specific source.""" try: index = self._get_index() index.delete( filter={"source_id": {"$eq": source_id}}, namespace=namespace, ) logger.info("Deleted vectors for source '%s' from namespace '%s'", source_id, namespace) except Exception as e: logger.error("Failed to delete vectors from Pinecone: %s", e) def delete_namespace(self, namespace: str) -> None: """Delete all vectors in a namespace (when a notebook is deleted).""" try: index = self._get_index() index.delete(delete_all=True, namespace=namespace) logger.info("Deleted entire namespace '%s'", namespace) except Exception as e: logger.error("Failed to delete namespace from Pinecone: %s", e) def query(self, query_vector: list[float], namespace: str, top_k: int = 5, filter: dict | None = None) -> list[dict]: """ Query Pinecone for the most similar chunks. Returns list of {"text", "source_id", "source_filename", "chunk_index", "score"}. """ try: index = self._get_index() results = index.query( vector=query_vector, namespace=namespace, top_k=top_k, include_metadata=True, filter=filter, ) matches = [] for match in results.get("matches", []): meta = match.get("metadata", {}) matches.append({ "text": meta.get("text", ""), "source_id": meta.get("source_id", ""), "source_filename": meta.get("source_filename", ""), "chunk_index": meta.get("chunk_index", 0), "score": match.get("score", 0.0), }) return matches except Exception as e: logger.error("Pinecone query failed: %s", e) return []