Spaces:
Running
Running
| """Pinecone vector store operations.""" | |
| import os | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| INDEX_NAME = "notebooklm" | |
| UPSERT_BATCH_SIZE = 100 | |
| class VectorStore: | |
| """Pinecone client for upserting, deleting, and querying vectors.""" | |
| def __init__(self): | |
| self._index = None | |
| def _get_index(self): | |
| """Lazy-initialize the Pinecone index connection.""" | |
| if self._index is not None: | |
| return self._index | |
| from pinecone import Pinecone | |
| api_key = os.environ.get("Pinecone_API") | |
| if not api_key: | |
| raise RuntimeError( | |
| "Pinecone_API not found in environment. " | |
| "Add it as a Secret in your HF Space settings." | |
| ) | |
| pc = Pinecone(api_key=api_key) | |
| self._index = pc.Index(INDEX_NAME) | |
| logger.info("Connected to Pinecone index: %s", INDEX_NAME) | |
| return self._index | |
| def upsert(self, records: list[dict], namespace: str) -> int: | |
| """ | |
| Upsert embedding records into Pinecone in batches. | |
| Args: | |
| records: List of {"id": str, "values": list[float], "metadata": dict} | |
| namespace: Pinecone namespace (notebook_id) | |
| Returns: | |
| Number of vectors upserted | |
| """ | |
| index = self._get_index() | |
| total = 0 | |
| for i in range(0, len(records), UPSERT_BATCH_SIZE): | |
| batch = records[i : i + UPSERT_BATCH_SIZE] | |
| index.upsert(vectors=batch, namespace=namespace) | |
| total += len(batch) | |
| logger.info("Upserted %d vectors to namespace '%s'", total, namespace) | |
| return total | |
| def delete_by_source(self, source_id: str, namespace: str) -> None: | |
| """Delete all vectors belonging to a specific source.""" | |
| try: | |
| index = self._get_index() | |
| index.delete( | |
| filter={"source_id": {"$eq": source_id}}, | |
| namespace=namespace, | |
| ) | |
| logger.info("Deleted vectors for source '%s' from namespace '%s'", source_id, namespace) | |
| except Exception as e: | |
| logger.error("Failed to delete vectors from Pinecone: %s", e) | |
| def delete_namespace(self, namespace: str) -> None: | |
| """Delete all vectors in a namespace (when a notebook is deleted).""" | |
| try: | |
| index = self._get_index() | |
| index.delete(delete_all=True, namespace=namespace) | |
| logger.info("Deleted entire namespace '%s'", namespace) | |
| except Exception as e: | |
| logger.error("Failed to delete namespace from Pinecone: %s", e) | |
| def query(self, query_vector: list[float], namespace: str, top_k: int = 5, filter: dict | None = None) -> list[dict]: | |
| """ | |
| Query Pinecone for the most similar chunks. | |
| Returns list of {"text", "source_id", "source_filename", "chunk_index", "score"}. | |
| """ | |
| try: | |
| index = self._get_index() | |
| results = index.query( | |
| vector=query_vector, | |
| namespace=namespace, | |
| top_k=top_k, | |
| include_metadata=True, | |
| filter=filter, | |
| ) | |
| matches = [] | |
| for match in results.get("matches", []): | |
| meta = match.get("metadata", {}) | |
| matches.append({ | |
| "text": meta.get("text", ""), | |
| "source_id": meta.get("source_id", ""), | |
| "source_filename": meta.get("source_filename", ""), | |
| "chunk_index": meta.get("chunk_index", 0), | |
| "score": match.get("score", 0.0), | |
| }) | |
| return matches | |
| except Exception as e: | |
| logger.error("Pinecone query failed: %s", e) | |
| return [] | |