File size: 3,768 Bytes
9f911b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a14fec
9f911b3
 
1a14fec
9f911b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690fe5e
9f911b3
 
 
 
 
 
 
 
 
 
 
 
690fe5e
9f911b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Pinecone vector store operations."""

import os
import logging

logger = logging.getLogger(__name__)

INDEX_NAME = "notebooklm"
UPSERT_BATCH_SIZE = 100


class VectorStore:
    """Pinecone client for upserting, deleting, and querying vectors."""

    def __init__(self):
        self._index = None

    def _get_index(self):
        """Lazy-initialize the Pinecone index connection."""
        if self._index is not None:
            return self._index

        from pinecone import Pinecone

        api_key = os.environ.get("Pinecone_API")
        if not api_key:
            raise RuntimeError(
                "Pinecone_API not found in environment. "
                "Add it as a Secret in your HF Space settings."
            )

        pc = Pinecone(api_key=api_key)
        self._index = pc.Index(INDEX_NAME)
        logger.info("Connected to Pinecone index: %s", INDEX_NAME)
        return self._index

    def upsert(self, records: list[dict], namespace: str) -> int:
        """
        Upsert embedding records into Pinecone in batches.

        Args:
            records: List of {"id": str, "values": list[float], "metadata": dict}
            namespace: Pinecone namespace (notebook_id)

        Returns:
            Number of vectors upserted
        """
        index = self._get_index()
        total = 0
        for i in range(0, len(records), UPSERT_BATCH_SIZE):
            batch = records[i : i + UPSERT_BATCH_SIZE]
            index.upsert(vectors=batch, namespace=namespace)
            total += len(batch)
        logger.info("Upserted %d vectors to namespace '%s'", total, namespace)
        return total

    def delete_by_source(self, source_id: str, namespace: str) -> None:
        """Delete all vectors belonging to a specific source."""
        try:
            index = self._get_index()
            index.delete(
                filter={"source_id": {"$eq": source_id}},
                namespace=namespace,
            )
            logger.info("Deleted vectors for source '%s' from namespace '%s'", source_id, namespace)
        except Exception as e:
            logger.error("Failed to delete vectors from Pinecone: %s", e)

    def delete_namespace(self, namespace: str) -> None:
        """Delete all vectors in a namespace (when a notebook is deleted)."""
        try:
            index = self._get_index()
            index.delete(delete_all=True, namespace=namespace)
            logger.info("Deleted entire namespace '%s'", namespace)
        except Exception as e:
            logger.error("Failed to delete namespace from Pinecone: %s", e)

    def query(self, query_vector: list[float], namespace: str, top_k: int = 5, filter: dict | None = None) -> list[dict]:
        """
        Query Pinecone for the most similar chunks.

        Returns list of {"text", "source_id", "source_filename", "chunk_index", "score"}.
        """
        try:
            index = self._get_index()
            results = index.query(
                vector=query_vector,
                namespace=namespace,
                top_k=top_k,
                include_metadata=True,
                filter=filter,
            )

            matches = []
            for match in results.get("matches", []):
                meta = match.get("metadata", {})
                matches.append({
                    "text": meta.get("text", ""),
                    "source_id": meta.get("source_id", ""),
                    "source_filename": meta.get("source_filename", ""),
                    "chunk_index": meta.get("chunk_index", 0),
                    "score": match.get("score", 0.0),
                })
            return matches

        except Exception as e:
            logger.error("Pinecone query failed: %s", e)
            return []