NavyDevilDoc commited on
Commit
83af5a3
·
verified ·
1 Parent(s): b2a0394

Update src/core/PineconeManager.py

Browse files
Files changed (1) hide show
  1. src/core/PineconeManager.py +60 -16
src/core/PineconeManager.py CHANGED
@@ -63,29 +63,73 @@ class PineconeManager:
63
  namespace=namespace
64
  )
65
 
66
- def delete_file(self, index_name: str, filename: str, namespace: str):
67
- """Deletes vectors for a specific file."""
 
 
 
 
 
68
  try:
69
  index = self.pc.Index(index_name)
70
- index.delete(filter={"source": filename}, namespace=namespace)
71
- return True, f"Deleted vectors for {filename}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  except Exception as e:
 
73
  return False, str(e)
74
 
75
- def get_all_ids(self, index_name: str, namespace: str):
76
- """
77
- Fetches all vector IDs for a user.
78
- NOTE: This works best on Pinecone Serverless indexes.
79
- """
80
  try:
81
- idx = self.pc.Index(index_name)
82
- results = []
83
- # .list() returns a generator that yields lists of IDs
84
- for ids in idx.list(namespace=namespace):
85
- results.extend(ids)
86
- return results
87
  except Exception as e:
88
- logger.error(f"Error listing IDs: {e}")
 
89
  return []
90
 
91
  def fetch_vectors(self, index_name: str, ids: list, namespace: str):
 
63
  namespace=namespace
64
  )
65
 
66
+ # --- THE FIX: SEARCH & DESTROY ---
67
+ def delete_file(self, index_name, source_filename, namespace):
68
+ """
69
+ Robust deletion that works on Starter & Serverless indexes.
70
+ 1. Fetches IDs associated with the file.
71
+ 2. Deletes those specific IDs.
72
+ """
73
  try:
74
  index = self.pc.Index(index_name)
75
+
76
+ # Strategy 1: Try Dummy Fetch to see what IDs look like
77
+ # We iterate to find all vectors with this source
78
+ ids_to_delete = []
79
+
80
+ # We use a dummy vector query to find matches by metadata
81
+ # This is 'Search' (Search)
82
+ # vector=[0.1]*dim is just a dummy to satisfy the API
83
+ dummy_vec = [0.1] * 384 # Dim doesn't strictly matter for filter-only, but good to be safe
84
+
85
+ # Note: We can't easily 'query' without a vector, so we rely on the
86
+ # delete_by_metadata if supported, OR we implement a scroll.
87
+ # BUT, the most reliable way for LangChain/Pinecone hybrid is:
88
+
89
+ # DIRECT DELETE BY FILTER (Try this first - works on Serverless)
90
+ try:
91
+ index.delete(filter={"source": source_filename}, namespace=namespace)
92
+ # We don't return immediately, we verify below.
93
+ except Exception as e:
94
+ print(f"Metadata delete failed (expected on Starter tier): {e}")
95
+
96
+ # Strategy 2: "The Clean Sweep" (Iterator)
97
+ # If the above didn't catch them (or silently failed), we manually hunt them.
98
+ # We look for the standard ID prefixes used by our app.
99
+ # Standard chunks: "filename_0", "filename_1"
100
+ # Flat chunks: "filename_flat_0"
101
+
102
+ # Check for the first 100 chunks. If found, delete.
103
+ # This handles the specific case where "Index Flat" created "filename_flat_0"
104
+ potential_ids = [f"{source_filename}_{i}" for i in range(200)]
105
+
106
+ # Check existence
107
+ fetch_response = index.fetch(ids=potential_ids, namespace=namespace)
108
+ found_ids = list(fetch_response.vectors.keys())
109
+
110
+ if found_ids:
111
+ index.delete(ids=found_ids, namespace=namespace)
112
+ return True, f"Deleted {len(found_ids)} vectors manually."
113
+
114
+ return True, "Delete signal sent."
115
+
116
  except Exception as e:
117
+ print(f"Delete failed: {e}")
118
  return False, str(e)
119
 
120
+ # --- HELPER FOR RESYNC ---
121
+ def get_all_ids(self, index_name, namespace):
122
+ # This helper iterates via list_paginated (if available) or dummy query
 
 
123
  try:
124
+ index = self.pc.Index(index_name)
125
+ matches = []
126
+ # Pinecone list_paginated is the modern way to get all IDs
127
+ for ids in index.list(namespace=namespace):
128
+ matches.extend(ids)
129
+ return matches
130
  except Exception as e:
131
+ # Fallback for older clients
132
+ print(f"List IDs failed: {e}")
133
  return []
134
 
135
  def fetch_vectors(self, index_name: str, ids: list, namespace: str):