NavyDevilDoc commited on
Commit
9e30b0a
·
verified ·
1 Parent(s): 0be55f7

Update src/rag_engine.py

Browse files

updated to add automatic file rewrite in the database to update an existing file without dedup

Files changed (1) hide show
  1. src/rag_engine.py +41 -24
src/rag_engine.py CHANGED
@@ -166,12 +166,22 @@ def save_uploaded_file(uploaded_file, username: str = "default") -> str:
166
  logger.error(f"Error saving file: {e}")
167
  return None
168
 
169
- def process_and_add_text(text: str, source_name: str, username: str, embed_model_name: str, index_name: str) -> Tuple[bool, str]:
170
- """Ingests raw text (Flattener) -> Saves Backup to Disk -> Uploads to Pinecone."""
 
 
 
171
  if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
172
 
173
  try:
174
- # 1. SAVE PHYSICAL BACKUP (For Quiz Engine)
 
 
 
 
 
 
 
175
  user_docs_dir = os.path.join(UPLOAD_DIR, username)
176
  os.makedirs(user_docs_dir, exist_ok=True)
177
  backup_path = os.path.join(user_docs_dir, source_name)
@@ -179,29 +189,29 @@ def process_and_add_text(text: str, source_name: str, username: str, embed_model
179
  with open(backup_path, "w", encoding='utf-8') as f:
180
  f.write(text)
181
 
182
- # 2. UPLOAD TO PINECONE
183
- pm = PineconeManager(PINECONE_KEY)
184
- emb_fn = get_embedding_func(embed_model_name)
185
 
186
- # Create Document
187
  doc = Document(
188
  page_content=text,
189
  metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"}
190
  )
191
 
192
- # Add to VectorStore (Namespace = Username)
193
  vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
194
- vstore.add_documents([doc])
 
195
 
196
- return True, f"Indexed and backed up: {source_name}"
197
  except Exception as e:
198
  logger.error(f"Error indexing text: {e}")
199
  return False, str(e)
200
 
201
- def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str, strategy: str = "paragraph") -> Tuple[bool, str]:
202
- """Chunks File -> Scans Acronyms -> Uploads to Pinecone."""
203
- if not PINECONE_KEY or not index_name:
204
- return False, "Pinecone Configuration Missing."
 
 
205
 
206
  try:
207
  # 1. Chunking
@@ -213,25 +223,32 @@ def ingest_file(file_path: str, username: str, index_name: str, embed_model_name
213
  for doc in docs:
214
  acronym_mgr.scan_text_for_acronyms(doc.page_content)
215
 
216
- # 3. Pinecone Safety Check (Dynamic)
217
  pm = PineconeManager(PINECONE_KEY)
218
- emb_fn = get_embedding_func(embed_model_name)
219
 
220
- # DYNAMIC CHECK: Generate a test embedding to see true dimension
221
- # This allows you to swap models in CONFIGURATION later without breaking code
222
- test_vec = emb_fn.embed_query("this is a test")
223
  model_dim = len(test_vec)
224
 
225
  if not pm.check_dimension_compatibility(index_name, model_dim):
226
- return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors (based on current model), but found incompatible dimensions."
227
 
228
- # 4. Upload
229
- emb_fn = get_embedding_func(embed_model_name)
 
 
 
 
230
  vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
231
- custom_ids = [f"{doc.metadata.get('source', 'doc')}_{i}" for i, doc in enumerate(docs)]
 
 
 
 
232
  vstore.add_documents(docs, ids=custom_ids)
233
 
234
- return True, f"Successfully indexed {len(docs)} chunks."
235
 
236
  except Exception as e:
237
  logger.error(f"Ingestion failed: {e}")
 
166
  logger.error(f"Error saving file: {e}")
167
  return None
168
 
169
+ def process_and_add_text(text: str, source_name: str, username: str, index_name: str) -> Tuple[bool, str]:
170
+ """
171
+ Ingests raw text.
172
+ UPGRADE: Performs 'Clean Replace' - deletes old version of this source before adding new.
173
+ """
174
  if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
175
 
176
  try:
177
+ pm = PineconeManager(PINECONE_KEY)
178
+
179
+ # 1. PRE-EMPTIVE DELETE (The Fix)
180
+ # We wipe any existing vectors with this source name to prevent duplicates.
181
+ # This effectively makes this an "Update/Replace" operation.
182
+ pm.delete_file(index_name, source_name, namespace=username)
183
+
184
+ # 2. SAVE PHYSICAL BACKUP (For Quiz Engine)
185
  user_docs_dir = os.path.join(UPLOAD_DIR, username)
186
  os.makedirs(user_docs_dir, exist_ok=True)
187
  backup_path = os.path.join(user_docs_dir, source_name)
 
189
  with open(backup_path, "w", encoding='utf-8') as f:
190
  f.write(text)
191
 
192
+ # 3. UPLOAD TO PINECONE
193
+ emb_fn = get_embedding_func() # Uses default or last active model logic internally
 
194
 
 
195
  doc = Document(
196
  page_content=text,
197
  metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"}
198
  )
199
 
 
200
  vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
201
+ # Custom ID isn't strictly necessary for single-doc flattened text, but good for consistency
202
+ vstore.add_documents([doc], ids=[f"{source_name}_0"])
203
 
204
+ return True, f"Updated: {source_name}"
205
  except Exception as e:
206
  logger.error(f"Error indexing text: {e}")
207
  return False, str(e)
208
 
209
+ def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str = None, strategy: str = "paragraph") -> Tuple[bool, str]:
210
+ """
211
+ Chunks and uploads file.
212
+ UPGRADE: Performs 'Clean Replace' - deletes old chunks before uploading new ones.
213
+ """
214
+ if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
215
 
216
  try:
217
  # 1. Chunking
 
223
  for doc in docs:
224
  acronym_mgr.scan_text_for_acronyms(doc.page_content)
225
 
226
+ # 3. Pinecone Manager
227
  pm = PineconeManager(PINECONE_KEY)
 
228
 
229
+ # 4. SAFETY CHECK (Dimensions)
230
+ emb_fn = get_embedding_func(embed_model_name)
231
+ test_vec = emb_fn.embed_query("test")
232
  model_dim = len(test_vec)
233
 
234
  if not pm.check_dimension_compatibility(index_name, model_dim):
235
+ return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors."
236
 
237
+ # 5. PRE-EMPTIVE DELETE (The Fix)
238
+ # Wipe the slate clean for this specific filename
239
+ filename = os.path.basename(file_path)
240
+ pm.delete_file(index_name, filename, namespace=username)
241
+
242
+ # 6. UPLOAD NEW CHUNKS
243
  vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
244
+
245
+ # Generate readable IDs: "filename_0", "filename_1"
246
+ # This helps with the 'Frankenstein' sorting fix we added earlier
247
+ custom_ids = [f"{doc.metadata.get('source', filename)}_{i}" for i, doc in enumerate(docs)]
248
+
249
  vstore.add_documents(docs, ids=custom_ids)
250
 
251
+ return True, f"Successfully updated {filename} ({len(docs)} chunks)."
252
 
253
  except Exception as e:
254
  logger.error(f"Ingestion failed: {e}")