NavyDevilDoc commited on
Commit
633b400
·
verified ·
1 Parent(s): 7dbcae3

Update src/rag_engine.py

Browse files
Files changed (1) hide show
  1. src/rag_engine.py +22 -22
src/rag_engine.py CHANGED
@@ -317,7 +317,7 @@ def reset_knowledge_base(username: str) -> Tuple[bool, str]:
317
  def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
318
  """
319
  Downloads text from Pinecone and reconstructs local source files.
320
- Crucial for Quiz Mode after a container restart.
321
  """
322
  if not PINECONE_KEY or not index_name:
323
  return False, "Pinecone config missing."
@@ -325,53 +325,53 @@ def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, s
325
  try:
326
  pm = PineconeManager(PINECONE_KEY)
327
 
328
- # 1. Get all Vector IDs for this user
329
  ids = pm.get_all_ids(index_name, username)
330
- if not ids:
331
- return False, "No data found in Pinecone for this user."
332
 
333
- # 2. Fetch content (Batching by 100 for safety)
334
- # Pinecone fetch limit is often 1000, but we play safe.
335
  batch_size = 100
336
- reconstructed_files = {} # { "filename.txt": ["chunk1", "chunk2"] }
337
 
338
  for i in range(0, len(ids), batch_size):
339
  batch_ids = ids[i : i + batch_size]
340
  response = pm.fetch_vectors(index_name, batch_ids, username)
341
-
342
  vectors = response.vectors
343
 
344
  for vec_id, vec_data in vectors.items():
345
- # vec_data is also an object. Access .metadata via attribute.
346
- meta = vec_data.metadata
347
- if meta is None: meta = {}
348
-
349
- source = meta.get('source', 'unknown_restored.txt')
350
-
351
- # Retrieve text (handle potential key variations)
352
  text = meta.get('text') or meta.get('page_content') or ''
353
 
 
 
 
 
 
 
 
354
  if source not in reconstructed_files:
355
  reconstructed_files[source] = []
356
- reconstructed_files[source].append(text)
357
 
358
- # 3. Write to Disk
359
  user_dir = os.path.join(UPLOAD_DIR, username)
360
  os.makedirs(user_dir, exist_ok=True)
361
 
362
  count = 0
363
  for filename, chunks in reconstructed_files.items():
364
- # Join chunks. Since we don't track order perfectly in UUIDs,
365
- # we just join them. For the Quizzer's sliding window, this is usually fine.
366
- # (If you used the readable ID update from previous turn, they might sort better).
367
- full_text = "\n\n".join(chunks)
 
368
 
369
  file_path = os.path.join(user_dir, filename)
370
  with open(file_path, "w", encoding="utf-8") as f:
371
  f.write(full_text)
372
  count += 1
373
 
374
- return True, f"Restored {count} files from Pinecone!"
375
 
376
  except Exception as e:
377
  logger.error(f"Cache rebuild failed: {e}")
 
317
  def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
318
  """
319
  Downloads text from Pinecone and reconstructs local source files.
320
+ FIX: Sorts chunks numerically (_0, _1, _2) to prevent 'Frankenstein' files.
321
  """
322
  if not PINECONE_KEY or not index_name:
323
  return False, "Pinecone config missing."
 
325
  try:
326
  pm = PineconeManager(PINECONE_KEY)
327
 
328
+ # 1. Get all Vector IDs
329
  ids = pm.get_all_ids(index_name, username)
330
+ if not ids: return False, "No data found in Pinecone."
 
331
 
332
+ # 2. Fetch content
 
333
  batch_size = 100
334
+ reconstructed_files = {} # { "filename.txt": [ (index, text), (index, text) ] }
335
 
336
  for i in range(0, len(ids), batch_size):
337
  batch_ids = ids[i : i + batch_size]
338
  response = pm.fetch_vectors(index_name, batch_ids, username)
 
339
  vectors = response.vectors
340
 
341
  for vec_id, vec_data in vectors.items():
342
+ meta = vec_data.metadata or {}
343
+ source = meta.get('source', 'unknown.txt')
 
 
 
 
 
344
  text = meta.get('text') or meta.get('page_content') or ''
345
 
346
+ # EXTRACT CHUNK INDEX FROM ID (e.g., "doc.txt_12" -> 12)
347
+ try:
348
+ # Assumes ID format "filename_index"
349
+ chunk_index = int(vec_id.rsplit('_', 1)[-1])
350
+ except ValueError:
351
+ chunk_index = 0 # Fallback
352
+
353
  if source not in reconstructed_files:
354
  reconstructed_files[source] = []
355
+ reconstructed_files[source].append((chunk_index, text))
356
 
357
+ # 3. Write to Disk (Sorted)
358
  user_dir = os.path.join(UPLOAD_DIR, username)
359
  os.makedirs(user_dir, exist_ok=True)
360
 
361
  count = 0
362
  for filename, chunks in reconstructed_files.items():
363
+ # SORT BY INDEX (The Fix)
364
+ chunks.sort(key=lambda x: x[0])
365
+
366
+ # Join text only
367
+ full_text = "\n\n".join([c[1] for c in chunks])
368
 
369
  file_path = os.path.join(user_dir, filename)
370
  with open(file_path, "w", encoding="utf-8") as f:
371
  f.write(full_text)
372
  count += 1
373
 
374
+ return True, f"Restored {count} files (Sorted) from Pinecone!"
375
 
376
  except Exception as e:
377
  logger.error(f"Cache rebuild failed: {e}")