NavyDevilDoc commited on
Commit
1c2fd03
·
verified ·
1 Parent(s): 5576211

Update src/rag_engine.py

Browse files
Files changed (1) hide show
  1. src/rag_engine.py +7 -2
src/rag_engine.py CHANGED
@@ -341,12 +341,16 @@ def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, s
341
  for vec_id, vec_data in vectors.items():
342
  meta = vec_data.metadata or {}
343
  source = meta.get('source', 'unknown.txt')
 
344
  text = meta.get('text') or meta.get('page_content') or ''
345
 
346
  # EXTRACT CHUNK INDEX FROM ID (e.g., "doc.txt_12" -> 12)
347
  try:
348
- # Assumes ID format "filename_index"
349
- chunk_index = int(vec_id.rsplit('_', 1)[-1])
 
 
 
350
  except ValueError:
351
  chunk_index = 0 # Fallback
352
 
@@ -361,6 +365,7 @@ def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, s
361
  count = 0
362
  for filename, chunks in reconstructed_files.items():
363
  # SORT BY INDEX (The Fix)
 
364
  chunks.sort(key=lambda x: x[0])
365
 
366
  # Join text only
 
341
  for vec_id, vec_data in vectors.items():
342
  meta = vec_data.metadata or {}
343
  source = meta.get('source', 'unknown.txt')
344
+ # Try to get text from 'text' (langchain default) or 'page_content' (our backup)
345
  text = meta.get('text') or meta.get('page_content') or ''
346
 
347
  # EXTRACT CHUNK INDEX FROM ID (e.g., "doc.txt_12" -> 12)
348
  try:
349
+ # Assumes ID format "filename_index" from our new ingestion logic
350
+ if "_" in vec_id:
351
+ chunk_index = int(vec_id.rsplit('_', 1)[-1])
352
+ else:
353
+ chunk_index = 0
354
  except ValueError:
355
  chunk_index = 0 # Fallback
356
 
 
365
  count = 0
366
  for filename, chunks in reconstructed_files.items():
367
  # SORT BY INDEX (The Fix)
368
+ # This ensures Paragraph 1 comes before Paragraph 2
369
  chunks.sort(key=lambda x: x[0])
370
 
371
  # Join text only