Spaces:
Sleeping
Sleeping
Update src/rag_engine.py
Browse files- src/rag_engine.py +7 -2
src/rag_engine.py
CHANGED
|
@@ -341,12 +341,16 @@ def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, s
|
|
| 341 |
for vec_id, vec_data in vectors.items():
|
| 342 |
meta = vec_data.metadata or {}
|
| 343 |
source = meta.get('source', 'unknown.txt')
|
|
|
|
| 344 |
text = meta.get('text') or meta.get('page_content') or ''
|
| 345 |
|
| 346 |
# EXTRACT CHUNK INDEX FROM ID (e.g., "doc.txt_12" -> 12)
|
| 347 |
try:
|
| 348 |
-
# Assumes ID format "filename_index"
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
| 350 |
except ValueError:
|
| 351 |
chunk_index = 0 # Fallback
|
| 352 |
|
|
@@ -361,6 +365,7 @@ def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, s
|
|
| 361 |
count = 0
|
| 362 |
for filename, chunks in reconstructed_files.items():
|
| 363 |
# SORT BY INDEX (The Fix)
|
|
|
|
| 364 |
chunks.sort(key=lambda x: x[0])
|
| 365 |
|
| 366 |
# Join text only
|
|
|
|
| 341 |
for vec_id, vec_data in vectors.items():
|
| 342 |
meta = vec_data.metadata or {}
|
| 343 |
source = meta.get('source', 'unknown.txt')
|
| 344 |
+
# Try to get text from 'text' (langchain default) or 'page_content' (our backup)
|
| 345 |
text = meta.get('text') or meta.get('page_content') or ''
|
| 346 |
|
| 347 |
# EXTRACT CHUNK INDEX FROM ID (e.g., "doc.txt_12" -> 12)
|
| 348 |
try:
|
| 349 |
+
# Assumes ID format "filename_index" from our new ingestion logic
|
| 350 |
+
if "_" in vec_id:
|
| 351 |
+
chunk_index = int(vec_id.rsplit('_', 1)[-1])
|
| 352 |
+
else:
|
| 353 |
+
chunk_index = 0
|
| 354 |
except ValueError:
|
| 355 |
chunk_index = 0 # Fallback
|
| 356 |
|
|
|
|
| 365 |
count = 0
|
| 366 |
for filename, chunks in reconstructed_files.items():
|
| 367 |
# SORT BY INDEX (The Fix)
|
| 368 |
+
# This ensures Paragraph 1 comes before Paragraph 2
|
| 369 |
chunks.sort(key=lambda x: x[0])
|
| 370 |
|
| 371 |
# Join text only
|