Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 23, 2025

Commit

633b400

verified ·

1 Parent(s): 7dbcae3

Update src/rag_engine.py

Browse files

Files changed (1) hide show

src/rag_engine.py +22 -22

src/rag_engine.py CHANGED Viewed

@@ -317,7 +317,7 @@ def reset_knowledge_base(username: str) -> Tuple[bool, str]:
 def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
     """
     Downloads text from Pinecone and reconstructs local source files.
-    Crucial for Quiz Mode after a container restart.
     """
     if not PINECONE_KEY or not index_name:
         return False, "Pinecone config missing."
@@ -325,53 +325,53 @@ def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, s
     try:
         pm = PineconeManager(PINECONE_KEY)
-        # 1. Get all Vector IDs for this user
         ids = pm.get_all_ids(index_name, username)
-        if not ids:
-            return False, "No data found in Pinecone for this user."
-        # 2. Fetch content (Batching by 100 for safety)
-        # Pinecone fetch limit is often 1000, but we play safe.
         batch_size = 100
-        reconstructed_files = {} # { "filename.txt": ["chunk1", "chunk2"] }
         for i in range(0, len(ids), batch_size):
             batch_ids = ids[i : i + batch_size]
             response = pm.fetch_vectors(index_name, batch_ids, username)
             vectors = response.vectors
             for vec_id, vec_data in vectors.items():
-                # vec_data is also an object. Access .metadata via attribute.
-                meta = vec_data.metadata
-                if meta is None: meta = {}
-                source = meta.get('source', 'unknown_restored.txt')
-                # Retrieve text (handle potential key variations)
                 text = meta.get('text') or meta.get('page_content') or ''
                 if source not in reconstructed_files:
                     reconstructed_files[source] = []
-                reconstructed_files[source].append(text)
-        # 3. Write to Disk
         user_dir = os.path.join(UPLOAD_DIR, username)
         os.makedirs(user_dir, exist_ok=True)
         count = 0
         for filename, chunks in reconstructed_files.items():
-            # Join chunks. Since we don't track order perfectly in UUIDs,
-            # we just join them. For the Quizzer's sliding window, this is usually fine.
-            # (If you used the readable ID update from previous turn, they might sort better).
-            full_text = "\n\n".join(chunks)
             file_path = os.path.join(user_dir, filename)
             with open(file_path, "w", encoding="utf-8") as f:
                 f.write(full_text)
             count += 1
-        return True, f"Restored {count} files from Pinecone!"
     except Exception as e:
         logger.error(f"Cache rebuild failed: {e}")

 def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
     """
     Downloads text from Pinecone and reconstructs local source files.
+    FIX: Sorts chunks numerically (_0, _1, _2) to prevent 'Frankenstein' files.
     """
     if not PINECONE_KEY or not index_name:
         return False, "Pinecone config missing."
     try:
         pm = PineconeManager(PINECONE_KEY)
+        # 1. Get all Vector IDs
         ids = pm.get_all_ids(index_name, username)
+        if not ids: return False, "No data found in Pinecone."
+        # 2. Fetch content
         batch_size = 100
+        reconstructed_files = {} # { "filename.txt": [ (index, text), (index, text) ] }
         for i in range(0, len(ids), batch_size):
             batch_ids = ids[i : i + batch_size]
             response = pm.fetch_vectors(index_name, batch_ids, username)
             vectors = response.vectors
             for vec_id, vec_data in vectors.items():
+                meta = vec_data.metadata or {}
+                source = meta.get('source', 'unknown.txt')
                 text = meta.get('text') or meta.get('page_content') or ''
+                # EXTRACT CHUNK INDEX FROM ID (e.g., "doc.txt_12" -> 12)
+                try:
+                    # Assumes ID format "filename_index"
+                    chunk_index = int(vec_id.rsplit('_', 1)[-1])
+                except ValueError:
+                    chunk_index = 0 # Fallback
                 if source not in reconstructed_files:
                     reconstructed_files[source] = []
+                reconstructed_files[source].append((chunk_index, text))
+        # 3. Write to Disk (Sorted)
         user_dir = os.path.join(UPLOAD_DIR, username)
         os.makedirs(user_dir, exist_ok=True)
         count = 0
         for filename, chunks in reconstructed_files.items():
+            # SORT BY INDEX (The Fix)
+            chunks.sort(key=lambda x: x[0])
+            # Join text only
+            full_text = "\n\n".join([c[1] for c in chunks])
             file_path = os.path.join(user_dir, filename)
             with open(file_path, "w", encoding="utf-8") as f:
                 f.write(full_text)
             count += 1
+        return True, f"Restored {count} files (Sorted) from Pinecone!"
     except Exception as e:
         logger.error(f"Cache rebuild failed: {e}")