Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 24, 2025

Commit

9e30b0a

verified ·

1 Parent(s): 0be55f7

Update src/rag_engine.py

Browse files

updated to add automatic file rewrite in the database to update an existing file without dedup

Files changed (1) hide show

src/rag_engine.py +41 -24

src/rag_engine.py CHANGED Viewed

@@ -166,12 +166,22 @@ def save_uploaded_file(uploaded_file, username: str = "default") -> str:
         logger.error(f"Error saving file: {e}")
         return None
-def process_and_add_text(text: str, source_name: str, username: str, embed_model_name: str, index_name: str) -> Tuple[bool, str]:
-    """Ingests raw text (Flattener) -> Saves Backup to Disk -> Uploads to Pinecone."""
     if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
     try:
-        # 1. SAVE PHYSICAL BACKUP (For Quiz Engine)
         user_docs_dir = os.path.join(UPLOAD_DIR, username)
         os.makedirs(user_docs_dir, exist_ok=True)
         backup_path = os.path.join(user_docs_dir, source_name)
@@ -179,29 +189,29 @@ def process_and_add_text(text: str, source_name: str, username: str, embed_model
         with open(backup_path, "w", encoding='utf-8') as f:
             f.write(text)
-        # 2. UPLOAD TO PINECONE
-        pm = PineconeManager(PINECONE_KEY)
-        emb_fn = get_embedding_func(embed_model_name)
-        # Create Document
         doc = Document(
             page_content=text,
             metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"}
         )
-        # Add to VectorStore (Namespace = Username)
         vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
-        vstore.add_documents([doc])
-        return True, f"Indexed and backed up: {source_name}"
     except Exception as e:
         logger.error(f"Error indexing text: {e}")
         return False, str(e)
-def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str, strategy: str = "paragraph") -> Tuple[bool, str]:
-    """Chunks File -> Scans Acronyms -> Uploads to Pinecone."""
-    if not PINECONE_KEY or not index_name:
-        return False, "Pinecone Configuration Missing."
     try:
         # 1. Chunking
@@ -213,25 +223,32 @@ def ingest_file(file_path: str, username: str, index_name: str, embed_model_name
         for doc in docs:
             acronym_mgr.scan_text_for_acronyms(doc.page_content)
-        # 3. Pinecone Safety Check (Dynamic)
         pm = PineconeManager(PINECONE_KEY)
-        emb_fn = get_embedding_func(embed_model_name)
-        # DYNAMIC CHECK: Generate a test embedding to see true dimension
-        # This allows you to swap models in CONFIGURATION later without breaking code
-        test_vec = emb_fn.embed_query("this is a test")
         model_dim = len(test_vec)
         if not pm.check_dimension_compatibility(index_name, model_dim):
-            return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors (based on current model), but found incompatible dimensions."
-        # 4. Upload
-        emb_fn = get_embedding_func(embed_model_name)
         vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
-        custom_ids = [f"{doc.metadata.get('source', 'doc')}_{i}" for i, doc in enumerate(docs)]
         vstore.add_documents(docs, ids=custom_ids)
-        return True, f"Successfully indexed {len(docs)} chunks."
     except Exception as e:
         logger.error(f"Ingestion failed: {e}")

         logger.error(f"Error saving file: {e}")
         return None
+def process_and_add_text(text: str, source_name: str, username: str, index_name: str) -> Tuple[bool, str]:
+    """
+    Ingests raw text.
+    UPGRADE: Performs 'Clean Replace' - deletes old version of this source before adding new.
+    """
     if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
     try:
+        pm = PineconeManager(PINECONE_KEY)
+        # 1. PRE-EMPTIVE DELETE (The Fix)
+        # We wipe any existing vectors with this source name to prevent duplicates.
+        # This effectively makes this an "Update/Replace" operation.
+        pm.delete_file(index_name, source_name, namespace=username)
+        # 2. SAVE PHYSICAL BACKUP (For Quiz Engine)
         user_docs_dir = os.path.join(UPLOAD_DIR, username)
         os.makedirs(user_docs_dir, exist_ok=True)
         backup_path = os.path.join(user_docs_dir, source_name)
         with open(backup_path, "w", encoding='utf-8') as f:
             f.write(text)
+        # 3. UPLOAD TO PINECONE
+        emb_fn = get_embedding_func() # Uses default or last active model logic internally
         doc = Document(
             page_content=text,
             metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"}
         )
         vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
+        # Custom ID isn't strictly necessary for single-doc flattened text, but good for consistency
+        vstore.add_documents([doc], ids=[f"{source_name}_0"])
+        return True, f"Updated: {source_name}"
     except Exception as e:
         logger.error(f"Error indexing text: {e}")
         return False, str(e)
+def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str = None, strategy: str = "paragraph") -> Tuple[bool, str]:
+    """
+    Chunks and uploads file.
+    UPGRADE: Performs 'Clean Replace' - deletes old chunks before uploading new ones.
+    """
+    if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
     try:
         # 1. Chunking
         for doc in docs:
             acronym_mgr.scan_text_for_acronyms(doc.page_content)
+        # 3. Pinecone Manager
         pm = PineconeManager(PINECONE_KEY)
+        # 4. SAFETY CHECK (Dimensions)
+        emb_fn = get_embedding_func(embed_model_name)
+        test_vec = emb_fn.embed_query("test")
         model_dim = len(test_vec)
         if not pm.check_dimension_compatibility(index_name, model_dim):
+            return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors."
+        # 5. PRE-EMPTIVE DELETE (The Fix)
+        # Wipe the slate clean for this specific filename
+        filename = os.path.basename(file_path)
+        pm.delete_file(index_name, filename, namespace=username)
+        # 6. UPLOAD NEW CHUNKS
         vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
+        # Generate readable IDs: "filename_0", "filename_1"
+        # This helps with the 'Frankenstein' sorting fix we added earlier
+        custom_ids = [f"{doc.metadata.get('source', filename)}_{i}" for i, doc in enumerate(docs)]
         vstore.add_documents(docs, ids=custom_ids)
+        return True, f"Successfully updated {filename} ({len(docs)} chunks)."
     except Exception as e:
         logger.error(f"Ingestion failed: {e}")