Spaces:

Dheeraj-13
/

RAG_Knowledge_Assistant

Sleeping

App Files Files Community

Dheeraj-13 commited on Dec 19, 2025

Commit

d3a38ee

1 Parent(s): 8367f0a

Feature: Additive Ingestion - Allow adding new docs without wiping old ones. Added Clear button.

Browse files

Files changed (2) hide show

apps/web/app.py +45 -15
services/rag/index.py +15 -16

apps/web/app.py CHANGED Viewed

@@ -91,45 +91,69 @@ def chat_fn(message, history, backend):
     return final_response
-def admin_ingest(files, use_sample):
-    # 1. Clean Data & Temp Dirs (Fresh Start)
-    temp_in = "temp_ingest"
-    dirs_to_clean = [temp_in, PROCESSED_DIR, INDEX_DIR]
-    for d in dirs_to_clean:
         if os.path.exists(d):
             shutil.rmtree(d)
         os.makedirs(d)
-    status = "Starting ingestion...\n"
     # Handle Source Selection
     if use_sample:
         # Copy from samples dir
         sample_file = os.path.join(SAMPLES_DIR, "sports_legends.txt")
         if os.path.exists(sample_file):
             shutil.copy(sample_file, temp_in)
-            status += f"Loaded sample data: {sample_file}\n"
         else:
             return "Error: Sample data not found on server."
-    elif files:
         # Copy uploaded files
         for file in files:
             shutil.copy(file.name, temp_in)
-        status += f"Loaded {len(files)} uploaded files.\n"
-    else:
-        return "No files selected and 'Use Sample' not checked."
     yield status
     # Run Ingest
     try:
         ingest(temp_in, PROCESSED_DIR)
-        status += "Ingestion complete.\nBuilding Index...\n"
         yield status
         build_index(PROCESSED_DIR, INDEX_DIR)
-        status += "Index built successfully.\nReloading services...\n"
         yield status
         # FORCE RELOAD: Clear singletons
@@ -137,7 +161,7 @@ def admin_ingest(files, use_sample):
         services.rag.retrieve._shared_retriever = None
         init_services()
-        status += "Services reloaded. Index updated successfully."
     except Exception as e:
         print(f"Ingestion Failed: {e}") # Print to server logs
         import traceback
@@ -172,6 +196,7 @@ with gr.Blocks(title="RAG Knowledge Assistant", theme=gr.themes.Soft()) as demo:
                 )
                 ingest_btn = gr.Button("Process Documents", variant="primary", size="sm")
             # Status Log - Visible by default
             with gr.Accordion("System Logs", open=True):
@@ -190,6 +215,11 @@ with gr.Blocks(title="RAG Knowledge Assistant", theme=gr.themes.Soft()) as demo:
                 outputs=[status_box]
             )
             with gr.Group():
                 backend_radio = gr.Radio(
                     choices=["openai", "gemini", "local"],

     return final_response
+def clear_knowledge_base():
+    # Helper to wipe data
+    for d in [PROCESSED_DIR, INDEX_DIR]:
         if os.path.exists(d):
             shutil.rmtree(d)
         os.makedirs(d)
+    # Reset helper
+    import services.rag.retrieve
+    services.rag.retrieve._shared_retriever = None
+    init_services()
+    return "Knowledge Base Cleared. System is empty."
+def admin_ingest(files, use_sample):
+    # 1. Clean Temp Input ONLY (Keep Processed/Index for additive)
+    temp_in = "temp_ingest"
+    if os.path.exists(temp_in):
+        shutil.rmtree(temp_in)
+    os.makedirs(temp_in)
+    # Ensure processed/index dirs exist
+    os.makedirs(PROCESSED_DIR, exist_ok=True)
+    os.makedirs(INDEX_DIR, exist_ok=True)
+    status = "Starting processing...\n"
     # Handle Source Selection
+    files_found = False
     if use_sample:
         # Copy from samples dir
         sample_file = os.path.join(SAMPLES_DIR, "sports_legends.txt")
         if os.path.exists(sample_file):
             shutil.copy(sample_file, temp_in)
+            status += f"Loaded: Sports Legends Dataset\n"
+            files_found = True
         else:
             return "Error: Sample data not found on server."
+    if files:
         # Copy uploaded files
         for file in files:
             shutil.copy(file.name, temp_in)
+        status += f"Loaded: {len(files)} new files.\n"
+        files_found = True
+    if not files_found:
+        return "No new files selected. Select files or sample data."
     yield status
     # Run Ingest
     try:
+        # Ingest new files to PROCESSED_DIR (Additive)
         ingest(temp_in, PROCESSED_DIR)
+        status += "Processing new files complete.\nRebuilding Index...\n"
         yield status
+        # Build Index (scans ALL files in PROCESSED_DIR)
         build_index(PROCESSED_DIR, INDEX_DIR)
+        status += "Index rebuilt with all documents.\nReloading services...\n"
         yield status
         # FORCE RELOAD: Clear singletons
         services.rag.retrieve._shared_retriever = None
         init_services()
+        status += "Services reloaded. Knowledge Base Updated successfully!"
     except Exception as e:
         print(f"Ingestion Failed: {e}") # Print to server logs
         import traceback
                 )
                 ingest_btn = gr.Button("Process Documents", variant="primary", size="sm")
+                clear_btn = gr.Button("Clear Knowledge Base", variant="stop", size="sm")
             # Status Log - Visible by default
             with gr.Accordion("System Logs", open=True):
                 outputs=[status_box]
             )
+            clear_btn.click(
+                clear_knowledge_base,
+                outputs=[status_box]
+            )
             with gr.Group():
                 backend_radio = gr.Radio(
                     choices=["openai", "gemini", "local"],

services/rag/index.py CHANGED Viewed

@@ -9,23 +9,22 @@ from .embed import get_embedder
 def load_processed_data(processed_dir: str) -> List[Dict]:
     chunks = []
-    # Read manifest if exists, or just iterate JSONs
-    manifest_path = os.path.join(processed_dir, "manifest.json")
-    if os.path.exists(manifest_path):
-        with open(manifest_path, 'r') as f:
-            manifest = json.load(f)
-        for entry in manifest:
-            with open(entry['path'], 'r') as f:
-                doc_data = json.load(f)
-                chunks.extend(doc_data['chunks'])
-    else:
-        # Fallback to glob
-         import glob
-         for f_path in glob.glob(os.path.join(processed_dir, "*.json")):
-             if f_path.endswith("manifest.json"): continue
-             with open(f_path, 'r') as f:
                 doc_data = json.load(f)
-                chunks.extend(doc_data['chunks'])
     return chunks
 def build_index(processed_dir: str, output_dir: str):

 def load_processed_data(processed_dir: str) -> List[Dict]:
     chunks = []
+    # Always glob for all JSONs to support additive ingestion
+    import glob
+    json_files = glob.glob(os.path.join(processed_dir, "*.json"))
+    print(f"Found {len(json_files)} existing documents to index.")
+    for f_path in json_files:
+        if f_path.endswith("manifest.json"): continue
+        try:
+            with open(f_path, 'r') as f:
                 doc_data = json.load(f)
+                if 'chunks' in doc_data:
+                    chunks.extend(doc_data['chunks'])
+        except Exception as e:
+            print(f"Error loading {f_path}: {e}")
     return chunks
 def build_index(processed_dir: str, output_dir: str):