Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 29, 2025

Commit

74f60fc

verified ·

1 Parent(s): bd85152

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -105

app.py CHANGED Viewed

@@ -12,64 +12,105 @@ from src.llm_client import ask_llm
 # --- CONFIGURATION ---
 DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
 HF_TOKEN = os.environ.get("HF_TOKEN")
-DB_FILE = "navy_docs.db"
 INDEX_FILE = "navy_index.faiss"
 META_FILE = "navy_metadata.pkl"
 st.set_page_config(page_title="Navy Policy Architect", layout="wide", page_icon="⚓")
-# --- CLOUD SYNC MANAGER ---
 class SyncManager:
     """Handles downloading/uploading the Database & Index to Hugging Face"""
     @staticmethod
-    def pull_data():
-        if not HF_TOKEN: return
         try:
-            # Download SQLite DB
-            if not os.path.exists(DB_FILE):
-                hf_hub_download(repo_id=DATASET_REPO_ID, filename=DB_FILE, local_dir=".", token=HF_TOKEN)
-            # Download FAISS Index
             if not os.path.exists(INDEX_FILE):
-                hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
-                hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
             return True
         except Exception as e:
-            # It's okay if files don't exist yet (first run)
-            print(f"Sync Note: {e}")
             return False
     @staticmethod
-    def push_data():
         if not HF_TOKEN: return
         api = HfApi(token=HF_TOKEN)
         try:
-            # Upload SQLite DB
-            api.upload_file(path_or_fileobj=DB_FILE, path_in_repo=DB_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             # Upload FAISS Index
             api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             st.toast("Cloud Sync Complete!", icon="☁️")
         except Exception as e:
-            st.error(f"Sync Error: {e}")
-# --- INITIALIZATION ---
-if 'db' not in st.session_state:
-    with st.spinner("Connecting to Secure Cloud Storage..."):
-        SyncManager.pull_data()
-        st.session_state.db = DatabaseManager(DB_FILE)
-        st.session_state.search_engine = SearchEngine()
-# --- SIDEBAR: UPLOAD & MANAGE ---
 with st.sidebar:
     st.header("🗄️ Knowledge Base")
-    # 1. Initialize the Uploader Key
     if "uploader_key" not in st.session_state:
         st.session_state.uploader_key = 0
-    # 2. Upload Section
     uploaded_files = st.file_uploader(
-        "Upload Policy Documents",
         accept_multiple_files=True,
         type=['pdf', 'docx', 'txt', 'csv', 'xlsx'],
         key=f"uploader_{st.session_state.uploader_key}"
@@ -79,179 +120,122 @@ with st.sidebar:
         progress_bar = st.progress(0)
         status = st.empty()
-        # Get current files to check for duplicates
         existing_files = st.session_state.db.get_all_filenames()
         for i, f in enumerate(uploaded_files):
             status.text(f"Processing: {f.name}...")
-            # --- DUPLICATION CHECK ---
             if f.name in existing_files:
-                st.toast(f"♻️ Updating existing file: {f.name}")
                 st.session_state.db.delete_document(f.name)
-            # A. Parse File
             text, filename, method = process_file(f)
             if "Error" in method:
                 st.error(f"Failed {filename}: {method}")
                 continue
-            # B. Chunk & ID
             chunks, doc_id = chunk_text(text, filename)
-            # --- NEW STEP: Generate Abstract ---
-            # We skip this for tiny files to save time
             abstract = "No summary generated."
             if len(text) > 500:
                 with st.spinner(f"Writing abstract for {filename}..."):
-                    # We utilize our flexible LLM client
-                    # Note: We send only the first 30k chars to keep it fast
                     abstract = ask_llm(
                         query="Generate Abstract",
                         context=text[:30000],
                         mode="Abstract Generator",
                         model_provider="Gemini"
                     )
-            # -----------------------------------
-            # C. Save to SQLite (Now includes Abstract)
             st.session_state.db.add_document(doc_id, filename, text, abstract=abstract)
-            # D. Add to Vector Index
             st.session_state.search_engine.add_features(chunks)
-            progress_bar.progress((i + 1) / len(uploaded_files))
         status.text("Syncing to Cloud...")
-        SyncManager.push_data()
-        st.success(f"Successfully processed {len(uploaded_files)} documents!")
         time.sleep(1)
         st.session_state.uploader_key += 1
         st.rerun()
     st.divider()
-    # 3. Document Library (Better Visibility)
-    st.subheader("Manage Files")
-    # We fetch the latest list
     all_files = st.session_state.db.get_all_filenames()
     if all_files:
-        st.caption(f"📚 **Library: {len(all_files)} Documents**")
-        # VISUAL LIST: A clean, scrollable list of what you have
         with st.expander("View File List", expanded=False):
             for f in all_files:
                 st.text(f"• {f}")
-        # DELETION MENU
-        file_to_del = st.selectbox("Select file to delete:", [""] + all_files)
         if file_to_del and st.button("🗑️ Delete Selected"):
             deleted_id = st.session_state.db.delete_document(file_to_del)
             st.toast(f"Removed {file_to_del}")
-            SyncManager.push_data()
             time.sleep(1)
             st.rerun()
-        # DANGER ZONE: Nuke Everything
-        if st.button("⚠️ Reset Database (Delete All)", type="primary"):
             for f in all_files:
                 st.session_state.db.delete_document(f)
-            # We also wipe the index explicitly to be safe
             st.session_state.search_engine.reset_index()
-            SyncManager.push_data()
-            st.success("Database wiped clean.")
             time.sleep(1)
             st.rerun()
     else:
         st.info("Library is empty.")
-    # 2. Management Section
-    st.subheader("Manage Files")
-    all_files = st.session_state.db.get_all_filenames()
-    if all_files:
-        st.caption(f"Total Documents: {len(all_files)}")
-        file_to_del = st.selectbox("Delete File:", [""] + all_files)
-        if file_to_del and st.button("🗑️ Remove Document"):
-            deleted_id = st.session_state.db.delete_document(file_to_del)
-            st.toast(f"Removed {file_to_del} from Database.")
-            SyncManager.push_data()
-            time.sleep(1)
-            st.rerun()
 # --- MAIN UI: SEARCH ---
 st.title("⚓ Navy Policy Architect")
-st.markdown("Search across PDF, Word, and Excel files. Generate AI summaries based on official policy.")
-query = st.text_input("Enter your query (e.g., 'What are the requirements for O-5 promotion?')", placeholder="Search...")
 if query:
-    # 1. SEARCH (Vector Search -> Returns relevant chunks)
     results = st.session_state.search_engine.search(query, top_k=5)
     if not results:
         st.info("No matching documents found.")
     else:
-        # 2. SYNTHESIS (The "Parent Retrieval" Magic)
         top_match = results[0]
-        # We grab the FULL TEXT from SQLite using the doc_id found in the chunk
         full_doc_text = st.session_state.db.get_doc_text(top_match['doc_id'])
-        # --- AI SUMMARY SECTION ---
         with st.container():
             st.markdown("### 🤖 Intelligence Hub")
-            st.caption(f"Analyzing primary source: {top_match['source']}")
-            # LAYOUT: Two columns for controls
             col1, col2 = st.columns(2)
             with col1:
-                # The "Deep Dive" Selector
                 analysis_mode = st.selectbox(
                     "Select Analysis Type:",
                     ["Executive Summary", "Action Plan", "Risk Assessment", "Socratic Review", "Instructor Mode"]
                 )
             with col2:
-                # The "Brain" Selector
                 model_choice = st.selectbox(
                     "Select Model:",
                     ["Gemini (Cloud - Smartest)", "Granite (Private Space)"]
                 )
-                # Map the UI text to the backend key
                 provider = "Gemini" if "Gemini" in model_choice else "Granite"
             if st.button("✨ Generate Assessment"):
-                with st.spinner(f"Consulting {provider} via {analysis_mode}..."):
-                    # Call the client with the new parameters
                     response = ask_llm(query, full_doc_text, mode=analysis_mode, model_provider=provider)
                     st.markdown("---")
                     st.markdown(response)
                     st.markdown("---")
-                    # Feature: Source Verification
-                    with st.expander("🔍 View Source Data used for this summary"):
-                        st.text(full_doc_text[:2000] + "...")
-        # --- SEARCH RESULTS SECTION (Rich View) ---
-        with st.expander("📚 Reference Documents (Click to view)", expanded=True):
-            if not results:
-                st.info("No matching documents found.")
             for res in results:
                 score = res['score']
                 color = "#09ab3b" if score > 2 else "#ffbd45" if score > 0 else "#ff4b4b"
-                # RETRIEVE THE ABSTRACT FROM DB
                 doc_abstract = st.session_state.db.get_doc_abstract(res['doc_id'])
-                # FIX: We moved the HTML string flush to the left to prevent
-                # Markdown from interpreting it as a code block.
                 html_content = f"""
 <div style="
     border-left: 5px solid {color};
@@ -265,11 +249,9 @@ if query:
         <h4 style="margin:0; color: #0e1117;">📄 {res['source']}</h4>
         <span style="font-size: 0.8em; color: #555; background: #ddd; padding: 2px 8px; border-radius: 4px;">Relevance: {score:.2f}</span>
     </div>
     <div style="background: #e3e6ea; padding: 10px; border-radius: 5px; margin-bottom: 10px;">
         <p style="margin: 0; font-size: 0.9em; color: #333;"><strong>🤖 Abstract:</strong> {doc_abstract}</p>
     </div>
     <p style="margin: 0; font-style: italic; font-size: 0.85em; color: #555;">
         "Matching Chunk: ...{res['snippet']}..."
     </p>

 # --- CONFIGURATION ---
 DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
 HF_TOKEN = os.environ.get("HF_TOKEN")
 INDEX_FILE = "navy_index.faiss"
 META_FILE = "navy_metadata.pkl"
 st.set_page_config(page_title="Navy Policy Architect", layout="wide", page_icon="⚓")
+# --- CLOUD SYNC MANAGER (UPGRADED) ---
 class SyncManager:
     """Handles downloading/uploading the Database & Index to Hugging Face"""
     @staticmethod
+    def get_remote_dbs():
+        """Scans the Hugging Face Repo for available .db files"""
+        if not HF_TOKEN: return []
         try:
+            api = HfApi(token=HF_TOKEN)
+            files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
+            # Filter for .db files (excluding potential system files)
+            dbs = [f for f in files if f.endswith(".db")]
+            return dbs
+        except Exception as e:
+            print(f"Error listing DBs: {e}")
+            return []
+    @staticmethod
+    def pull_data(db_filename):
+        if not HF_TOKEN: return False
+        try:
+            # Download Specific SQLite DB
+            if not os.path.exists(db_filename):
+                hf_hub_download(repo_id=DATASET_REPO_ID, filename=db_filename, local_dir=".", token=HF_TOKEN)
+            # Download FAISS Index (Note: We use one shared index for simplicity in this demo,
+            # but ideally you'd have 'navy_index_eng.faiss', etc.)
             if not os.path.exists(INDEX_FILE):
+                try:
+                    hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
+                    hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
+                except:
+                    pass # It's okay if index doesn't exist yet
             return True
         except Exception as e:
+            st.error(f"Sync Error (Pull): {e}")
             return False
     @staticmethod
+    def push_data(db_filename):
         if not HF_TOKEN: return
         api = HfApi(token=HF_TOKEN)
         try:
+            # Upload Specific SQLite DB
+            api.upload_file(path_or_fileobj=db_filename, path_in_repo=db_filename, repo_id=DATASET_REPO_ID, repo_type="dataset")
             # Upload FAISS Index
             api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             st.toast("Cloud Sync Complete!", icon="☁️")
         except Exception as e:
+            st.error(f"Sync Error (Push): {e}")
+# --- SIDEBAR: KNOWLEDGE BASE SELECTOR ---
 with st.sidebar:
     st.header("🗄️ Knowledge Base")
+    # 1. Database Selector
+    # We fetch available DBs from the cloud to populate the dropdown
+    if "available_dbs" not in st.session_state:
+        st.session_state.available_dbs = SyncManager.get_remote_dbs()
+        if not st.session_state.available_dbs:
+            st.session_state.available_dbs = ["navy_docs.db"] # Default if empty
+    selected_db = st.selectbox("Select Database:", st.session_state.available_dbs)
+    # 2. Create New Database Option
+    with st.expander("➕ Create New Database"):
+        new_db_name = st.text_input("Name (e.g., 'Medical.db')")
+        if st.button("Create"):
+            if not new_db_name.endswith(".db"):
+                new_db_name += ".db"
+            st.session_state.available_dbs.append(new_db_name)
+            # Force reload to switch to this new DB
+            st.rerun()
+    # --- INITIALIZATION (Dynamic based on selection) ---
+    # If the DB has changed or isn't loaded, load it now
+    if 'current_db_name' not in st.session_state or st.session_state.current_db_name != selected_db:
+        with st.spinner(f"Loading {selected_db} from Cloud..."):
+            SyncManager.pull_data(selected_db)
+            st.session_state.db = DatabaseManager(selected_db)
+            st.session_state.search_engine = SearchEngine() # This resets the search engine for the new DB
+            st.session_state.current_db_name = selected_db
+            st.rerun() # Refresh to ensure everything is synced
+    st.divider()
+    # 3. Upload Section
     if "uploader_key" not in st.session_state:
         st.session_state.uploader_key = 0
     uploaded_files = st.file_uploader(
+        f"Upload to {selected_db}",
         accept_multiple_files=True,
         type=['pdf', 'docx', 'txt', 'csv', 'xlsx'],
         key=f"uploader_{st.session_state.uploader_key}"
         progress_bar = st.progress(0)
         status = st.empty()
         existing_files = st.session_state.db.get_all_filenames()
         for i, f in enumerate(uploaded_files):
             status.text(f"Processing: {f.name}...")
             if f.name in existing_files:
+                st.toast(f"♻️ Updating: {f.name}")
                 st.session_state.db.delete_document(f.name)
             text, filename, method = process_file(f)
             if "Error" in method:
                 st.error(f"Failed {filename}: {method}")
                 continue
             chunks, doc_id = chunk_text(text, filename)
+            # Generate Abstract
             abstract = "No summary generated."
             if len(text) > 500:
                 with st.spinner(f"Writing abstract for {filename}..."):
                     abstract = ask_llm(
                         query="Generate Abstract",
                         context=text[:30000],
                         mode="Abstract Generator",
                         model_provider="Gemini"
                     )
             st.session_state.db.add_document(doc_id, filename, text, abstract=abstract)
             st.session_state.search_engine.add_features(chunks)
+            progress_bar.progress((i + 1) / len(uploaded_files))
         status.text("Syncing to Cloud...")
+        # Push SPECIFICALLY the active database
+        SyncManager.push_data(selected_db)
+        st.success(f"Ingested {len(uploaded_files)} docs into {selected_db}!")
         time.sleep(1)
         st.session_state.uploader_key += 1
         st.rerun()
     st.divider()
+    # 4. Document Library
+    st.subheader(f"Files in {selected_db}")
     all_files = st.session_state.db.get_all_filenames()
     if all_files:
         with st.expander("View File List", expanded=False):
             for f in all_files:
                 st.text(f"• {f}")
+        file_to_del = st.selectbox("Delete File:", [""] + all_files)
         if file_to_del and st.button("🗑️ Delete Selected"):
             deleted_id = st.session_state.db.delete_document(file_to_del)
             st.toast(f"Removed {file_to_del}")
+            SyncManager.push_data(selected_db)
             time.sleep(1)
             st.rerun()
+        if st.button("⚠️ Nuke Database", type="primary"):
             for f in all_files:
                 st.session_state.db.delete_document(f)
             st.session_state.search_engine.reset_index()
+            SyncManager.push_data(selected_db)
+            st.success("Database wiped.")
             time.sleep(1)
             st.rerun()
     else:
         st.info("Library is empty.")
 # --- MAIN UI: SEARCH ---
 st.title("⚓ Navy Policy Architect")
+st.caption(f"Connected to Knowledge Base: {st.session_state.current_db_name}")
+query = st.text_input("Enter your query...", placeholder="Search...")
 if query:
     results = st.session_state.search_engine.search(query, top_k=5)
     if not results:
         st.info("No matching documents found.")
     else:
         top_match = results[0]
         full_doc_text = st.session_state.db.get_doc_text(top_match['doc_id'])
         with st.container():
             st.markdown("### 🤖 Intelligence Hub")
             col1, col2 = st.columns(2)
             with col1:
                 analysis_mode = st.selectbox(
                     "Select Analysis Type:",
                     ["Executive Summary", "Action Plan", "Risk Assessment", "Socratic Review", "Instructor Mode"]
                 )
             with col2:
                 model_choice = st.selectbox(
                     "Select Model:",
                     ["Gemini (Cloud - Smartest)", "Granite (Private Space)"]
                 )
                 provider = "Gemini" if "Gemini" in model_choice else "Granite"
             if st.button("✨ Generate Assessment"):
+                with st.spinner(f"Consulting {provider}..."):
                     response = ask_llm(query, full_doc_text, mode=analysis_mode, model_provider=provider)
                     st.markdown("---")
                     st.markdown(response)
                     st.markdown("---")
+        # --- SEARCH RESULTS SECTION (FIXED HTML) ---
+        with st.expander("📚 Reference Documents", expanded=True):
             for res in results:
                 score = res['score']
                 color = "#09ab3b" if score > 2 else "#ffbd45" if score > 0 else "#ff4b4b"
                 doc_abstract = st.session_state.db.get_doc_abstract(res['doc_id'])
+                # IMPORTANT: Left-aligned HTML string to prevent Code Block rendering
                 html_content = f"""
 <div style="
     border-left: 5px solid {color};
         <h4 style="margin:0; color: #0e1117;">📄 {res['source']}</h4>
         <span style="font-size: 0.8em; color: #555; background: #ddd; padding: 2px 8px; border-radius: 4px;">Relevance: {score:.2f}</span>
     </div>
     <div style="background: #e3e6ea; padding: 10px; border-radius: 5px; margin-bottom: 10px;">
         <p style="margin: 0; font-size: 0.9em; color: #333;"><strong>🤖 Abstract:</strong> {doc_abstract}</p>
     </div>
     <p style="margin: 0; font-style: italic; font-size: 0.85em; color: #555;">
         "Matching Chunk: ...{res['snippet']}..."
     </p>