Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

File size: 11,708 Bytes

ef513a5
0b474cc
79adaa2
39f39ce
79adaa2
 
 
 
 
7b68202
ef513a5
 
f09334e
39f313e
0b474cc
 
 
79adaa2
39f313e
13a7929
79adaa2
 
74f60fc
39f313e
74f60fc
 
39f313e
74f60fc
13a7929
74f60fc
 
 
 
 
 
 
 
9ea268c
 
 
74f60fc
13a7929
9ea268c
 
 
 
 
13a7929
 
9ea268c
 
 
 
13a7929
 
 
 
 
 
 
 
 
 
 
 
 
 
9ea268c
 
74f60fc
39f313e
79adaa2
9ea268c
 
39f313e
 
74f60fc
f09334e
39f313e
 
13a7929
74f60fc
f09334e
 
79adaa2
73ca4a0
74f60fc
13a7929
 
74f60fc
ef513a5
79adaa2
 
74f60fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ea268c
56e4e5f
 
9ea268c
56e4e5f
9ea268c
 
 
 
 
 
56e4e5f
 
 
 
 
 
 
 
 
 
9ea268c
 
 
56e4e5f
 
 
9ea268c
74f60fc
 
bfaaaee
 
 
 
74f60fc
bfaaaee
 
 
 
79adaa2
 
73ca4a0
79adaa2
73ca4a0
bfaaaee
 
73ca4a0
79adaa2
c6eeec6
bfaaaee
74f60fc
bfaaaee
 
79adaa2
c6eeec6
79adaa2
 
73ca4a0
79adaa2
 
e546bbb
74f60fc
a14f7cc
 
 
 
 
 
 
 
 
 
 
79adaa2
74f60fc
73ca4a0
79adaa2
74f60fc
 
bfaaaee
74f60fc
bfaaaee
 
79adaa2
f09334e
39f39ce
79adaa2
74f60fc
 
bfaaaee
 
 
 
 
 
 
74f60fc
bfaaaee
 
 
74f60fc
bfaaaee
 
 
74f60fc
bfaaaee
 
 
74f60fc
 
bfaaaee
 
 
 
39f39ce
79adaa2
 
74f60fc
39f39ce
74f60fc
39f313e
 
79adaa2
 
 
 
 
d71c08c
79adaa2
73ca4a0
d71c08c
b62f4f4
 
 
 
 
1a8ac84
b62f4f4
 
 
 
 
 
 
 
79adaa2
74f60fc
b62f4f4
5f4804b
79adaa2
5f4804b
79adaa2
74f60fc
 
a859b2e
 
 
a14f7cc
a859b2e
74f60fc
bd85152

import streamlit as st
import os
from huggingface_hub import HfApi, hf_hub_download
import time

# --- IMPORT OUR NEW MODULES ---
from src.database import DatabaseManager
from src.search import SearchEngine
from src.parsers import process_file, chunk_text
from src.llm_client import ask_llm

# --- CONFIGURATION ---
DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index" 
HF_TOKEN = os.environ.get("HF_TOKEN")
INDEX_FILE = "navy_index.faiss"
META_FILE = "navy_metadata.pkl"

st.set_page_config(page_title="Navy Policy Architect", layout="wide", page_icon="⚓")

# --- CLOUD SYNC MANAGER (FIXED) ---
class SyncManager:
    """Handles downloading/uploading the Database & Index to Hugging Face"""
    
    @staticmethod
    def get_remote_dbs():
        if not HF_TOKEN: return []
        try:
            api = HfApi(token=HF_TOKEN)
            # This worked because we specified repo_type="dataset" here
            files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
            dbs = [f for f in files if f.endswith(".db")]
            return dbs
        except Exception as e:
            return []

    @staticmethod
    def pull_data(db_filename):
        if not HF_TOKEN: 
            st.error("HF_TOKEN missing.")
            return False
        try:
            # FIX: Added repo_type="dataset"
            hf_hub_download(
                repo_id=DATASET_REPO_ID, 
                filename=db_filename, 
                local_dir=".", 
                token=HF_TOKEN,
                repo_type="dataset", # <--- THE MISSING LINK
                force_download=False 
            )
            
            # Download Index (Best effort)
            try:
                hf_hub_download(
                    repo_id=DATASET_REPO_ID, 
                    filename=INDEX_FILE, 
                    local_dir=".", 
                    token=HF_TOKEN,
                    repo_type="dataset" # <--- Added here too
                )
                hf_hub_download(
                    repo_id=DATASET_REPO_ID, 
                    filename=META_FILE, 
                    local_dir=".", 
                    token=HF_TOKEN,
                    repo_type="dataset" # <--- And here
                )
            except:
                pass 
            
            return True
        except Exception as e:
            # We return the actual error message so the UI can show it permanently
            return str(e)

    @staticmethod
    def push_data(db_filename):
        if not HF_TOKEN: return
        api = HfApi(token=HF_TOKEN)
        try:
            # This was already working because we had repo_type="dataset"
            api.upload_file(path_or_fileobj=db_filename, path_in_repo=db_filename, repo_id=DATASET_REPO_ID, repo_type="dataset")
            api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
            api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
            st.toast("Cloud Sync Complete!", icon="☁️")
        except Exception as e:
            st.error(f"Sync Error (Push): {e}")

            
# --- SIDEBAR: KNOWLEDGE BASE SELECTOR ---
with st.sidebar:
    st.header("🗄️ Knowledge Base")
    
    # 1. Database Selector
    # We fetch available DBs from the cloud to populate the dropdown
    if "available_dbs" not in st.session_state:
        st.session_state.available_dbs = SyncManager.get_remote_dbs()
        if not st.session_state.available_dbs:
            st.session_state.available_dbs = ["navy_docs.db"] # Default if empty

    selected_db = st.selectbox("Select Database:", st.session_state.available_dbs)
    
    # 2. Create New Database Option
    with st.expander("➕ Create New Database"):
        new_db_name = st.text_input("Name (e.g., 'Medical.db')")
        if st.button("Create"):
            if not new_db_name.endswith(".db"):
                new_db_name += ".db"
            st.session_state.available_dbs.append(new_db_name)
            # Force reload to switch to this new DB
            st.rerun()

    # --- INITIALIZATION (Dynamic based on selection) ---
    # If the DB has changed or isn't loaded, load it now
    if 'current_db_name' not in st.session_state or st.session_state.current_db_name != selected_db:
        
        # We use an empty container to hold messages
        msg_container = st.empty()
        
        with st.spinner(f"Syncing {selected_db}..."):
            
            # 1. Attempt the Pull
            result = SyncManager.pull_data(selected_db)
            
            # 2. Check the Result
            if result is True:
                # Success! Cloud file found.
                msg_container.success(f"Loaded {selected_db} from Cloud.")
            else:
                # Failure! (File deleted or new setup)
                # INSTEAD OF STOPPING, we warn and create a fresh local DB.
                msg_container.warning(f"Could not find {selected_db} in cloud. Creating new local database.")
                # We do NOT run st.stop() here anymore.

            # 3. Initialize the Database Manager (Either with the downloaded file or a new blank one)
            try:
                st.session_state.db = DatabaseManager(selected_db)
                st.session_state.search_engine = SearchEngine()
                st.session_state.current_db_name = selected_db
                # We intentionally do NOT rerun immediately here to let the warning show
            except Exception as e:
                st.error(f"Failed to initialize database: {e}")
                st.stop()

    # 3. Upload Section
    if "uploader_key" not in st.session_state:
        st.session_state.uploader_key = 0

    uploaded_files = st.file_uploader(
        f"Upload to {selected_db}", 
        accept_multiple_files=True, 
        type=['pdf', 'docx', 'txt', 'csv', 'xlsx'],
        key=f"uploader_{st.session_state.uploader_key}" 
    )
    
    if uploaded_files and st.button("Ingest Documents"):
        progress_bar = st.progress(0)
        status = st.empty()
        
        existing_files = st.session_state.db.get_all_filenames()
        
        for i, f in enumerate(uploaded_files):
            status.text(f"Processing: {f.name}...")
            
            if f.name in existing_files:
                st.toast(f"♻️ Updating: {f.name}")
                st.session_state.db.delete_document(f.name)

            text, filename, method = process_file(f)
            
            if "Error" in method:
                st.error(f"Failed {filename}: {method}")
                continue
                
            chunks, doc_id = chunk_text(text, filename)
            
            # Generate Abstract
            abstract = "No summary generated."
            if len(text) > 500:
                with st.spinner(f"Writing abstract for {filename}..."):
                    abstract = ask_llm(
                        query="Generate Abstract", 
                        context=text[:30000], 
                        mode="Abstract Generator", 
                        model_provider="Gemini"
                    )
            
            st.session_state.db.add_document(doc_id, filename, text, abstract=abstract)
            st.session_state.search_engine.add_features(chunks)
            progress_bar.progress((i + 1) / len(uploaded_files))
            
        status.text("Syncing to Cloud...")
        # Push SPECIFICALLY the active database
        SyncManager.push_data(selected_db)
        
        st.success(f"Ingested {len(uploaded_files)} docs into {selected_db}!")
        time.sleep(1)
        st.session_state.uploader_key += 1 
        st.rerun()

    st.divider()
    
    # 4. Document Library
    st.subheader(f"Files in {selected_db}")
    all_files = st.session_state.db.get_all_filenames()
    
    if all_files:
        with st.expander("View File List", expanded=False):
            for f in all_files:
                st.text(f"• {f}")

        file_to_del = st.selectbox("Delete File:", [""] + all_files)
        if file_to_del and st.button("🗑️ Delete Selected"):
            deleted_id = st.session_state.db.delete_document(file_to_del)
            st.toast(f"Removed {file_to_del}")
            SyncManager.push_data(selected_db)
            time.sleep(1)
            st.rerun()
            
        if st.button("⚠️ Nuke Database", type="primary"):
            for f in all_files:
                st.session_state.db.delete_document(f)
            st.session_state.search_engine.reset_index()
            SyncManager.push_data(selected_db)
            st.success("Database wiped.")
            time.sleep(1)
            st.rerun()
    else:
        st.info("Library is empty.")

# --- MAIN UI: SEARCH ---
st.title("⚓ Navy Policy Architect")
st.caption(f"Connected to Knowledge Base: {st.session_state.current_db_name}")

query = st.text_input("Enter your query...", placeholder="Search...")

if query:
    results = st.session_state.search_engine.search(query, top_k=5)
    
    if not results:
        st.info("No matching documents found.")
    else:
        top_match = results[0]
        full_doc_text = st.session_state.db.get_doc_text(top_match['doc_id'])
        
        with st.container():
            st.markdown("### 🤖 Intelligence Hub")
            col1, col2 = st.columns(2)
            with col1:
                analysis_mode = st.selectbox(
                    "Select Analysis Type:", 
                    ["Executive Summary", "Action Plan", "Risk Assessment", "Socratic Review", "Instructor Mode"]
                )
            with col2:
                model_choice = st.selectbox(
                    "Select Model:",
                    ["Gemini (Cloud - Smartest)", "Granite (Private Space)"]
                )
                provider = "Gemini" if "Gemini" in model_choice else "Granite"

            if st.button("✨ Generate Assessment"):
                with st.spinner(f"Consulting {provider}..."):
                    response = ask_llm(query, full_doc_text, mode=analysis_mode, model_provider=provider)
                    st.markdown("---")
                    st.markdown(response)
                    st.markdown("---")

        # --- SEARCH RESULTS SECTION (FIXED HTML) ---
        with st.expander("📚 Reference Documents", expanded=True):
            for res in results:
                score = res['score']
                color = "#09ab3b" if score > 2 else "#ffbd45" if score > 0 else "#ff4b4b"
                doc_abstract = st.session_state.db.get_doc_abstract(res['doc_id'])
                
                # IMPORTANT: Left-aligned HTML string to prevent Code Block rendering
                html_content = f"""
<div style="
    border-left: 5px solid {color}; 
    padding: 15px; 
    background-color: #f0f2f6; 
    margin-bottom: 15px; 
    border-radius: 5px;
    color: #1f1f1f;
">
    <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
        <h4 style="margin:0; color: #0e1117;">📄 {res['source']}</h4>
        <span style="font-size: 0.8em; color: #555; background: #ddd; padding: 2px 8px; border-radius: 4px;">Relevance: {score:.2f}</span>
    </div>
    <div style="background: #e3e6ea; padding: 10px; border-radius: 5px; margin-bottom: 10px;">
        <p style="margin: 0; font-size: 0.9em; color: #333;"><strong>🤖 Abstract:</strong> {doc_abstract}</p>
    </div>
    <p style="margin: 0; font-style: italic; font-size: 0.85em; color: #555;">
        "Matching Chunk: ...{res['snippet']}..."
    </p>
</div>
"""
                st.markdown(html_content, unsafe_allow_html=True)