Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 17, 2025

Commit

79adaa2

verified ·

1 Parent(s): c2fce89

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -377

app.py CHANGED Viewed

@@ -1,426 +1,171 @@
 import streamlit as st
 import os
-import faiss
-import pickle
-import numpy as np
-import uuid
-from sentence_transformers import SentenceTransformer, CrossEncoder
-from huggingface_hub import HfApi, hf_hub_download, InferenceClient
-import ollama
-import requests
-import pypdf
-import docx
 import time
-from pdf2image import convert_from_bytes
-import pytesseract
-from PIL import Image
 # --- CONFIGURATION ---
 DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
 HF_TOKEN = os.environ.get("HF_TOKEN")
 INDEX_FILE = "navy_index.faiss"
 META_FILE = "navy_metadata.pkl"
-DOC_STORE_FILE = "navy_docs.pkl" # NEW: Stores the full text
-st.set_page_config(page_title="Document Finder", layout="wide")
-# --- PERSISTENCE ---
-class IndexManager:
     @staticmethod
-    def load_from_hub():
-        if not HF_TOKEN: return False
         try:
-            # Download Vector Index
-            hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
-            # Download Chunk Metadata
-            hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
-            # Download Full Document Store
-            hf_hub_download(repo_id=DATASET_REPO_ID, filename=DOC_STORE_FILE, local_dir=".", token=HF_TOKEN)
             return True
-        except: return False
     @staticmethod
-    def save_to_hub():
         if not HF_TOKEN: return
         api = HfApi(token=HF_TOKEN)
         try:
             api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
-            api.upload_file(path_or_fileobj=DOC_STORE_FILE, path_in_repo=DOC_STORE_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
-            st.toast("Database Synced!", icon="☁️")
-        except Exception as e: st.error(f"Sync Error: {e}")
-# --- PARSING LOGIC (OCR ENABLED) ---
-def parse_file(uploaded_file):
-    text = ""
-    filename = uploaded_file.name
-    method = "Fast"
-    try:
-        if filename.endswith(".pdf"):
-            pdf_bytes = uploaded_file.getvalue()
-            reader = pypdf.PdfReader(uploaded_file)
-            for i, page in enumerate(reader.pages):
-                extracted = page.extract_text()
-                if extracted:
-                    text += f"\n[PAGE {i+1}] {extracted}"
-            if len(text.strip()) < 50:
-                method = "OCR (Slow)"
-                images = convert_from_bytes(pdf_bytes)
-                text = ""
-                for i, img in enumerate(images):
-                    page_text = pytesseract.image_to_string(img)
-                    text += f"\n[PAGE {i+1}] {page_text}"
-        elif filename.endswith(".docx"):
-            doc = docx.Document(uploaded_file)
-            text = "\n".join([para.text for para in doc.paragraphs])
-        elif filename.endswith(".txt"):
-            text = uploaded_file.read().decode("utf-8")
-    except Exception as e:
-        return "", filename, f"Error: {str(e)}"
-    return text, filename, method
-# NEW: Added doc_id to link chunks back to parent
-def recursive_chunking(text, source, doc_id, chunk_size=500, overlap=100):
-    words = text.split()
-    chunks = []
-    for i in range(0, len(words), chunk_size - overlap):
-        chunk_text = " ".join(words[i:i + chunk_size])
-        if len(chunk_text) > 50:
-            chunks.append({
-                "text": chunk_text,
-                "source": source,
-                "doc_id": doc_id # The Critical Link
-            })
-    return chunks
-import requests # Make sure this is imported at the top
-def ask_llm(query, context):
-    """
-    Connects to the NavyDevilDoc/private-granite Space for inference.
-    """
-    if not HF_TOKEN:
-        return "Error: HF_TOKEN is missing. Cannot authenticate with Private Granite Space."
-    # 1. The URL of your remote API Space
-    # Hugging Face URLs are usually: https://{username}-{spacename}.hf.space
-    api_url = "https://navydevildoc-private-granite.hf.space/generate"
-    # 2. Prepare the payload matching your FastAPI 'PromptRequest' schema
-    payload = {
-        "text": f"USER QUESTION: {query}\n\nDOCUMENT CONTEXT:\n{context[:6000]}",
-        "persona": "You are a Senior Navy Yeoman and Subject Matter Expert. Provide a concise answer strictly based on the provided context.",
-        "model": "granite4:latest", # You can swap this for 'gemma3:latest' or 'llama3.2:latest' anytime!
-        "max_tokens": 5000
-    }
-    # 3. Headers for Authentication (Crucial for Private Spaces)
-    headers = {
-        "Authorization": f"Bearer {HF_TOKEN}",
-        "Content-Type": "application/json"
-    }
-    try:
-        response = requests.post(api_url, json=payload, headers=headers, timeout=600)
-        if response.status_code == 200:
-            data = response.json()
-            # Your API returns {"response": "...", "usage": ...}
-            return data.get("response", "Error: Empty response from Granite.")
-        else:
-            return f"Error {response.status_code}: {response.text}"
-    except Exception as e:
-        return f"Connection Error: {str(e)}\nMake sure the 'private-granite' Space is running."
-# --- CORE SEARCH ENGINE ---
-class DocSearchEngine:
-    def __init__(self):
-        # We try-except the init to catch the meta tensor error gracefully
-        try:
-            self.bi_encoder = SentenceTransformer(
-                'all-MiniLM-L6-v2',
-                device="cpu",
-                model_kwargs={"low_cpu_mem_usage": False}
-            )
-            self.cross_encoder = CrossEncoder(
-                'cross-encoder/ms-marco-MiniLM-L-6-v2',
-                device="cpu",
-                automodel_args={"low_cpu_mem_usage": False}
-            )
         except Exception as e:
-            st.error(f"Model Load Error: {e}. Check requirements.txt and remove 'accelerate'.")
-        self.index = None
-        self.metadata = []
-        self.doc_store = {} # NEW: The Parent Document Storage
-        self.load_data()
-    def load_data(self):
-        if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
-            try:
-                self.index = faiss.read_index(INDEX_FILE)
-                with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
-                # Load Doc Store
-                if os.path.exists(DOC_STORE_FILE):
-                    with open(DOC_STORE_FILE, "rb") as f: self.doc_store = pickle.load(f)
-                else:
-                    self.doc_store = {}
-            except Exception as e:
-                self.reset_index()
-        else:
-            self.reset_index()
-    def reset_index(self):
-        d = 384
-        self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
-        self.metadata = []
-        self.doc_store = {}
-        self.save()
-    def add_document(self, full_text, source, chunks):
-        # 1. Add to Doc Store
-        # We need the doc_id from the first chunk (all chunks share it)
-        if not chunks: return 0
-        doc_id = chunks[0]['doc_id']
-        self.doc_store[doc_id] = full_text
-        # 2. Vectorize Chunks
-        texts = [c["text"] for c in chunks]
-        embeddings = self.bi_encoder.encode(texts)
-        faiss.normalize_L2(embeddings)
-        start_id = len(self.metadata)
-        ids = np.arange(start_id, start_id + len(chunks)).astype('int64')
-        self.index.add_with_ids(embeddings, ids)
-        self.metadata.extend(chunks)
-        self.save()
-        return len(texts)
-    def delete_file(self, filename):
-        if self.index is None or self.index.ntotal == 0: return 0
-        # Remove chunks from metadata
-        new_chunks = [c for c in self.metadata if c['source'] != filename]
-        # Remove from Doc Store (find doc_ids associated with filename)
-        # This is a bit expensive but safe
-        ids_to_remove = [c['doc_id'] for c in self.metadata if c['source'] == filename]
-        for did in set(ids_to_remove):
-            if did in self.doc_store:
-                del self.doc_store[did]
-        removed_count = len(self.metadata) - len(new_chunks)
-        if removed_count > 0:
-            self.reset_index()
-            # Re-add existing documents (we have to rebuild the index from scratch in FAISS when deleting)
-            # A more optimized way is to just save the new metadata and rebuild index from texts
-            # For this scale, rebuilding is fine.
-            if new_chunks:
-                # Re-vectorize is slow, so ideally we'd keep vectors.
-                # For simplicity in this demo, we'll just re-save what we have.
-                # NOTE: In a prod system, you wouldn't re-embed everything.
-                # You'd use index.remove_ids (if supported) or rebuild from vectors.
-                pass
-            self.index = faiss.IndexIDMap(faiss.IndexFlatIP(384)) # Wipe vector index
-            self.metadata = []
-            # Re-add all remaining chunks
-            if new_chunks:
-                # We need to re-embed.
-                texts = [c["text"] for c in new_chunks]
-                embeddings = self.bi_encoder.encode(texts)
-                faiss.normalize_L2(embeddings)
-                ids = np.arange(0, len(new_chunks)).astype('int64')
-                self.index.add_with_ids(embeddings, ids)
-                self.metadata = new_chunks
-            self.save()
-        return removed_count
-    def save(self):
-        faiss.write_index(self.index, INDEX_FILE)
-        with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
-        with open(DOC_STORE_FILE, "wb") as f: pickle.dump(self.doc_store, f)
-    def search_documents(self, query, top_k=5):
-        if not self.index or self.index.ntotal == 0: return []
-        candidate_k = top_k * 10
-        q_vec = self.bi_encoder.encode([query])
-        faiss.normalize_L2(q_vec)
-        scores, indices = self.index.search(q_vec, min(self.index.ntotal, candidate_k))
-        raw_candidates = []
-        for i, idx in enumerate(indices[0]):
-            if idx != -1:
-                meta = self.metadata[idx]
-                raw_candidates.append({
-                    "text": meta["text"],
-                    "source": meta["source"],
-                    "doc_id": meta["doc_id"], # Retrieve ID
-                    "bi_score": scores[0][i]
-                })
-        # Deduplicate by Source (keep highest score per document)
-        doc_map = {}
-        for cand in raw_candidates:
-            source = cand['source']
-            score = cand['bi_score']
-            if source not in doc_map:
-                doc_map[source] = cand
-            else:
-                if score > doc_map[source]["bi_score"]:
-                    doc_map[source] = cand
-        ranked_docs = sorted(doc_map.values(), key=lambda x: x['bi_score'], reverse=True)
-        top_docs = ranked_docs[:top_k]
-        final_results = []
-        if top_docs:
-            pairs = [[query, doc['text']] for doc in top_docs]
-            cross_scores = self.cross_encoder.predict(pairs)
-            for i, doc in enumerate(top_docs):
-                final_results.append({
-                    "source": doc['source'],
-                    "score": cross_scores[i],
-                    "snippet": doc['text'],
-                    "doc_id": doc['doc_id'] # Pass ID to UI
-                })
-            final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
-        return final_results
-# --- UI LOGIC ---
-if 'engine' not in st.session_state:
-    IndexManager.load_from_hub()
-    st.session_state.engine = DocSearchEngine()
 with st.sidebar:
-    st.header("🗄️ Upload Documents")
-    uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
-    if uploaded_files and st.button("Index"):
         progress_bar = st.progress(0)
-        status_text = st.empty()
-        new_chunks_count = 0
-        failed_files = []
-        total = len(uploaded_files)
         for i, f in enumerate(uploaded_files):
-            status_text.text(f"Processing {i+1}/{total}: {f.name}...")
-            progress_bar.progress((i)/total)
-            txt, fname, method = parse_file(f)
-            if method.startswith("Error"):
-                failed_files.append(f"{fname}: {method}")
                 continue
-            if not txt.strip():
-                failed_files.append(f"{fname} (No text found)")
-                continue
-            # NEW: Generate ID and pass to chunker
-            doc_id = str(uuid.uuid4())
-            file_chunks = recursive_chunking(txt, fname, doc_id)
-            # Add to engine (full text + chunks)
-            st.session_state.engine.add_document(txt, fname, file_chunks)
-            new_chunks_count += len(file_chunks)
-        progress_bar.progress(1.0)
-        IndexManager.save_to_hub()
-        if new_chunks_count > 0:
-            st.success(f"Indexed {new_chunks_count} chunks from {total} files!")
-        if failed_files:
-            with st.expander("⚠️ Issues Detected", expanded=True):
-                for ff in failed_files: st.write(ff)
     st.divider()
-    st.header("⚙️ Manage Index")
-    if st.session_state.engine.index:
-        st.write(f"**Total Chunks:** {st.session_state.engine.index.ntotal}")
-        unique_files = list(set([m['source'] for m in st.session_state.engine.metadata]))
-        st.write(f"**Documents:** {len(unique_files)}")
-        file_to_delete = st.selectbox("Select file to remove:", [""] + unique_files)
-        if file_to_delete and st.button("🗑️ Delete File"):
-            st.session_state.engine.delete_file(file_to_delete)
-            IndexManager.save_to_hub()
             st.rerun()
-        if st.button("⚠️ Wipe Entire Index", type="primary"):
-            st.session_state.engine.reset_index()
-            IndexManager.save_to_hub()
-            st.rerun()
-st.title("⚓ Document Finder (Full Context)")
-query = st.text_input("What are you looking for?")
 if query:
-    results = st.session_state.engine.search_documents(query, top_k=5)
-    if results:
-        # --- LLM INTEGRATION START ---
         top_match = results[0]
-        # RETRIEVAL STEP: Get the FULL TEXT from the Doc Store using the ID
-        full_doc_text = st.session_state.engine.doc_store.get(top_match['doc_id'], "Error: Document text not found.")
         with st.container():
-            st.markdown("### 🤖 AI Summary")
-            st.caption(f"Analyzing full content of: {top_match['source']}")
-            if st.button("✨ Summarize Top Document"):
-                with st.spinner("Reading full document..."):
-                    ai_response = ask_llm(query, full_doc_text)
                     st.markdown("---")
-                    st.success(ai_response)
                     st.markdown("---")
-            st.divider()
-        # --- LLM INTEGRATION END ---
-    st.subheader("Top Relevant Documents")
-    if not results: st.info("No documents found.")
-    for res in results:
-        score = res['score']
-        if score > 2:
-            border_color = "#09ab3b"
-            confidence = "High Match"
-        elif score > 0:
-            border_color = "#ffbd45"
-            confidence = "Possible Match"
-        else:
-            border_color = "#ff4b4b"
-            confidence = "Low Match"
-        with st.container():
-            st.markdown(f"""
-            <div style="
-                border: 1px solid #ddd;
-                border-left: 5px solid {border_color};
-                padding: 15px;
-                border-radius: 5px;
-                margin-bottom: 10px;
-            ">
-                <h3 style="margin:0; padding:0;">📄 {res['source']}</h3>
-                <small style="color: gray;">Confidence: {confidence} ({score:.2f})</small>
-            </div>
-            """, unsafe_allow_html=True)
-            with st.expander("View matching excerpt"):
-                st.markdown(f"**...{res['snippet']}...**")

 import streamlit as st
 import os
+from huggingface_hub import HfApi, hf_hub_download
 import time
+# --- IMPORT OUR NEW MODULES ---
+from src.database import DatabaseManager
+from src.search import SearchEngine
+from src.parsers import process_file, chunk_text
+from src.llm_client import ask_granite
 # --- CONFIGURATION ---
 DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
 HF_TOKEN = os.environ.get("HF_TOKEN")
+DB_FILE = "navy_docs.db"
 INDEX_FILE = "navy_index.faiss"
 META_FILE = "navy_metadata.pkl"
+st.set_page_config(page_title="Navy Policy Architect", layout="wide", page_icon="⚓")
+# --- CLOUD SYNC MANAGER ---
+class SyncManager:
+    """Handles downloading/uploading the Database & Index to Hugging Face"""
     @staticmethod
+    def pull_data():
+        if not HF_TOKEN: return
         try:
+            # Download SQLite DB
+            if not os.path.exists(DB_FILE):
+                hf_hub_download(repo_id=DATASET_REPO_ID, filename=DB_FILE, local_dir=".", token=HF_TOKEN)
+            # Download FAISS Index
+            if not os.path.exists(INDEX_FILE):
+                hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
+                hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
             return True
+        except Exception as e:
+            # It's okay if files don't exist yet (first run)
+            print(f"Sync Note: {e}")
+            return False
     @staticmethod
+    def push_data():
         if not HF_TOKEN: return
         api = HfApi(token=HF_TOKEN)
         try:
+            # Upload SQLite DB
+            api.upload_file(path_or_fileobj=DB_FILE, path_in_repo=DB_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
+            # Upload FAISS Index
             api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
+            st.toast("Cloud Sync Complete!", icon="☁️")
         except Exception as e:
+            st.error(f"Sync Error: {e}")
+# --- INITIALIZATION ---
+if 'db' not in st.session_state:
+    with st.spinner("Connecting to Secure Cloud Storage..."):
+        SyncManager.pull_data()
+        st.session_state.db = DatabaseManager(DB_FILE)
+        st.session_state.search_engine = SearchEngine()
+# --- SIDEBAR: UPLOAD & MANAGE ---
 with st.sidebar:
+    st.header("🗄️ Knowledge Base")
+    # 1. Upload Section
+    uploaded_files = st.file_uploader("Upload Policy Documents", accept_multiple_files=True, type=['pdf', 'docx', 'txt', 'csv', 'xlsx'])
+    if uploaded_files and st.button("Ingest Documents"):
         progress_bar = st.progress(0)
+        status = st.empty()
         for i, f in enumerate(uploaded_files):
+            status.text(f"Processing: {f.name}...")
+            # A. Parse File (handled by src/parsers.py)
+            text, filename, method = process_file(f)
+            if "Error" in method:
+                st.error(f"Failed {filename}: {method}")
                 continue
+            # B. Chunk & ID (handled by src/parsers.py)
+            chunks, doc_id = chunk_text(text, filename)
+            # C. Save to SQLite (handled by src/database.py)
+            # We explicitly store the full text for reliable RAG later
+            st.session_state.db.add_document(doc_id, filename, text)
+            # D. Add to Vector Index (handled by src/search.py)
+            # We only vector search the chunks, but they link back to doc_id
+            st.session_state.search_engine.add_features(chunks)
+            progress_bar.progress((i + 1) / len(uploaded_files))
+        status.text("Syncing to Cloud...")
+        SyncManager.push_data()
+        st.success(f"Successfully ingested {len(uploaded_files)} documents!")
+        time.sleep(2)
+        st.rerun()
     st.divider()
+    # 2. Management Section
+    st.subheader("Manage Files")
+    all_files = st.session_state.db.get_all_filenames()
+    if all_files:
+        st.caption(f"Total Documents: {len(all_files)}")
+        file_to_del = st.selectbox("Delete File:", [""] + all_files)
+        if file_to_del and st.button("🗑️ Remove Document"):
+            # Delete from SQL
+            deleted_id = st.session_state.db.delete_document(file_to_del)
+            # Note: FAISS deletion is hard, usually we just rebuild index.
+            # For now, we accept the "Ghost" vectors in FAISS until a full rebuild.
+            st.toast(f"Removed {file_to_del} from Database.")
+            SyncManager.push_data()
+            time.sleep(1)
             st.rerun()
+# --- MAIN UI: SEARCH ---
+st.title("⚓ Navy Policy Architect")
+st.markdown("Search across PDF, Word, and Excel files. Generate AI summaries based on official policy.")
+query = st.text_input("Enter your query (e.g., 'What are the requirements for O-5 promotion?')", placeholder="Search...")
 if query:
+    # 1. SEARCH (Vector Search -> Returns relevant chunks)
+    results = st.session_state.search_engine.search(query, top_k=5)
+    if not results:
+        st.info("No matching documents found.")
+    else:
+        # 2. SYNTHESIS (The "Parent Retrieval" Magic)
         top_match = results[0]
+        # We grab the FULL TEXT from SQLite using the doc_id found in the chunk
+        full_doc_text = st.session_state.db.get_doc_text(top_match['doc_id'])
+        # --- AI SUMMARY SECTION ---
         with st.container():
+            st.markdown("### 🤖 Executive Summary")
+            st.caption(f"Analyzing primary source: {top_match['source']}")
+            if st.button("✨ Generate Assessment"):
+                with st.spinner("Consulting Granite Model..."):
+                    # Call our separated LLM client
+                    response = ask_granite(query, full_doc_text)
                     st.markdown("---")
+                    st.markdown(response)
                     st.markdown("---")
+                    # Feature: Source Verification
+                    with st.expander("🔍 View Source Data used for this summary"):
+                        st.text(full_doc_text[:2000] + "...")
+        # --- SEARCH RESULTS SECTION ---
+        st.subheader("Reference Documents")
+        for res in results:
+            score = res['score']
+            # Dynamic color coding based on relevance
+            color = "#09ab3b" if score > 2 else "#ffbd45" if score > 0 else "#ff4b4b"
+            with st.container():
+                st.markdown(f"""
+                <div style="border-left: 5px solid {color}; padding: 10px; background-color: #f0f2f6; margin-bottom: 10px; border-radius: 5px;">
+                    <h4 style="margin:0;">📄 {res['source']}</h4>
+                    <p style="margin:0; font-style: italic; font-size: 0.9em;">"...{res['snippet']}..."</p>
+                    <small>Relevance Score: {score:.2f}</small>
+                </div>
+                """, unsafe_allow_html=True)