Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 16, 2025

Commit

e1daca2

verified ·

1 Parent(s): 5729a49

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -72

app.py CHANGED Viewed

@@ -9,6 +9,9 @@ from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
 import pypdf
 import docx
 import time
 # --- CONFIGURATION ---
 DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
@@ -39,22 +42,45 @@ class IndexManager:
             st.toast("Database Synced!", icon="☁️")
         except Exception as e: st.error(f"Sync Error: {e}")
-# --- PARSING & CHUNKING ---
 def parse_file(uploaded_file):
     text = ""
     filename = uploaded_file.name
     try:
         if filename.endswith(".pdf"):
             reader = pypdf.PdfReader(uploaded_file)
             for i, page in enumerate(reader.pages):
-                if page.extract_text(): text += f"\n[PAGE {i+1}] {page.extract_text()}"
         elif filename.endswith(".docx"):
             doc = docx.Document(uploaded_file)
             text = "\n".join([para.text for para in doc.paragraphs])
         elif filename.endswith(".txt"):
             text = uploaded_file.read().decode("utf-8")
-    except: pass
-    return text, filename
 def recursive_chunking(text, source, chunk_size=500, overlap=100):
     words = text.split()
@@ -68,7 +94,7 @@ def recursive_chunking(text, source, chunk_size=500, overlap=100):
 # --- CORE SEARCH ENGINE ---
 class DocSearchEngine:
     def __init__(self):
-        # Force CPU to avoid Docker memory issues
         self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")
         self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
@@ -80,13 +106,11 @@ class DocSearchEngine:
                 self.index = faiss.read_index(INDEX_FILE)
                 with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
             except Exception as e:
-                st.error(f"Index load failed, starting fresh: {e}")
                 self.reset_index()
         else:
             self.reset_index()
     def reset_index(self):
-        """Wipes the index clean"""
         d = 384
         self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
         self.metadata = []
@@ -102,23 +126,17 @@ class DocSearchEngine:
         self.index.add_with_ids(embeddings, ids)
         self.metadata.extend(chunks)
         self.save()
         return len(texts)
     def delete_file(self, filename):
         if self.index is None or self.index.ntotal == 0: return 0
         new_chunks = [c for c in self.metadata if c['source'] != filename]
         removed_count = len(self.metadata) - len(new_chunks)
         if removed_count > 0:
             self.reset_index()
-            if new_chunks:
-                self.add_documents(new_chunks)
-            else:
-                self.save()
         return removed_count
     def save(self):
@@ -127,9 +145,7 @@ class DocSearchEngine:
     def search_documents(self, query, top_k=5):
         if not self.index or self.index.ntotal == 0: return []
         candidate_k = top_k * 10
         q_vec = self.bi_encoder.encode([query])
         faiss.normalize_L2(q_vec)
@@ -156,14 +172,12 @@ class DocSearchEngine:
                     doc_map[source]["snippet"] = cand['text']
         ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
         final_results = []
         top_docs = ranked_docs[:top_k]
         if top_docs:
             pairs = [[query, doc[1]['snippet']] for doc in top_docs]
             cross_scores = self.cross_encoder.predict(pairs)
             for i, (source, data) in enumerate(top_docs):
                 final_results.append({
                     "source": source,
@@ -187,65 +201,41 @@ with st.sidebar:
         status_text = st.empty()
         new_chunks = []
-        failed_files = []   # Track crashes
-        empty_files = []    # Track files with no text (Scans?)
-        total_files = len(uploaded_files)
         for i, f in enumerate(uploaded_files):
-            # Update Status
-            status_text.text(f"Processing {i+1}/{total_files}: {f.name}")
-            progress_bar.progress((i + 1) / total_files)
-            # 1. Parse
-            txt, fname = parse_file(f)
-            # Check if text extraction failed (likely a scanned PDF)
             if not txt.strip():
-                empty_files.append(fname)
                 continue
-            # 2. Chunk
             file_chunks = recursive_chunking(txt, fname)
-            if not file_chunks:
-                # Text was found, but maybe it was too short/garbage
-                empty_files.append(f"{fname} (Too short)")
-                continue
             new_chunks.extend(file_chunks)
-        # 3. Save & Report
         if new_chunks:
-            with st.spinner("Saving to database..."):
-                st.session_state.engine.add_documents(new_chunks)
-                IndexManager.save_to_hub()
-            st.success(f"Successfully indexed {len(new_chunks)} chunks from {total_files - len(empty_files)} files!")
-            # REPORT ERRORS
-            if empty_files:
-                with st.expander("⚠️ Skipped Documents (No Text Found)", expanded=True):
-                    st.warning("The following files appear to be empty or scanned images (OCR required):")
-                    for ef in empty_files:
-                        st.write(f"- {ef}")
-        else:
-            st.error("No valid text found in any of the uploaded files.")
-            if empty_files:
-                st.write("Files were detected but contained no extractable text (likely scanned images).")
-        with st.spinner("Indexing..."):
-            new_chunks = []
-            for f in uploaded_files:
-                txt, fname = parse_file(f)
-                new_chunks.extend(recursive_chunking(txt, fname))
-            if new_chunks:
                 st.session_state.engine.add_documents(new_chunks)
                 IndexManager.save_to_hub()
-                st.success("Indexed!")
     st.divider()
     st.header("⚙️ Manage Index")
     if st.session_state.engine.index:
         st.write(f"**Total Chunks:** {st.session_state.engine.index.ntotal}")
         unique_files = list(set([m['source'] for m in st.session_state.engine.metadata]))
@@ -261,7 +251,6 @@ with st.sidebar:
                 st.rerun()
         st.divider()
-        # THE NUCLEAR OPTION
         if st.button("⚠️ Wipe Entire Index", type="primary"):
             with st.spinner("Nuking database..."):
                 st.session_state.engine.reset_index()
@@ -271,21 +260,14 @@ with st.sidebar:
                 st.rerun()
 st.title("⚓ Document Finder")
-st.caption("Locates the specific Instruction or NAVADMIN relevant to your query.")
-query = st.text_input("What are you looking for?", placeholder="e.g. 'FY25 Retention Bonuses'")
 if query:
     results = st.session_state.engine.search_documents(query, top_k=5)
     st.subheader("Top Relevant Documents")
-    if not results:
-        st.info("No documents found.")
     for res in results:
         score = res['score']
         if score > 2:
             border_color = "#09ab3b"
             confidence = "High Match"
@@ -309,6 +291,5 @@ if query:
                 <small style="color: gray;">Confidence: {confidence} ({score:.2f})</small>
             </div>
             """, unsafe_allow_html=True)
             with st.expander("View matching excerpt"):
                 st.markdown(f"**...{res['snippet']}...**")

 import pypdf
 import docx
 import time
+from pdf2image import convert_from_bytes
+import pytesseract
+from PIL import Image
 # --- CONFIGURATION ---
 DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
             st.toast("Database Synced!", icon="☁️")
         except Exception as e: st.error(f"Sync Error: {e}")
+# --- PARSING LOGIC (OCR ENABLED) ---
 def parse_file(uploaded_file):
     text = ""
     filename = uploaded_file.name
+    method = "Fast"
     try:
         if filename.endswith(".pdf"):
+            # Method 1: Fast Text Extraction
+            pdf_bytes = uploaded_file.getvalue()
             reader = pypdf.PdfReader(uploaded_file)
             for i, page in enumerate(reader.pages):
+                extracted = page.extract_text()
+                if extracted:
+                    text += f"\n[PAGE {i+1}] {extracted}"
+            # Method 2: OCR Fallback
+            # If fast method yielded almost no text, switch to OCR
+            if len(text.strip()) < 50:
+                method = "OCR (Slow)"
+                # Reset file pointer or use bytes
+                images = convert_from_bytes(pdf_bytes)
+                text = "" # Reset text
+                for i, img in enumerate(images):
+                    # Tesseract reads the image
+                    page_text = pytesseract.image_to_string(img)
+                    text += f"\n[PAGE {i+1}] {page_text}"
         elif filename.endswith(".docx"):
             doc = docx.Document(uploaded_file)
             text = "\n".join([para.text for para in doc.paragraphs])
         elif filename.endswith(".txt"):
             text = uploaded_file.read().decode("utf-8")
+    except Exception as e:
+        return "", filename, f"Error: {str(e)}"
+    return text, filename, method
 def recursive_chunking(text, source, chunk_size=500, overlap=100):
     words = text.split()
 # --- CORE SEARCH ENGINE ---
 class DocSearchEngine:
     def __init__(self):
+        # Force CPU
         self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")
         self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
                 self.index = faiss.read_index(INDEX_FILE)
                 with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
             except Exception as e:
                 self.reset_index()
         else:
             self.reset_index()
     def reset_index(self):
         d = 384
         self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
         self.metadata = []
         self.index.add_with_ids(embeddings, ids)
         self.metadata.extend(chunks)
         self.save()
         return len(texts)
     def delete_file(self, filename):
         if self.index is None or self.index.ntotal == 0: return 0
         new_chunks = [c for c in self.metadata if c['source'] != filename]
         removed_count = len(self.metadata) - len(new_chunks)
         if removed_count > 0:
             self.reset_index()
+            if new_chunks: self.add_documents(new_chunks)
+            else: self.save()
         return removed_count
     def save(self):
     def search_documents(self, query, top_k=5):
         if not self.index or self.index.ntotal == 0: return []
         candidate_k = top_k * 10
         q_vec = self.bi_encoder.encode([query])
         faiss.normalize_L2(q_vec)
                     doc_map[source]["snippet"] = cand['text']
         ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
         final_results = []
         top_docs = ranked_docs[:top_k]
         if top_docs:
             pairs = [[query, doc[1]['snippet']] for doc in top_docs]
             cross_scores = self.cross_encoder.predict(pairs)
             for i, (source, data) in enumerate(top_docs):
                 final_results.append({
                     "source": source,
         status_text = st.empty()
         new_chunks = []
+        failed_files = []
+        total = len(uploaded_files)
         for i, f in enumerate(uploaded_files):
+            status_text.text(f"Processing {i+1}/{total}: {f.name}...")
+            progress_bar.progress((i)/total)
+            # PARSE (With OCR Auto-Switch)
+            txt, fname, method = parse_file(f)
+            if method == "OCR (Slow)":
+                st.toast(f"OCR Used for {fname}", icon="⚠️")
             if not txt.strip():
+                failed_files.append(f"{fname} (Empty/Unreadable)")
                 continue
             file_chunks = recursive_chunking(txt, fname)
             new_chunks.extend(file_chunks)
+        progress_bar.progress(1.0)
         if new_chunks:
+            with st.spinner("Saving database..."):
                 st.session_state.engine.add_documents(new_chunks)
                 IndexManager.save_to_hub()
+            st.success(f"Indexed {len(new_chunks)} chunks!")
+        if failed_files:
+            with st.expander("Failed Files"):
+                for ff in failed_files: st.write(ff)
     st.divider()
     st.header("⚙️ Manage Index")
     if st.session_state.engine.index:
         st.write(f"**Total Chunks:** {st.session_state.engine.index.ntotal}")
         unique_files = list(set([m['source'] for m in st.session_state.engine.metadata]))
                 st.rerun()
         st.divider()
         if st.button("⚠️ Wipe Entire Index", type="primary"):
             with st.spinner("Nuking database..."):
                 st.session_state.engine.reset_index()
                 st.rerun()
 st.title("⚓ Document Finder")
+query = st.text_input("What are you looking for?")
 if query:
     results = st.session_state.engine.search_documents(query, top_k=5)
     st.subheader("Top Relevant Documents")
+    if not results: st.info("No documents found.")
     for res in results:
         score = res['score']
         if score > 2:
             border_color = "#09ab3b"
             confidence = "High Match"
                 <small style="color: gray;">Confidence: {confidence} ({score:.2f})</small>
             </div>
             """, unsafe_allow_html=True)
             with st.expander("View matching excerpt"):
                 st.markdown(f"**...{res['snippet']}...**")