NavyDevilDoc commited on
Commit
e1daca2
·
verified ·
1 Parent(s): 5729a49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -72
app.py CHANGED
@@ -9,6 +9,9 @@ from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
9
  import pypdf
10
  import docx
11
  import time
 
 
 
12
 
13
  # --- CONFIGURATION ---
14
  DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
@@ -39,22 +42,45 @@ class IndexManager:
39
  st.toast("Database Synced!", icon="☁️")
40
  except Exception as e: st.error(f"Sync Error: {e}")
41
 
42
- # --- PARSING & CHUNKING ---
43
  def parse_file(uploaded_file):
44
  text = ""
45
  filename = uploaded_file.name
 
 
46
  try:
47
  if filename.endswith(".pdf"):
 
 
48
  reader = pypdf.PdfReader(uploaded_file)
 
49
  for i, page in enumerate(reader.pages):
50
- if page.extract_text(): text += f"\n[PAGE {i+1}] {page.extract_text()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  elif filename.endswith(".docx"):
52
  doc = docx.Document(uploaded_file)
53
  text = "\n".join([para.text for para in doc.paragraphs])
54
  elif filename.endswith(".txt"):
55
  text = uploaded_file.read().decode("utf-8")
56
- except: pass
57
- return text, filename
 
 
 
58
 
59
  def recursive_chunking(text, source, chunk_size=500, overlap=100):
60
  words = text.split()
@@ -68,7 +94,7 @@ def recursive_chunking(text, source, chunk_size=500, overlap=100):
68
  # --- CORE SEARCH ENGINE ---
69
  class DocSearchEngine:
70
  def __init__(self):
71
- # Force CPU to avoid Docker memory issues
72
  self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")
73
  self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
74
 
@@ -80,13 +106,11 @@ class DocSearchEngine:
80
  self.index = faiss.read_index(INDEX_FILE)
81
  with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
82
  except Exception as e:
83
- st.error(f"Index load failed, starting fresh: {e}")
84
  self.reset_index()
85
  else:
86
  self.reset_index()
87
 
88
  def reset_index(self):
89
- """Wipes the index clean"""
90
  d = 384
91
  self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
92
  self.metadata = []
@@ -102,23 +126,17 @@ class DocSearchEngine:
102
 
103
  self.index.add_with_ids(embeddings, ids)
104
  self.metadata.extend(chunks)
105
-
106
  self.save()
107
  return len(texts)
108
 
109
  def delete_file(self, filename):
110
  if self.index is None or self.index.ntotal == 0: return 0
111
-
112
  new_chunks = [c for c in self.metadata if c['source'] != filename]
113
  removed_count = len(self.metadata) - len(new_chunks)
114
-
115
  if removed_count > 0:
116
  self.reset_index()
117
- if new_chunks:
118
- self.add_documents(new_chunks)
119
- else:
120
- self.save()
121
-
122
  return removed_count
123
 
124
  def save(self):
@@ -127,9 +145,7 @@ class DocSearchEngine:
127
 
128
  def search_documents(self, query, top_k=5):
129
  if not self.index or self.index.ntotal == 0: return []
130
-
131
  candidate_k = top_k * 10
132
-
133
  q_vec = self.bi_encoder.encode([query])
134
  faiss.normalize_L2(q_vec)
135
 
@@ -156,14 +172,12 @@ class DocSearchEngine:
156
  doc_map[source]["snippet"] = cand['text']
157
 
158
  ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
159
-
160
  final_results = []
161
  top_docs = ranked_docs[:top_k]
162
 
163
  if top_docs:
164
  pairs = [[query, doc[1]['snippet']] for doc in top_docs]
165
  cross_scores = self.cross_encoder.predict(pairs)
166
-
167
  for i, (source, data) in enumerate(top_docs):
168
  final_results.append({
169
  "source": source,
@@ -187,65 +201,41 @@ with st.sidebar:
187
  status_text = st.empty()
188
 
189
  new_chunks = []
190
- failed_files = [] # Track crashes
191
- empty_files = [] # Track files with no text (Scans?)
192
 
193
- total_files = len(uploaded_files)
194
 
195
  for i, f in enumerate(uploaded_files):
196
- # Update Status
197
- status_text.text(f"Processing {i+1}/{total_files}: {f.name}")
198
- progress_bar.progress((i + 1) / total_files)
199
 
200
- # 1. Parse
201
- txt, fname = parse_file(f)
202
 
203
- # Check if text extraction failed (likely a scanned PDF)
 
 
204
  if not txt.strip():
205
- empty_files.append(fname)
206
  continue
207
 
208
- # 2. Chunk
209
  file_chunks = recursive_chunking(txt, fname)
210
-
211
- if not file_chunks:
212
- # Text was found, but maybe it was too short/garbage
213
- empty_files.append(f"{fname} (Too short)")
214
- continue
215
-
216
  new_chunks.extend(file_chunks)
217
 
218
- # 3. Save & Report
 
219
  if new_chunks:
220
- with st.spinner("Saving to database..."):
221
- st.session_state.engine.add_documents(new_chunks)
222
- IndexManager.save_to_hub()
223
-
224
- st.success(f"Successfully indexed {len(new_chunks)} chunks from {total_files - len(empty_files)} files!")
225
-
226
- # REPORT ERRORS
227
- if empty_files:
228
- with st.expander("⚠️ Skipped Documents (No Text Found)", expanded=True):
229
- st.warning("The following files appear to be empty or scanned images (OCR required):")
230
- for ef in empty_files:
231
- st.write(f"- {ef}")
232
- else:
233
- st.error("No valid text found in any of the uploaded files.")
234
- if empty_files:
235
- st.write("Files were detected but contained no extractable text (likely scanned images).")
236
- with st.spinner("Indexing..."):
237
- new_chunks = []
238
- for f in uploaded_files:
239
- txt, fname = parse_file(f)
240
- new_chunks.extend(recursive_chunking(txt, fname))
241
- if new_chunks:
242
  st.session_state.engine.add_documents(new_chunks)
243
  IndexManager.save_to_hub()
244
- st.success("Indexed!")
 
 
 
 
245
 
246
  st.divider()
247
  st.header("⚙️ Manage Index")
248
-
249
  if st.session_state.engine.index:
250
  st.write(f"**Total Chunks:** {st.session_state.engine.index.ntotal}")
251
  unique_files = list(set([m['source'] for m in st.session_state.engine.metadata]))
@@ -261,7 +251,6 @@ with st.sidebar:
261
  st.rerun()
262
 
263
  st.divider()
264
- # THE NUCLEAR OPTION
265
  if st.button("⚠️ Wipe Entire Index", type="primary"):
266
  with st.spinner("Nuking database..."):
267
  st.session_state.engine.reset_index()
@@ -271,21 +260,14 @@ with st.sidebar:
271
  st.rerun()
272
 
273
  st.title("⚓ Document Finder")
274
- st.caption("Locates the specific Instruction or NAVADMIN relevant to your query.")
275
-
276
- query = st.text_input("What are you looking for?", placeholder="e.g. 'FY25 Retention Bonuses'")
277
 
278
  if query:
279
  results = st.session_state.engine.search_documents(query, top_k=5)
280
-
281
  st.subheader("Top Relevant Documents")
282
-
283
- if not results:
284
- st.info("No documents found.")
285
-
286
  for res in results:
287
  score = res['score']
288
-
289
  if score > 2:
290
  border_color = "#09ab3b"
291
  confidence = "High Match"
@@ -309,6 +291,5 @@ if query:
309
  <small style="color: gray;">Confidence: {confidence} ({score:.2f})</small>
310
  </div>
311
  """, unsafe_allow_html=True)
312
-
313
  with st.expander("View matching excerpt"):
314
  st.markdown(f"**...{res['snippet']}...**")
 
9
  import pypdf
10
  import docx
11
  import time
12
+ from pdf2image import convert_from_bytes
13
+ import pytesseract
14
+ from PIL import Image
15
 
16
  # --- CONFIGURATION ---
17
  DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
 
42
  st.toast("Database Synced!", icon="☁️")
43
  except Exception as e: st.error(f"Sync Error: {e}")
44
 
45
+ # --- PARSING LOGIC (OCR ENABLED) ---
46
  def parse_file(uploaded_file):
47
  text = ""
48
  filename = uploaded_file.name
49
+ method = "Fast"
50
+
51
  try:
52
  if filename.endswith(".pdf"):
53
+ # Method 1: Fast Text Extraction
54
+ pdf_bytes = uploaded_file.getvalue()
55
  reader = pypdf.PdfReader(uploaded_file)
56
+
57
  for i, page in enumerate(reader.pages):
58
+ extracted = page.extract_text()
59
+ if extracted:
60
+ text += f"\n[PAGE {i+1}] {extracted}"
61
+
62
+ # Method 2: OCR Fallback
63
+ # If fast method yielded almost no text, switch to OCR
64
+ if len(text.strip()) < 50:
65
+ method = "OCR (Slow)"
66
+ # Reset file pointer or use bytes
67
+ images = convert_from_bytes(pdf_bytes)
68
+ text = "" # Reset text
69
+ for i, img in enumerate(images):
70
+ # Tesseract reads the image
71
+ page_text = pytesseract.image_to_string(img)
72
+ text += f"\n[PAGE {i+1}] {page_text}"
73
+
74
  elif filename.endswith(".docx"):
75
  doc = docx.Document(uploaded_file)
76
  text = "\n".join([para.text for para in doc.paragraphs])
77
  elif filename.endswith(".txt"):
78
  text = uploaded_file.read().decode("utf-8")
79
+
80
+ except Exception as e:
81
+ return "", filename, f"Error: {str(e)}"
82
+
83
+ return text, filename, method
84
 
85
  def recursive_chunking(text, source, chunk_size=500, overlap=100):
86
  words = text.split()
 
94
  # --- CORE SEARCH ENGINE ---
95
  class DocSearchEngine:
96
  def __init__(self):
97
+ # Force CPU
98
  self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")
99
  self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
100
 
 
106
  self.index = faiss.read_index(INDEX_FILE)
107
  with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
108
  except Exception as e:
 
109
  self.reset_index()
110
  else:
111
  self.reset_index()
112
 
113
  def reset_index(self):
 
114
  d = 384
115
  self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
116
  self.metadata = []
 
126
 
127
  self.index.add_with_ids(embeddings, ids)
128
  self.metadata.extend(chunks)
 
129
  self.save()
130
  return len(texts)
131
 
132
  def delete_file(self, filename):
133
  if self.index is None or self.index.ntotal == 0: return 0
 
134
  new_chunks = [c for c in self.metadata if c['source'] != filename]
135
  removed_count = len(self.metadata) - len(new_chunks)
 
136
  if removed_count > 0:
137
  self.reset_index()
138
+ if new_chunks: self.add_documents(new_chunks)
139
+ else: self.save()
 
 
 
140
  return removed_count
141
 
142
  def save(self):
 
145
 
146
  def search_documents(self, query, top_k=5):
147
  if not self.index or self.index.ntotal == 0: return []
 
148
  candidate_k = top_k * 10
 
149
  q_vec = self.bi_encoder.encode([query])
150
  faiss.normalize_L2(q_vec)
151
 
 
172
  doc_map[source]["snippet"] = cand['text']
173
 
174
  ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
 
175
  final_results = []
176
  top_docs = ranked_docs[:top_k]
177
 
178
  if top_docs:
179
  pairs = [[query, doc[1]['snippet']] for doc in top_docs]
180
  cross_scores = self.cross_encoder.predict(pairs)
 
181
  for i, (source, data) in enumerate(top_docs):
182
  final_results.append({
183
  "source": source,
 
201
  status_text = st.empty()
202
 
203
  new_chunks = []
204
+ failed_files = []
 
205
 
206
+ total = len(uploaded_files)
207
 
208
  for i, f in enumerate(uploaded_files):
209
+ status_text.text(f"Processing {i+1}/{total}: {f.name}...")
210
+ progress_bar.progress((i)/total)
 
211
 
212
+ # PARSE (With OCR Auto-Switch)
213
+ txt, fname, method = parse_file(f)
214
 
215
+ if method == "OCR (Slow)":
216
+ st.toast(f"OCR Used for {fname}", icon="⚠️")
217
+
218
  if not txt.strip():
219
+ failed_files.append(f"{fname} (Empty/Unreadable)")
220
  continue
221
 
 
222
  file_chunks = recursive_chunking(txt, fname)
 
 
 
 
 
 
223
  new_chunks.extend(file_chunks)
224
 
225
+ progress_bar.progress(1.0)
226
+
227
  if new_chunks:
228
+ with st.spinner("Saving database..."):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  st.session_state.engine.add_documents(new_chunks)
230
  IndexManager.save_to_hub()
231
+ st.success(f"Indexed {len(new_chunks)} chunks!")
232
+
233
+ if failed_files:
234
+ with st.expander("Failed Files"):
235
+ for ff in failed_files: st.write(ff)
236
 
237
  st.divider()
238
  st.header("⚙️ Manage Index")
 
239
  if st.session_state.engine.index:
240
  st.write(f"**Total Chunks:** {st.session_state.engine.index.ntotal}")
241
  unique_files = list(set([m['source'] for m in st.session_state.engine.metadata]))
 
251
  st.rerun()
252
 
253
  st.divider()
 
254
  if st.button("⚠️ Wipe Entire Index", type="primary"):
255
  with st.spinner("Nuking database..."):
256
  st.session_state.engine.reset_index()
 
260
  st.rerun()
261
 
262
  st.title("⚓ Document Finder")
263
+ query = st.text_input("What are you looking for?")
 
 
264
 
265
  if query:
266
  results = st.session_state.engine.search_documents(query, top_k=5)
 
267
  st.subheader("Top Relevant Documents")
268
+ if not results: st.info("No documents found.")
 
 
 
269
  for res in results:
270
  score = res['score']
 
271
  if score > 2:
272
  border_color = "#09ab3b"
273
  confidence = "High Match"
 
291
  <small style="color: gray;">Confidence: {confidence} ({score:.2f})</small>
292
  </div>
293
  """, unsafe_allow_html=True)
 
294
  with st.expander("View matching excerpt"):
295
  st.markdown(f"**...{res['snippet']}...**")