NavyDevilDoc commited on
Commit
c6eeec6
·
verified ·
1 Parent(s): 39f39ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -0
app.py CHANGED
@@ -183,6 +183,56 @@ with st.sidebar:
183
  st.header("🗄️ Upload Documents")
184
  uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
185
  if uploaded_files and st.button("Index"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  with st.spinner("Indexing..."):
187
  new_chunks = []
188
  for f in uploaded_files:
 
183
  st.header("🗄️ Upload Documents")
184
  uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
185
  if uploaded_files and st.button("Index"):
186
+ progress_bar = st.progress(0)
187
+ status_text = st.empty()
188
+
189
+ new_chunks = []
190
+ failed_files = [] # Track crashes
191
+ empty_files = [] # Track files with no text (Scans?)
192
+
193
+ total_files = len(uploaded_files)
194
+
195
+ for i, f in enumerate(uploaded_files):
196
+ # Update Status
197
+ status_text.text(f"Processing {i+1}/{total_files}: {f.name}")
198
+ progress_bar.progress((i + 1) / total_files)
199
+
200
+ # 1. Parse
201
+ txt, fname = parse_file(f)
202
+
203
+ # Check if text extraction failed (likely a scanned PDF)
204
+ if not txt.strip():
205
+ empty_files.append(fname)
206
+ continue
207
+
208
+ # 2. Chunk
209
+ file_chunks = recursive_chunking(txt, fname)
210
+
211
+ if not file_chunks:
212
+ # Text was found, but maybe it was too short/garbage
213
+ empty_files.append(f"{fname} (Too short)")
214
+ continue
215
+
216
+ new_chunks.extend(file_chunks)
217
+
218
+ # 3. Save & Report
219
+ if new_chunks:
220
+ with st.spinner("Saving to database..."):
221
+ st.session_state.engine.add_documents(new_chunks)
222
+ IndexManager.save_to_hub()
223
+
224
+ st.success(f"Successfully indexed {len(new_chunks)} chunks from {total_files - len(empty_files)} files!")
225
+
226
+ # REPORT ERRORS
227
+ if empty_files:
228
+ with st.expander("⚠️ Skipped Documents (No Text Found)", expanded=True):
229
+ st.warning("The following files appear to be empty or scanned images (OCR required):")
230
+ for ef in empty_files:
231
+ st.write(f"- {ef}")
232
+ else:
233
+ st.error("No valid text found in any of the uploaded files.")
234
+ if empty_files:
235
+ st.write("Files were detected but contained no extractable text (likely scanned images).")
236
  with st.spinner("Indexing..."):
237
  new_chunks = []
238
  for f in uploaded_files: