Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 16, 2025

Commit

c6eeec6

verified ·

1 Parent(s): 39f39ce

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -0

app.py CHANGED Viewed

@@ -183,6 +183,56 @@ with st.sidebar:
     st.header("🗄️ Upload Documents")
     uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
     if uploaded_files and st.button("Index"):
         with st.spinner("Indexing..."):
             new_chunks = []
             for f in uploaded_files:

     st.header("🗄️ Upload Documents")
     uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
     if uploaded_files and st.button("Index"):
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        new_chunks = []
+        failed_files = []   # Track crashes
+        empty_files = []    # Track files with no text (Scans?)
+        total_files = len(uploaded_files)
+        for i, f in enumerate(uploaded_files):
+            # Update Status
+            status_text.text(f"Processing {i+1}/{total_files}: {f.name}")
+            progress_bar.progress((i + 1) / total_files)
+            # 1. Parse
+            txt, fname = parse_file(f)
+            # Check if text extraction failed (likely a scanned PDF)
+            if not txt.strip():
+                empty_files.append(fname)
+                continue
+            # 2. Chunk
+            file_chunks = recursive_chunking(txt, fname)
+            if not file_chunks:
+                # Text was found, but maybe it was too short/garbage
+                empty_files.append(f"{fname} (Too short)")
+                continue
+            new_chunks.extend(file_chunks)
+        # 3. Save & Report
+        if new_chunks:
+            with st.spinner("Saving to database..."):
+                st.session_state.engine.add_documents(new_chunks)
+                IndexManager.save_to_hub()
+            st.success(f"Successfully indexed {len(new_chunks)} chunks from {total_files - len(empty_files)} files!")
+            # REPORT ERRORS
+            if empty_files:
+                with st.expander("⚠️ Skipped Documents (No Text Found)", expanded=True):
+                    st.warning("The following files appear to be empty or scanned images (OCR required):")
+                    for ef in empty_files:
+                        st.write(f"- {ef}")
+        else:
+            st.error("No valid text found in any of the uploaded files.")
+            if empty_files:
+                st.write("Files were detected but contained no extractable text (likely scanned images).")
         with st.spinner("Indexing..."):
             new_chunks = []
             for f in uploaded_files: