Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -183,6 +183,56 @@ with st.sidebar:
|
|
| 183 |
st.header("🗄️ Upload Documents")
|
| 184 |
uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
|
| 185 |
if uploaded_files and st.button("Index"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
with st.spinner("Indexing..."):
|
| 187 |
new_chunks = []
|
| 188 |
for f in uploaded_files:
|
|
|
|
| 183 |
st.header("🗄️ Upload Documents")
|
| 184 |
uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
|
| 185 |
if uploaded_files and st.button("Index"):
|
| 186 |
+
progress_bar = st.progress(0)
|
| 187 |
+
status_text = st.empty()
|
| 188 |
+
|
| 189 |
+
new_chunks = []
|
| 190 |
+
failed_files = [] # Track crashes
|
| 191 |
+
empty_files = [] # Track files with no text (Scans?)
|
| 192 |
+
|
| 193 |
+
total_files = len(uploaded_files)
|
| 194 |
+
|
| 195 |
+
for i, f in enumerate(uploaded_files):
|
| 196 |
+
# Update Status
|
| 197 |
+
status_text.text(f"Processing {i+1}/{total_files}: {f.name}")
|
| 198 |
+
progress_bar.progress((i + 1) / total_files)
|
| 199 |
+
|
| 200 |
+
# 1. Parse
|
| 201 |
+
txt, fname = parse_file(f)
|
| 202 |
+
|
| 203 |
+
# Check if text extraction failed (likely a scanned PDF)
|
| 204 |
+
if not txt.strip():
|
| 205 |
+
empty_files.append(fname)
|
| 206 |
+
continue
|
| 207 |
+
|
| 208 |
+
# 2. Chunk
|
| 209 |
+
file_chunks = recursive_chunking(txt, fname)
|
| 210 |
+
|
| 211 |
+
if not file_chunks:
|
| 212 |
+
# Text was found, but maybe it was too short/garbage
|
| 213 |
+
empty_files.append(f"{fname} (Too short)")
|
| 214 |
+
continue
|
| 215 |
+
|
| 216 |
+
new_chunks.extend(file_chunks)
|
| 217 |
+
|
| 218 |
+
# 3. Save & Report
|
| 219 |
+
if new_chunks:
|
| 220 |
+
with st.spinner("Saving to database..."):
|
| 221 |
+
st.session_state.engine.add_documents(new_chunks)
|
| 222 |
+
IndexManager.save_to_hub()
|
| 223 |
+
|
| 224 |
+
st.success(f"Successfully indexed {len(new_chunks)} chunks from {total_files - len(empty_files)} files!")
|
| 225 |
+
|
| 226 |
+
# REPORT ERRORS
|
| 227 |
+
if empty_files:
|
| 228 |
+
with st.expander("⚠️ Skipped Documents (No Text Found)", expanded=True):
|
| 229 |
+
st.warning("The following files appear to be empty or scanned images (OCR required):")
|
| 230 |
+
for ef in empty_files:
|
| 231 |
+
st.write(f"- {ef}")
|
| 232 |
+
else:
|
| 233 |
+
st.error("No valid text found in any of the uploaded files.")
|
| 234 |
+
if empty_files:
|
| 235 |
+
st.write("Files were detected but contained no extractable text (likely scanned images).")
|
| 236 |
with st.spinner("Indexing..."):
|
| 237 |
new_chunks = []
|
| 238 |
for f in uploaded_files:
|