Spaces:
Runtime error
Runtime error
delete docs
Browse files
app.py
CHANGED
|
@@ -1158,6 +1158,114 @@ class PDFSearchApp:
|
|
| 1158 |
else:
|
| 1159 |
return "medium"
|
| 1160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1161 |
def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
|
| 1162 |
"""
|
| 1163 |
Optimize selection to include consecutive pages when beneficial
|
|
@@ -3498,14 +3606,45 @@ def create_ui():
|
|
| 3498 |
visible=True
|
| 3499 |
)
|
| 3500 |
|
| 3501 |
-
|
| 3502 |
-
|
| 3503 |
-
|
|
|
|
|
|
|
| 3504 |
|
| 3505 |
-
|
| 3506 |
-
|
| 3507 |
-
|
| 3508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3509 |
|
| 3510 |
|
| 3511 |
# Event handlers
|
|
@@ -3522,6 +3661,46 @@ def create_ui():
|
|
| 3522 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
| 3523 |
)
|
| 3524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3525 |
|
| 3526 |
|
| 3527 |
|
|
|
|
| 1158 |
else:
|
| 1159 |
return "medium"
|
| 1160 |
|
| 1161 |
+
def delete_documents(self, collection_name=None):
|
| 1162 |
+
"""
|
| 1163 |
+
Delete documents and their associated collections from the system
|
| 1164 |
+
|
| 1165 |
+
Args:
|
| 1166 |
+
collection_name: Name of the collection to delete. If None, deletes all collections.
|
| 1167 |
+
|
| 1168 |
+
Returns:
|
| 1169 |
+
Status message about the deletion operation
|
| 1170 |
+
"""
|
| 1171 |
+
try:
|
| 1172 |
+
print(f"ποΈ DELETE DOCUMENTS REQUESTED")
|
| 1173 |
+
print(f"π Collection to delete: {collection_name if collection_name else 'ALL COLLECTIONS'}")
|
| 1174 |
+
|
| 1175 |
+
if not self.indexed_docs:
|
| 1176 |
+
return "β No documents found to delete. Please upload some documents first."
|
| 1177 |
+
|
| 1178 |
+
deleted_collections = []
|
| 1179 |
+
deleted_files = []
|
| 1180 |
+
|
| 1181 |
+
if collection_name:
|
| 1182 |
+
# Delete specific collection
|
| 1183 |
+
if collection_name in self.indexed_docs:
|
| 1184 |
+
collection_info = self.indexed_docs[collection_name]
|
| 1185 |
+
|
| 1186 |
+
# Delete from Milvus
|
| 1187 |
+
try:
|
| 1188 |
+
middleware = Middleware(collection_name, create_collection=False)
|
| 1189 |
+
middleware.drop_collection()
|
| 1190 |
+
print(f"β
Dropped Milvus collection: {collection_name}")
|
| 1191 |
+
except Exception as e:
|
| 1192 |
+
print(f"β οΈ Warning: Could not drop Milvus collection {collection_name}: {e}")
|
| 1193 |
+
|
| 1194 |
+
# Delete page images
|
| 1195 |
+
try:
|
| 1196 |
+
base_output_dir = self._ensure_base_directory()
|
| 1197 |
+
collection_dir = os.path.join(base_output_dir, collection_name)
|
| 1198 |
+
if os.path.exists(collection_dir):
|
| 1199 |
+
shutil.rmtree(collection_dir)
|
| 1200 |
+
print(f"β
Deleted page images directory: {collection_dir}")
|
| 1201 |
+
deleted_files.append(f"Page images: {collection_dir}")
|
| 1202 |
+
except Exception as e:
|
| 1203 |
+
print(f"β οΈ Warning: Could not delete page images for {collection_name}: {e}")
|
| 1204 |
+
|
| 1205 |
+
# Remove from indexed_docs
|
| 1206 |
+
del self.indexed_docs[collection_name]
|
| 1207 |
+
deleted_collections.append(collection_name)
|
| 1208 |
+
|
| 1209 |
+
return f"β
Successfully deleted collection '{collection_name}'\nπ Deleted: {len(deleted_files)} file/directory items"
|
| 1210 |
+
else:
|
| 1211 |
+
return f"β Collection '{collection_name}' not found. Available collections: {list(self.indexed_docs.keys())}"
|
| 1212 |
+
else:
|
| 1213 |
+
# Delete all collections
|
| 1214 |
+
for coll_name in list(self.indexed_docs.keys()):
|
| 1215 |
+
try:
|
| 1216 |
+
# Delete from Milvus
|
| 1217 |
+
middleware = Middleware(coll_name, create_collection=False)
|
| 1218 |
+
middleware.drop_collection()
|
| 1219 |
+
print(f"β
Dropped Milvus collection: {coll_name}")
|
| 1220 |
+
except Exception as e:
|
| 1221 |
+
print(f"β οΈ Warning: Could not drop Milvus collection {coll_name}: {e}")
|
| 1222 |
+
|
| 1223 |
+
# Delete page images
|
| 1224 |
+
try:
|
| 1225 |
+
base_output_dir = self._ensure_base_directory()
|
| 1226 |
+
collection_dir = os.path.join(base_output_dir, coll_name)
|
| 1227 |
+
if os.path.exists(collection_dir):
|
| 1228 |
+
shutil.rmtree(collection_dir)
|
| 1229 |
+
print(f"β
Deleted page images directory: {collection_dir}")
|
| 1230 |
+
deleted_files.append(f"Page images: {collection_dir}")
|
| 1231 |
+
except Exception as e:
|
| 1232 |
+
print(f"β οΈ Warning: Could not delete page images for {coll_name}: {e}")
|
| 1233 |
+
|
| 1234 |
+
deleted_collections.append(coll_name)
|
| 1235 |
+
|
| 1236 |
+
# Clear all indexed docs
|
| 1237 |
+
self.indexed_docs.clear()
|
| 1238 |
+
|
| 1239 |
+
return f"β
Successfully deleted ALL collections ({len(deleted_collections)} total)\nπ Deleted: {len(deleted_files)} file/directory items\nποΈ Collections deleted: {', '.join(deleted_collections)}"
|
| 1240 |
+
|
| 1241 |
+
except Exception as e:
|
| 1242 |
+
error_msg = f"β Error during document deletion: {str(e)}"
|
| 1243 |
+
print(f"{error_msg}")
|
| 1244 |
+
print(f"β Traceback: {traceback.format_exc()}")
|
| 1245 |
+
return error_msg
|
| 1246 |
+
|
| 1247 |
+
def get_available_collections(self):
|
| 1248 |
+
"""
|
| 1249 |
+
Get list of available collections for deletion
|
| 1250 |
+
|
| 1251 |
+
Returns:
|
| 1252 |
+
List of collection names and their details
|
| 1253 |
+
"""
|
| 1254 |
+
if not self.indexed_docs:
|
| 1255 |
+
return "No collections available for deletion."
|
| 1256 |
+
|
| 1257 |
+
collection_list = []
|
| 1258 |
+
for collection_name, collection_info in self.indexed_docs.items():
|
| 1259 |
+
collection_list.append(f"π {collection_name}")
|
| 1260 |
+
if isinstance(collection_info, dict):
|
| 1261 |
+
if 'files' in collection_info:
|
| 1262 |
+
collection_list.append(f" π Files: {len(collection_info['files'])}")
|
| 1263 |
+
if 'pages' in collection_info:
|
| 1264 |
+
collection_list.append(f" π Pages: {collection_info['pages']}")
|
| 1265 |
+
collection_list.append("")
|
| 1266 |
+
|
| 1267 |
+
return "\n".join(collection_list)
|
| 1268 |
+
|
| 1269 |
def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
|
| 1270 |
"""
|
| 1271 |
Optimize selection to include consecutive pages when beneficial
|
|
|
|
| 3606 |
visible=True
|
| 3607 |
)
|
| 3608 |
|
| 3609 |
+
# Delete Documents Tab
|
| 3610 |
+
with gr.Tab("ποΈ Delete Documents"):
|
| 3611 |
+
with gr.Column():
|
| 3612 |
+
gr.Markdown("### Delete Document Collections")
|
| 3613 |
+
gr.Markdown("β οΈ **Warning**: This will permanently delete documents and their associated data from the system.")
|
| 3614 |
|
| 3615 |
+
# Show available collections
|
| 3616 |
+
gr.Markdown("#### Available Collections")
|
| 3617 |
+
collections_display = gr.Textbox(
|
| 3618 |
+
label="Current Collections",
|
| 3619 |
+
interactive=False,
|
| 3620 |
+
lines=8,
|
| 3621 |
+
value="No collections available. Upload some documents first."
|
| 3622 |
+
)
|
| 3623 |
+
|
| 3624 |
+
# Collection selection
|
| 3625 |
+
collection_dropdown = gr.Dropdown(
|
| 3626 |
+
label="Select Collection to Delete",
|
| 3627 |
+
choices=[],
|
| 3628 |
+
value=None,
|
| 3629 |
+
allow_custom_value=True,
|
| 3630 |
+
info="Select a specific collection to delete, or leave empty to delete all collections"
|
| 3631 |
+
)
|
| 3632 |
+
|
| 3633 |
+
# Delete options
|
| 3634 |
+
with gr.Row():
|
| 3635 |
+
delete_specific_btn = gr.Button("ποΈ Delete Selected Collection", variant="secondary")
|
| 3636 |
+
delete_all_btn = gr.Button("ποΈ Delete ALL Collections", variant="stop")
|
| 3637 |
+
|
| 3638 |
+
# Status output
|
| 3639 |
+
delete_status = gr.Textbox(
|
| 3640 |
+
label="Deletion Status",
|
| 3641 |
+
interactive=False,
|
| 3642 |
+
lines=6
|
| 3643 |
+
)
|
| 3644 |
+
|
| 3645 |
+
# Refresh button
|
| 3646 |
+
refresh_collections_btn = gr.Button("π Refresh Collections List", variant="secondary")
|
| 3647 |
+
|
| 3648 |
|
| 3649 |
|
| 3650 |
# Event handlers
|
|
|
|
| 3661 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
| 3662 |
)
|
| 3663 |
|
| 3664 |
+
# Delete events
|
| 3665 |
+
def refresh_collections():
|
| 3666 |
+
"""Refresh the collections list and dropdown"""
|
| 3667 |
+
collections_text = app.get_available_collections()
|
| 3668 |
+
collection_choices = list(app.indexed_docs.keys()) if app.indexed_docs else []
|
| 3669 |
+
return collections_text, gr.Dropdown(choices=collection_choices)
|
| 3670 |
+
|
| 3671 |
+
def delete_specific_collection(collection_name):
|
| 3672 |
+
"""Delete a specific collection"""
|
| 3673 |
+
if not collection_name or collection_name.strip() == "":
|
| 3674 |
+
return "β Please select a collection to delete."
|
| 3675 |
+
return app.delete_documents(collection_name.strip())
|
| 3676 |
+
|
| 3677 |
+
def delete_all_collections():
|
| 3678 |
+
"""Delete all collections"""
|
| 3679 |
+
return app.delete_documents()
|
| 3680 |
+
|
| 3681 |
+
# Delete event handlers
|
| 3682 |
+
refresh_collections_btn.click(
|
| 3683 |
+
fn=refresh_collections,
|
| 3684 |
+
outputs=[collections_display, collection_dropdown]
|
| 3685 |
+
)
|
| 3686 |
+
|
| 3687 |
+
delete_specific_btn.click(
|
| 3688 |
+
fn=delete_specific_collection,
|
| 3689 |
+
inputs=[collection_dropdown],
|
| 3690 |
+
outputs=[delete_status]
|
| 3691 |
+
)
|
| 3692 |
+
|
| 3693 |
+
delete_all_btn.click(
|
| 3694 |
+
fn=delete_all_collections,
|
| 3695 |
+
outputs=[delete_status]
|
| 3696 |
+
)
|
| 3697 |
+
|
| 3698 |
+
# Initialize collections on page load
|
| 3699 |
+
demo.load(
|
| 3700 |
+
fn=refresh_collections,
|
| 3701 |
+
outputs=[collections_display, collection_dropdown]
|
| 3702 |
+
)
|
| 3703 |
+
|
| 3704 |
|
| 3705 |
|
| 3706 |
|