Spaces:
Running
Running
Merge pull request #302 from saurabhhhcodes/backend/soft-delete-documents-285
Browse files- backend/app/database.py +2 -0
- backend/app/models.py +2 -1
- backend/app/routes/chat.py +3 -0
- backend/app/routes/documents.py +26 -32
- backend/tests/test_documents.py +15 -5
backend/app/database.py
CHANGED
|
@@ -75,6 +75,8 @@ def _migrate_schema():
|
|
| 75 |
existing_docs_columns = {c["name"] for c in inspector.get_columns("documents")}
|
| 76 |
docs_migrations = [
|
| 77 |
("documents", "last_accessed_at", "ALTER TABLE documents ADD COLUMN last_accessed_at TIMESTAMP"),
|
|
|
|
|
|
|
| 78 |
]
|
| 79 |
for table, column, ddl in docs_migrations:
|
| 80 |
if column not in existing_docs_columns:
|
|
|
|
| 75 |
existing_docs_columns = {c["name"] for c in inspector.get_columns("documents")}
|
| 76 |
docs_migrations = [
|
| 77 |
("documents", "last_accessed_at", "ALTER TABLE documents ADD COLUMN last_accessed_at TIMESTAMP"),
|
| 78 |
+
("documents", "is_deleted", "ALTER TABLE documents ADD COLUMN is_deleted BOOLEAN DEFAULT FALSE NOT NULL"),
|
| 79 |
+
("documents", "deleted_at", "ALTER TABLE documents ADD COLUMN deleted_at TIMESTAMP"),
|
| 80 |
]
|
| 81 |
for table, column, ddl in docs_migrations:
|
| 82 |
if column not in existing_docs_columns:
|
backend/app/models.py
CHANGED
|
@@ -182,6 +182,8 @@ class Document(Base):
|
|
| 182 |
drive_file_id = Column(String(255), unique=True, nullable=True, index=True)
|
| 183 |
drive_folder_id = Column(String(255), nullable=True, index=True)
|
| 184 |
drive_synced_at = Column(DateTime, nullable=True)
|
|
|
|
|
|
|
| 185 |
|
| 186 |
# Relationships
|
| 187 |
owner = relationship("User", back_populates="documents")
|
|
@@ -238,4 +240,3 @@ class SharedMessage(Base):
|
|
| 238 |
|
| 239 |
# Relationships
|
| 240 |
message = relationship("ChatMessage", back_populates="shared_message")
|
| 241 |
-
|
|
|
|
| 182 |
drive_file_id = Column(String(255), unique=True, nullable=True, index=True)
|
| 183 |
drive_folder_id = Column(String(255), nullable=True, index=True)
|
| 184 |
drive_synced_at = Column(DateTime, nullable=True)
|
| 185 |
+
is_deleted = Column(Boolean, default=False, nullable=False, index=True)
|
| 186 |
+
deleted_at = Column(DateTime, nullable=True)
|
| 187 |
|
| 188 |
# Relationships
|
| 189 |
owner = relationship("User", back_populates="documents")
|
|
|
|
| 240 |
|
| 241 |
# Relationships
|
| 242 |
message = relationship("ChatMessage", back_populates="shared_message")
|
|
|
backend/app/routes/chat.py
CHANGED
|
@@ -236,6 +236,7 @@ def ask_question(
|
|
| 236 |
doc = db.query(Document).filter(
|
| 237 |
Document.id == payload.document_id,
|
| 238 |
Document.user_id == user.id,
|
|
|
|
| 239 |
).first()
|
| 240 |
|
| 241 |
if not doc:
|
|
@@ -296,6 +297,7 @@ def ask_question_stream(
|
|
| 296 |
doc = db.query(Document).filter(
|
| 297 |
Document.id == payload.document_id,
|
| 298 |
Document.user_id == user.id,
|
|
|
|
| 299 |
).first()
|
| 300 |
|
| 301 |
if not doc:
|
|
@@ -437,6 +439,7 @@ def export_chat_history(
|
|
| 437 |
doc = db.query(Document).filter(
|
| 438 |
Document.id == document_id,
|
| 439 |
Document.user_id == resolved_user.id,
|
|
|
|
| 440 |
).first()
|
| 441 |
|
| 442 |
if not doc:
|
|
|
|
| 236 |
doc = db.query(Document).filter(
|
| 237 |
Document.id == payload.document_id,
|
| 238 |
Document.user_id == user.id,
|
| 239 |
+
Document.is_deleted.is_(False),
|
| 240 |
).first()
|
| 241 |
|
| 242 |
if not doc:
|
|
|
|
| 297 |
doc = db.query(Document).filter(
|
| 298 |
Document.id == payload.document_id,
|
| 299 |
Document.user_id == user.id,
|
| 300 |
+
Document.is_deleted.is_(False),
|
| 301 |
).first()
|
| 302 |
|
| 303 |
if not doc:
|
|
|
|
| 439 |
doc = db.query(Document).filter(
|
| 440 |
Document.id == document_id,
|
| 441 |
Document.user_id == resolved_user.id,
|
| 442 |
+
Document.is_deleted.is_(False),
|
| 443 |
).first()
|
| 444 |
|
| 445 |
if not doc:
|
backend/app/routes/documents.py
CHANGED
|
@@ -8,6 +8,7 @@ import uuid
|
|
| 8 |
import logging
|
| 9 |
import asyncio
|
| 10 |
import concurrent.futures
|
|
|
|
| 11 |
from typing import Optional
|
| 12 |
from pathlib import Path
|
| 13 |
import shutil
|
|
@@ -23,7 +24,7 @@ from app.schemas import DocumentResponse, DocumentListResponse, DocumentStatusRe
|
|
| 23 |
from app.auth import get_current_user
|
| 24 |
from app.config import get_settings
|
| 25 |
from app.rag.chunker import chunk_document, get_page_count
|
| 26 |
-
from app.rag.vectorstore import store_chunks
|
| 27 |
|
| 28 |
try:
|
| 29 |
from crawl4ai import AsyncWebCrawler
|
|
@@ -158,7 +159,11 @@ def _ingest_document(document_id: str, filepath: str, original_name: str, user_i
|
|
| 158 |
|
| 159 |
db = SessionLocal()
|
| 160 |
try:
|
| 161 |
-
doc =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
if not doc:
|
| 163 |
logger.error(f"Document {document_id} not found for ingestion")
|
| 164 |
return
|
|
@@ -236,7 +241,11 @@ def _ingest_document(document_id: str, filepath: str, original_name: str, user_i
|
|
| 236 |
except Exception as e:
|
| 237 |
logger.error(f"Ingestion error for {document_id}: {e}")
|
| 238 |
try:
|
| 239 |
-
doc =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
if doc:
|
| 241 |
doc.status = "failed"
|
| 242 |
doc.error_message = str(e)[:500]
|
|
@@ -476,6 +485,7 @@ def get_document_status(
|
|
| 476 |
doc = db.query(Document).filter(
|
| 477 |
Document.id == document_id,
|
| 478 |
Document.user_id == user.id,
|
|
|
|
| 479 |
).first()
|
| 480 |
|
| 481 |
if not doc:
|
|
@@ -517,7 +527,7 @@ def list_documents(
|
|
| 517 |
"""Total Pages"""
|
| 518 |
totalDocuments = (
|
| 519 |
db.query(Document)
|
| 520 |
-
.filter(Document.user_id == user.id)
|
| 521 |
.count()
|
| 522 |
)
|
| 523 |
"""Total Pages"""
|
|
@@ -526,7 +536,7 @@ def list_documents(
|
|
| 526 |
"""List all documents for the authenticated user in Paginated form"""
|
| 527 |
docs = ((
|
| 528 |
db.execute(select(Document)
|
| 529 |
-
.where(Document.user_id == user.id)
|
| 530 |
.order_by(Document.uploaded_at.desc())
|
| 531 |
.limit(per_page).offset(skip))
|
| 532 |
)
|
|
@@ -567,6 +577,7 @@ def get_document(
|
|
| 567 |
doc = db.query(Document).filter(
|
| 568 |
Document.id == document_id,
|
| 569 |
Document.user_id == user.id,
|
|
|
|
| 570 |
).first()
|
| 571 |
|
| 572 |
if not doc:
|
|
@@ -603,6 +614,7 @@ def serve_pdf(
|
|
| 603 |
doc = db.query(Document).filter(
|
| 604 |
Document.id == document_id,
|
| 605 |
Document.user_id == user.id,
|
|
|
|
| 606 |
).first()
|
| 607 |
|
| 608 |
if not doc:
|
|
@@ -627,12 +639,11 @@ def delete_document(
|
|
| 627 |
db: Session = Depends(get_db),
|
| 628 |
):
|
| 629 |
"""
|
| 630 |
-
|
| 631 |
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
overall operation.
|
| 636 |
|
| 637 |
Args:
|
| 638 |
document_id: The unique identifier of the document to delete.
|
|
@@ -653,32 +664,14 @@ def delete_document(
|
|
| 653 |
doc = db.query(Document).filter(
|
| 654 |
Document.id == document_id,
|
| 655 |
Document.user_id == user.id,
|
|
|
|
| 656 |
).first()
|
| 657 |
|
| 658 |
if not doc:
|
| 659 |
raise HTTPException(status_code=404, detail="Document not found")
|
| 660 |
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
if os.path.exists(filepath):
|
| 664 |
-
os.remove(filepath)
|
| 665 |
-
|
| 666 |
-
# Delete vectors from ChromaDB
|
| 667 |
-
try:
|
| 668 |
-
delete_document_chunks(document_id=document_id, user_id=user.id)
|
| 669 |
-
except Exception as e:
|
| 670 |
-
logger.warning(f"Error deleting vectors: {e}")
|
| 671 |
-
|
| 672 |
-
# Delete persisted knowledge graph
|
| 673 |
-
try:
|
| 674 |
-
from app.rag.graph_builder import delete_graph
|
| 675 |
-
|
| 676 |
-
delete_graph(user_id=user.id, document_id=document_id)
|
| 677 |
-
except Exception as e:
|
| 678 |
-
logger.warning(f"Error deleting knowledge graph: {e}")
|
| 679 |
-
|
| 680 |
-
# Delete from database (cascades to chat messages)
|
| 681 |
-
db.delete(doc)
|
| 682 |
db.commit()
|
| 683 |
|
| 684 |
return {"message": f"Document '{doc.original_name}' deleted successfully"}
|
|
@@ -714,6 +707,7 @@ def update_chunk_settings(
|
|
| 714 |
doc = db.query(Document).filter(
|
| 715 |
Document.id == document_id,
|
| 716 |
Document.user_id == user.id,
|
|
|
|
| 717 |
).first()
|
| 718 |
|
| 719 |
if not doc:
|
|
@@ -748,4 +742,4 @@ def update_chunk_settings(
|
|
| 748 |
user_id=user.id,
|
| 749 |
)
|
| 750 |
# Return the updated document record with new chunk settings
|
| 751 |
-
return DocumentResponse.model_validate(doc)
|
|
|
|
| 8 |
import logging
|
| 9 |
import asyncio
|
| 10 |
import concurrent.futures
|
| 11 |
+
from datetime import datetime, timezone
|
| 12 |
from typing import Optional
|
| 13 |
from pathlib import Path
|
| 14 |
import shutil
|
|
|
|
| 24 |
from app.auth import get_current_user
|
| 25 |
from app.config import get_settings
|
| 26 |
from app.rag.chunker import chunk_document, get_page_count
|
| 27 |
+
from app.rag.vectorstore import store_chunks
|
| 28 |
|
| 29 |
try:
|
| 30 |
from crawl4ai import AsyncWebCrawler
|
|
|
|
| 159 |
|
| 160 |
db = SessionLocal()
|
| 161 |
try:
|
| 162 |
+
doc = (
|
| 163 |
+
db.query(Document)
|
| 164 |
+
.filter(Document.id == document_id, Document.is_deleted.is_(False))
|
| 165 |
+
.first()
|
| 166 |
+
)
|
| 167 |
if not doc:
|
| 168 |
logger.error(f"Document {document_id} not found for ingestion")
|
| 169 |
return
|
|
|
|
| 241 |
except Exception as e:
|
| 242 |
logger.error(f"Ingestion error for {document_id}: {e}")
|
| 243 |
try:
|
| 244 |
+
doc = (
|
| 245 |
+
db.query(Document)
|
| 246 |
+
.filter(Document.id == document_id, Document.is_deleted.is_(False))
|
| 247 |
+
.first()
|
| 248 |
+
)
|
| 249 |
if doc:
|
| 250 |
doc.status = "failed"
|
| 251 |
doc.error_message = str(e)[:500]
|
|
|
|
| 485 |
doc = db.query(Document).filter(
|
| 486 |
Document.id == document_id,
|
| 487 |
Document.user_id == user.id,
|
| 488 |
+
Document.is_deleted.is_(False),
|
| 489 |
).first()
|
| 490 |
|
| 491 |
if not doc:
|
|
|
|
| 527 |
"""Total Pages"""
|
| 528 |
totalDocuments = (
|
| 529 |
db.query(Document)
|
| 530 |
+
.filter(Document.user_id == user.id, Document.is_deleted.is_(False))
|
| 531 |
.count()
|
| 532 |
)
|
| 533 |
"""Total Pages"""
|
|
|
|
| 536 |
"""List all documents for the authenticated user in Paginated form"""
|
| 537 |
docs = ((
|
| 538 |
db.execute(select(Document)
|
| 539 |
+
.where(Document.user_id == user.id, Document.is_deleted.is_(False))
|
| 540 |
.order_by(Document.uploaded_at.desc())
|
| 541 |
.limit(per_page).offset(skip))
|
| 542 |
)
|
|
|
|
| 577 |
doc = db.query(Document).filter(
|
| 578 |
Document.id == document_id,
|
| 579 |
Document.user_id == user.id,
|
| 580 |
+
Document.is_deleted.is_(False),
|
| 581 |
).first()
|
| 582 |
|
| 583 |
if not doc:
|
|
|
|
| 614 |
doc = db.query(Document).filter(
|
| 615 |
Document.id == document_id,
|
| 616 |
Document.user_id == user.id,
|
| 617 |
+
Document.is_deleted.is_(False),
|
| 618 |
).first()
|
| 619 |
|
| 620 |
if not doc:
|
|
|
|
| 639 |
db: Session = Depends(get_db),
|
| 640 |
):
|
| 641 |
"""
|
| 642 |
+
Soft-delete a document so it disappears from normal document APIs.
|
| 643 |
|
| 644 |
+
The underlying file, vectors, graph, and chat history are retained for a
|
| 645 |
+
future recycle-bin/restore flow. Normal read/list endpoints filter deleted
|
| 646 |
+
documents so accidental deletion is reversible at the database level.
|
|
|
|
| 647 |
|
| 648 |
Args:
|
| 649 |
document_id: The unique identifier of the document to delete.
|
|
|
|
| 664 |
doc = db.query(Document).filter(
|
| 665 |
Document.id == document_id,
|
| 666 |
Document.user_id == user.id,
|
| 667 |
+
Document.is_deleted.is_(False),
|
| 668 |
).first()
|
| 669 |
|
| 670 |
if not doc:
|
| 671 |
raise HTTPException(status_code=404, detail="Document not found")
|
| 672 |
|
| 673 |
+
doc.is_deleted = True
|
| 674 |
+
doc.deleted_at = datetime.now(timezone.utc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
db.commit()
|
| 676 |
|
| 677 |
return {"message": f"Document '{doc.original_name}' deleted successfully"}
|
|
|
|
| 707 |
doc = db.query(Document).filter(
|
| 708 |
Document.id == document_id,
|
| 709 |
Document.user_id == user.id,
|
| 710 |
+
Document.is_deleted.is_(False),
|
| 711 |
).first()
|
| 712 |
|
| 713 |
if not doc:
|
|
|
|
| 742 |
user_id=user.id,
|
| 743 |
)
|
| 744 |
# Return the updated document record with new chunk settings
|
| 745 |
+
return DocumentResponse.model_validate(doc)
|
backend/tests/test_documents.py
CHANGED
|
@@ -93,14 +93,13 @@ def test_ingest_document_builds_and_saves_graph(db_session, monkeypatch, tmp_pat
|
|
| 93 |
assert refreshed.chunk_count == 1
|
| 94 |
|
| 95 |
|
| 96 |
-
def
|
| 97 |
-
|
| 98 |
doc_id = ready_document.id
|
| 99 |
|
| 100 |
-
monkeypatch.setattr("app.routes.documents.delete_document_chunks", lambda **kwargs: None)
|
| 101 |
monkeypatch.setattr(
|
| 102 |
"app.rag.graph_builder.delete_graph",
|
| 103 |
-
lambda user_id, document_id:
|
| 104 |
{"user_id": user_id, "document_id": document_id}
|
| 105 |
),
|
| 106 |
)
|
|
@@ -111,4 +110,15 @@ def test_delete_document_removes_knowledge_graph(client, auth_headers, ready_doc
|
|
| 111 |
)
|
| 112 |
|
| 113 |
assert response.status_code == 200
|
| 114 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
assert refreshed.chunk_count == 1
|
| 94 |
|
| 95 |
|
| 96 |
+
def test_delete_document_soft_deletes_and_hides_document(client, auth_headers, ready_document, db_session, monkeypatch):
|
| 97 |
+
deletion_calls = []
|
| 98 |
doc_id = ready_document.id
|
| 99 |
|
|
|
|
| 100 |
monkeypatch.setattr(
|
| 101 |
"app.rag.graph_builder.delete_graph",
|
| 102 |
+
lambda user_id, document_id: deletion_calls.append(
|
| 103 |
{"user_id": user_id, "document_id": document_id}
|
| 104 |
),
|
| 105 |
)
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
assert response.status_code == 200
|
| 113 |
+
assert deletion_calls == []
|
| 114 |
+
|
| 115 |
+
db_session.refresh(ready_document)
|
| 116 |
+
assert ready_document.is_deleted is True
|
| 117 |
+
assert ready_document.deleted_at is not None
|
| 118 |
+
|
| 119 |
+
list_response = client.get("/api/v1/documents/", headers=auth_headers)
|
| 120 |
+
assert list_response.status_code == 200
|
| 121 |
+
assert list_response.json()["total"] == 0
|
| 122 |
+
|
| 123 |
+
get_response = client.get(f"/api/v1/documents/{doc_id}", headers=auth_headers)
|
| 124 |
+
assert get_response.status_code == 404
|