Saurabh Kumar Bajpai commited on
Commit
9005b3b
·
1 Parent(s): 234979c

feat: soft-delete documents

Browse files
backend/app/database.py CHANGED
@@ -75,6 +75,8 @@ def _migrate_schema():
75
  existing_docs_columns = {c["name"] for c in inspector.get_columns("documents")}
76
  docs_migrations = [
77
  ("documents", "last_accessed_at", "ALTER TABLE documents ADD COLUMN last_accessed_at TIMESTAMP"),
 
 
78
  ]
79
  for table, column, ddl in docs_migrations:
80
  if column not in existing_docs_columns:
 
75
  existing_docs_columns = {c["name"] for c in inspector.get_columns("documents")}
76
  docs_migrations = [
77
  ("documents", "last_accessed_at", "ALTER TABLE documents ADD COLUMN last_accessed_at TIMESTAMP"),
78
+ ("documents", "is_deleted", "ALTER TABLE documents ADD COLUMN is_deleted BOOLEAN DEFAULT FALSE NOT NULL"),
79
+ ("documents", "deleted_at", "ALTER TABLE documents ADD COLUMN deleted_at TIMESTAMP"),
80
  ]
81
  for table, column, ddl in docs_migrations:
82
  if column not in existing_docs_columns:
backend/app/models.py CHANGED
@@ -182,6 +182,8 @@ class Document(Base):
182
  drive_file_id = Column(String(255), unique=True, nullable=True, index=True)
183
  drive_folder_id = Column(String(255), nullable=True, index=True)
184
  drive_synced_at = Column(DateTime, nullable=True)
 
 
185
 
186
  # Relationships
187
  owner = relationship("User", back_populates="documents")
@@ -238,4 +240,3 @@ class SharedMessage(Base):
238
 
239
  # Relationships
240
  message = relationship("ChatMessage", back_populates="shared_message")
241
-
 
182
  drive_file_id = Column(String(255), unique=True, nullable=True, index=True)
183
  drive_folder_id = Column(String(255), nullable=True, index=True)
184
  drive_synced_at = Column(DateTime, nullable=True)
185
+ is_deleted = Column(Boolean, default=False, nullable=False, index=True)
186
+ deleted_at = Column(DateTime, nullable=True)
187
 
188
  # Relationships
189
  owner = relationship("User", back_populates="documents")
 
240
 
241
  # Relationships
242
  message = relationship("ChatMessage", back_populates="shared_message")
 
backend/app/routes/chat.py CHANGED
@@ -236,6 +236,7 @@ def ask_question(
236
  doc = db.query(Document).filter(
237
  Document.id == payload.document_id,
238
  Document.user_id == user.id,
 
239
  ).first()
240
 
241
  if not doc:
@@ -296,6 +297,7 @@ def ask_question_stream(
296
  doc = db.query(Document).filter(
297
  Document.id == payload.document_id,
298
  Document.user_id == user.id,
 
299
  ).first()
300
 
301
  if not doc:
@@ -437,6 +439,7 @@ def export_chat_history(
437
  doc = db.query(Document).filter(
438
  Document.id == document_id,
439
  Document.user_id == resolved_user.id,
 
440
  ).first()
441
 
442
  if not doc:
 
236
  doc = db.query(Document).filter(
237
  Document.id == payload.document_id,
238
  Document.user_id == user.id,
239
+ Document.is_deleted.is_(False),
240
  ).first()
241
 
242
  if not doc:
 
297
  doc = db.query(Document).filter(
298
  Document.id == payload.document_id,
299
  Document.user_id == user.id,
300
+ Document.is_deleted.is_(False),
301
  ).first()
302
 
303
  if not doc:
 
439
  doc = db.query(Document).filter(
440
  Document.id == document_id,
441
  Document.user_id == resolved_user.id,
442
+ Document.is_deleted.is_(False),
443
  ).first()
444
 
445
  if not doc:
backend/app/routes/documents.py CHANGED
@@ -8,6 +8,7 @@ import uuid
8
  import logging
9
  import asyncio
10
  import concurrent.futures
 
11
  from typing import Optional
12
  from pathlib import Path
13
  import shutil
@@ -23,7 +24,7 @@ from app.schemas import DocumentResponse, DocumentListResponse, DocumentStatusRe
23
  from app.auth import get_current_user
24
  from app.config import get_settings
25
  from app.rag.chunker import chunk_document, get_page_count
26
- from app.rag.vectorstore import store_chunks, delete_document_chunks
27
 
28
  try:
29
  from crawl4ai import AsyncWebCrawler
@@ -158,7 +159,11 @@ def _ingest_document(document_id: str, filepath: str, original_name: str, user_i
158
 
159
  db = SessionLocal()
160
  try:
161
- doc = db.query(Document).filter(Document.id == document_id).first()
 
 
 
 
162
  if not doc:
163
  logger.error(f"Document {document_id} not found for ingestion")
164
  return
@@ -236,7 +241,11 @@ def _ingest_document(document_id: str, filepath: str, original_name: str, user_i
236
  except Exception as e:
237
  logger.error(f"Ingestion error for {document_id}: {e}")
238
  try:
239
- doc = db.query(Document).filter(Document.id == document_id).first()
 
 
 
 
240
  if doc:
241
  doc.status = "failed"
242
  doc.error_message = str(e)[:500]
@@ -476,6 +485,7 @@ def get_document_status(
476
  doc = db.query(Document).filter(
477
  Document.id == document_id,
478
  Document.user_id == user.id,
 
479
  ).first()
480
 
481
  if not doc:
@@ -517,7 +527,7 @@ def list_documents(
517
  """Total Pages"""
518
  totalDocuments = (
519
  db.query(Document)
520
- .filter(Document.user_id == user.id)
521
  .count()
522
  )
523
  """Total Pages"""
@@ -526,7 +536,7 @@ def list_documents(
526
  """List all documents for the authenticated user in Paginated form"""
527
  docs = ((
528
  db.execute(select(Document)
529
- .where(Document.user_id == user.id)
530
  .order_by(Document.uploaded_at.desc())
531
  .limit(per_page).offset(skip))
532
  )
@@ -567,6 +577,7 @@ def get_document(
567
  doc = db.query(Document).filter(
568
  Document.id == document_id,
569
  Document.user_id == user.id,
 
570
  ).first()
571
 
572
  if not doc:
@@ -603,6 +614,7 @@ def serve_pdf(
603
  doc = db.query(Document).filter(
604
  Document.id == document_id,
605
  Document.user_id == user.id,
 
606
  ).first()
607
 
608
  if not doc:
@@ -627,12 +639,11 @@ def delete_document(
627
  db: Session = Depends(get_db),
628
  ):
629
  """
630
- Delete a document and its associated vector embeddings.
631
 
632
- Removes the document from the database, deletes the physical file from
633
- disk, and attempts to delete all corresponding vector chunks from ChromaDB.
634
- If ChromaDB deletion fails, the error is logged but does not block the
635
- overall operation.
636
 
637
  Args:
638
  document_id: The unique identifier of the document to delete.
@@ -653,32 +664,14 @@ def delete_document(
653
  doc = db.query(Document).filter(
654
  Document.id == document_id,
655
  Document.user_id == user.id,
 
656
  ).first()
657
 
658
  if not doc:
659
  raise HTTPException(status_code=404, detail="Document not found")
660
 
661
- # Delete file from disk
662
- filepath = os.path.join(settings.UPLOAD_DIR, user.id, doc.filename)
663
- if os.path.exists(filepath):
664
- os.remove(filepath)
665
-
666
- # Delete vectors from ChromaDB
667
- try:
668
- delete_document_chunks(document_id=document_id, user_id=user.id)
669
- except Exception as e:
670
- logger.warning(f"Error deleting vectors: {e}")
671
-
672
- # Delete persisted knowledge graph
673
- try:
674
- from app.rag.graph_builder import delete_graph
675
-
676
- delete_graph(user_id=user.id, document_id=document_id)
677
- except Exception as e:
678
- logger.warning(f"Error deleting knowledge graph: {e}")
679
-
680
- # Delete from database (cascades to chat messages)
681
- db.delete(doc)
682
  db.commit()
683
 
684
  return {"message": f"Document '{doc.original_name}' deleted successfully"}
@@ -714,6 +707,7 @@ def update_chunk_settings(
714
  doc = db.query(Document).filter(
715
  Document.id == document_id,
716
  Document.user_id == user.id,
 
717
  ).first()
718
 
719
  if not doc:
@@ -748,4 +742,4 @@ def update_chunk_settings(
748
  user_id=user.id,
749
  )
750
  # Return the updated document record with new chunk settings
751
- return DocumentResponse.model_validate(doc)
 
8
  import logging
9
  import asyncio
10
  import concurrent.futures
11
+ from datetime import datetime, timezone
12
  from typing import Optional
13
  from pathlib import Path
14
  import shutil
 
24
  from app.auth import get_current_user
25
  from app.config import get_settings
26
  from app.rag.chunker import chunk_document, get_page_count
27
+ from app.rag.vectorstore import store_chunks
28
 
29
  try:
30
  from crawl4ai import AsyncWebCrawler
 
159
 
160
  db = SessionLocal()
161
  try:
162
+ doc = (
163
+ db.query(Document)
164
+ .filter(Document.id == document_id, Document.is_deleted.is_(False))
165
+ .first()
166
+ )
167
  if not doc:
168
  logger.error(f"Document {document_id} not found for ingestion")
169
  return
 
241
  except Exception as e:
242
  logger.error(f"Ingestion error for {document_id}: {e}")
243
  try:
244
+ doc = (
245
+ db.query(Document)
246
+ .filter(Document.id == document_id, Document.is_deleted.is_(False))
247
+ .first()
248
+ )
249
  if doc:
250
  doc.status = "failed"
251
  doc.error_message = str(e)[:500]
 
485
  doc = db.query(Document).filter(
486
  Document.id == document_id,
487
  Document.user_id == user.id,
488
+ Document.is_deleted.is_(False),
489
  ).first()
490
 
491
  if not doc:
 
527
  """Total Pages"""
528
  totalDocuments = (
529
  db.query(Document)
530
+ .filter(Document.user_id == user.id, Document.is_deleted.is_(False))
531
  .count()
532
  )
533
  """Total Pages"""
 
536
  """List all documents for the authenticated user in Paginated form"""
537
  docs = ((
538
  db.execute(select(Document)
539
+ .where(Document.user_id == user.id, Document.is_deleted.is_(False))
540
  .order_by(Document.uploaded_at.desc())
541
  .limit(per_page).offset(skip))
542
  )
 
577
  doc = db.query(Document).filter(
578
  Document.id == document_id,
579
  Document.user_id == user.id,
580
+ Document.is_deleted.is_(False),
581
  ).first()
582
 
583
  if not doc:
 
614
  doc = db.query(Document).filter(
615
  Document.id == document_id,
616
  Document.user_id == user.id,
617
+ Document.is_deleted.is_(False),
618
  ).first()
619
 
620
  if not doc:
 
639
  db: Session = Depends(get_db),
640
  ):
641
  """
642
+ Soft-delete a document so it disappears from normal document APIs.
643
 
644
+ The underlying file, vectors, graph, and chat history are retained for a
645
+ future recycle-bin/restore flow. Normal read/list endpoints filter deleted
646
+ documents so accidental deletion is reversible at the database level.
 
647
 
648
  Args:
649
  document_id: The unique identifier of the document to delete.
 
664
  doc = db.query(Document).filter(
665
  Document.id == document_id,
666
  Document.user_id == user.id,
667
+ Document.is_deleted.is_(False),
668
  ).first()
669
 
670
  if not doc:
671
  raise HTTPException(status_code=404, detail="Document not found")
672
 
673
+ doc.is_deleted = True
674
+ doc.deleted_at = datetime.now(timezone.utc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  db.commit()
676
 
677
  return {"message": f"Document '{doc.original_name}' deleted successfully"}
 
707
  doc = db.query(Document).filter(
708
  Document.id == document_id,
709
  Document.user_id == user.id,
710
+ Document.is_deleted.is_(False),
711
  ).first()
712
 
713
  if not doc:
 
742
  user_id=user.id,
743
  )
744
  # Return the updated document record with new chunk settings
745
+ return DocumentResponse.model_validate(doc)
backend/tests/test_documents.py CHANGED
@@ -93,14 +93,13 @@ def test_ingest_document_builds_and_saves_graph(db_session, monkeypatch, tmp_pat
93
  assert refreshed.chunk_count == 1
94
 
95
 
96
- def test_delete_document_removes_knowledge_graph(client, auth_headers, ready_document, monkeypatch):
97
- deleted = {}
98
  doc_id = ready_document.id
99
 
100
- monkeypatch.setattr("app.routes.documents.delete_document_chunks", lambda **kwargs: None)
101
  monkeypatch.setattr(
102
  "app.rag.graph_builder.delete_graph",
103
- lambda user_id, document_id: deleted.update(
104
  {"user_id": user_id, "document_id": document_id}
105
  ),
106
  )
@@ -111,4 +110,15 @@ def test_delete_document_removes_knowledge_graph(client, auth_headers, ready_doc
111
  )
112
 
113
  assert response.status_code == 200
114
- assert deleted["document_id"] == doc_id
 
 
 
 
 
 
 
 
 
 
 
 
93
  assert refreshed.chunk_count == 1
94
 
95
 
96
+ def test_delete_document_soft_deletes_and_hides_document(client, auth_headers, ready_document, db_session, monkeypatch):
97
+ deletion_calls = []
98
  doc_id = ready_document.id
99
 
 
100
  monkeypatch.setattr(
101
  "app.rag.graph_builder.delete_graph",
102
+ lambda user_id, document_id: deletion_calls.append(
103
  {"user_id": user_id, "document_id": document_id}
104
  ),
105
  )
 
110
  )
111
 
112
  assert response.status_code == 200
113
+ assert deletion_calls == []
114
+
115
+ db_session.refresh(ready_document)
116
+ assert ready_document.is_deleted is True
117
+ assert ready_document.deleted_at is not None
118
+
119
+ list_response = client.get("/api/v1/documents/", headers=auth_headers)
120
+ assert list_response.status_code == 200
121
+ assert list_response.json()["total"] == 0
122
+
123
+ get_response = client.get(f"/api/v1/documents/{doc_id}", headers=auth_headers)
124
+ assert get_response.status_code == 404