File size: 4,400 Bytes
7c46845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Cleanup script for ChromaDB vectors and records of documents inactive for 30 days.

By default, a document is considered inactive if it has not been accessed (last_accessed_at)
or uploaded (if last_accessed_at is missing) for 30 days.

Run manually:
    python backend/scripts/document_cleanup.py

Environment:
    DOCUMENT_CLEANUP_INACTIVE_DAYS=30
    DOCUMENT_CLEANUP_DRY_RUN=true
"""
from __future__ import annotations

import logging
import os
import sys
from datetime import datetime, timedelta, timezone
from pathlib import Path

from sqlalchemy import or_, inspect, text

# Allow running this file directly from the repository root.
BACKEND_DIR = Path(__file__).resolve().parents[1]
if str(BACKEND_DIR) not in sys.path:
    sys.path.insert(0, str(BACKEND_DIR))

from app.database import SessionLocal  # noqa: E402
from app.models import Document  # noqa: E402
from app.rag.vectorstore import delete_document_chunks  # noqa: E402
from app.config import get_settings  # noqa: E402

logger = logging.getLogger("document_cleanup")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

settings = get_settings()

def _env_bool(name: str, default: bool = False) -> bool:
    value = os.getenv(name)
    if value is None:
        return default
    return value.strip().lower() in {"1", "true", "yes", "on"}


def ensure_last_accessed_at_column() -> None:
    """Ensure last_accessed_at column exists in database (handles SQLite local installs)."""
    db = SessionLocal()
    try:
        bind = db.get_bind()
        inspector = inspect(bind)
        columns = {column["name"] for column in inspector.get_columns("documents")}
        if "last_accessed_at" not in columns:
            logger.info("Adding missing documents.last_accessed_at column")
            db.execute(text("ALTER TABLE documents ADD COLUMN last_accessed_at TIMESTAMP"))
            db.commit()
    finally:
        db.close()


def cleanup_inactive_documents(
    inactive_days: int | None = None,
    dry_run: bool | None = None,
) -> dict[str, int]:
    """Delete database records, physical files, and Chroma collections for inactive documents."""
    ensure_last_accessed_at_column()

    days = inactive_days or int(os.getenv("DOCUMENT_CLEANUP_INACTIVE_DAYS", "30"))
    is_dry_run = _env_bool("DOCUMENT_CLEANUP_DRY_RUN", False) if dry_run is None else dry_run
    cutoff = datetime.now(timezone.utc) - timedelta(days=days)

    stats = {
        "scanned": 0,
        "eligible": 0,
        "deleted": 0,
        "failed": 0,
    }

    db = SessionLocal()
    try:
        # Select documents whose last_accessed_at or uploaded_at is before cutoff
        docs = db.query(Document).filter(
            or_(
                Document.last_accessed_at < cutoff,
                Document.last_accessed_at.is_(None) & (Document.uploaded_at < cutoff)
            )
        ).all()

        for doc in docs:
            stats["scanned"] += 1
            last_activity = doc.last_accessed_at or doc.uploaded_at
            
            stats["eligible"] += 1
            logger.info(
                "Document %s ('%s') inactive since %s; purging dry_run=%s",
                doc.id,
                doc.original_name,
                last_activity,
                is_dry_run,
            )

            if is_dry_run:
                continue

            try:
                # 1. Delete file from disk
                filepath = os.path.join(settings.UPLOAD_DIR, doc.user_id, doc.filename)
                if os.path.exists(filepath):
                    os.remove(filepath)
                    logger.info("Deleted physical file: %s", filepath)

                # 2. Delete vectors from ChromaDB
                delete_document_chunks(document_id=doc.id, user_id=doc.user_id)

                # 3. Delete from SQL database
                db.delete(doc)
                stats["deleted"] += 1
            except Exception as exc:
                stats["failed"] += 1
                logger.warning(
                    "Failed purging document %s: %s",
                    doc.id,
                    exc,
                    exc_info=True,
                )

        if not is_dry_run:
            db.commit()

        logger.info("Document cleanup complete: %s", stats)
        return stats
    finally:
        db.close()


if __name__ == "__main__":
    cleanup_inactive_documents()