[NOTICKET][document] add updated_at on metadata and delete vector embedding if user delete document on knowledge

#11
src/document/document_service.py CHANGED
@@ -1,7 +1,7 @@
1
  """Service for managing documents."""
2
 
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
- from sqlalchemy import select, delete
5
  from src.db.postgres.models import Document
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.middlewares.logging import get_logger
@@ -77,6 +77,12 @@ class DocumentService:
77
  # Delete from blob storage
78
  await blob_storage.delete_file(document.blob_name)
79
 
 
 
 
 
 
 
80
  # Delete from database
81
  await db.execute(
82
  delete(Document).where(Document.id == document_id)
 
1
  """Service for managing documents."""
2
 
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
+ from sqlalchemy import select, delete, text
5
  from src.db.postgres.models import Document
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.middlewares.logging import get_logger
 
77
  # Delete from blob storage
78
  await blob_storage.delete_file(document.blob_name)
79
 
80
+ # Delete vector embeddings from pgvector
81
+ await db.execute(
82
+ text("DELETE FROM langchain_pg_embedding WHERE cmetadata->'data'->>'document_id' = :doc_id"),
83
+ {"doc_id": document_id}
84
+ )
85
+
86
  # Delete from database
87
  await db.execute(
88
  delete(Document).where(Document.id == document_id)
src/knowledge/processing_service.py CHANGED
@@ -8,6 +8,7 @@ from src.db.postgres.models import Document as DBDocument
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
  from src.middlewares.logging import get_logger
10
  from typing import List
 
11
  import sys
12
  import docx
13
  import pandas as pd
@@ -15,6 +16,8 @@ import pytesseract
15
  from pdf2image import convert_from_bytes
16
  from io import BytesIO
17
 
 
 
18
  logger = get_logger("knowledge_processing")
19
 
20
 
@@ -55,6 +58,7 @@ class KnowledgeProcessingService:
55
  metadata={
56
  "user_id": db_doc.user_id,
57
  "source_type": "document",
 
58
  "data": {
59
  "document_id": db_doc.id,
60
  "filename": db_doc.filename,
@@ -103,6 +107,7 @@ class KnowledgeProcessingService:
103
  metadata={
104
  "user_id": db_doc.user_id,
105
  "source_type": "document",
 
106
  "data": {
107
  "document_id": db_doc.id,
108
  "filename": db_doc.filename,
@@ -150,6 +155,7 @@ class KnowledgeProcessingService:
150
  metadata={
151
  "user_id": db_doc.user_id,
152
  "source_type": "document",
 
153
  "data": {
154
  "document_id": db_doc.id,
155
  "filename": db_doc.filename,
 
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
  from src.middlewares.logging import get_logger
10
  from typing import List
11
+ from datetime import datetime, timezone, timedelta
12
  import sys
13
  import docx
14
  import pandas as pd
 
16
  from pdf2image import convert_from_bytes
17
  from io import BytesIO
18
 
19
+ _JAKARTA_TZ = timezone(timedelta(hours=7))
20
+
21
  logger = get_logger("knowledge_processing")
22
 
23
 
 
58
  metadata={
59
  "user_id": db_doc.user_id,
60
  "source_type": "document",
61
+ "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
62
  "data": {
63
  "document_id": db_doc.id,
64
  "filename": db_doc.filename,
 
107
  metadata={
108
  "user_id": db_doc.user_id,
109
  "source_type": "document",
110
+ "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
111
  "data": {
112
  "document_id": db_doc.id,
113
  "filename": db_doc.filename,
 
155
  metadata={
156
  "user_id": db_doc.user_id,
157
  "source_type": "document",
158
+ "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
159
  "data": {
160
  "document_id": db_doc.id,
161
  "filename": db_doc.filename,
src/pipeline/document_pipeline/document_pipeline.py CHANGED
@@ -32,7 +32,7 @@ class DocumentPipeline:
32
  if file_type not in SUPPORTED_FILE_TYPES:
33
  raise HTTPException(
34
  status_code=400,
35
- detail=f"Unsupported file type. Supported: {SUPPORTED_FILE_TYPES}",
36
  )
37
 
38
  blob_name = await blob_storage.upload_file(content, file.filename, user_id)
 
32
  if file_type not in SUPPORTED_FILE_TYPES:
33
  raise HTTPException(
34
  status_code=400,
35
+ detail=f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}",
36
  )
37
 
38
  blob_name = await blob_storage.upload_file(content, file.filename, user_id)