Spaces:
Running
Running
File size: 2,489 Bytes
409c17a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
"""
Domain Layer - Document Entity
Represents a document in the knowledge base.
"""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Optional
from uuid import UUID, uuid4
class DocumentStatus(str, Enum):
"""Document processing status"""
PENDING = "pending"
PROCESSING = "processing"
INDEXED = "indexed"
FAILED = "failed"
class DocumentType(str, Enum):
"""Supported document types"""
PDF = "pdf"
DOCX = "docx"
TXT = "txt"
MD = "md"
HTML = "html"
@dataclass
class Document:
"""Document entity - core business object"""
title: str
filename: str
file_type: DocumentType
file_size: int
storage_path: str
department: str
id: UUID = field(default_factory=uuid4)
status: DocumentStatus = DocumentStatus.PENDING
upload_session_id: Optional[str] = None
uploaded_at: datetime = field(default_factory=datetime.utcnow)
indexed_at: Optional[datetime] = None
metadata: dict = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.utcnow)
updated_at: datetime = field(default_factory=datetime.utcnow)
def mark_as_processing(self) -> None:
"""Mark document as being processed"""
self.status = DocumentStatus.PROCESSING
self.updated_at = datetime.utcnow()
def mark_as_indexed(self) -> None:
"""Mark document as successfully indexed"""
self.status = DocumentStatus.INDEXED
self.indexed_at = datetime.utcnow()
self.updated_at = datetime.utcnow()
def mark_as_failed(self) -> None:
"""Mark document processing as failed"""
self.status = DocumentStatus.FAILED
self.updated_at = datetime.utcnow()
def is_indexed(self) -> bool:
"""Check if document is indexed"""
return self.status == DocumentStatus.INDEXED
@dataclass
class DocumentChunk:
"""Document chunk - piece of document for vector search"""
document_id: UUID
chunk_index: int
content: str
token_count: int
id: UUID = field(default_factory=uuid4)
vector_id: Optional[str] = None
metadata: dict = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.utcnow)
def set_vector_id(self, vector_id: str) -> None:
"""Set Qdrant vector ID"""
self.vector_id = vector_id
|