File size: 2,489 Bytes
409c17a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""

Domain Layer - Document Entity



Represents a document in the knowledge base.

"""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Optional
from uuid import UUID, uuid4


class DocumentStatus(str, Enum):
    """Document processing status"""

    PENDING = "pending"
    PROCESSING = "processing"
    INDEXED = "indexed"
    FAILED = "failed"


class DocumentType(str, Enum):
    """Supported document types"""

    PDF = "pdf"
    DOCX = "docx"
    TXT = "txt"
    MD = "md"
    HTML = "html"


@dataclass
class Document:
    """Document entity - core business object"""

    title: str
    filename: str
    file_type: DocumentType
    file_size: int
    storage_path: str
    department: str
    id: UUID = field(default_factory=uuid4)
    status: DocumentStatus = DocumentStatus.PENDING
    upload_session_id: Optional[str] = None
    uploaded_at: datetime = field(default_factory=datetime.utcnow)
    indexed_at: Optional[datetime] = None
    metadata: dict = field(default_factory=dict)
    created_at: datetime = field(default_factory=datetime.utcnow)
    updated_at: datetime = field(default_factory=datetime.utcnow)

    def mark_as_processing(self) -> None:
        """Mark document as being processed"""
        self.status = DocumentStatus.PROCESSING
        self.updated_at = datetime.utcnow()

    def mark_as_indexed(self) -> None:
        """Mark document as successfully indexed"""
        self.status = DocumentStatus.INDEXED
        self.indexed_at = datetime.utcnow()
        self.updated_at = datetime.utcnow()

    def mark_as_failed(self) -> None:
        """Mark document processing as failed"""
        self.status = DocumentStatus.FAILED
        self.updated_at = datetime.utcnow()

    def is_indexed(self) -> bool:
        """Check if document is indexed"""
        return self.status == DocumentStatus.INDEXED


@dataclass
class DocumentChunk:
    """Document chunk - piece of document for vector search"""

    document_id: UUID
    chunk_index: int
    content: str
    token_count: int
    id: UUID = field(default_factory=uuid4)
    vector_id: Optional[str] = None
    metadata: dict = field(default_factory=dict)
    created_at: datetime = field(default_factory=datetime.utcnow)

    def set_vector_id(self, vector_id: str) -> None:
        """Set Qdrant vector ID"""
        self.vector_id = vector_id