File size: 3,782 Bytes
01d5a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from datetime import datetime
from typing import Optional, Dict
from .process_status import ProcessStatus
from pydantic import BaseModel, Field


class DocumentDTO(BaseModel):
    id: Optional[int] = None
    name: str = Field(default="")
    title: Optional[str] = None
    extract_status: ProcessStatus = Field(default=ProcessStatus.INITIALIZED)
    embedding_status: ProcessStatus = Field(default=ProcessStatus.INITIALIZED)
    analyze_status: ProcessStatus = Field(default=ProcessStatus.INITIALIZED)
    mime_type: Optional[str] = None
    raw_content: Optional[str] = None
    user_description: Optional[str] = None
    create_time: Optional[datetime] = None
    url: Optional[str] = None
    document_size: int = Field(default=0)
    insight: Optional[Dict] = None
    summary: Optional[Dict] = None

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat() if v else None,
            ProcessStatus: lambda v: v.value if v else None,
        }

    def dict(self, *args, **kwargs):
        d = super().dict(*args, **kwargs)
        if d.get("extract_status"):
            d["extract_status"] = d["extract_status"].value
        if d.get("embedding_status"):
            d["embedding_status"] = d["embedding_status"].value
        if d.get("analyze_status"):
            d["analyze_status"] = d["analyze_status"].value
        return d

    @classmethod
    def from_dict(cls, data: Dict):
        if not data:
            return None

        if "extract_status" in data:
            data["extract_status"] = ProcessStatus(data["extract_status"])
        if "embedding_status" in data:
            data["embedding_status"] = ProcessStatus(data["embedding_status"])
        if "analyze_status" in data:
            data["analyze_status"] = ProcessStatus(data["analyze_status"])
        if "create_time" in data and isinstance(data["create_time"], str):
            data["create_time"] = datetime.fromisoformat(data["create_time"])

        return cls(**data)


class CreateDocumentRequest(BaseModel):
    name: str = Field(
        ..., description="Document name", max_length=255, example="example.pdf"
    )
    title: Optional[str] = Field(
        None, description="Document title", max_length=255, example="Example Document"
    )
    mime_type: Optional[str] = Field(
        None, description="MIME type", max_length=100, example="application/pdf"
    )
    user_description: Optional[str] = Field(
        None,
        description="User provided description",
        example="This is an example document",
    )
    url: Optional[str] = Field(
        None,
        description="Document URL or file path",
        example="https://example.com/doc.pdf or /path/to/file",
    )
    document_size: int = Field(
        0, description="Document size in bytes", ge=0, example=1024
    )
    raw_content: Optional[str] = Field(
        None, description="Extracted raw content from the document"
    )
    extract_status: ProcessStatus = Field(
        ProcessStatus.INITIALIZED, description="Extraction status"
    )
    embedding_status: ProcessStatus = Field(
        ProcessStatus.INITIALIZED, description="Embedding status"
    )
    analyze_status: ProcessStatus = Field(
        ProcessStatus.INITIALIZED, description="Analysis status"
    )

    def to_dto(self) -> "DocumentDTO":
        return DocumentDTO(
            name=self.name,
            title=self.title,
            mime_type=self.mime_type,
            user_description=self.user_description,
            url=self.url,
            document_size=self.document_size,
            raw_content=self.raw_content,
            extract_status=self.extract_status,
            embedding_status=self.embedding_status,
            analyze_status=self.analyze_status,
        )