File size: 8,649 Bytes
f02f2d2
 
 
b73db8b
f02f2d2
b73db8b
f02f2d2
 
 
 
 
 
 
 
 
 
 
 
 
b73db8b
f02f2d2
 
 
 
 
 
 
 
 
 
 
b73db8b
f02f2d2
 
 
 
 
 
 
b73db8b
f02f2d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b73db8b
 
 
f02f2d2
 
 
 
 
 
 
 
 
 
b73db8b
 
 
f02f2d2
 
 
 
 
 
b73db8b
 
 
f02f2d2
 
 
 
 
b73db8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f02f2d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
"""
Data models for documents and document chunks.
"""
from typing import List, Dict, Any, Optional
import uuid
from PIL import Image


class RawDocument:
    """Represents a raw document with extracted content."""

    def __init__(
        self,
        filename: str,
        file_type: str,
        pages: List[Dict[str, Any]],
        raw_text: str,
        raw_tables: List[Dict[str, Any]],
        total_pages: int,
        metadata: Optional[Dict[str, Any]] = None,
    ):
        """
        Initialize a RawDocument.

        Args:
            filename: Name of the document file
            file_type: Type of file (e.g., 'pdf', 'docx')
            pages: List of page dictionaries with 'page_num' and 'text' keys
            raw_text: Full extracted text from the document
            raw_tables: List of tables extracted from the document
            total_pages: Total number of pages in the document
            metadata: Additional metadata (file_path, author, etc.)
        """
        self.filename = filename
        self.file_type = file_type
        self.pages = pages
        self.raw_text = raw_text
        self.raw_tables = raw_tables
        self.total_pages = total_pages
        self.metadata = metadata or {}

    def __repr__(self) -> str:
        return f"RawDocument(filename={self.filename}, pages={self.total_pages})"


class DocumentChunk:
    """Represents a chunk of document content with metadata."""

    def __init__(
        self,
        content: str,
        chunk_type: str,
        page_number: int,
        metadata: Dict[str, Any] = None,
        chunk_id: str = None,
    ):
        """
        Initialize a DocumentChunk.

        Args:
            content: The text content of the chunk
            chunk_type: Type of chunk (e.g., 'text', 'table')
            page_number: Page number where this chunk appears
            metadata: Additional metadata about the chunk
            chunk_id: Unique identifier for the chunk (auto-generated if not provided)
        """
        self.content = content
        self.chunk_type = chunk_type
        self.page_number = page_number
        self.metadata = metadata or {}
        self.chunk_id = chunk_id or str(uuid.uuid4())

    def __repr__(self) -> str:
        return (
            f"DocumentChunk(type={self.chunk_type}, page={self.page_number}, "
            f"length={len(self.content)})"
        )


class TableExtraction:
    """Represents a table extracted from a document."""

    def __init__(
        self,
        headers: List[str],
        rows: List[List[str]],
        page_number: int,
        schema_summary: str,
        table_id: str = None,
    ):
        """
        Initialize a TableExtraction.

        Args:
            headers: List of column headers
            rows: List of rows, each containing cell values
            page_number: Page number where this table appears
            schema_summary: Summary description of the table schema
            table_id: Unique identifier for the table (auto-generated if not provided)
        """
        self.headers = headers
        self.rows = rows
        self.page_number = page_number
        self.schema_summary = schema_summary
        self.table_id = table_id or str(uuid.uuid4())

    def __repr__(self) -> str:
        return (
            f"TableExtraction(columns={len(self.headers)}, "
            f"rows={len(self.rows)}, page={self.page_number})"
        )


class ProcessedDocument:
    """Represents a fully processed document with text chunks and tables."""

    def __init__(
        self,
        filename: str,
        text_chunks: List[DocumentChunk],
        tables: List["TableExtraction"],
        total_pages: int,
        file_type: str,
        images: Optional[List["ImageExtraction"]] = None,
        layout: Optional["LayoutExtraction"] = None,
        metadata: Optional["MetadataExtraction"] = None,
    ):
        """
        Initialize a ProcessedDocument.

        Args:
            filename: Name of the document file
            text_chunks: List of text chunks extracted from the document
            tables: List of tables extracted from the document
            total_pages: Total number of pages in the document
            file_type: Type of file (e.g., 'pdf', 'docx')
            images: List of images extracted from the document (Phase 2)
            layout: Layout information (Phase 2)
            metadata: Document metadata (Phase 2)
        """
        self.filename = filename
        self.text_chunks = text_chunks
        self.tables = tables
        self.total_pages = total_pages
        self.file_type = file_type
        self.images = images or []
        self.layout = layout
        self.metadata = metadata

    def __repr__(self) -> str:
        return (
            f"ProcessedDocument(filename={self.filename}, "
            f"text_chunks={len(self.text_chunks)}, "
            f"tables={len(self.tables)}, "
            f"images={len(self.images)})"
        )


class ImageExtraction:
    """Represents an image extracted from a document."""

    def __init__(
        self,
        image: Image.Image,
        page_number: int,
        image_index: int,
        width: int,
        height: int,
        format: str,
        image_id: str = None,
    ):
        """
        Initialize an ImageExtraction.

        Args:
            image: PIL Image object
            page_number: Page number where this image appears
            image_index: Index of image on the page
            width: Image width in pixels
            height: Image height in pixels
            format: Image format (png, jpg, etc.)
            image_id: Unique identifier for the image (auto-generated if not provided)
        """
        self.image = image
        self.page_number = page_number
        self.image_index = image_index
        self.width = width
        self.height = height
        self.format = format
        self.image_id = image_id or str(uuid.uuid4())

    def __repr__(self) -> str:
        return (
            f"ImageExtraction(page={self.page_number}, "
            f"size={self.width}x{self.height}, format={self.format})"
        )


class LayoutExtraction:
    """Represents document layout and structure information."""

    def __init__(
        self,
        sections: List[Dict[str, Any]],
        hierarchy: Dict[str, Any],
        page_layouts: List[Dict[str, Any]],
        total_pages: int,
    ):
        """
        Initialize a LayoutExtraction.

        Args:
            sections: List of document sections with hierarchy info
            hierarchy: Document hierarchy tree
            page_layouts: Layout information per page
            total_pages: Total number of pages
        """
        self.sections = sections
        self.hierarchy = hierarchy
        self.page_layouts = page_layouts
        self.total_pages = total_pages

    def __repr__(self) -> str:
        return (
            f"LayoutExtraction(sections={len(self.sections)}, "
            f"pages={self.total_pages})"
        )


class MetadataExtraction:
    """Represents document metadata."""

    def __init__(
        self,
        title: Optional[str] = None,
        author: Optional[str] = None,
        subject: Optional[str] = None,
        keywords: Optional[List[str]] = None,
        creator: Optional[str] = None,
        producer: Optional[str] = None,
        creation_date: Optional[str] = None,
        modification_date: Optional[str] = None,
        page_count: Optional[int] = None,
        custom_properties: Optional[Dict[str, Any]] = None,
    ):
        """
        Initialize a MetadataExtraction.

        Args:
            title: Document title
            author: Document author
            subject: Document subject
            keywords: List of keywords
            creator: Creator application
            producer: Producer application
            creation_date: Creation date
            modification_date: Last modification date
            page_count: Number of pages
            custom_properties: Additional custom properties
        """
        self.title = title
        self.author = author
        self.subject = subject
        self.keywords = keywords or []
        self.creator = creator
        self.producer = producer
        self.creation_date = creation_date
        self.modification_date = modification_date
        self.page_count = page_count
        self.custom_properties = custom_properties or {}

    def __repr__(self) -> str:
        return (
            f"MetadataExtraction(title={self.title}, "
            f"author={self.author}, pages={self.page_count})"
        )