""" Data models for documents and document chunks. """ from typing import List, Dict, Any, Optional import uuid from PIL import Image class RawDocument: """Represents a raw document with extracted content.""" def __init__( self, filename: str, file_type: str, pages: List[Dict[str, Any]], raw_text: str, raw_tables: List[Dict[str, Any]], total_pages: int, metadata: Optional[Dict[str, Any]] = None, ): """ Initialize a RawDocument. Args: filename: Name of the document file file_type: Type of file (e.g., 'pdf', 'docx') pages: List of page dictionaries with 'page_num' and 'text' keys raw_text: Full extracted text from the document raw_tables: List of tables extracted from the document total_pages: Total number of pages in the document metadata: Additional metadata (file_path, author, etc.) """ self.filename = filename self.file_type = file_type self.pages = pages self.raw_text = raw_text self.raw_tables = raw_tables self.total_pages = total_pages self.metadata = metadata or {} def __repr__(self) -> str: return f"RawDocument(filename={self.filename}, pages={self.total_pages})" class DocumentChunk: """Represents a chunk of document content with metadata.""" def __init__( self, content: str, chunk_type: str, page_number: int, metadata: Dict[str, Any] = None, chunk_id: str = None, ): """ Initialize a DocumentChunk. Args: content: The text content of the chunk chunk_type: Type of chunk (e.g., 'text', 'table') page_number: Page number where this chunk appears metadata: Additional metadata about the chunk chunk_id: Unique identifier for the chunk (auto-generated if not provided) """ self.content = content self.chunk_type = chunk_type self.page_number = page_number self.metadata = metadata or {} self.chunk_id = chunk_id or str(uuid.uuid4()) def __repr__(self) -> str: return ( f"DocumentChunk(type={self.chunk_type}, page={self.page_number}, " f"length={len(self.content)})" ) class TableExtraction: """Represents a table extracted from a document.""" def __init__( self, headers: List[str], rows: List[List[str]], page_number: int, schema_summary: str, table_id: str = None, ): """ Initialize a TableExtraction. Args: headers: List of column headers rows: List of rows, each containing cell values page_number: Page number where this table appears schema_summary: Summary description of the table schema table_id: Unique identifier for the table (auto-generated if not provided) """ self.headers = headers self.rows = rows self.page_number = page_number self.schema_summary = schema_summary self.table_id = table_id or str(uuid.uuid4()) def __repr__(self) -> str: return ( f"TableExtraction(columns={len(self.headers)}, " f"rows={len(self.rows)}, page={self.page_number})" ) class ProcessedDocument: """Represents a fully processed document with text chunks and tables.""" def __init__( self, filename: str, text_chunks: List[DocumentChunk], tables: List["TableExtraction"], total_pages: int, file_type: str, images: Optional[List["ImageExtraction"]] = None, layout: Optional["LayoutExtraction"] = None, metadata: Optional["MetadataExtraction"] = None, ): """ Initialize a ProcessedDocument. Args: filename: Name of the document file text_chunks: List of text chunks extracted from the document tables: List of tables extracted from the document total_pages: Total number of pages in the document file_type: Type of file (e.g., 'pdf', 'docx') images: List of images extracted from the document (Phase 2) layout: Layout information (Phase 2) metadata: Document metadata (Phase 2) """ self.filename = filename self.text_chunks = text_chunks self.tables = tables self.total_pages = total_pages self.file_type = file_type self.images = images or [] self.layout = layout self.metadata = metadata def __repr__(self) -> str: return ( f"ProcessedDocument(filename={self.filename}, " f"text_chunks={len(self.text_chunks)}, " f"tables={len(self.tables)}, " f"images={len(self.images)})" ) class ImageExtraction: """Represents an image extracted from a document.""" def __init__( self, image: Image.Image, page_number: int, image_index: int, width: int, height: int, format: str, image_id: str = None, ): """ Initialize an ImageExtraction. Args: image: PIL Image object page_number: Page number where this image appears image_index: Index of image on the page width: Image width in pixels height: Image height in pixels format: Image format (png, jpg, etc.) image_id: Unique identifier for the image (auto-generated if not provided) """ self.image = image self.page_number = page_number self.image_index = image_index self.width = width self.height = height self.format = format self.image_id = image_id or str(uuid.uuid4()) def __repr__(self) -> str: return ( f"ImageExtraction(page={self.page_number}, " f"size={self.width}x{self.height}, format={self.format})" ) class LayoutExtraction: """Represents document layout and structure information.""" def __init__( self, sections: List[Dict[str, Any]], hierarchy: Dict[str, Any], page_layouts: List[Dict[str, Any]], total_pages: int, ): """ Initialize a LayoutExtraction. Args: sections: List of document sections with hierarchy info hierarchy: Document hierarchy tree page_layouts: Layout information per page total_pages: Total number of pages """ self.sections = sections self.hierarchy = hierarchy self.page_layouts = page_layouts self.total_pages = total_pages def __repr__(self) -> str: return ( f"LayoutExtraction(sections={len(self.sections)}, " f"pages={self.total_pages})" ) class MetadataExtraction: """Represents document metadata.""" def __init__( self, title: Optional[str] = None, author: Optional[str] = None, subject: Optional[str] = None, keywords: Optional[List[str]] = None, creator: Optional[str] = None, producer: Optional[str] = None, creation_date: Optional[str] = None, modification_date: Optional[str] = None, page_count: Optional[int] = None, custom_properties: Optional[Dict[str, Any]] = None, ): """ Initialize a MetadataExtraction. Args: title: Document title author: Document author subject: Document subject keywords: List of keywords creator: Creator application producer: Producer application creation_date: Creation date modification_date: Last modification date page_count: Number of pages custom_properties: Additional custom properties """ self.title = title self.author = author self.subject = subject self.keywords = keywords or [] self.creator = creator self.producer = producer self.creation_date = creation_date self.modification_date = modification_date self.page_count = page_count self.custom_properties = custom_properties or {} def __repr__(self) -> str: return ( f"MetadataExtraction(title={self.title}, " f"author={self.author}, pages={self.page_count})" )