syedmohaiminulhoque's picture
feat: Implement Phase 2 enhancements for multi-modal document comparison
b73db8b
"""
Data models for documents and document chunks.
"""
from typing import List, Dict, Any, Optional
import uuid
from PIL import Image
class RawDocument:
"""Represents a raw document with extracted content."""
def __init__(
self,
filename: str,
file_type: str,
pages: List[Dict[str, Any]],
raw_text: str,
raw_tables: List[Dict[str, Any]],
total_pages: int,
metadata: Optional[Dict[str, Any]] = None,
):
"""
Initialize a RawDocument.
Args:
filename: Name of the document file
file_type: Type of file (e.g., 'pdf', 'docx')
pages: List of page dictionaries with 'page_num' and 'text' keys
raw_text: Full extracted text from the document
raw_tables: List of tables extracted from the document
total_pages: Total number of pages in the document
metadata: Additional metadata (file_path, author, etc.)
"""
self.filename = filename
self.file_type = file_type
self.pages = pages
self.raw_text = raw_text
self.raw_tables = raw_tables
self.total_pages = total_pages
self.metadata = metadata or {}
def __repr__(self) -> str:
return f"RawDocument(filename={self.filename}, pages={self.total_pages})"
class DocumentChunk:
"""Represents a chunk of document content with metadata."""
def __init__(
self,
content: str,
chunk_type: str,
page_number: int,
metadata: Dict[str, Any] = None,
chunk_id: str = None,
):
"""
Initialize a DocumentChunk.
Args:
content: The text content of the chunk
chunk_type: Type of chunk (e.g., 'text', 'table')
page_number: Page number where this chunk appears
metadata: Additional metadata about the chunk
chunk_id: Unique identifier for the chunk (auto-generated if not provided)
"""
self.content = content
self.chunk_type = chunk_type
self.page_number = page_number
self.metadata = metadata or {}
self.chunk_id = chunk_id or str(uuid.uuid4())
def __repr__(self) -> str:
return (
f"DocumentChunk(type={self.chunk_type}, page={self.page_number}, "
f"length={len(self.content)})"
)
class TableExtraction:
"""Represents a table extracted from a document."""
def __init__(
self,
headers: List[str],
rows: List[List[str]],
page_number: int,
schema_summary: str,
table_id: str = None,
):
"""
Initialize a TableExtraction.
Args:
headers: List of column headers
rows: List of rows, each containing cell values
page_number: Page number where this table appears
schema_summary: Summary description of the table schema
table_id: Unique identifier for the table (auto-generated if not provided)
"""
self.headers = headers
self.rows = rows
self.page_number = page_number
self.schema_summary = schema_summary
self.table_id = table_id or str(uuid.uuid4())
def __repr__(self) -> str:
return (
f"TableExtraction(columns={len(self.headers)}, "
f"rows={len(self.rows)}, page={self.page_number})"
)
class ProcessedDocument:
"""Represents a fully processed document with text chunks and tables."""
def __init__(
self,
filename: str,
text_chunks: List[DocumentChunk],
tables: List["TableExtraction"],
total_pages: int,
file_type: str,
images: Optional[List["ImageExtraction"]] = None,
layout: Optional["LayoutExtraction"] = None,
metadata: Optional["MetadataExtraction"] = None,
):
"""
Initialize a ProcessedDocument.
Args:
filename: Name of the document file
text_chunks: List of text chunks extracted from the document
tables: List of tables extracted from the document
total_pages: Total number of pages in the document
file_type: Type of file (e.g., 'pdf', 'docx')
images: List of images extracted from the document (Phase 2)
layout: Layout information (Phase 2)
metadata: Document metadata (Phase 2)
"""
self.filename = filename
self.text_chunks = text_chunks
self.tables = tables
self.total_pages = total_pages
self.file_type = file_type
self.images = images or []
self.layout = layout
self.metadata = metadata
def __repr__(self) -> str:
return (
f"ProcessedDocument(filename={self.filename}, "
f"text_chunks={len(self.text_chunks)}, "
f"tables={len(self.tables)}, "
f"images={len(self.images)})"
)
class ImageExtraction:
"""Represents an image extracted from a document."""
def __init__(
self,
image: Image.Image,
page_number: int,
image_index: int,
width: int,
height: int,
format: str,
image_id: str = None,
):
"""
Initialize an ImageExtraction.
Args:
image: PIL Image object
page_number: Page number where this image appears
image_index: Index of image on the page
width: Image width in pixels
height: Image height in pixels
format: Image format (png, jpg, etc.)
image_id: Unique identifier for the image (auto-generated if not provided)
"""
self.image = image
self.page_number = page_number
self.image_index = image_index
self.width = width
self.height = height
self.format = format
self.image_id = image_id or str(uuid.uuid4())
def __repr__(self) -> str:
return (
f"ImageExtraction(page={self.page_number}, "
f"size={self.width}x{self.height}, format={self.format})"
)
class LayoutExtraction:
"""Represents document layout and structure information."""
def __init__(
self,
sections: List[Dict[str, Any]],
hierarchy: Dict[str, Any],
page_layouts: List[Dict[str, Any]],
total_pages: int,
):
"""
Initialize a LayoutExtraction.
Args:
sections: List of document sections with hierarchy info
hierarchy: Document hierarchy tree
page_layouts: Layout information per page
total_pages: Total number of pages
"""
self.sections = sections
self.hierarchy = hierarchy
self.page_layouts = page_layouts
self.total_pages = total_pages
def __repr__(self) -> str:
return (
f"LayoutExtraction(sections={len(self.sections)}, "
f"pages={self.total_pages})"
)
class MetadataExtraction:
"""Represents document metadata."""
def __init__(
self,
title: Optional[str] = None,
author: Optional[str] = None,
subject: Optional[str] = None,
keywords: Optional[List[str]] = None,
creator: Optional[str] = None,
producer: Optional[str] = None,
creation_date: Optional[str] = None,
modification_date: Optional[str] = None,
page_count: Optional[int] = None,
custom_properties: Optional[Dict[str, Any]] = None,
):
"""
Initialize a MetadataExtraction.
Args:
title: Document title
author: Document author
subject: Document subject
keywords: List of keywords
creator: Creator application
producer: Producer application
creation_date: Creation date
modification_date: Last modification date
page_count: Number of pages
custom_properties: Additional custom properties
"""
self.title = title
self.author = author
self.subject = subject
self.keywords = keywords or []
self.creator = creator
self.producer = producer
self.creation_date = creation_date
self.modification_date = modification_date
self.page_count = page_count
self.custom_properties = custom_properties or {}
def __repr__(self) -> str:
return (
f"MetadataExtraction(title={self.title}, "
f"author={self.author}, pages={self.page_count})"
)