Spaces:

syedmohaiminulhoque
/

agentic-doc-sim-streamlit

Running

App Files Files Community

agentic-doc-sim-streamlit / src /models /document.py

syedmohaiminulhoque

feat: Implement Phase 2 enhancements for multi-modal document comparison

b73db8b 17 days ago

raw

history blame contribute delete

8.65 kB

	"""
	Data models for documents and document chunks.
	"""
	from typing import List, Dict, Any, Optional
	import uuid
	from PIL import Image


	class RawDocument:
	"""Represents a raw document with extracted content."""

	def __init__(
	self,
	filename: str,
	file_type: str,
	pages: List[Dict[str, Any]],
	raw_text: str,
	raw_tables: List[Dict[str, Any]],
	total_pages: int,
	metadata: Optional[Dict[str, Any]] = None,
	):
	"""
	Initialize a RawDocument.

	Args:
	filename: Name of the document file
	file_type: Type of file (e.g., 'pdf', 'docx')
	pages: List of page dictionaries with 'page_num' and 'text' keys
	raw_text: Full extracted text from the document
	raw_tables: List of tables extracted from the document
	total_pages: Total number of pages in the document
	metadata: Additional metadata (file_path, author, etc.)
	"""
	self.filename = filename
	self.file_type = file_type
	self.pages = pages
	self.raw_text = raw_text
	self.raw_tables = raw_tables
	self.total_pages = total_pages
	self.metadata = metadata or {}

	def __repr__(self) -> str:
	return f"RawDocument(filename={self.filename}, pages={self.total_pages})"


	class DocumentChunk:
	"""Represents a chunk of document content with metadata."""

	def __init__(
	self,
	content: str,
	chunk_type: str,
	page_number: int,
	metadata: Dict[str, Any] = None,
	chunk_id: str = None,
	):
	"""
	Initialize a DocumentChunk.

	Args:
	content: The text content of the chunk
	chunk_type: Type of chunk (e.g., 'text', 'table')
	page_number: Page number where this chunk appears
	metadata: Additional metadata about the chunk
	chunk_id: Unique identifier for the chunk (auto-generated if not provided)
	"""
	self.content = content
	self.chunk_type = chunk_type
	self.page_number = page_number
	self.metadata = metadata or {}
	self.chunk_id = chunk_id or str(uuid.uuid4())

	def __repr__(self) -> str:
	return (
	f"DocumentChunk(type={self.chunk_type}, page={self.page_number}, "
	f"length={len(self.content)})"
	)


	class TableExtraction:
	"""Represents a table extracted from a document."""

	def __init__(
	self,
	headers: List[str],
	rows: List[List[str]],
	page_number: int,
	schema_summary: str,
	table_id: str = None,
	):
	"""
	Initialize a TableExtraction.

	Args:
	headers: List of column headers
	rows: List of rows, each containing cell values
	page_number: Page number where this table appears
	schema_summary: Summary description of the table schema
	table_id: Unique identifier for the table (auto-generated if not provided)
	"""
	self.headers = headers
	self.rows = rows
	self.page_number = page_number
	self.schema_summary = schema_summary
	self.table_id = table_id or str(uuid.uuid4())

	def __repr__(self) -> str:
	return (
	f"TableExtraction(columns={len(self.headers)}, "
	f"rows={len(self.rows)}, page={self.page_number})"
	)


	class ProcessedDocument:
	"""Represents a fully processed document with text chunks and tables."""

	def __init__(
	self,
	filename: str,
	text_chunks: List[DocumentChunk],
	tables: List["TableExtraction"],
	total_pages: int,
	file_type: str,
	images: Optional[List["ImageExtraction"]] = None,
	layout: Optional["LayoutExtraction"] = None,
	metadata: Optional["MetadataExtraction"] = None,
	):
	"""
	Initialize a ProcessedDocument.

	Args:
	filename: Name of the document file
	text_chunks: List of text chunks extracted from the document
	tables: List of tables extracted from the document
	total_pages: Total number of pages in the document
	file_type: Type of file (e.g., 'pdf', 'docx')
	images: List of images extracted from the document (Phase 2)
	layout: Layout information (Phase 2)
	metadata: Document metadata (Phase 2)
	"""
	self.filename = filename
	self.text_chunks = text_chunks
	self.tables = tables
	self.total_pages = total_pages
	self.file_type = file_type
	self.images = images or []
	self.layout = layout
	self.metadata = metadata

	def __repr__(self) -> str:
	return (
	f"ProcessedDocument(filename={self.filename}, "
	f"text_chunks={len(self.text_chunks)}, "
	f"tables={len(self.tables)}, "
	f"images={len(self.images)})"
	)


	class ImageExtraction:
	"""Represents an image extracted from a document."""

	def __init__(
	self,
	image: Image.Image,
	page_number: int,
	image_index: int,
	width: int,
	height: int,
	format: str,
	image_id: str = None,
	):
	"""
	Initialize an ImageExtraction.

	Args:
	image: PIL Image object
	page_number: Page number where this image appears
	image_index: Index of image on the page
	width: Image width in pixels
	height: Image height in pixels
	format: Image format (png, jpg, etc.)
	image_id: Unique identifier for the image (auto-generated if not provided)
	"""
	self.image = image
	self.page_number = page_number
	self.image_index = image_index
	self.width = width
	self.height = height
	self.format = format
	self.image_id = image_id or str(uuid.uuid4())

	def __repr__(self) -> str:
	return (
	f"ImageExtraction(page={self.page_number}, "
	f"size={self.width}x{self.height}, format={self.format})"
	)


	class LayoutExtraction:
	"""Represents document layout and structure information."""

	def __init__(
	self,
	sections: List[Dict[str, Any]],
	hierarchy: Dict[str, Any],
	page_layouts: List[Dict[str, Any]],
	total_pages: int,
	):
	"""
	Initialize a LayoutExtraction.

	Args:
	sections: List of document sections with hierarchy info
	hierarchy: Document hierarchy tree
	page_layouts: Layout information per page
	total_pages: Total number of pages
	"""
	self.sections = sections
	self.hierarchy = hierarchy
	self.page_layouts = page_layouts
	self.total_pages = total_pages

	def __repr__(self) -> str:
	return (
	f"LayoutExtraction(sections={len(self.sections)}, "
	f"pages={self.total_pages})"
	)


	class MetadataExtraction:
	"""Represents document metadata."""

	def __init__(
	self,
	title: Optional[str] = None,
	author: Optional[str] = None,
	subject: Optional[str] = None,
	keywords: Optional[List[str]] = None,
	creator: Optional[str] = None,
	producer: Optional[str] = None,
	creation_date: Optional[str] = None,
	modification_date: Optional[str] = None,
	page_count: Optional[int] = None,
	custom_properties: Optional[Dict[str, Any]] = None,
	):
	"""
	Initialize a MetadataExtraction.

	Args:
	title: Document title
	author: Document author
	subject: Document subject
	keywords: List of keywords
	creator: Creator application
	producer: Producer application
	creation_date: Creation date
	modification_date: Last modification date
	page_count: Number of pages
	custom_properties: Additional custom properties
	"""
	self.title = title
	self.author = author
	self.subject = subject
	self.keywords = keywords or []
	self.creator = creator
	self.producer = producer
	self.creation_date = creation_date
	self.modification_date = modification_date
	self.page_count = page_count
	self.custom_properties = custom_properties or {}

	def __repr__(self) -> str:
	return (
	f"MetadataExtraction(title={self.title}, "
	f"author={self.author}, pages={self.page_count})"
	)