Spaces:

DrAbdulmalek
/

OmniFile-Processor

Running

OmniFile-Processor / modules /core /structure.py

Dr. Abdulmalek

deploy: OmniFile AI Processor v4.3.0

900df0b 7 days ago

10.3 kB

	"""
	وحدة البنية الأساسية للمستندات (Document Structure Module)
	=============================================================
	تعريف مخطط البيانات لنتائج OCR باستخدام نماذج Pydantic v2.
	جميع مكونات OCR تنتج وتستهلك هذه الأنواع المهيكلة.

	Defines the data schema for OCR results using Pydantic v2 models.
	All OCR components produce and consume these structured types.

	المصدر: دمج من مشروع arabic-ocr-pro
	Source: Merged from arabic-ocr-pro project

	OmniFile AI Processor - وحدة معالجة الملفات الذكية
	"""

	from __future__ import annotations

	from enum import Enum
	from typing import Optional

	from pydantic import BaseModel, Field


	class BlockType(str, Enum):
	"""أنواع كتل محتوى المستند / Types of document content blocks."""
	TEXT = "text"
	HEADING = "heading"
	TABLE = "table"
	IMAGE = "image"
	FOOTER = "footer"
	HEADER = "header"
	UNKNOWN = "unknown"


	class BBox(BaseModel):
	"""مربع إحاطي محاذي للمحاور بإحداثيات البكسل / Axis-aligned bounding box.

	Attributes:
	x: الحافة اليسرى / Left edge x-coordinate
	y: الحافة العلوية / Top edge y-coordinate
	width: العرض / Width
	height: الارتفاع / Height
	"""

	x: int = Field(ge=0, description="Left edge x-coordinate in pixels")
	y: int = Field(ge=0, description="Top edge y-coordinate in pixels")
	width: int = Field(ge=0, description="Width in pixels")
	height: int = Field(ge=0, description="Height in pixels")

	@property
	def x2(self) -> int:
	"""الحافة اليمنى / Right edge x-coordinate."""
	return self.x + self.width

	@property
	def y2(self) -> int:
	"""الحافة السفلية / Bottom edge y-coordinate."""
	return self.y + self.height

	@property
	def area(self) -> int:
	"""المساحة بالبكسل المربع / Area in square pixels."""
	return self.width * self.height

	@property
	def center(self) -> tuple[int, int]:
	"""نقطة المركز / Center point."""
	return (self.x + self.width // 2, self.y + self.height // 2)

	def intersection(self, other: BBox) -> Optional[BBox]:
	"""حساب تقاطع مربعين / Compute intersection of two bounding boxes."""
	x1 = max(self.x, other.x)
	y1 = max(self.y, other.y)
	x2 = min(self.x2, other.x2)
	y2 = min(self.y2, other.y2)

	if x1 >= x2 or y1 >= y2:
	return None

	return BBox(x=x1, y=y1, width=x2 - x1, height=y2 - y1)

	def iou(self, other: BBox) -> float:
	"""حساب نسبة التقاطع على الاتحاد / Compute IoU."""
	intersection = self.intersection(other)
	if intersection is None:
	return 0.0

	inter_area = intersection.area
	union_area = self.area + other.area - inter_area

	if union_area == 0:
	return 0.0

	return inter_area / union_area

	def contains(self, other: BBox) -> bool:
	"""التحقق من احتواء مربع لآخر / Check if this box fully contains another."""
	return (
	self.x <= other.x
	and self.y <= other.y
	and self.x2 >= other.x2
	and self.y2 >= other.y2
	)


	class OCRToken(BaseModel):
	"""رمز OCR واحد (كلمة أو مجموعة أحرف) / Single OCR result token.

	Attributes:
	text: النص المعترف به / Recognized text
	bbox: مربع الإحاطة / Bounding box
	confidence: درجة الثقة / Confidence score
	engine: محرك OCR المصدر / Source OCR engine
	"""

	text: str = Field(description="Recognized text content")
	bbox: BBox = Field(description="Bounding box in the source image")
	confidence: float = Field(ge=0.0, le=1.0, description="Recognition confidence score")
	engine: str = Field(default="", description="Source OCR engine name")

	def is_arabic(self) -> bool:
	"""هل الرمز يحتوي أحرف عربية؟ / Check if token contains Arabic characters."""
	return any("\u0600" <= ch <= "\u06FF" or "\uFB50" <= ch <= "\uFDFF" or "\uFE70" <= ch <= "\uFEFF" for ch in self.text)

	def is_empty(self) -> bool:
	"""هل الرمز فارغ؟ / Check if token has empty text."""
	return len(self.text.strip()) == 0


	class DocumentBlock(BaseModel):
	"""كتلة محتوى مستند (فقرة، عنوان، جدول، إلخ) / Document content block.

	Attributes:
	block_type: تصنيف الكتلة / Block type classification
	tokens: رموز OCR داخل الكتلة / OCR tokens
	bbox: مربع إحاطة الكتلة / Bounding box
	confidence: متوسط الثقة / Average confidence
	table_data: بيانات الجدول المهيكلة / Structured table data
	"""

	block_type: BlockType = Field(default=BlockType.TEXT, description="Type of content block")
	tokens: list[OCRToken] = Field(default_factory=list, description="OCR tokens in this block")
	bbox: Optional[BBox] = Field(default=None, description="Bounding box of the entire block")
	confidence: float = Field(default=0.0, ge=0.0, le=1.0, description="Average confidence score")
	table_data: Optional[list[list[str]]] = Field(
	default=None,
	description="Structured table data (rows x cols) if this is a table block",
	)

	def get_text(self) -> str:
	"""الحصول على النص المدمج / Get concatenated text."""
	return " ".join(token.text for token in self.tokens if not token.is_empty())

	def compute_confidence(self) -> float:
	"""حساب متوسط الثقة / Compute average confidence."""
	non_empty = [t for t in self.tokens if not t.is_empty()]
	if not non_empty:
	return 0.0
	return sum(t.confidence for t in non_empty) / len(non_empty)


	class DocumentPage(BaseModel):
	"""صفحة واحدة من المستند / A single page of a document.

	Attributes:
	page_number: رقم الصفحة (يبدأ من 1) / 1-based page index
	width: عرض الصفحة / Page width
	height: ارتفاع الصفحة / Page height
	blocks: كتل المحتوى / Content blocks
	image_path: مسار صورة الصفحة / Path to page image
	"""

	page_number: int = Field(ge=1, description="1-based page index")
	width: int = Field(ge=0, description="Page width in pixels")
	height: int = Field(ge=0, description="Page height in pixels")
	blocks: list[DocumentBlock] = Field(default_factory=list, description="Detected content blocks")
	image_path: Optional[str] = Field(default=None, description="Path to the rendered page image")

	def get_full_text(self) -> str:
	"""الحصول على كل النصوص من الصفحة / Get all text from the page."""
	texts = []
	for block in self.blocks:
	text = block.get_text().strip()
	if text:
	texts.append(text)
	return "\n".join(texts)


	class DocumentMetadata(BaseModel):
	"""بيانات وصفية عن المستند / Document metadata.

	Attributes:
	filename: اسم الملف الأصلي / Original file name
	file_size: حجم الملف / File size
	page_count: عدد الصفحات / Total pages
	processing_time: وقت المعالجة / Processing time
	engine_used: محرك OCR المستخدم / Primary OCR engine
	preprocessing_applied: خطوات المعالجة المسبقة / Preprocessing steps
	"""

	filename: str = Field(default="", description="Original source file name")
	file_size: int = Field(default=0, ge=0, description="File size in bytes")
	page_count: int = Field(default=0, ge=0, description="Total number of pages")
	processing_time: float = Field(default=0.0, ge=0.0, description="Processing time in seconds")
	engine_used: str = Field(default="", description="Primary OCR engine used")
	preprocessing_applied: list[str] = Field(
	default_factory=list,
	description="List of preprocessing steps that were applied",
	)


	class Document(BaseModel):
	"""نتيجة مستند OCR الكامل / Complete OCR document result.

	Attributes:
	pages: صفحات المستند / Document pages
	metadata: البيانات الوصفية / Document metadata
	"""

	pages: list[DocumentPage] = Field(default_factory=list, description="Document pages with OCR results")
	metadata: DocumentMetadata = Field(default_factory=DocumentMetadata, description="Document metadata")

	def get_full_text(self) -> str:
	"""الحصول على كل النصوص من جميع الصفحات / Get all text from all pages."""
	page_texts = []
	for page in self.pages:
	text = page.get_full_text().strip()
	if text:
	page_texts.append(f"--- Page {page.page_number} ---\n{text}")
	return "\n\n".join(page_texts)

	def get_page_text(self, page_number: int) -> str:
	"""الحصول على نص صفحة محددة / Get text from a specific page."""
	for page in self.pages:
	if page.page_number == page_number:
	return page.get_full_text()
	return ""

	def get_all_tables(self) -> list[list[list[str]]]:
	"""استخراج كل الجداول / Extract all tables from the document."""
	tables = []
	for page in self.pages:
	for block in page.blocks:
	if block.block_type == BlockType.TABLE and block.table_data:
	tables.append(block.table_data)
	return tables

	@property
	def total_pages(self) -> int:
	"""عدد الصفحات / Total number of pages."""
	return len(self.pages)

	@property
	def total_tokens(self) -> int:
	"""عدد الرموز / Total number of OCR tokens."""
	return sum(
	len(block.tokens)
	for page in self.pages
	for block in page.blocks
	)