Spaces:

DrAbdulmalek
/

OmniFile-Processor

Sleeping

OmniFile-Processor / modules /vision /text_reconstructor.py

Dr. Abdulmalek

deploy: OmniFile AI Processor v4.3.0

900df0b 20 days ago

22.5 kB

	"""
	مُعيد تجميع النصوص
	=====================
	إعادة تجميع الكلمات المكتشفة من OCR إلى نصوص مترابطة
	مع دعم خاص للنصوص العربية (RTL).

	القدرات:
	- تجميع الكلمات بناءً على إحداثياتها (x, y)
	- تجميع الكلمات في سطور حسب القرب العمودي
	- دعم النص العربي RTL باستخدام arabic-reshaper و python-bidi
	- التعامل مع النصوص المختلطة (عربي + إنجليزي)
	"""

	import logging
	import re
	from typing import Optional

	logger = logging.getLogger(__name__)


	class TextReconstructor:
	"""
	مُعيد تجميع النصوص - يعيد بناء الجمل من نتائج OCR على مستوى الكلمات.

	مثال الاستخدام:
	>>> reconstructor = TextReconstructor(line_threshold=15)
	>>> words = [
	... {"text": "مرحبا", "x": 200, "y": 10, "w": 50, "h": 20},
	... {"text": "بالعالم", "x": 140, "y": 10, "w": 60, "h": 20},
	... {"text": "Hello", "x": 10, "y": 50, "w": 40, "h": 20},
	... ]
	>>> text = reconstructor.reconstruct(words, direction="rtl")
	"""

	def __init__(
	self,
	line_threshold: float = 15.0,
	word_gap_threshold: float = 50.0,
	default_direction: str = "auto",
	) -> None:
	"""
	تهيئة مُعيد التجميع.

	Args:
	line_threshold: أقصى فرق عمودي (Y) لاعتبار كلمتين في نفس السطر
	word_gap_threshold: أقل مسافة أفقية لفصل الكلمات بمسافة
	default_direction: الاتجاه الافتراضي ("auto", "rtl", "ltr")
	"""
	self.line_threshold = line_threshold
	self.word_gap_threshold = word_gap_threshold
	self.default_direction = default_direction

	# التحقق من مكتبات إعادة تشكيل العربية
	self._has_reshaper = self._check_library(
	"arabic_reshaper", "arabic-reshaper"
	)
	self._has_bidi = self._check_library(
	"bidi", "python-bidi"
	)

	if not self._has_reshaper:
	logger.warning(
	"arabic-reshaper غير مثبت. النص العربي قد لا يظهر بشكل صحيح. "
	"قم بالتثبيت: pip install arabic-reshaper"
	)
	if not self._has_bidi:
	logger.warning(
	"python-bidi غير مثبت. اتجاه النص قد لا يكون صحيحاً. "
	"قم بالتثبيت: pip install python-bidi"
	)

	@staticmethod
	def _check_library(import_name: str, package_name: str) -> bool:
	"""التحقق من توفر مكتبة."""
	try:
	__import__(import_name)
	return True
	except ImportError:
	return False

	# ------------------------------------------------------------------
	# الأساليب العامة (Public API)
	# ------------------------------------------------------------------

	def reconstruct(
	self,
	words: list[dict],
	direction: str = "auto",
	) -> str:
	"""
	إعادة تجميع قائمة كلمات إلى نص مترابط.

	Args:
	words: قائمة كلمات، كل كلمة قاموس يحتوي:
	- text: النص
	- x, y: موقع أعلى اليسار
	- w, h: العرض والارتفاع
	direction: اتجاه النص ("auto", "rtl", "ltr")

	Returns:
	النص المُعاد تجميعه
	"""
	if not words:
	return ""

	# تنظيف الكلمات الفارغة
	valid_words = [
	w for w in words
	if w.get("text", "").strip()
	and all(k in w for k in ("x", "y", "w", "h"))
	]

	if not valid_words:
	return ""

	# تحديد الاتجاه
	detected_direction = self._detect_direction(valid_words, direction)

	# تجميع الكلمات في سطور
	lines = self._group_into_lines(valid_words)

	# ترتيب الكلمات داخل كل سطر
	ordered_lines: list[str] = []
	for line_words in lines:
	line_text = self._order_line(line_words, detected_direction)
	ordered_lines.append(line_text)

	# دمج الأسطر
	full_text = "\n".join(ordered_lines)

	return full_text.strip()

	def reconstruct_with_direction(
	self,
	words: list[dict],
	direction: str = "rtl",
	) -> str:
	"""
	إعادة تجميع النصوص مع تحديد الاتجاه بشكل صريح.

	Args:
	words: قائمة كلمات OCR
	direction: "rtl" أو "ltr"

	Returns:
	النص المُعاد تجميعه مع معالجة الاتجاه
	"""
	if direction not in ("rtl", "ltr"):
	logger.warning(
	"اتجاه غير معروف '%s' - سيتم استخدام auto", direction
	)
	return self.reconstruct(words, direction="auto")

	text = self.reconstruct(words, direction=direction)

	# إعادة تشكيل النص العربي إذا توفرت المكتبات
	if direction == "rtl" and self._has_reshaper and self._has_bidi:
	text = self._apply_arabic_reshaping(text)

	return text

	def get_statistics(self, words: list[dict]) -> dict:
	"""
	الحصول على إحصائيات حول نتائج OCR.

	Args:
	words: قائمة كلمات OCR

	Returns:
	قاموس يحتوي إحصائيات
	"""
	if not words:
	return {"total_words": 0}

	valid_words = [
	w for w in words
	if w.get("text", "").strip()
	]

	lines = self._group_into_lines(valid_words)

	# اكتشاف نسبة العربية
	arabic_count = sum(
	1 for w in valid_words
	if self._is_arabic_text(w.get("text", ""))
	)

	return {
	"total_words": len(valid_words),
	"total_lines": len(lines),
	"arabic_words": arabic_count,
	"english_words": len(valid_words) - arabic_count,
	"arabic_ratio": arabic_count / max(1, len(valid_words)),
	"direction": self._detect_direction(valid_words, "auto"),
	}

	# ------------------------------------------------------------------
	# الأساليب الداخلية - التجميع والترتيب
	# ------------------------------------------------------------------

	def _group_into_lines(
	self, words: list[dict]
	) -> list[list[dict]]:
	"""
	تجميع الكلمات في أسطر بناءً على القرب العمودي.

	الخوارزمية:
	1. ترتيب الكلمات حسب Y
	2. تجميع الكلمات القريبة عمودياً في نفس السطر
	3. استخدام المتوسط المتحرك لحدود الأسطر

	Args:
	words: قائمة كلمات صالحة

	Returns:
	قائمة أسطر، كل سطر قائمة كلمات
	"""
	# ترتيب حسب Y أولاً (الصفوف العلوية أولاً)
	sorted_words = sorted(words, key=lambda w: w["y"])

	lines: list[list[dict]] = []
	current_line: list[dict] = [sorted_words[0]]

	for word in sorted_words[1:]:
	# حساب متوسط Y للسطر الحالي
	avg_y = sum(w["y"] for w in current_line) / len(current_line)
	word_center_y = word["y"] + word["h"] / 2
	current_center_y = avg_y + (current_line[0]["h"] / 2)

	# إذا كانت الكلمة قريبة عمودياً من السطر الحالي
	if abs(word_center_y - current_center_y) <= self.line_threshold:
	current_line.append(word)
	else:
	# سطر جديد
	lines.append(current_line)
	current_line = [word]

	# إضافة السطر الأخير
	if current_line:
	lines.append(current_line)

	# ترتيب كل سطر حسب Y المتوسط (للضمان)
	lines.sort(key=lambda line: sum(w["y"] for w in line) / len(line))

	return lines

	def _order_line(
	self,
	line_words: list[dict],
	direction: str,
	) -> str:
	"""
	ترتيب الكلمات داخل سطر وبناء النص.

	Args:
	line_words: كلمات في نفس السطر
	direction: اتجاه النص

	Returns:
	نص السطر
	"""
	if not line_words:
	return ""

	# ترتيب حسب X (اليسار لليمين أولاً)
	sorted_by_x = sorted(line_words, key=lambda w: w["x"])

	if direction == "rtl":
	# للعربية: الكلمات على اليمين تأتي أولاً
	# لكننا نبقي ترتيب X لأن الكلمة اليمنى لها x أكبر
	# نحتاج لعكس الترتيب
	sorted_by_x = sorted(line_words, key=lambda w: -w["x"])

	# بناء النص مع مراعاة المسافات
	result_parts: list[str] = []

	for i, word in enumerate(sorted_by_x):
	text = word["text"].strip()
	if not text:
	continue

	if i == 0:
	result_parts.append(text)
	else:
	# حساب المسافة من الكلمة السابقة
	prev_word = sorted_by_x[i - 1]
	gap = self._calculate_gap(prev_word, word, direction)

	if gap > self.word_gap_threshold:
	# مسافة كبيرة = مسافة بين كلمات
	result_parts.append(" ")
	result_parts.append(text)
	else:
	# مسافة صغيرة = كلمات متصلة أو مسافة عادية
	result_parts.append(" ")
	result_parts.append(text)

	return "".join(result_parts).strip()

	@staticmethod
	def _calculate_gap(
	word1: dict, word2: dict, direction: str
	) -> float:
	"""
	حساب المسافة الأفقية بين كلمتين.

	Args:
	word1: الكلمة الأولى
	word2: الكلمة الثانية
	direction: اتجاه النص

	Returns:
	المسافة بالبكسل
	"""
	if direction == "rtl":
	# للعربية: word1 على اليمين و word2 على اليسار
	# word1.x > word2.x (عادةً)
	# المسافة = word1.x - (word2.x + word2.w)
	right_word = word1 if word1["x"] > word2["x"] else word2
	left_word = word2 if word1["x"] > word2["x"] else word1
	return max(0, left_word["x"] - (right_word["x"] + right_word["w"]))
	else:
	# للإنجليزية: word1 على اليسار و word2 على اليمين
	left_word = word1 if word1["x"] < word2["x"] else word2
	right_word = word2 if word1["x"] < word2["x"] else word1
	return max(0, right_word["x"] - (left_word["x"] + left_word["w"]))

	# ------------------------------------------------------------------
	# الأساليب الداخلية - كشف الاتجاه
	# ------------------------------------------------------------------

	def _detect_direction(
	self, words: list[dict], hint: str
	) -> str:
	"""
	اكتشاف اتجاه النص تلقائياً أو استخدام الإشارة المحددة.

	Args:
	words: قائمة الكلمات
	hint: الإشارة ("auto", "rtl", "ltr")

	Returns:
	"rtl" أو "ltr"
	"""
	if hint in ("rtl", "ltr"):
	return hint

	# كشف تلقائي
	if hint == "auto" or hint not in ("rtl", "ltr"):
	arabic_chars = 0
	latin_chars = 0

	arabic_ranges = [
	(0x0600, 0x06FF),
	(0x0750, 0x077F),
	(0x08A0, 0x08FF),
	(0xFB50, 0xFDFF),
	(0xFE70, 0xFEFF),
	(0x0660, 0x0669),
	]

	for word in words:
	text = word.get("text", "")
	if not text.strip():
	continue

	for char in text:
	code = ord(char)
	if any(start <= code <= end for start, end in arabic_ranges):
	arabic_chars += 1
	elif ("A" <= char <= "Z") or ("a" <= char <= "z"):
	latin_chars += 1

	if arabic_chars == 0 and latin_chars == 0:
	return "ltr"

	if arabic_chars > latin_chars:
	logger.debug(
	"اتجاه RTL مكتشف (حروف عربية: %d، لاتينية: %d)",
	arabic_chars,
	latin_chars,
	)
	return "rtl"

	logger.debug(
	"اتجاه LTR مكتشف (حروف عربية: %d، لاتينية: %d)",
	arabic_chars,
	latin_chars,
	)
	return "ltr"

	return "ltr"

	@staticmethod
	def _is_arabic_text(text: str) -> bool:
	"""
	التحقق مما إذا كان النص يحتوي على حروف عربية.

	يتحقق من وجود حروف عربية (U+0600–U+06FF) أو أرقام هندية.

	Args:
	text: النص المراد فحصه

	Returns:
	True إذا كان النص يحتوي على عربية
	"""
	if not text:
	return False

	# نطاقات اليونيكود العربية
	arabic_ranges = [
	(0x0600, 0x06FF), # الحروف العربية
	(0x0750, 0x077F), # امتدادات العربية
	(0x08A0, 0x08FF), # امتدادات إضافية
	(0xFB50, 0xFDFF), # أشكال العرض العربية
	(0xFE70, 0xFEFF), # أشكال العرض العربية - B
	(0x0660, 0x0669), # الأرقام الهندية
	]

	for char in text:
	code = ord(char)
	for start, end in arabic_ranges:
	if start <= code <= end:
	return True

	return False

	@staticmethod
	def _is_latin_text(text: str) -> bool:
	"""
	التحقق مما إذا كان النص يحتوي على حروف لاتينية/إنجليزية.

	Args:
	text: النص المراد فحصه

	Returns:
	True إذا كان النص يحتوي على لاتينية
	"""
	if not text:
	return False

	for char in text:
	if ("A" <= char <= "Z") or ("a" <= char <= "z"):
	return True

	return False

	# ------------------------------------------------------------------
	# أساليب إعادة تشكيل النص العربي
	# ------------------------------------------------------------------

	def _apply_arabic_reshaping(self, text: str) -> str:
	"""
	إعادة تشكيل النص العربي ليظهر بشكل صحيح.

	تستخدم arabic-reshaper لتوصيل الحروف
	و python-bidi لعكس اتجاه العرض.

	Args:
	text: النص العربي الخام

	Returns:
	النص المعاد تشكيله
	"""
	if not text:
	return text

	try:
	import arabic_reshaper
	from bidi.algorithm import get_display

	# تقسيم النص إلى أسطر ومعالجة كل سطر
	lines = text.split("\n")
	reshaped_lines: list[str] = []

	for line in lines:
	if not line.strip():
	reshaped_lines.append(line)
	continue

	# التعامل مع النص المختلط (عربي + إنجليزي)
	segments = self._split_mixed_text(line)

	reshaped_segments: list[str] = []
	for segment in segments:
	if segment["type"] == "arabic":
	reshaped = arabic_reshaper.reshape(segment["text"])
	displayed = get_display(reshaped)
	reshaped_segments.append(displayed)
	else:
	reshaped_segments.append(segment["text"])

	reshaped_lines.append("".join(reshaped_segments))

	return "\n".join(reshaped_lines)

	except Exception as e:
	logger.warning("فشل في إعادة تشكيل النص العربي: %s", e)
	return text

	@staticmethod
	def _split_mixed_text(text: str) -> list[dict]:
	"""
	تقسيم النص المختلط إلى أجزاء عربية وغير عربية.

	Args:
	text: النص المختلط

	Returns:
	قائمة أجزاء: [{"text": "...", "type": "arabic\|other"}]
	"""
	if not text:
	return []

	segments: list[dict] = []
	current_segment = ""
	current_type = None

	arabic_ranges = [
	(0x0600, 0x06FF),
	(0x0750, 0x077F),
	(0x08A0, 0x08FF),
	(0xFB50, 0xFDFF),
	(0xFE70, 0xFEFF),
	]

	for char in text:
	code = ord(char)
	is_arabic = any(start <= code <= end for start, end in arabic_ranges)
	is_space = char in (" ", "\t", "\n")

	char_type = "arabic" if is_arabic else "other"

	# المسافات تنضم للنوع الحالي
	if is_space:
	current_segment += char
	continue

	if current_type is None:
	current_type = char_type
	current_segment = char
	elif char_type == current_type:
	current_segment += char
	else:
	# تغيير النوع
	if current_segment.strip():
	segments.append({
	"text": current_segment,
	"type": current_type,
	})
	current_type = char_type
	current_segment = char

	# إضافة الجزء الأخير
	if current_segment.strip():
	segments.append({
	"text": current_segment,
	"type": current_type,
	})

	return segments

	def reconstruct_mixed_paragraph(
	self,
	words: list[dict],
	) -> str:
	"""
	إعادة تجميع فقرة مختلطة (عربي + إنجليزي) مع معالجة ذكية.

	يحاول فصل الأجزاء العربية عن الإنجليزية ويعالج كل جزء
	حسب اتجاهه المناسب.

	Args:
	words: قائمة كلمات OCR

	Returns:
	النص المُعاد تجميعه
	"""
	if not words:
	return ""

	# تجميع في سطور
	valid_words = [
	w for w in words
	if w.get("text", "").strip()
	and all(k in w for k in ("x", "y", "w", "h"))
	]

	if not valid_words:
	return ""

	lines = self._group_into_lines(valid_words)
	result_lines: list[str] = []

	for line_words in lines:
	# فصل كلمات العربي عن الإنجليزي
	arabic_words = [
	w for w in line_words
	if self._is_arabic_text(w["text"])
	]
	english_words = [
	w for w in line_words
	if self._is_latin_text(w["text"])
	]

	# ترتيب كل مجموعة
	arabic_sorted = sorted(
	arabic_words, key=lambda w: -w["x"]
	)
	english_sorted = sorted(
	english_words, key=lambda w: w["x"]
	)

	# دمج حسب الموقع
	line_text = self._merge_mixed_line(
	arabic_sorted, english_sorted, line_words
	)
	result_lines.append(line_text)

	full_text = "\n".join(result_lines)

	# إعادة تشكيل العربي
	if self._has_reshaper and self._has_bidi:
	full_text = self._apply_arabic_reshaping(full_text)

	return full_text.strip()

	@staticmethod
	def _merge_mixed_line(
	arabic_words: list[dict],
	english_words: list[dict],
	all_words: list[dict],
	) -> str:
	"""
	دمج كلمات عربية وإنجليزية في سطر واحد حسب الموقع.

	Args:
	arabic_words: الكلمات العربية (مرتبة RTL)
	english_words: الكلمات الإنجليزية (مرتبة LTR)
	all_words: كل الكلمات (مرتبة حسب الموقع الأصلي)

	Returns:
	نص السطر المدمج
	"""
	# إنشاء خريطة الموقع -> النص
	position_map: dict[tuple[int, int], str] = {}
	for w in all_words:
	center_x = w["x"] + w["w"] // 2
	center_y = w["y"] + w["h"] // 2
	position_map[(center_x, center_y)] = w["text"].strip()

	# ترتيب حسب الموقع الأصلي (X تنازلياً للعربية)
	sorted_positions = sorted(
	position_map.keys(),
	key=lambda pos: pos[0],
	)

	# البناء - نعكس النص العربي فقط
	parts: list[str] = []
	for pos in sorted_positions:
	text = position_map[pos]
	parts.append(text)

	return " ".join(parts)