Spaces:

DrAbdulmalek
/

OmniFile-Processor

Sleeping

App Files Files Community

DrAbdulmalek commited on 29 days ago

Commit

4c2e90c

verified ·

1 Parent(s): 3bce188

Upload modules/vision/text_reconstructor.py with huggingface_hub

Browse files

Files changed (1) hide show

modules/vision/text_reconstructor.py +649 -0

modules/vision/text_reconstructor.py ADDED Viewed

	@@ -0,0 +1,649 @@

+"""
+مُعيد تجميع النصوص
+=====================
+إعادة تجميع الكلمات المكتشفة من OCR إلى نصوص مترابطة
+مع دعم خاص للنصوص العربية (RTL).
+القدرات:
+- تجميع الكلمات بناءً على إحداثياتها (x, y)
+- تجميع الكلمات في سطور حسب القرب العمودي
+- دعم النص العربي RTL باستخدام arabic-reshaper و python-bidi
+- التعامل مع النصوص المختلطة (عربي + إنجليزي)
+"""
+import logging
+import re
+from typing import Optional
+logger = logging.getLogger(__name__)
+class TextReconstructor:
+    """
+    مُعيد تجميع النصوص - يعيد بناء الجمل من نتائج OCR على مستوى الكلمات.
+    مثال الاستخدام:
+        >>> reconstructor = TextReconstructor(line_threshold=15)
+        >>> words = [
+        ...     {"text": "مرحبا", "x": 200, "y": 10, "w": 50, "h": 20},
+        ...     {"text": "بالعالم", "x": 140, "y": 10, "w": 60, "h": 20},
+        ...     {"text": "Hello", "x": 10, "y": 50, "w": 40, "h": 20},
+        ... ]
+        >>> text = reconstructor.reconstruct(words, direction="rtl")
+    """
+    def __init__(
+        self,
+        line_threshold: float = 15.0,
+        word_gap_threshold: float = 50.0,
+        default_direction: str = "auto",
+    ) -> None:
+        """
+        تهيئة مُعيد التجميع.
+        Args:
+            line_threshold: أقصى فرق عمودي (Y) لاعتبار كلمتين في نفس السطر
+            word_gap_threshold: أقل مسافة أفقية لفصل الكلمات بمسافة
+            default_direction: الاتجاه الافتراضي ("auto", "rtl", "ltr")
+        """
+        self.line_threshold = line_threshold
+        self.word_gap_threshold = word_gap_threshold
+        self.default_direction = default_direction
+        # التحقق من مكتبات إعادة تشكيل العربية
+        self._has_reshaper = self._check_library(
+            "arabic_reshaper", "arabic-reshaper"
+        )
+        self._has_bidi = self._check_library(
+            "bidi", "python-bidi"
+        )
+        if not self._has_reshaper:
+            logger.warning(
+                "arabic-reshaper غير مثبت. النص العربي قد لا يظهر بشكل صحيح. "
+                "قم بالتثبيت: pip install arabic-reshaper"
+            )
+        if not self._has_bidi:
+            logger.warning(
+                "python-bidi غير مثبت. اتجاه النص قد لا يكون صحيحاً. "
+                "قم بالتثبيت: pip install python-bidi"
+            )
+    @staticmethod
+    def _check_library(import_name: str, package_name: str) -> bool:
+        """التحقق من توفر مكتبة."""
+        try:
+            __import__(import_name)
+            return True
+        except ImportError:
+            return False
+    # ------------------------------------------------------------------
+    # الأساليب العامة (Public API)
+    # ------------------------------------------------------------------
+    def reconstruct(
+        self,
+        words: list[dict],
+        direction: str = "auto",
+    ) -> str:
+        """
+        إعادة تجميع قائمة كلمات إلى نص مترابط.
+        Args:
+            words: قائمة كلمات، كل كلمة قاموس يحتوي:
+                   - text: النص
+                   - x, y: موقع أعلى اليسار
+                   - w, h: العرض والارتفاع
+            direction: اتجاه النص ("auto", "rtl", "ltr")
+        Returns:
+            النص المُعاد تجميعه
+        """
+        if not words:
+            return ""
+        # تنظيف الكلمات الفارغة
+        valid_words = [
+            w for w in words
+            if w.get("text", "").strip()
+            and all(k in w for k in ("x", "y", "w", "h"))
+        ]
+        if not valid_words:
+            return ""
+        # تحديد الاتجاه
+        detected_direction = self._detect_direction(valid_words, direction)
+        # تجميع الكلمات في سطور
+        lines = self._group_into_lines(valid_words)
+        # ترتيب الكلمات داخل كل سطر
+        ordered_lines: list[str] = []
+        for line_words in lines:
+            line_text = self._order_line(line_words, detected_direction)
+            ordered_lines.append(line_text)
+        # دمج الأسطر
+        full_text = "\n".join(ordered_lines)
+        return full_text.strip()
+    def reconstruct_with_direction(
+        self,
+        words: list[dict],
+        direction: str = "rtl",
+    ) -> str:
+        """
+        إعادة تجميع النصوص مع تحديد الاتجاه بشكل صريح.
+        Args:
+            words: قائمة كلمات OCR
+            direction: "rtl" أو "ltr"
+        Returns:
+            النص المُعاد تجميعه مع معالجة الاتجاه
+        """
+        if direction not in ("rtl", "ltr"):
+            logger.warning(
+                "اتجاه غير معروف '%s' - سيتم استخدام auto", direction
+            )
+            return self.reconstruct(words, direction="auto")
+        text = self.reconstruct(words, direction=direction)
+        # إعادة تشكيل النص العربي إذا توفرت المكتبات
+        if direction == "rtl" and self._has_reshaper and self._has_bidi:
+            text = self._apply_arabic_reshaping(text)
+        return text
+    def get_statistics(self, words: list[dict]) -> dict:
+        """
+        الحصول على إحصائيات حول نتائج OCR.
+        Args:
+            words: قائمة كلمات OCR
+        Returns:
+            قاموس يحتوي إحصائيات
+        """
+        if not words:
+            return {"total_words": 0}
+        valid_words = [
+            w for w in words
+            if w.get("text", "").strip()
+        ]
+        lines = self._group_into_lines(valid_words)
+        # اكتشاف نسبة العربية
+        arabic_count = sum(
+            1 for w in valid_words
+            if self._is_arabic_text(w.get("text", ""))
+        )
+        return {
+            "total_words": len(valid_words),
+            "total_lines": len(lines),
+            "arabic_words": arabic_count,
+            "english_words": len(valid_words) - arabic_count,
+            "arabic_ratio": arabic_count / max(1, len(valid_words)),
+            "direction": self._detect_direction(valid_words, "auto"),
+        }
+    # ------------------------------------------------------------------
+    # الأساليب الداخلية - التجميع والترتيب
+    # ------------------------------------------------------------------
+    def _group_into_lines(
+        self, words: list[dict]
+    ) -> list[list[dict]]:
+        """
+        تجميع الكلمات في أسطر بناءً على القرب العمودي.
+        الخوارزمية:
+        1. ترتيب الكلمات حسب Y
+        2. تجميع الكلمات القريبة عمودياً في نفس السطر
+        3. استخدام المتوسط المتحرك لحدود الأسطر
+        Args:
+            words: قائمة كلمات صالحة
+        Returns:
+            قائمة أسطر، كل سطر قائمة كلمات
+        """
+        # ترتيب حسب Y أولاً (الصفوف العلوية أولاً)
+        sorted_words = sorted(words, key=lambda w: w["y"])
+        lines: list[list[dict]] = []
+        current_line: list[dict] = [sorted_words[0]]
+        for word in sorted_words[1:]:
+            # حساب متوسط Y للسطر الحالي
+            avg_y = sum(w["y"] for w in current_line) / len(current_line)
+            word_center_y = word["y"] + word["h"] / 2
+            current_center_y = avg_y + (current_line[0]["h"] / 2)
+            # إذا كانت الكلمة قريبة عمودياً من السطر الحالي
+            if abs(word_center_y - current_center_y) <= self.line_threshold:
+                current_line.append(word)
+            else:
+                # سطر جديد
+                lines.append(current_line)
+                current_line = [word]
+        # إضافة السطر الأخير
+        if current_line:
+            lines.append(current_line)
+        # ترتيب كل سطر حسب Y المتوسط (للضمان)
+        lines.sort(key=lambda line: sum(w["y"] for w in line) / len(line))
+        return lines
+    def _order_line(
+        self,
+        line_words: list[dict],
+        direction: str,
+    ) -> str:
+        """
+        ترتيب الكلمات داخل سطر وبناء النص.
+        Args:
+            line_words: كلمات في نفس السطر
+            direction: اتجاه النص
+        Returns:
+            نص السطر
+        """
+        if not line_words:
+            return ""
+        # ترتيب حسب X (اليسار لليمين أولاً)
+        sorted_by_x = sorted(line_words, key=lambda w: w["x"])
+        if direction == "rtl":
+            # للعربية: الكلمات على اليمين تأتي أولاً
+            # لكننا نبقي ترتيب X لأن الكلمة اليمنى لها x أكبر
+            # نحتاج لعكس الترتيب
+            sorted_by_x = sorted(line_words, key=lambda w: -w["x"])
+        # بناء النص مع مراعاة المسافات
+        result_parts: list[str] = []
+        for i, word in enumerate(sorted_by_x):
+            text = word["text"].strip()
+            if not text:
+                continue
+            if i == 0:
+                result_parts.append(text)
+            else:
+                # حساب المسافة من الكلمة السابقة
+                prev_word = sorted_by_x[i - 1]
+                gap = self._calculate_gap(prev_word, word, direction)
+                if gap > self.word_gap_threshold:
+                    # مسافة كبيرة = مسافة بين كلمات
+                    result_parts.append(" ")
+                    result_parts.append(text)
+                else:
+                    # مسافة صغيرة = كلمات متصلة أو مسافة عادية
+                    result_parts.append(" ")
+                    result_parts.append(text)
+        return "".join(result_parts).strip()
+    @staticmethod
+    def _calculate_gap(
+        word1: dict, word2: dict, direction: str
+    ) -> float:
+        """
+        حساب المسافة الأفقية بين كلمتين.
+        Args:
+            word1: الكلمة الأولى
+            word2: الكلمة الثانية
+            direction: اتجاه النص
+        Returns:
+            المسافة بالبكسل
+        """
+        if direction == "rtl":
+            # للعربية: word1 على اليمين و word2 على اليسار
+            # word1.x > word2.x (عادةً)
+            # المسافة = word1.x - (word2.x + word2.w)
+            right_word = word1 if word1["x"] > word2["x"] else word2
+            left_word = word2 if word1["x"] > word2["x"] else word1
+            return max(0, left_word["x"] - (right_word["x"] + right_word["w"]))
+        else:
+            # للإنجليزية: word1 على اليسار و word2 على اليمين
+            left_word = word1 if word1["x"] < word2["x"] else word2
+            right_word = word2 if word1["x"] < word2["x"] else word1
+            return max(0, right_word["x"] - (left_word["x"] + left_word["w"]))
+    # ------------------------------------------------------------------
+    # الأساليب الداخلية - كشف الاتجاه
+    # ------------------------------------------------------------------
+    def _detect_direction(
+        self, words: list[dict], hint: str
+    ) -> str:
+        """
+        اكتشاف اتجاه النص تلقائياً أو استخدام الإشارة المحددة.
+        Args:
+            words: قائمة الكلمات
+            hint: الإشارة ("auto", "rtl", "ltr")
+        Returns:
+            "rtl" أو "ltr"
+        """
+        if hint in ("rtl", "ltr"):
+            return hint
+        # كشف تلقائي
+        if hint == "auto" or hint not in ("rtl", "ltr"):
+            arabic_count = 0
+            total_count = 0
+            for word in words:
+                text = word.get("text", "")
+                if text.strip():
+                    total_count += 1
+                    if self._is_arabic_text(text):
+                        arabic_count += 1
+            if total_count == 0:
+                return "ltr"
+            arabic_ratio = arabic_count / total_count
+            if arabic_ratio > 0.3:
+                logger.debug(
+                    "اتجاه RTL مكتشف (نسبة العربية: %.1f%%)",
+                    arabic_ratio * 100,
+                )
+                return "rtl"
+            else:
+                logger.debug(
+                    "اتجاه LTR مكتشف (نسبة الإنجليزية: %.1f%%)",
+                    (1 - arabic_ratio) * 100,
+                )
+                return "ltr"
+        return "ltr"
+    @staticmethod
+    def _is_arabic_text(text: str) -> bool:
+        """
+        التحقق مما إذا كان النص يحتوي على حروف عربية.
+        يتحقق من وجود حروف عربية (U+0600–U+06FF) أو أرقام هندية.
+        Args:
+            text: النص المراد فحصه
+        Returns:
+            True إذا كان النص يحتوي على عربية
+        """
+        if not text:
+            return False
+        # نطاقات اليونيكود العربية
+        arabic_ranges = [
+            (0x0600, 0x06FF),   # الحروف العربية
+            (0x0750, 0x077F),   # امتدادات العربية
+            (0x08A0, 0x08FF),   # امتدادات إضافية
+            (0xFB50, 0xFDFF),   # أشكال العرض العربية
+            (0xFE70, 0xFEFF),   # أشكال العرض العربية - B
+            (0x0660, 0x0669),   # الأرقام الهندية
+        ]
+        for char in text:
+            code = ord(char)
+            for start, end in arabic_ranges:
+                if start <= code <= end:
+                    return True
+        return False
+    @staticmethod
+    def _is_latin_text(text: str) -> bool:
+        """
+        التحقق مما إذا كان النص يحتوي على حروف لاتينية/إنجليزية.
+        Args:
+            text: النص المراد فحصه
+        Returns:
+            True إذا كان النص يحتوي على لاتينية
+        """
+        if not text:
+            return False
+        for char in text:
+            if ("A" <= char <= "Z") or ("a" <= char <= "z"):
+                return True
+        return False
+    # ------------------------------------------------------------------
+    # أساليب إعادة تشكيل النص العربي
+    # ------------------------------------------------------------------
+    def _apply_arabic_reshaping(self, text: str) -> str:
+        """
+        إعادة تشكيل النص العربي ليظهر بشكل صحيح.
+        تستخدم arabic-reshaper لتوصيل الحروف
+        و python-bidi لعكس اتجاه العرض.
+        Args:
+            text: النص العربي الخام
+        Returns:
+            النص المعاد تشكيله
+        """
+        if not text:
+            return text
+        try:
+            import arabic_reshaper
+            from bidi.algorithm import get_display
+            # تقسيم النص إلى أسطر ومعالجة كل سطر
+            lines = text.split("\n")
+            reshaped_lines: list[str] = []
+            for line in lines:
+                if not line.strip():
+                    reshaped_lines.append(line)
+                    continue
+                # التعامل مع النص المختلط (عربي + إنجليزي)
+                segments = self._split_mixed_text(line)
+                reshaped_segments: list[str] = []
+                for segment in segments:
+                    if segment["type"] == "arabic":
+                        reshaped = arabic_reshaper.reshape(segment["text"])
+                        displayed = get_display(reshaped)
+                        reshaped_segments.append(displayed)
+                    else:
+                        reshaped_segments.append(segment["text"])
+                reshaped_lines.append("".join(reshaped_segments))
+            return "\n".join(reshaped_lines)
+        except Exception as e:
+            logger.warning("فشل في إعادة تشكيل النص العربي: %s", e)
+            return text
+    @staticmethod
+    def _split_mixed_text(text: str) -> list[dict]:
+        """
+        تقسيم النص المختلط إلى أجزاء عربية وغير عربية.
+        Args:
+            text: النص المختلط
+        Returns:
+            قائمة أجزاء: [{"text": "...", "type": "arabic|other"}]
+        """
+        if not text:
+            return []
+        segments: list[dict] = []
+        current_segment = ""
+        current_type = None
+        arabic_ranges = [
+            (0x0600, 0x06FF),
+            (0x0750, 0x077F),
+            (0x08A0, 0x08FF),
+            (0xFB50, 0xFDFF),
+            (0xFE70, 0xFEFF),
+        ]
+        for char in text:
+            code = ord(char)
+            is_arabic = any(start <= code <= end for start, end in arabic_ranges)
+            is_space = char in (" ", "\t", "\n")
+            char_type = "arabic" if is_arabic else "other"
+            # المسافات تنضم للنوع الحالي
+            if is_space:
+                current_segment += char
+                continue
+            if current_type is None:
+                current_type = char_type
+                current_segment = char
+            elif char_type == current_type:
+                current_segment += char
+            else:
+                # تغيير النوع
+                if current_segment.strip():
+                    segments.append({
+                        "text": current_segment,
+                        "type": current_type,
+                    })
+                current_type = char_type
+                current_segment = char
+        # إضافة الجزء الأخير
+        if current_segment.strip():
+            segments.append({
+                "text": current_segment,
+                "type": current_type,
+            })
+        return segments
+    def reconstruct_mixed_paragraph(
+        self,
+        words: list[dict],
+    ) -> str:
+        """
+        إعادة تجميع فقرة مختلطة (عربي + إنجليزي) مع معالجة ذكية.
+        يحاول فصل الأجزاء العربية عن الإنجليزية ويعالج كل جزء
+        حسب اتجاهه المناسب.
+        Args:
+            words: قائمة كلمات OCR
+        Returns:
+            النص المُعاد تجميعه
+        """
+        if not words:
+            return ""
+        # تجميع في سطور
+        valid_words = [
+            w for w in words
+            if w.get("text", "").strip()
+            and all(k in w for k in ("x", "y", "w", "h"))
+        ]
+        if not valid_words:
+            return ""
+        lines = self._group_into_lines(valid_words)
+        result_lines: list[str] = []
+        for line_words in lines:
+            # فصل كلمات العربي عن الإنجليزي
+            arabic_words = [
+                w for w in line_words
+                if self._is_arabic_text(w["text"])
+            ]
+            english_words = [
+                w for w in line_words
+                if self._is_latin_text(w["text"])
+            ]
+            # ترتيب كل مجموعة
+            arabic_sorted = sorted(
+                arabic_words, key=lambda w: -w["x"]
+            )
+            english_sorted = sorted(
+                english_words, key=lambda w: w["x"]
+            )
+            # دمج حسب الموقع
+            line_text = self._merge_mixed_line(
+                arabic_sorted, english_sorted, line_words
+            )
+            result_lines.append(line_text)
+        full_text = "\n".join(result_lines)
+        # إعادة تشكيل العربي
+        if self._has_reshaper and self._has_bidi:
+            full_text = self._apply_arabic_reshaping(full_text)
+        return full_text.strip()
+    @staticmethod
+    def _merge_mixed_line(
+        arabic_words: list[dict],
+        english_words: list[dict],
+        all_words: list[dict],
+    ) -> str:
+        """
+        دمج كلمات عربية وإنجليزية في سطر واحد حسب الموقع.
+        Args:
+            arabic_words: الكلمات العربية (مرتبة RTL)
+            english_words: الكلمات الإنجليزية (مرتبة LTR)
+            all_words: كل الكلمات (مرتبة حسب الموقع الأصلي)
+        Returns:
+            نص السطر المدمج
+        """
+        # إنشاء خريطة الموقع -> النص
+        position_map: dict[tuple[int, int], str] = {}
+        for w in all_words:
+            center_x = w["x"] + w["w"] // 2
+            center_y = w["y"] + w["h"] // 2
+            position_map[(center_x, center_y)] = w["text"].strip()
+        # ترتيب حسب الموقع الأصلي (X تنازلياً للعربية)
+        sorted_positions = sorted(
+            position_map.keys(),
+            key=lambda pos: pos[0],
+        )
+        # البناء - نعكس النص العربي فقط
+        parts: list[str] = []
+        for pos in sorted_positions:
+            text = position_map[pos]
+            parts.append(text)
+        return " ".join(parts)