Spaces:

Mazenbs
/

extract_html_full

Sleeping

App Files Files Community

Mazenbs commited on Dec 2, 2025

Commit

ef66367

verified ·

1 Parent(s): 3f58b85

Update parser/section_extractor.py

Browse files

Files changed (1) hide show

parser/section_extractor.py +53 -20

parser/section_extractor.py CHANGED Viewed

@@ -1,38 +1,71 @@
 # parser/section_extractor.py
 import re
 from typing import List, Dict, Any, Tuple
 SECTION_KEYWORDS = ["الكتاب", "الباب", "الفصل"]
 def is_section_line(line: str) -> bool:
-    return bool(re.match(rf"^(?:{'|'.join(SECTION_KEYWORDS)})\b", line))
 def extract_sections_from_text_blocks(text_blocks: List[str]) -> Tuple[List[Dict[str, Any]], List[str]]:
-    sections = []
-    preamble = []
-    current = None
-    found = False
     for block in text_blocks:
-        parts = re.split(r"(?<=\.)|(?=\b(?:الكتاب|الباب|الفصل)\b)", block)
-        for part in parts:
-            part = part.strip()
-            if not part:
                 continue
-            if is_section_line(part):
-                found = True
-                current = {"title": part, "lines": []}
                 sections.append(current)
                 continue
-            if not found:
-                preamble.append(part)
-            else:
-                if current is None:
-                    current = {"title": "", "lines": []}
-                    sections.append(current)
-                current["lines"].append(part)
     return sections, preamble

 # parser/section_extractor.py
 import re
 from typing import List, Dict, Any, Tuple
+from parser.article_extractor import ARTICLE_KEYWORD
 SECTION_KEYWORDS = ["الكتاب", "الباب", "الفصل"]
+# يتأكد أن السطر يبدأ بكلمة قسم (الكتاب|الباب|الفصل) وليس مجرد وجودها داخل السطر
+SECTION_RE = re.compile(rf"^\s*(?:{'|'.join(SECTION_KEYWORDS)})\b", re.IGNORECASE)
 def is_section_line(line: str) -> bool:
+    return bool(SECTION_RE.match(line))
+def split_title_and_following(line: str) -> (str, str):
+    """
+    إذا كان سطر العنوان يحتوي على بداية مادة (مثلاً: 'الفصل الاول ... مادة(1) ...')
+    فإننا نُعيد (عنوان مقطّع, الباقي الذي يبدأ بكلمة 'مادة' أو نص لاحق)
+    وإلا نعيد (line, "")
+    """
+    # نبحث عن كلمة "مادة" كحد فارز (نأخذ أول ظهور)
+    idx = re.search(rf"\b{ARTICLE_KEYWORD}\b", line)
+    if idx:
+        i = idx.start()
+        title_part = line[:i].strip()
+        follow = line[i:].strip()  # يبدأ بكلمة 'مادة'
+        return title_part, follow
+    return line.strip(), ""
 def extract_sections_from_text_blocks(text_blocks: List[str]) -> Tuple[List[Dict[str, Any]], List[str]]:
+    sections: List[Dict[str, Any]] = []
+    preamble: List[str] = []
+    current: Dict[str, Any] = None
+    found_any_section = False
     for block in text_blocks:
+        # نقسم البلوك إلى أسطر حقيقية للحفاظ على الترتيب
+        for raw_line in block.splitlines():
+            line = raw_line.strip()
+            if not line:
                 continue
+            # إذا كان هذا سطر قسم
+            if is_section_line(line):
+                found_any_section = True
+                title, follow = split_title_and_following(line)
+                # لو العنوان فارغ بعد القص، نعطيه السطر الأصلي المقصّر
+                title = title or line
+                current = {"title": title, "lines": []}
                 sections.append(current)
+                # إن وُجد نص يبدأ بـ "مادة" داخل نفس السطر، ضعه كسطر تابع ليعالج لاحقًا
+                if follow:
+                    current["lines"].append(follow)
+                continue
+            # لو لم نعثر على أي قسم بعدُ → جزء من الـ preamble
+            if not found_any_section:
+                preamble.append(line)
                 continue
+            # لو لا يوجد current (نادر) نفتح قسم افتراضي
+            if current is None:
+                current = {"title": "", "lines": []}
+                sections.append(current)
+            # إضافة السطر إلى القسم الحالي
+            current["lines"].append(line)
     return sections, preamble