Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 2, 2025

Commit

828df36

verified ·

1 Parent(s): 20ee4ec

Update parser/section_extractor.py

Browse files

Files changed (1) hide show

parser/section_extractor.py +21 -50

parser/section_extractor.py CHANGED Viewed

@@ -1,67 +1,38 @@
 import re
 from typing import List, Dict, Any, Tuple
-from parser.article_extractor import ARTICLE_PATTERN
-SECTION_KEYWORDS = ["الكتاب", "الباب", "الفصل", "القسم"]
-SECTION_RE = re.compile(rf"^\s*(?:{'|'.join(SECTION_KEYWORDS)})\b", re.IGNORECASE)
 def is_section_line(line: str) -> bool:
-    return bool(SECTION_RE.match(line))
-def split_title_and_following(line: str) -> Tuple[str, str]:
-    match = re.search(rf"\bمادة\b", line)
-    if match:
-        idx = match.start()
-        title = line[:idx].strip()
-        follow = line[idx:].strip()
-        if not title:
-            title = line.strip()
-            follow = ""
-        return title, follow
-    return line.strip(), ""
 def extract_sections_from_text_blocks(text_blocks: List[str]) -> Tuple[List[Dict[str, Any]], List[str]]:
-    """
-    ترجع:
-      - sections: قائمة بالأقسام وكل قسم يحتوي "title" و "lines"
-      - preamble: المقدمة الحقيقية، تتوقف عند أول قسم أو أول مادة
-    """
-    sections: List[Dict[str, Any]] = []
-    preamble: List[str] = []
-    current: Dict[str, Any] = None
-    found_structure = False  # يعني: وجدنا أول قسم أو أول مادة
     for block in text_blocks:
-        for raw_line in block.splitlines():
-            line = raw_line.strip()
-            if not line:
-                continue
-            # التحقق من بداية القسم أولاً
-            if not found_structure and is_section_line(line):
-                found_structure = True
-            # التحقق من بداية المادة
-            elif not found_structure and ARTICLE_PATTERN.match(line):
-                found_structure = True
-            if not found_structure:
-                preamble.append(line)
                 continue
-            # بدأنا الأقسام
-            if is_section_line(line):
-                title, follow = split_title_and_following(line)
-                current = {"title": title, "lines": []}
                 sections.append(current)
-                if follow:
-                    current["lines"].append(follow)
                 continue
-            if current is None:
-                current = {"title": "", "lines": []}
-                sections.append(current)
-            current["lines"].append(line)
     return sections, preamble

+# parser/section_extractor.py
 import re
 from typing import List, Dict, Any, Tuple
+SECTION_KEYWORDS = ["الكتاب", "الباب", "الفصل"]
 def is_section_line(line: str) -> bool:
+    return bool(re.match(rf"^(?:{'|'.join(SECTION_KEYWORDS)})\b", line))
 def extract_sections_from_text_blocks(text_blocks: List[str]) -> Tuple[List[Dict[str, Any]], List[str]]:
+    sections = []
+    preamble = []
+    current = None
+    found = False
     for block in text_blocks:
+        parts = re.split(r"(?<=\.)|(?=\b(?:الكتاب|الباب|الفصل)\b)", block)
+        for part in parts:
+            part = part.strip()
+            if not part:
                 continue
+            if is_section_line(part):
+                found = True
+                current = {"title": part, "lines": []}
                 sections.append(current)
                 continue
+            if not found:
+                preamble.append(part)
+            else:
+                if current is None:
+                    current = {"title": "", "lines": []}
+                    sections.append(current)
+                current["lines"].append(part)
     return sections, preamble