Spaces:

Mazenbs
/

extract_html_full

Sleeping

Mazenbs commited on Dec 2, 2025

Commit

19dc866

verified ·

1 Parent(s): 9bbdf9a

Create parser/section_extractor.py

Files changed (1) hide show

parser/section_extractor.py ADDED Viewed

+# parser/section_extractor.py
+import re
+from typing import List, Dict, Any, Tuple
+SECTION_KEYWORDS = ["الكتاب", "الباب", "الفصل"]
+def is_section_line(line: str) -> bool:
+    return bool(re.match(rf"^(?:{'|'.join(SECTION_KEYWORDS)})\b", line))
+def extract_sections_from_text_blocks(text_blocks: List[str]) -> Tuple[List[Dict[str, Any]], List[str]]:
+    sections = []
+    preamble = []
+    current = None
+    found = False
+    for block in text_blocks:
+        parts = re.split(r"(?<=\.)|(?=\b(?:الكتاب|الباب|الفصل)\b)", block)
+        for part in parts:
+            part = part.strip()
+            if not part:
+                continue
+            if is_section_line(part):
+                found = True
+                current = {"title": part, "lines": []}
+                sections.append(current)
+                continue
+            if not found:
+                preamble.append(part)
+            else:
+                if current is None:
+                    current = {"title": "", "lines": []}
+                    sections.append(current)
+                current["lines"].append(part)
+    return sections, preamble