Spaces:

Mazenbs
/

extract_html_full

Sleeping

Mazenbs commited on Dec 3, 2025

Commit

8734fe8

verified ·

1 Parent(s): 471980d

Update parser/section_extractor.py

Files changed (1) hide show

parser/section_extractor.py CHANGED Viewed

@@ -1,16 +1,20 @@
 def extract_sections(texts: list):
     sections = []
     current = {"name": "", "texts": []}
     for t in texts:
-        if "الباب" in t or "الفصل" in t:
-            if current["texts"]:
                 sections.append(current)
-            current = {"name": t, "texts": []}
         else:
-            current["texts"].append(t)
-    if current["texts"]:
         sections.append(current)
     return sections

 def extract_sections(texts: list):
+    """
+    تقسيم النصوص إلى أقسام (أبواب/فصول) مع تجميع النصوص داخل كل قسم
+    """
     sections = []
     current = {"name": "", "texts": []}
     for t in texts:
+        # التعرف على البواب/الفصول
+        if any(keyword in t for keyword in ["الباب", "الفصل"]):
+            if current["texts"] or current["name"]:
                 sections.append(current)
+            current = {"name": t.strip(), "texts": []}
         else:
+            current["texts"].append(t.strip())
+    if current["texts"] or current["name"]:
         sections.append(current)
     return sections