Spaces:

Mazenbs
/

extract_html_full

Running

Mazenbs commited on Dec 3, 2025

Commit

b771743

verified ·

1 Parent(s): 8db4722

Update parser/section_extractor.py

Files changed (1) hide show

parser/section_extractor.py CHANGED Viewed

@@ -1,19 +1,33 @@
 def extract_sections(texts: list):
     """
-    تقسيم النصوص إلى أقسام وفصول
     """
     sections = []
     current = {"name": "", "texts": []}
     for t in texts:
-        t = t.strip()
-        if any(keyword in t for keyword in ["الباب", "الفصل"]):
             if current["texts"] or current["name"]:
                 sections.append(current)
-            current = {"name": t, "texts": []}
         else:
-            current["texts"].append(t)
     if current["texts"] or current["name"]:
         sections.append(current)

+# parser/section_extractor.py
+from helpers.utils import is_section, normalize_digits
 def extract_sections(texts: list):
     """
+    تقسيم النصوص إلى أقسام وفصول باستخدام الأنماط الذكية
     """
     sections = []
     current = {"name": "", "texts": []}
     for t in texts:
+        if not isinstance(t, str):
+            continue
+        t_norm = normalize_digits(t.strip())
+        # هل السطر عنوان قسم أو فصل؟
+        if is_section(t_norm):
+            # لو يوجد قسم سابق → نحفظه
             if current["texts"] or current["name"]:
                 sections.append(current)
+            # نبدأ قسم/فصل جديد
+            current = {"name": t_norm, "texts": []}
         else:
+            # السطور التابعة للقسم الحالي
+            current["texts"].append(t_norm)
+    # إضافة آخر قسم
     if current["texts"] or current["name"]:
         sections.append(current)