Spaces:

Mazenbs
/

extract_html_full

Running

Mazenbs commited on Dec 3, 2025

Commit

e5a8aa5

verified ·

1 Parent(s): 8734fe8

Update parser/article_extractor.py

Files changed (1) hide show

parser/article_extractor.py CHANGED Viewed

@@ -1,19 +1,34 @@
 import re
 def extract_articles(texts: list):
     articles = []
-    current = {"number": None, "text": ""}
     for t in texts:
-        m = re.match(r"مادة\s*\((\d+)\)", t)
         if m:
-            if current["number"]:
                 articles.append(current)
             current = {"number": m.group(1), "text": ""}
         else:
-            current["text"] += " " + t
-    if current["number"]:
         articles.append(current)
     return articles

 import re
 def extract_articles(texts: list):
+    """
+    تقسيم النصوص إلى مواد قانونية مع جمع نص كل مادة
+    """
     articles = []
+    current = None
     for t in texts:
+        # التعرف على رقم المادة
+        m = re.match(r"مادة\s*\(?(\d+)\)?", t)
         if m:
+            if current:
+                # أضف المادة السابقة قبل البدء بمادة جديدة
+                current["text"] = current["text"].strip()
                 articles.append(current)
             current = {"number": m.group(1), "text": ""}
+            # إزالة "مادة(1)" من النص إذا كان هناك محتوى إضافي بعد الرقم
+            remaining_text = t[m.end():].strip()
+            if remaining_text:
+                current["text"] += remaining_text
         else:
+            if current:
+                current["text"] += " " + t
+            else:
+                # نصوص قبل أول مادة يمكن تجاهلها أو وضعها كمقدمة فرعية
+                continue
+    if current:
+        current["text"] = current["text"].strip()
         articles.append(current)
     return articles