Spaces:
Running
Running
Update parser/article_extractor.py
Browse files- parser/article_extractor.py +20 -5
parser/article_extractor.py
CHANGED
|
@@ -1,19 +1,34 @@
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
def extract_articles(texts: list):
|
|
|
|
|
|
|
|
|
|
| 4 |
articles = []
|
| 5 |
-
current =
|
| 6 |
|
| 7 |
for t in texts:
|
| 8 |
-
|
|
|
|
| 9 |
if m:
|
| 10 |
-
if current
|
|
|
|
|
|
|
| 11 |
articles.append(current)
|
| 12 |
current = {"number": m.group(1), "text": ""}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
else:
|
| 14 |
-
current
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
if current
|
|
|
|
| 17 |
articles.append(current)
|
| 18 |
|
| 19 |
return articles
|
|
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
def extract_articles(texts: list):
|
| 4 |
+
"""
|
| 5 |
+
تقسيم النصوص إلى مواد قانونية مع جمع نص كل مادة
|
| 6 |
+
"""
|
| 7 |
articles = []
|
| 8 |
+
current = None
|
| 9 |
|
| 10 |
for t in texts:
|
| 11 |
+
# التعرف على رقم المادة
|
| 12 |
+
m = re.match(r"مادة\s*\(?(\d+)\)?", t)
|
| 13 |
if m:
|
| 14 |
+
if current:
|
| 15 |
+
# أضف المادة السابقة قبل البدء بمادة جديدة
|
| 16 |
+
current["text"] = current["text"].strip()
|
| 17 |
articles.append(current)
|
| 18 |
current = {"number": m.group(1), "text": ""}
|
| 19 |
+
# إزالة "مادة(1)" من النص إذا كان هناك محتوى إضافي بعد الرقم
|
| 20 |
+
remaining_text = t[m.end():].strip()
|
| 21 |
+
if remaining_text:
|
| 22 |
+
current["text"] += remaining_text
|
| 23 |
else:
|
| 24 |
+
if current:
|
| 25 |
+
current["text"] += " " + t
|
| 26 |
+
else:
|
| 27 |
+
# نصوص قبل أول مادة يمكن تجاهلها أو وضعها كمقدمة فرعية
|
| 28 |
+
continue
|
| 29 |
|
| 30 |
+
if current:
|
| 31 |
+
current["text"] = current["text"].strip()
|
| 32 |
articles.append(current)
|
| 33 |
|
| 34 |
return articles
|