Spaces:
Sleeping
Sleeping
Update parser/assembler.py
Browse files- parser/assembler.py +12 -12
parser/assembler.py
CHANGED
|
@@ -2,33 +2,34 @@ from typing import List, Dict
|
|
| 2 |
from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
|
| 3 |
|
| 4 |
|
| 5 |
-
def extract_title_and_preamble(
|
| 6 |
"""
|
| 7 |
استخراج عنوان القانون والمقدمة، مع التوقف عند أول قسم أو مادة.
|
| 8 |
"""
|
| 9 |
title = ""
|
| 10 |
-
|
| 11 |
-
remaining_texts = []
|
| 12 |
|
| 13 |
# استخراج العنوان
|
| 14 |
-
while
|
| 15 |
-
|
|
|
|
| 16 |
if t.lower() != "html" and t != "":
|
| 17 |
title = t
|
| 18 |
break
|
| 19 |
|
| 20 |
# استخراج المقدمة حتى أول قسم أو مادة
|
| 21 |
-
while
|
| 22 |
-
t =
|
| 23 |
if is_section(t) or is_article(t):
|
| 24 |
break
|
| 25 |
-
|
| 26 |
|
| 27 |
# البقية تعتبر نصوص للتحليل (أقسام ومواد)
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
preamble = "\n".join(preamble_lines).strip()
|
| 31 |
-
return title, preamble, remaining_texts
|
| 32 |
|
| 33 |
def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
|
| 34 |
"""
|
|
@@ -49,7 +50,6 @@ def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
|
|
| 49 |
current["text"] += "\n" + t
|
| 50 |
current["blocks"].append(block)
|
| 51 |
else:
|
| 52 |
-
# نص غير مصنف → نحفظه كمادة بدون رقم
|
| 53 |
current = {"number": None, "text": t, "blocks": [block]}
|
| 54 |
|
| 55 |
if current:
|
|
|
|
| 2 |
from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
|
| 3 |
|
| 4 |
|
| 5 |
+
def extract_title_and_preamble(blocks: List[Dict[str, str]]) -> (str, str, List[Dict[str, str]]):
|
| 6 |
"""
|
| 7 |
استخراج عنوان القانون والمقدمة، مع التوقف عند أول قسم أو مادة.
|
| 8 |
"""
|
| 9 |
title = ""
|
| 10 |
+
preamble_blocks = []
|
|
|
|
| 11 |
|
| 12 |
# استخراج العنوان
|
| 13 |
+
while blocks:
|
| 14 |
+
block = blocks.pop(0)
|
| 15 |
+
t = block.get("text", "").strip()
|
| 16 |
if t.lower() != "html" and t != "":
|
| 17 |
title = t
|
| 18 |
break
|
| 19 |
|
| 20 |
# استخراج المقدمة حتى أول قسم أو مادة
|
| 21 |
+
while blocks:
|
| 22 |
+
t = blocks[0].get("text", "").strip()
|
| 23 |
if is_section(t) or is_article(t):
|
| 24 |
break
|
| 25 |
+
preamble_blocks.append(blocks.pop(0))
|
| 26 |
|
| 27 |
# البقية تعتبر نصوص للتحليل (أقسام ومواد)
|
| 28 |
+
remaining_blocks = blocks
|
| 29 |
+
|
| 30 |
+
preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
|
| 31 |
+
return title, preamble, remaining_blocks
|
| 32 |
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
|
| 35 |
"""
|
|
|
|
| 50 |
current["text"] += "\n" + t
|
| 51 |
current["blocks"].append(block)
|
| 52 |
else:
|
|
|
|
| 53 |
current = {"number": None, "text": t, "blocks": [block]}
|
| 54 |
|
| 55 |
if current:
|