Spaces:
Running
Running
File size: 4,960 Bytes
ac8768a cdb4964 560009a 392a934 cdb4964 ac8768a cdb4964 349aab0 cdb4964 72a807d cdb4964 349aab0 87cfaef 50eda68 87cfaef cdb4964 228d412 50eda68 127ed78 50eda68 5771f15 228d412 50eda68 d3f0826 5771f15 349aab0 50eda68 127ed78 50eda68 038eebb 87cfaef 560009a 87cfaef cdb4964 560009a 87cfaef 560009a 87cfaef 560009a 87cfaef 560009a 39285a6 d4584e2 709465a d4584e2 709465a d4584e2 9a1cbd4 fa02b5f d4584e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
# parser/assembler.py
from typing import List, Dict, Tuple
from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
from helpers.cleaner import extract_law_number_and_year, merge_colon_lines
def extract_title_and_preamble(
blocks: List[Dict[str, str]],
) -> Tuple[str, str, List[Dict[str, str]]]:
title_blocks = [b for b in blocks if b["type"] == "title"]
preamble_blocks = [b for b in blocks if b["type"] == "preamble"]
body_blocks = [b for b in blocks if b["type"] == "body"]
title = "\n".join([b["text"] for b in title_blocks]).strip()
preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
return title, preamble, body_blocks
def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
articles = []
current = None
for block in blocks:
t = block["text"].strip()
if is_article(t):
if current:
# تطبيق merge_colon_lines على نص المادة السابقة
current["text"] = merge_colon_lines(current["text"].strip())
articles.append(current)
current = {"number": extract_article_number(t), "text": t}
else:
if current:
current["text"] += "\n" + t
else:
current = {"number": None, "text": t}
if current:
current["text"] = merge_colon_lines(current["text"].strip())
articles.append(current)
return articles
def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
sections = []
current = {"name": "", "texts": []}
for block in blocks:
t = normalize_digits(block["text"].strip())
if is_section(t):
if current["texts"] or current["name"]:
sections.append(current)
current = {"name": t, "texts": []}
else:
current["texts"].append(block)
if current["texts"] or current["name"]:
sections.append(current)
return sections
def parse_law_from_texts(text_blocks: List[Dict[str, str]], url: str = None) -> Dict:
title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
sections_raw = extract_sections(remaining_blocks)
# -------------------------------------------------------
# استخراج رقم القانون والسنة من العنوان
# -------------------------------------------------------
law_info_title = extract_law_number_and_year(title)
# استخراج رقم القانون والسنة من المقدمة (كباك أب)
law_info_preamble = extract_law_number_and_year(preamble)
# -------------------------------------------------------
# اختيار الأفضل:
# العنوان أولوية، وإذا ناقص → نكمل من المقدمة
# -------------------------------------------------------
law_number = None
law_year = None
if law_info_title:
law_number = law_info_title.get("law_number")
law_year = law_info_title.get("year")
if (not law_number or not law_year) and law_info_preamble:
law_number = law_number or law_info_preamble.get("law_number")
law_year = law_year or law_info_preamble.get("year")
# -------------------------------------------------------
# معالجة الأقسام
# -------------------------------------------------------
sections = []
for sec in sections_raw:
raw_blocks = sec["texts"]
# دمج نصوص القسم مع merge_colon_lines
# واستبعاد المواد من نص المحتوى
content = "\n".join([
b["text"] for b in raw_blocks
if not is_article(b["text"])
]).strip()
content = merge_colon_lines(content)
# استخراج المواد من القسم
articles = extract_articles_from_blocks(raw_blocks)
# تجهيز المواد بالشكل المطلوب
articles_cleaned = []
for a in articles:
if a["number"] is None:
articles_cleaned.append({"tag": a["text"]})
else:
articles_cleaned.append({
"number": a["number"],
"text": a["text"]
})
# إضافة القسم النهائي
sections.append({
"title": sec["name"],
"content": content,
"articles": articles_cleaned
})
# -------------------------------------------------------
# الاستجابة النهائية
# -------------------------------------------------------
return {
"message": "success",
"url": url,
"count": len(text_blocks),
"law": {
"title": title,
"preamble": preamble,
"number": law_number,
"year": law_year,
"sections": sections
}
}
|