extract_html_full / parser /assembler.py
Mazenbs's picture
Update parser/assembler.py
cd3c116 verified
# parser/assembler.py
from typing import List, Dict, Tuple
from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
from helpers.cleaner import extract_law_number_and_year, merge_colon_lines
def extract_title_and_preamble(
blocks: List[Dict[str, str]],
) -> Tuple[str, str, List[Dict[str, str]]]:
title_blocks = [b for b in blocks if b["type"] == "title"]
preamble_blocks = [b for b in blocks if b["type"] == "preamble"]
body_blocks = [b for b in blocks if b["type"] == "body"]
title = "\n".join([b["text"] for b in title_blocks]).strip()
preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
return title, preamble, body_blocks
def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
articles = []
current = None
for block in blocks:
t = block["text"].strip()
if is_article(t):
if current:
# تطبيق merge_colon_lines على نص المادة السابقة
current["text"] = merge_colon_lines(current["text"].strip())
articles.append(current)
current = {"number": extract_article_number(t), "text": t}
else:
if current:
current["text"] += "\n" + t
else:
current = {"number": None, "text": t}
if current:
current["text"] = merge_colon_lines(current["text"].strip())
articles.append(current)
return articles
def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
sections = []
current = {"name": "", "texts": []}
for block in blocks:
t = normalize_digits(block["text"].strip())
if is_section(t):
if current["texts"] or current["name"]:
sections.append(current)
current = {"name": t, "texts": []}
else:
current["texts"].append(block)
if current["texts"] or current["name"]:
sections.append(current)
return sections
def parse_law_from_texts(text_blocks: List[Dict[str, str]], url: str = None) -> Dict:
title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
sections_raw = extract_sections(remaining_blocks)
# -------------------------------------------------------
# استخراج رقم القانون والسنة من العنوان
# -------------------------------------------------------
law_info_title = extract_law_number_and_year(title)
# استخراج رقم القانون والسنة من المقدمة (كباك أب)
law_info_preamble = extract_law_number_and_year(preamble)
# -------------------------------------------------------
# اختيار الأفضل:
# العنوان أولوية، وإذا ناقص → نكمل من المقدمة
# -------------------------------------------------------
law_number = None
law_year = None
if law_info_title:
law_number = law_info_title.get("law_number")
law_year = law_info_title.get("year")
if (not law_number or not law_year) and law_info_preamble:
law_number = law_number or law_info_preamble.get("law_number")
law_year = law_year or law_info_preamble.get("year")
# -------------------------------------------------------
# معالجة الأقسام
# -------------------------------------------------------
sections = []
for sec in sections_raw:
raw_blocks = sec["texts"]
# دمج نصوص القسم مع merge_colon_lines
# واستبعاد المواد من نص المحتوى
content = "\n".join([
b["text"] for b in raw_blocks
if not is_article(b["text"])
]).strip()
content = merge_colon_lines(content)
# استخراج المواد من القسم
articles = extract_articles_from_blocks(raw_blocks)
# تجهيز المواد بالشكل المطلوب
articles_cleaned = []
for a in articles:
if a["number"] is None:
articles_cleaned.append({"tag": a["text"]})
else:
articles_cleaned.append({
"number": a["number"],
"text": a["text"]
})
# إضافة القسم النهائي
sections.append({
"title": sec["name"],
"content": content,
"articles": articles_cleaned
})
# -------------------------------------------------------
# الاستجابة النهائية
# -------------------------------------------------------
return {
"message": "success",
"url": url,
"count": len(text_blocks),
"law": {
"title": title,
"preamble": preamble,
"number": law_number,
"year": law_year,
"sections": sections
}
}