# parser/assembler.py from typing import List, Dict, Tuple from helpers.utils import normalize_digits, extract_article_number, is_article, is_section from helpers.cleaner import extract_law_number_and_year, merge_colon_lines def extract_title_and_preamble( blocks: List[Dict[str, str]], ) -> Tuple[str, str, List[Dict[str, str]]]: title_blocks = [b for b in blocks if b["type"] == "title"] preamble_blocks = [b for b in blocks if b["type"] == "preamble"] body_blocks = [b for b in blocks if b["type"] == "body"] title = "\n".join([b["text"] for b in title_blocks]).strip() preamble = "\n".join([b["text"] for b in preamble_blocks]).strip() return title, preamble, body_blocks def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]: articles = [] current = None for block in blocks: t = block["text"].strip() if is_article(t): if current: # تطبيق merge_colon_lines على نص المادة السابقة current["text"] = merge_colon_lines(current["text"].strip()) articles.append(current) current = {"number": extract_article_number(t), "text": t} else: if current: current["text"] += "\n" + t else: current = {"number": None, "text": t} if current: current["text"] = merge_colon_lines(current["text"].strip()) articles.append(current) return articles def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]: sections = [] current = {"name": "", "texts": []} for block in blocks: t = normalize_digits(block["text"].strip()) if is_section(t): if current["texts"] or current["name"]: sections.append(current) current = {"name": t, "texts": []} else: current["texts"].append(block) if current["texts"] or current["name"]: sections.append(current) return sections def parse_law_from_texts(text_blocks: List[Dict[str, str]], url: str = None) -> Dict: title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks) sections_raw = extract_sections(remaining_blocks) # ------------------------------------------------------- # استخراج رقم القانون والسنة من العنوان # ------------------------------------------------------- law_info_title = extract_law_number_and_year(title) # استخراج رقم القانون والسنة من المقدمة (كباك أب) law_info_preamble = extract_law_number_and_year(preamble) # ------------------------------------------------------- # اختيار الأفضل: # العنوان أولوية، وإذا ناقص → نكمل من المقدمة # ------------------------------------------------------- law_number = None law_year = None if law_info_title: law_number = law_info_title.get("law_number") law_year = law_info_title.get("year") if (not law_number or not law_year) and law_info_preamble: law_number = law_number or law_info_preamble.get("law_number") law_year = law_year or law_info_preamble.get("year") # ------------------------------------------------------- # معالجة الأقسام # ------------------------------------------------------- sections = [] for sec in sections_raw: raw_blocks = sec["texts"] # دمج نصوص القسم مع merge_colon_lines # واستبعاد المواد من نص المحتوى content = "\n".join([ b["text"] for b in raw_blocks if not is_article(b["text"]) ]).strip() content = merge_colon_lines(content) # استخراج المواد من القسم articles = extract_articles_from_blocks(raw_blocks) # تجهيز المواد بالشكل المطلوب articles_cleaned = [] for a in articles: if a["number"] is None: articles_cleaned.append({"tag": a["text"]}) else: articles_cleaned.append({ "number": a["number"], "text": a["text"] }) # إضافة القسم النهائي sections.append({ "title": sec["name"], "content": content, "articles": articles_cleaned }) # ------------------------------------------------------- # الاستجابة النهائية # ------------------------------------------------------- return { "message": "success", "url": url, "count": len(text_blocks), "law": { "title": title, "preamble": preamble, "number": law_number, "year": law_year, "sections": sections } }