Spaces:
Running
Running
| # parser/assembler.py | |
| from typing import List, Dict, Tuple | |
| from helpers.utils import normalize_digits, extract_article_number, is_article, is_section | |
| from helpers.cleaner import extract_law_number_and_year, merge_colon_lines | |
| def extract_title_and_preamble( | |
| blocks: List[Dict[str, str]], | |
| ) -> Tuple[str, str, List[Dict[str, str]]]: | |
| title_blocks = [b for b in blocks if b["type"] == "title"] | |
| preamble_blocks = [b for b in blocks if b["type"] == "preamble"] | |
| body_blocks = [b for b in blocks if b["type"] == "body"] | |
| title = "\n".join([b["text"] for b in title_blocks]).strip() | |
| preamble = "\n".join([b["text"] for b in preamble_blocks]).strip() | |
| return title, preamble, body_blocks | |
| def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]: | |
| articles = [] | |
| current = None | |
| for block in blocks: | |
| t = block["text"].strip() | |
| if is_article(t): | |
| if current: | |
| # تطبيق merge_colon_lines على نص المادة السابقة | |
| current["text"] = merge_colon_lines(current["text"].strip()) | |
| articles.append(current) | |
| current = {"number": extract_article_number(t), "text": t} | |
| else: | |
| if current: | |
| current["text"] += "\n" + t | |
| else: | |
| current = {"number": None, "text": t} | |
| if current: | |
| current["text"] = merge_colon_lines(current["text"].strip()) | |
| articles.append(current) | |
| return articles | |
| def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]: | |
| sections = [] | |
| current = {"name": "", "texts": []} | |
| for block in blocks: | |
| t = normalize_digits(block["text"].strip()) | |
| if is_section(t): | |
| if current["texts"] or current["name"]: | |
| sections.append(current) | |
| current = {"name": t, "texts": []} | |
| else: | |
| current["texts"].append(block) | |
| if current["texts"] or current["name"]: | |
| sections.append(current) | |
| return sections | |
| def parse_law_from_texts(text_blocks: List[Dict[str, str]], url: str = None) -> Dict: | |
| title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks) | |
| sections_raw = extract_sections(remaining_blocks) | |
| # ------------------------------------------------------- | |
| # استخراج رقم القانون والسنة من العنوان | |
| # ------------------------------------------------------- | |
| law_info_title = extract_law_number_and_year(title) | |
| # استخراج رقم القانون والسنة من المقدمة (كباك أب) | |
| law_info_preamble = extract_law_number_and_year(preamble) | |
| # ------------------------------------------------------- | |
| # اختيار الأفضل: | |
| # العنوان أولوية، وإذا ناقص → نكمل من المقدمة | |
| # ------------------------------------------------------- | |
| law_number = None | |
| law_year = None | |
| if law_info_title: | |
| law_number = law_info_title.get("law_number") | |
| law_year = law_info_title.get("year") | |
| if (not law_number or not law_year) and law_info_preamble: | |
| law_number = law_number or law_info_preamble.get("law_number") | |
| law_year = law_year or law_info_preamble.get("year") | |
| # ------------------------------------------------------- | |
| # معالجة الأقسام | |
| # ------------------------------------------------------- | |
| sections = [] | |
| for sec in sections_raw: | |
| raw_blocks = sec["texts"] | |
| # دمج نصوص القسم مع merge_colon_lines | |
| # واستبعاد المواد من نص المحتوى | |
| content = "\n".join([ | |
| b["text"] for b in raw_blocks | |
| if not is_article(b["text"]) | |
| ]).strip() | |
| content = merge_colon_lines(content) | |
| # استخراج المواد من القسم | |
| articles = extract_articles_from_blocks(raw_blocks) | |
| # تجهيز المواد بالشكل المطلوب | |
| articles_cleaned = [] | |
| for a in articles: | |
| if a["number"] is None: | |
| articles_cleaned.append({"tag": a["text"]}) | |
| else: | |
| articles_cleaned.append({ | |
| "number": a["number"], | |
| "text": a["text"] | |
| }) | |
| # إضافة القسم النهائي | |
| sections.append({ | |
| "title": sec["name"], | |
| "content": content, | |
| "articles": articles_cleaned | |
| }) | |
| # ------------------------------------------------------- | |
| # الاستجابة النهائية | |
| # ------------------------------------------------------- | |
| return { | |
| "message": "success", | |
| "url": url, | |
| "count": len(text_blocks), | |
| "law": { | |
| "title": title, | |
| "preamble": preamble, | |
| "number": law_number, | |
| "year": law_year, | |
| "sections": sections | |
| } | |
| } | |