Spaces:

Mazenbs
/

extract_html_full

Running

File size: 4,960 Bytes

ac8768a
cdb4964
560009a
392a934
cdb4964
ac8768a
 
 
cdb4964
 
 
349aab0
cdb4964
72a807d
cdb4964
 
349aab0
 
87cfaef
50eda68
 
 
87cfaef
cdb4964
228d412
50eda68
127ed78
 
50eda68
5771f15
228d412
50eda68
 
d3f0826
5771f15
349aab0
50eda68
127ed78
50eda68
 
 
038eebb
 
87cfaef
560009a
 
 
87cfaef
cdb4964
560009a
87cfaef
560009a
 
87cfaef
560009a
87cfaef
560009a
 
 
 
 
 
39285a6
d4584e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709465a
d4584e2
 
 
709465a
d4584e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a1cbd4
fa02b5f
d4584e2

# parser/assembler.py
from typing import List, Dict, Tuple
from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
from helpers.cleaner import extract_law_number_and_year, merge_colon_lines

def extract_title_and_preamble(
    blocks: List[Dict[str, str]],
) -> Tuple[str, str, List[Dict[str, str]]]:
    title_blocks = [b for b in blocks if b["type"] == "title"]
    preamble_blocks = [b for b in blocks if b["type"] == "preamble"]
    body_blocks = [b for b in blocks if b["type"] == "body"]

    title = "\n".join([b["text"] for b in title_blocks]).strip()
    preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()

    return title, preamble, body_blocks


def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
    articles = []
    current = None

    for block in blocks:
        t = block["text"].strip()
        if is_article(t):
            if current:
                # تطبيق merge_colon_lines على نص المادة السابقة
                current["text"] = merge_colon_lines(current["text"].strip())
                articles.append(current)
            current = {"number": extract_article_number(t), "text": t}
        else:
            if current:
                current["text"] += "\n" + t
            else:
                current = {"number": None, "text": t}

    if current:
        current["text"] = merge_colon_lines(current["text"].strip())
        articles.append(current)

    return articles


def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
    sections = []
    current = {"name": "", "texts": []}

    for block in blocks:
        t = normalize_digits(block["text"].strip())

        if is_section(t):
            if current["texts"] or current["name"]:
                sections.append(current)
            current = {"name": t, "texts": []}
        else:
            current["texts"].append(block)

    if current["texts"] or current["name"]:
        sections.append(current)

    return sections

def parse_law_from_texts(text_blocks: List[Dict[str, str]], url: str = None) -> Dict:
    title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
    sections_raw = extract_sections(remaining_blocks)

    # -------------------------------------------------------
    # استخراج رقم القانون والسنة من العنوان
    # -------------------------------------------------------
    law_info_title = extract_law_number_and_year(title)

    # استخراج رقم القانون والسنة من المقدمة (كباك أب)
    law_info_preamble = extract_law_number_and_year(preamble)

    # -------------------------------------------------------
    # اختيار الأفضل:
    # العنوان أولوية، وإذا ناقص → نكمل من المقدمة
    # -------------------------------------------------------
    law_number = None
    law_year = None

    if law_info_title:
        law_number = law_info_title.get("law_number")
        law_year = law_info_title.get("year")

    if (not law_number or not law_year) and law_info_preamble:
        law_number = law_number or law_info_preamble.get("law_number")
        law_year   = law_year or law_info_preamble.get("year")

    # -------------------------------------------------------
    # معالجة الأقسام
    # -------------------------------------------------------
    sections = []
    for sec in sections_raw:
        raw_blocks = sec["texts"]

        # دمج نصوص القسم مع merge_colon_lines
        # واستبعاد المواد من نص المحتوى
        content = "\n".join([
            b["text"] for b in raw_blocks 
            if not is_article(b["text"])
        ]).strip()

        content = merge_colon_lines(content)

        # استخراج المواد من القسم
        articles = extract_articles_from_blocks(raw_blocks)

        # تجهيز المواد بالشكل المطلوب
        articles_cleaned = []
        for a in articles:
            if a["number"] is None:
                articles_cleaned.append({"tag": a["text"]})
            else:
                articles_cleaned.append({
                    "number": a["number"],
                    "text": a["text"]
                })

        # إضافة القسم النهائي
        sections.append({
            "title": sec["name"],
            "content": content,
            "articles": articles_cleaned
        })

    # -------------------------------------------------------
    # الاستجابة النهائية
    # -------------------------------------------------------
    return {
        "message": "success",
        "url": url, 
        "count": len(text_blocks),
        "law": {
            "title": title,
            "preamble": preamble,
            "number": law_number,
            "year": law_year,
            "sections": sections
        }
    }