Spaces:

Mazenbs
/

extract_html_full

Sleeping

App Files Files Community

Mazenbs commited on Dec 11, 2025

Commit

d4584e2

verified ·

1 Parent(s): ebc846d

Update parser/assembler.py

Browse files

Files changed (1) hide show

parser/assembler.py +82 -0

parser/assembler.py CHANGED Viewed

@@ -61,6 +61,88 @@ def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
     return sections
 def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
     title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
     sections_raw = extract_sections(remaining_blocks)

     return sections
 def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
+    # استخراج العنوان + المقدمة + باقي النص
+    title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
+    sections_raw = extract_sections(remaining_blocks)
+    # -------------------------------------------------------
+    # استخراج رقم القانون والسنة من العنوان
+    # -------------------------------------------------------
+    law_info_title = extract_law_number_and_year(title)
+    # استخراج رقم القانون والسنة من المقدمة (كباك أب)
+    law_info_preamble = extract_law_number_and_year(preamble)
+    # -------------------------------------------------------
+    # اختيار الأفضل:
+    # العنوان أولوية، وإذا ناقص → نكمل من المقدمة
+    # -------------------------------------------------------
+    law_number = None
+    law_year = None
+    if law_info_title:
+        law_number = law_info_title.get("number")
+        law_year = law_info_title.get("year")
+    if (not law_number or not law_year) and law_info_preamble:
+        law_number = law_number or law_info_preamble.get("number")
+        law_year   = law_year or law_info_preamble.get("year")
+    # -------------------------------------------------------
+    # معالجة الأقسام
+    # -------------------------------------------------------
+    sections = []
+    for sec in sections_raw:
+        raw_blocks = sec["texts"]
+        # دمج نصوص القسم مع merge_colon_lines
+        # واستبعاد المواد من نص المحتوى
+        content = "\n".join([
+            b["text"] for b in raw_blocks
+            if not is_article(b["text"])
+        ]).strip()
+        content = merge_colon_lines(content)
+        # استخراج المواد من القسم
+        articles = extract_articles_from_blocks(raw_blocks)
+        # تجهيز المواد بالشكل المطلوب
+        articles_cleaned = []
+        for a in articles:
+            if a["number"] is None:
+                articles_cleaned.append({"tag": a["text"]})
+            else:
+                articles_cleaned.append({
+                    "number": a["number"],
+                    "text": a["text"]
+                })
+        # إضافة القسم النهائي
+        sections.append({
+            "title": sec["name"],
+            "content": content,
+            "articles": articles_cleaned
+        })
+    # -------------------------------------------------------
+    # الاستجابة النهائية
+    # -------------------------------------------------------
+    return {
+        "message": "success",
+        "blocks": {
+            "count": len(text_blocks),
+        },
+        "law": {
+            "title": title,
+            "preamble": preamble,
+            "number": law_number,
+            "year": law_year,
+            "sections": sections
+        }
+    }
+def parse_law_from_textsx(text_blocks: List[Dict[str, str]]) -> Dict:
     title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
     sections_raw = extract_sections(remaining_blocks)