Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 8, 2025

Commit

cdb4964

verified ·

1 Parent(s): 7f167ce

Update parser/assembler.py

Browse files

Files changed (1) hide show

parser/assembler.py +25 -52

parser/assembler.py CHANGED Viewed

@@ -1,65 +1,34 @@
 # parser/assembler.py
-from typing import List, Dict, Optional, Tuple
 from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
 def extract_title_and_preamble(
     blocks: List[Dict[str, str]],
-    *,
-    default_title: Optional[str] = None,
-    default_preamble: Optional[str] = None
 ) -> Tuple[str, str, List[Dict[str, str]]]:
     """
-    تعمل على قائمة من الكتل ويمكن أن تحتوي كل كتلة على مفتاح type مسبقاً:
-      type = "title" | "preamble" | "body"
-    إذا وُجد type يُستخدم مباشرة، وإلا تُستخدم الطريقة التقليدية.
-    يمكن أيضاً تمرير عنوان أو مقدمة افتراضية لتُستخدم بدلاً من الاستخراج التلقائي.
     """
-    # إذا أُرسلت قيم افتراضية نستخدمها فوراً
-    if default_title is not None and default_preamble is not None:
-        # نفصل كتل body فقط (أي شيء لا يُعتبر title أو preamble)
-        body_blocks = [b for b in blocks if b.get("type") != "title" and b.get("type") != "preamble"]
-        return default_title, default_preamble, body_blocks
-    # هل القائمة تحتوي على حقل type مُحدد مسبقاً؟
-    if any(b.get("type") in {"title", "preamble", "body"} for b in blocks):
-        title_blocks   = [b for b in blocks if b.get("type") == "title"]
-        preamble_blocks = [b for b in blocks if b.get("type") == "preamble"]
-        body_blocks     = [b for b in blocks if b.get("type") == "body"]
-        title   = "\n".join([b["text"].strip() for b in title_blocks]).strip()
-        preamble = "\n".join([b["text"].strip() for b in preamble_blocks]).strip()
-        return title, preamble, body_blocks
-    # الطريقة التقليدية (المنطق القديم دون تغيير)
-    title = ""
-    preamble_blocks = []
-    # استخراج العنوان
-    while blocks:
-        block = blocks.pop(0)
-        t = block.get("text", "").strip()
-        if t.lower() != "html" and t:
-            title = t
-            break
-    # استخراج المقدمة حتى أول قسم أو مادة
-    while blocks:
-        t = blocks[0].get("text", "").strip()
-        if is_section(t) or is_article(t):
-            break
-        preamble_blocks.append(blocks.pop(0))
     preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
-    return title, preamble, blocks
 def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
     articles = []
     current = None
     for block in blocks:
-        t = block.get("text", "").strip()
         if is_article(t):
             if current:
                 current["text"] = current["text"].strip()
@@ -79,11 +48,14 @@ def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
 def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
     sections = []
     current = {"name": "", "texts": []}
     for block in blocks:
-        t = normalize_digits(block.get("text", "").strip())
         if is_section(t):
             if current["texts"] or current["name"]:
@@ -99,16 +71,17 @@ def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
 def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
-    raw_blocks = [b for b in text_blocks if b.get("text")]
-    title, preamble, remaining_blocks = extract_title_and_preamble(raw_blocks)
     sections_raw = extract_sections(remaining_blocks)
     sections = []
     for sec in sections_raw:
         raw_blocks = sec["texts"]
-        content = "\n".join([b["text"] for b in raw_blocks if not is_article(b.get("text", ""))]).strip()
         articles = extract_articles_from_blocks(raw_blocks)
         sections.append({
@@ -121,10 +94,10 @@ def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
         })
     return {
-        "message": "success",  # كلمة إنجليزية بدلًا من تم التحليل بنجاح
         "all_blocks": {
             "count": len(text_blocks),
-            #"items": text_blocks
         },
         "law": {
             "title": title,

 # parser/assembler.py
+from typing import List, Dict, Tuple
 from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
 def extract_title_and_preamble(
     blocks: List[Dict[str, str]],
 ) -> Tuple[str, str, List[Dict[str, str]]]:
     """
+    استخراج العنوان والمقدمة وبقية النصوص (body) من الكتل.
+    كل كتلة تحتوي على {"text": ..., "type": ...}، جميع العناصر مستخدمة.
     """
+    title_blocks = [b for b in blocks if b["type"] == "title"]
+    preamble_blocks = [b for b in blocks if b["type"] == "preamble"]
+    body_blocks = [b for b in blocks if b["type"] == "body"]
+    title = "\n".join([b["text"] for b in title_blocks]).strip()
     preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
+    return title, preamble, body_blocks
 def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
+    """
+    تقسيم النصوص إلى مقالات حسب is_article.
+    """
     articles = []
     current = None
     for block in blocks:
+        t = block["text"].strip()
         if is_article(t):
             if current:
                 current["text"] = current["text"].strip()
 def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
+    """
+    تقسيم النصوص إلى أقسام حسب is_section.
+    """
     sections = []
     current = {"name": "", "texts": []}
     for block in blocks:
+        t = normalize_digits(block["text"].strip())
         if is_section(t):
             if current["texts"] or current["name"]:
 def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
+    """
+    تحويل قائمة النصوص المفهرسة إلى هيكل القانون النهائي مع أقسامه ومقالاته.
+    """
+    title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
     sections_raw = extract_sections(remaining_blocks)
     sections = []
     for sec in sections_raw:
         raw_blocks = sec["texts"]
+        content = "\n".join([b["text"] for b in raw_blocks if not is_article(b["text"])]).strip()
         articles = extract_articles_from_blocks(raw_blocks)
         sections.append({
         })
     return {
+        "message": "success",
         "all_blocks": {
             "count": len(text_blocks),
+            #"items": text_blocks  # يمكن إلغاء التعليق إذا أردت كل العناصر
         },
         "law": {
             "title": title,