Spaces:

Mazenbs
/

extract_html_full

Sleeping

App Files Files Community

Mazenbs commited on Dec 7, 2025

Commit

ac8768a

verified ·

1 Parent(s): eca2f80

Update parser/assembler.py

Browse files

Files changed (1) hide show

parser/assembler.py +35 -4

parser/assembler.py CHANGED Viewed

@@ -1,8 +1,40 @@
 from typing import List, Dict
 from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
-def extract_title_and_preamble(blocks: List[Dict[str, str]]) -> (str, str, List[Dict[str, str]]):
     title = ""
     preamble_blocks = []
@@ -10,7 +42,7 @@ def extract_title_and_preamble(blocks: List[Dict[str, str]]) -> (str, str, List[
     while blocks:
         block = blocks.pop(0)
         t = block.get("text", "").strip()
-        if t.lower() != "html" and t != "":
             title = t
             break
@@ -21,9 +53,8 @@ def extract_title_and_preamble(blocks: List[Dict[str, str]]) -> (str, str, List[
             break
         preamble_blocks.append(blocks.pop(0))
-    remaining_blocks = blocks
     preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
-    return title, preamble, remaining_blocks
 def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:

+# parser/assembler.py
 from typing import List, Dict
 from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
+def extract_title_and_preamble(
+    blocks: List[Dict[str, str]],
+    *,
+    default_title: Optional[str] = None,
+    default_preamble: Optional[str] = None
+) -> Tuple[str, str, List[Dict[str, str]]]:
+    """
+    تعمل على قائمة من الكتل ويمكن أن تحتوي كل كتلة على مفتاح type مسبقاً:
+      type = "title" | "preamble" | "body"
+    إذا وُجد type يُستخدم مباشرة، وإلا تُستخدم الطريقة التقليدية.
+    يمكن أيضاً تمرير عنوان أو مقدمة افتراضية لتُستخدم بدلاً من الاستخراج التلقائي.
+    """
+    # إذا أُرسلت قيم افتراضية نستخدمها فوراً
+    if default_title is not None and default_preamble is not None:
+        # نفصل كتل body فقط (أي شيء لا يُعتبر title أو preamble)
+        body_blocks = [b for b in blocks if b.get("type") != "title" and b.get("type") != "preamble"]
+        return default_title, default_preamble, body_blocks
+    # هل القائمة تحتوي على حقل type مُحدد مسبقاً؟
+    if any(b.get("type") in {"title", "preamble", "body"} for b in blocks):
+        title_blocks   = [b for b in blocks if b.get("type") == "title"]
+        preamble_blocks = [b for b in blocks if b.get("type") == "preamble"]
+        body_blocks     = [b for b in blocks if b.get("type") == "body"]
+        title   = "\n".join([b["text"].strip() for b in title_blocks]).strip()
+        preamble = "\n".join([b["text"].strip() for b in preamble_blocks]).strip()
+        return title, preamble, body_blocks
+    # الطريقة التقليدية (المنطق القديم دون تغيير)
     title = ""
     preamble_blocks = []
     while blocks:
         block = blocks.pop(0)
         t = block.get("text", "").strip()
+        if t.lower() != "html" and t:
             title = t
             break
             break
         preamble_blocks.append(blocks.pop(0))
     preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
+    return title, preamble, blocks
 def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]: