Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 6, 2025

Commit

87cfaef

verified ·

1 Parent(s): 560009a

Update parser/assembler.py

Browse files

Files changed (1) hide show

parser/assembler.py +39 -36

parser/assembler.py CHANGED Viewed

@@ -2,49 +2,56 @@ from typing import List, Dict
 from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
-def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
     """
     استخراج عنوان القانون والمقدمة، مع الاحتفاظ بالنصوص غير المصنفة.
     """
     title = ""
-    while texts:
-        t = texts.pop(0).strip()
         if t.lower() != "html" and t != "":
             title = t
             break
-    preamble_lines = []
-    remaining_texts = []
-    for t in texts:
         if is_section(t) or is_article(t):
-            remaining_texts.append(t)
         else:
-            preamble_lines.append(t)
-    preamble = "\n".join(preamble_lines).strip()
-    return title, preamble, remaining_texts
-def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
     """
     استخراج المواد مع الاحتفاظ بالنصوص التابعة لها.
     """
     articles = []
     current = None
-    for t in texts:
-        t = t.strip()
         if is_article(t):
             if current:
                 current["text"] = current["text"].strip()
                 articles.append(current)
-            current = {"number": extract_article_number(t), "text": t}
         else:
             if current:
                 current["text"] += "\n" + t
             else:
-                current = {"number": None, "text": t}
     if current:
         current["text"] = current["text"].strip()
@@ -53,30 +60,25 @@ def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
     return articles
-def extract_sections(texts: List[str]) -> List[Dict]:
     """
     تقسيم النصوص إلى أقسام وفصول باستخدام الأنماط الذكية.
     """
     sections = []
     current = {"name": "", "texts": []}
-    for t in texts:
-        if not isinstance(t, str):
-            continue
-        t_norm = normalize_digits(t.strip())
-        # هل السطر عنوان قسم أو فصل؟
-        if is_section(t_norm):
             # لو يوجد قسم سابق → نحفظه
             if current["texts"] or current["name"]:
                 sections.append(current)
             # نبدأ قسم/فصل جديد
-            current = {"name": t_norm, "texts": []}
         else:
-            # السطور التابعة للقسم الحالي
-            current["texts"].append(t_norm)
     # إضافة آخر قسم
     if current["texts"] or current["name"]:
@@ -90,29 +92,29 @@ def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
     تحويل النصوص إلى هيكل قانوني منظم مع ضمان عدم فقد أي نص.
     """
     # النصوص الخام (منظفة مسبقًا في text_blocks.py)
-    raw_texts = [b.get("text", "") for b in text_blocks if b.get("text")]
     # استخراج العنوان والمقدمة
-    title, preamble, remaining_texts = extract_title_and_preamble(raw_texts)
     # استخراج الأقسام
-    sections_raw = extract_sections(remaining_texts)
     sections = []
     for sec in sections_raw:
-        raw_texts = sec["texts"]
-        # المحتوى غير المواد
-        content = "\n".join(t for t in raw_texts if not is_article(t)).strip()
         # استخراج المواد
-        articles = extract_articles_from_texts(raw_texts)
         sections.append({
             "title": sec["name"],
             "content": content,
             "articles": [
-                {"number": a["number"], "text": a["text"]}
                 for a in articles
             ]
         })
@@ -123,6 +125,7 @@ def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
         "law": {
             "title": title,
             "preamble": preamble,
-            "sections": sections
         }
     }

 from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
+def extract_title_and_preamble(text_blocks: List[Dict[str, str]]) -> (str, str, List[Dict[str, str]]):
     """
     استخراج عنوان القانون والمقدمة، مع الاحتفاظ بالنصوص غير المصنفة.
     """
     title = ""
+    preamble_blocks = []
+    remaining_blocks = []
+    # استخراج العنوان
+    while text_blocks:
+        block = text_blocks.pop(0)
+        t = block.get("text", "").strip()
         if t.lower() != "html" and t != "":
             title = t
             break
+    # استخراج المقدمة وبقية النصوص
+    for block in text_blocks:
+        t = block.get("text", "").strip()
         if is_section(t) or is_article(t):
+            remaining_blocks.append(block)
         else:
+            preamble_blocks.append(block)
+    # المقدمة تبقى كـ نصوص منفصلة لكن نعرضها مجمعة عند الإخراج
+    preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
+    return title, preamble, remaining_blocks
+def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
     """
     استخراج المواد مع الاحتفاظ بالنصوص التابعة لها.
     """
     articles = []
     current = None
+    for block in blocks:
+        t = block.get("text", "").strip()
         if is_article(t):
             if current:
                 current["text"] = current["text"].strip()
                 articles.append(current)
+            current = {"number": extract_article_number(t), "text": t, "blocks": [block]}
         else:
             if current:
                 current["text"] += "\n" + t
+                current["blocks"].append(block)
             else:
+                # نص غير مصنف → نحفظه كمادة بدون رقم
+                current = {"number": None, "text": t, "blocks": [block]}
     if current:
         current["text"] = current["text"].strip()
     return articles
+def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
     """
     تقسيم النصوص إلى أقسام وفصول باستخدام الأنماط الذكية.
     """
     sections = []
     current = {"name": "", "texts": []}
+    for block in blocks:
+        t = normalize_digits(block.get("text", "").strip())
+        if is_section(t):
             # لو يوجد قسم سابق → نحفظه
             if current["texts"] or current["name"]:
                 sections.append(current)
             # نبدأ قسم/فصل جديد
+            current = {"name": t, "texts": []}
         else:
+            current["texts"].append(block)
     # إضافة آخر قسم
     if current["texts"] or current["name"]:
     تحويل النصوص إلى هيكل قانوني منظم مع ضمان عدم فقد أي نص.
     """
     # النصوص الخام (منظفة مسبقًا في text_blocks.py)
+    raw_blocks = [b for b in text_blocks if b.get("text")]
     # استخراج العنوان والمقدمة
+    title, preamble, remaining_blocks = extract_title_and_preamble(raw_blocks)
     # استخراج الأقسام
+    sections_raw = extract_sections(remaining_blocks)
     sections = []
     for sec in sections_raw:
+        raw_blocks = sec["texts"]
+        # المحتوى غير المواد (��بقى كـ نصوص منفصلة لكن نعرضه مجمّعًا)
+        content = "\n".join([b["text"] for b in raw_blocks if not is_article(b.get("text", ""))]).strip()
         # استخراج المواد
+        articles = extract_articles_from_blocks(raw_blocks)
         sections.append({
             "title": sec["name"],
             "content": content,
             "articles": [
+                {"number": a["number"], "text": a["text"], "blocks": a["blocks"]}
                 for a in articles
             ]
         })
         "law": {
             "title": title,
             "preamble": preamble,
+            "sections": sections,
+            "all_blocks": text_blocks  # ضمان أن كل النصوص محفوظة
         }
     }