Spaces:

Mazenbs
/

extract_html_full

Sleeping

App Files Files Community

Mazenbs commited on Dec 4, 2025

Commit

50eda68

verified ·

1 Parent(s): e1d1219

Update parser/assembler.py

Browse files

Files changed (1) hide show

parser/assembler.py +71 -89

parser/assembler.py CHANGED Viewed

@@ -1,116 +1,98 @@
 from typing import List, Dict
 from .section_extractor import extract_sections
 from helpers.cleaner import clean_text
-from helpers.utils import normalize_digits, is_article, extract_article_number
-def parse_law_from_texts(text_blocks: List[Dict[str, str]]):
     """
-    تحويل نصوص القانون المستخرجة إلى هيكل JSON متكامل مع تنظيف النصوص
-    واستخراج المواد بشكل صحيح باستخدام الأنماط الحديثة من utils.
     """
-    # -----------------------------------
-    # 1) تنظيف النصوص الخام + تحويل الأرقام
-    # -----------------------------------
-    pure_texts = [
-        clean_text(normalize_digits(block.get("text", "").strip()))
-        for block in text_blocks
-        if block.get("text")
-    ]
-    # -----------------------------------
-    # 2) استخراج العنوان
-    # -----------------------------------
     title = ""
-    preamble_lines = []
-    while pure_texts:
-        candidate = pure_texts.pop(0).strip()
-        if candidate.lower() != "html":
-            title = candidate
             break
-    # -----------------------------------
-    # 3) تجميع المقدمة قبل أقسام (باب / فصل / قسم)
-    # -----------------------------------
-    while pure_texts and not any(
-        keyword in pure_texts[0]
-        for keyword in ["الباب", "الفصل", "القسم"]
-    ):
-        preamble_lines.append(pure_texts.pop(0))
     preamble = "\n".join(preamble_lines)
-    # -----------------------------------
-    # 4) استخراج الأقسام الخام
-    # -----------------------------------
-    sections_raw = extract_sections(pure_texts)
-    # -----------------------------------
-    # 5) إعادة صياغة الأقسام + استخراج المواد الحديثة
-    # -----------------------------------
     sections = []
-    for s in sections_raw:
-        raw_texts = s["texts"]
-        # -------- المحتوى غير المادة --------
-        content_lines = [t for t in raw_texts if not is_article(t)]
-        content = "\n".join(content_lines).strip()
-        # -------- استخراج المواد --------
-        articles = []
-        current_article = None
-        for t in raw_texts:
-            t = t.strip()
-            if is_article(t):
-                # إضافة المادة السابقة إن وجدت
-                if current_article:
-                    current_article["text"] = current_article["text"].strip()
-                    articles.append(current_article)
-                # بدء مادة جديدة
-                current_article = {
-                    "number": extract_article_number(t),
-                    "text": t,
-                }
-            else:
-                # إلحاق النص بالمادة الحالية
-                if current_article:
-                    current_article["text"] += "\n" + t
-        # إضافة آخر مادة
-        if current_article:
-            current_article["text"] = current_article["text"].strip()
-            articles.append(current_article)
-        # -------- تنظيف بيانات القسم --------
-        clean_section = {
-            "title": clean_text(normalize_digits(s["name"])),
-            "content": clean_text(normalize_digits(content)),
             "articles": [
-                {
-                    "number": a["number"],
-                    "text": clean_text(normalize_digits(a["text"]))
-                }
                 for a in articles
             ]
-        }
-        sections.append(clean_section)
-    # -----------------------------------
-    # 6) إرجاع المستند القانوني الكامل
-    # -----------------------------------
     return {
         "message": "تم التحليل بنجاح",
         "saved_to_db": False,
         "law": {
-            "title": clean_text(normalize_digits(title)),
-            "preamble": clean_text(normalize_digits(preamble)),
             "sections": sections
         }
     }

 from typing import List, Dict
 from .section_extractor import extract_sections
 from helpers.cleaner import clean_text
+from helpers.utils import normalize_digits, is_article, extract_article_number, is_section
+def clean_text_block(text: str) -> str:
+    """تنظيف النص وتحويل الأرقام الهندية إلى عربية."""
+    return clean_text(normalize_digits(text.strip()))
+def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
     """
+    استخراج عنوان القانون والمقدمة، وإرجاع بقية النصوص بعد المقدمة.
     """
     title = ""
+    while texts:
+        t = texts.pop(0)
+        if t.lower() != "html":
+            title = t
             break
+    preamble_lines = []
+    while texts and not any(is_section(k) for k in [texts[0]]):
+        preamble_lines.append(texts.pop(0))
     preamble = "\n".join(preamble_lines)
+    return title, preamble, texts
+def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
+    """استخراج المواد من قائمة نصوص القسم باستخدام is_article و extract_article_number."""
+    articles = []
+    current = None
+    for t in texts:
+        t = t.strip()
+        if is_article(t):
+            if current:
+                current["text"] = current["text"].strip()
+                articles.append(current)
+            current = {"number": extract_article_number(t), "text": t}
+        else:
+            if current:
+                current["text"] += "\n" + t
+    if current:
+        current["text"] = current["text"].strip()
+        articles.append(current)
+    return articles
+def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
+    """
+    تحويل نصوص القانون إلى هيكل JSON مع تنظيف واستخراج مواد وأقسام
+    بالاعتماد على دوال utils.py.
+    """
+    # 1) تنظيف النصوص الخام
+    pure_texts = [clean_text_block(b.get("text", "")) for b in text_blocks if b.get("text")]
+    # 2) استخراج العنوان والمقدمة
+    title, preamble, remaining_texts = extract_title_and_preamble(pure_texts)
+    # 3) استخراج الأقسام الخام
+    sections_raw = extract_sections(remaining_texts)
+    # 4) بناء الأقسام مع المواد
     sections = []
+    for sec in sections_raw:
+        raw_texts = sec["texts"]
+        # المحتوى غير المواد
+        content = "\n".join(t for t in raw_texts if not is_article(t)).strip()
+        # استخراج المواد
+        articles = extract_articles_from_texts(raw_texts)
+        sections.append({
+            "title": clean_text_block(sec["name"]),
+            "content": clean_text_block(content),
             "articles": [
+                {"number": a["number"], "text": clean_text_block(a["text"])}
                 for a in articles
             ]
+        })
+    # 5) إرجاع المستند القانوني الكامل
     return {
         "message": "تم التحليل بنجاح",
         "saved_to_db": False,
         "law": {
+            "title": clean_text_block(title),
+            "preamble": clean_text_block(preamble),
             "sections": sections
         }
     }