Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 4, 2025

Commit

d3f0826

verified ·

1 Parent(s): 7afec87

Update parser/assembler.py

Browse files

Files changed (1) hide show

parser/assembler.py +25 -8

parser/assembler.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from typing import List, Dict
 from .section_extractor import extract_sections
 from helpers.cleaner import clean_text
-from helpers.utils import normalize_digits, is_article, extract_article_number, is_section, clean_text_block
 def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
     """
@@ -18,7 +19,8 @@ def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
     remaining_texts = []
     for t in texts:
-        if is_section(t) or is_article(t):
             remaining_texts.append(t)
         else:
             preamble_lines.append(t)
@@ -28,20 +30,34 @@ def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
 def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
-    """استخراج المواد من قائمة نصوص القسم باستخدام is_article و extract_article_number."""
     articles = []
     current = None
     for t in texts:
         t = t.strip()
-        if is_article(t):
             if current:
                 current["text"] = current["text"].strip()
                 articles.append(current)
             current = {"number": extract_article_number(t), "text": t}
-        else:
             if current:
                 current["text"] += "\n" + t
     if current:
         current["text"] = current["text"].strip()
@@ -53,7 +69,7 @@ def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
 def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
     """
     تحويل نصوص القانون إلى هيكل JSON مع تنظيف واستخراج مواد وأقسام
-    بالاعتماد على دوال utils.py.
     """
     # 1) تنظيف النصوص الخام
@@ -70,8 +86,9 @@ def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
     for sec in sections_raw:
         raw_texts = sec["texts"]
-        # المحتوى غير المواد
-        content = "\n".join(t for t in raw_texts if not is_article(t)).strip()
         # استخراج المواد
         articles = extract_articles_from_texts(raw_texts)

 from typing import List, Dict
 from .section_extractor import extract_sections
 from helpers.cleaner import clean_text
+from helpers.utils import normalize_digits, extract_article_number, clean_text_block, detect_line_type
 def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
     """
     remaining_texts = []
     for t in texts:
+        line_type = detect_line_type(t)
+        if line_type in ("section", "article"):
             remaining_texts.append(t)
         else:
             preamble_lines.append(t)
 def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
+    """
+    استخراج المواد مع ضمان عدم فقد أي نص باستخدام detect_line_type.
+    """
     articles = []
     current = None
     for t in texts:
         t = t.strip()
+        line_type = detect_line_type(t)
+        if line_type == "article":
+            # حفظ المادة السابقة قبل الانتقال للجديدة
             if current:
                 current["text"] = current["text"].strip()
                 articles.append(current)
+            # إنشاء مادة جديدة
             current = {"number": extract_article_number(t), "text": t}
+        elif line_type == "text":
+            # إضافة نص عادي للمادة الحالية أو إنشاء مادة بدون رقم
             if current:
                 current["text"] += "\n" + t
+            else:
+                current = {"number": None, "text": t}
+        elif line_type == "section":
+            # تجاهل بداية قسم هنا (سيتم التعامل معها في parse_law_from_texts)
+            continue
     if current:
         current["text"] = current["text"].strip()
 def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
     """
     تحويل نصوص القانون إلى هيكل JSON مع تنظيف واستخراج مواد وأقسام
+    باستخدام detect_line_type.
     """
     # 1) تنظيف النصوص الخام
     for sec in sections_raw:
         raw_texts = sec["texts"]
+        # المحتوى غير المواد (نصوص عادية داخل القسم)
+        content_lines = [t for t in raw_texts if detect_line_type(t) == "text"]
+        content = "\n".join(content_lines).strip()
         # استخراج المواد
         articles = extract_articles_from_texts(raw_texts)