Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 4, 2025

Commit

038eebb

verified ·

1 Parent(s): 3178e95

Update parser/assembler.py

Browse files

Files changed (1) hide show

parser/assembler.py +20 -40

parser/assembler.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from typing import List, Dict
 from helpers.utils import normalize_digits, extract_article_number, detect_line_type
 def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
     """
     استخراج عنوان القانون والمقدمة، وإرجاع بقية النصوص بعد المقدمة.
@@ -29,8 +28,7 @@ def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
 def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
     """
-    استخراج المواد مع ضمان عدم فقد أي نص.
-    أي نص عادي يتم إضافته للمادة السابقة أو كمادة بدون رقم.
     """
     articles = []
     current = None
@@ -40,34 +38,29 @@ def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
         line_type = detect_line_type(t)
         if line_type == "article":
-            # حفظ المادة السابقة قبل الانتقال للجديدة
             if current:
                 current["text"] = current["text"].strip()
                 articles.append(current)
-            # بدء مادة جديدة
             current = {"number": extract_article_number(t), "text": t}
         elif line_type == "text":
-            # إضافة النص للمادة الحالية، أو إنشاء مادة بدون رقم إذا لم توجد
             if current:
                 current["text"] += "\n" + t
             else:
                 current = {"number": None, "text": t}
         elif line_type == "section":
-            # تجاهل السطر هنا، سيتم التعامل معه في parse_law_from_texts
             continue
-    # إضافة آخر مادة
     if current:
         current["text"] = current["text"].strip()
         articles.append(current)
     return articles
-def extract_sections(texts: list):
     """
-    تقسيم النصوص إلى أقسام وفصول باستخدام الأنماط الذكية.
-    يعتمد على detect_line_type لتحديد نوع السطر.
     """
     sections = []
     current = {"name": "", "texts": []}
@@ -79,20 +72,13 @@ def extract_sections(texts: list):
         t_norm = normalize_digits(t.strip())
         line_type = detect_line_type(t_norm)
-        # إذا كان السطر بداية قسم/باب/فصل
         if line_type == "section":
-            # حفظ القسم السابق إذا وجد
             if current["texts"] or current["name"]:
                 sections.append(current)
-            # بدء قسم جديد
             current = {"name": t_norm, "texts": []}
         else:
-            # إضافة السطر إلى القسم الحالي (سواء كان مادة أو نص عادي)
             current["texts"].append(t_norm)
-    # إضافة آخر قسم
     if current["texts"] or current["name"]:
         sections.append(current)
@@ -101,40 +87,34 @@ def extract_sections(texts: list):
 def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
     """
-    تحويل نصوص القانون إلى هيكل JSON مع استخراج مواد وأقسام
-    باستخدام detect_line_type بدون تنظيف النصوص.
     """
-    # 1) استخدام النصوص الخام كما هي
-    pure_texts = [b.get("text", "") for b in text_blocks if b.get("text")]
-    # 2) استخراج العنوان والمقدمة
-    title, preamble, remaining_texts = extract_title_and_preamble(pure_texts)
-    # 3) استخراج الأقسام الخام
     sections_raw = extract_sections(remaining_texts)
-    # 4) بناء الأقسام مع المواد
     sections = []
     for sec in sections_raw:
-        raw_texts = sec["texts"]
-        # النصوص غير المواد داخل القسم
-        content_lines = [t for t in raw_texts if detect_line_type(t) == "text"]
         content = "\n".join(content_lines).strip()
-        # استخراج المواد مع ضم النصوص العادية التابعة لكل مادة
-        articles = extract_articles_from_texts(raw_texts)
         sections.append({
             "title": sec["name"],
             "content": content,
-            "articles": [
-                {"number": a["number"], "text": a["text"]}
-                for a in articles
-            ]
         })
-    # 5) إرجاع المستند القانوني الكامل
     return {
         "message": "تم التحليل بنجاح",
         "saved_to_db": False,

 from typing import List, Dict
 from helpers.utils import normalize_digits, extract_article_number, detect_line_type
 def extract_title_and_preamble(texts: List[str]) -> (str, str, List[str]):
     """
     استخراج عنوان القانون والمقدمة، وإرجاع بقية النصوص بعد المقدمة.
 def extract_articles_from_texts(texts: List[str]) -> List[Dict]:
     """
+    استخراج المواد من قائمة نصوص معينة، مع ضم أي نصوص تتبع المادة.
     """
     articles = []
     current = None
         line_type = detect_line_type(t)
         if line_type == "article":
             if current:
                 current["text"] = current["text"].strip()
                 articles.append(current)
             current = {"number": extract_article_number(t), "text": t}
         elif line_type == "text":
             if current:
                 current["text"] += "\n" + t
             else:
                 current = {"number": None, "text": t}
         elif line_type == "section":
+            # لا نفعل شيئًا هنا
             continue
     if current:
         current["text"] = current["text"].strip()
         articles.append(current)
     return articles
+def extract_sections(texts: List[str]) -> List[Dict]:
     """
+    تقسيم النصوص إلى أقسام وفصول.
     """
     sections = []
     current = {"name": "", "texts": []}
         t_norm = normalize_digits(t.strip())
         line_type = detect_line_type(t_norm)
         if line_type == "section":
             if current["texts"] or current["name"]:
                 sections.append(current)
             current = {"name": t_norm, "texts": []}
         else:
             current["texts"].append(t_norm)
     if current["texts"] or current["name"]:
         sections.append(current)
 def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
     """
+    استخراج القانون بشكل منظم:
+    1) العنوان والمقدمة
+    2) الأقسام
+    3) المواد داخل كل قسم
     """
+    # النصوص الخام
+    raw_texts = [b.get("text", "") for b in text_blocks if b.get("text")]
+    # 1) استخراج العنوان والمقدمة
+    title, preamble, remaining_texts = extract_title_and_preamble(raw_texts)
+    # 2) استخراج الأقسام
     sections_raw = extract_sections(remaining_texts)
+    # 3) لكل قسم، استخراج المواد
     sections = []
     for sec in sections_raw:
+        articles = extract_articles_from_texts(sec["texts"])
+        # المحتوى النصي العادي داخل القسم
+        content_lines = [t for t in sec["texts"] if detect_line_type(t) == "text"]
         content = "\n".join(content_lines).strip()
         sections.append({
             "title": sec["name"],
             "content": content,
+            "articles": [{"number": a["number"], "text": a["text"]} for a in articles]
         })
     return {
         "message": "تم التحليل بنجاح",
         "saved_to_db": False,