Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 9, 2025

Commit

f54552b

verified ·

1 Parent(s): fd49a85

Update parser/table_extractorgo.py

Browse files

Files changed (1) hide show

parser/table_extractorgo.py +17 -6

parser/table_extractorgo.py CHANGED Viewed

@@ -2,13 +2,15 @@
 from bs4 import BeautifulSoup, Tag
 from typing import List, Dict, Any
 from helpers.cleaner import clean_text
-from parser.article_extractor import is_article, extract_article_number  # <-- تعديل الاستيراد
-from parser.section_extractor import is_section_line
 def tables_from_soup(soup: BeautifulSoup) -> List[Tag]:
     return soup.find_all("table")
 def table_to_struct(table: Tag) -> Dict[str, Any]:
     trs = table.find_all("tr")
     if not trs:
         return {"headers": [], "rows": []}
@@ -29,7 +31,13 @@ def table_to_struct(table: Tag) -> Dict[str, Any]:
     return {"headers": headers, "rows": rows}
-def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     tables = tables_from_soup(soup)
     for idx, table in enumerate(tables):
@@ -44,10 +52,11 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
         while prev:
             text = prev.strip()
-            if is_article(text):  # <-- استخدم is_article
                 num = extract_article_number(text)
                 for sec in reversed(sections):
-                    for art in reversed(sec["articles"]):
                         if art.get("number") == num:
                             target_article = art
                             break
@@ -56,7 +65,8 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
                 if target_article:
                     break
-            if is_section_line(text):
                 for sec in reversed(sections):
                     if sec["title"] == text:
                         target_section = sec
@@ -66,6 +76,7 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
             prev = prev.find_previous(string=True)
         if target_article:
             target_article.setdefault("tables", []).append(struct)
         elif target_section:

 from bs4 import BeautifulSoup, Tag
 from typing import List, Dict, Any
 from helpers.cleaner import clean_text
+from parser.article_extractor import is_article, extract_article_number  # دوال المواد
+from parser.section_extractor import is_section  # دالة الأقسام
 def tables_from_soup(soup: BeautifulSoup) -> List[Tag]:
+    """استخراج جميع عناصر الجدول من صفحة HTML"""
     return soup.find_all("table")
 def table_to_struct(table: Tag) -> Dict[str, Any]:
+    """تحويل جدول HTML إلى هيكل JSON يحتوي على headers و rows"""
     trs = table.find_all("tr")
     if not trs:
         return {"headers": [], "rows": []}
     return {"headers": headers, "rows": rows}
+def link_tables_to_sections_and_articles(
+    soup: BeautifulSoup, sections: List[Dict[str, Any]]
+) -> List[Dict[str, Any]]:
+    """
+    ربط الجداول بالأقسام أو المواد الأقرب لها.
+    كل جدول مرتبط بمادة أو قسم حسب النص السابق له مباشرة.
+    """
     tables = tables_from_soup(soup)
     for idx, table in enumerate(tables):
         while prev:
             text = prev.strip()
+            # إذا كان السطر يمثل بداية مادة
+            if is_article(text):
                 num = extract_article_number(text)
                 for sec in reversed(sections):
+                    for art in reversed(sec.get("articles", [])):
                         if art.get("number") == num:
                             target_article = art
                             break
                 if target_article:
                     break
+            # إذا كان السطر يمثل بداية قسم
+            if is_section(text):
                 for sec in reversed(sections):
                     if sec["title"] == text:
                         target_section = sec
             prev = prev.find_previous(string=True)
+        # ربط الجدول بالمادة أو القسم أو القسم الأول إذا لم يكن هناك هدف
         if target_article:
             target_article.setdefault("tables", []).append(struct)
         elif target_section: