Spaces:
Running
Running
Update parser/table_extractorgo.py
Browse files- parser/table_extractorgo.py +17 -6
parser/table_extractorgo.py
CHANGED
|
@@ -2,13 +2,15 @@
|
|
| 2 |
from bs4 import BeautifulSoup, Tag
|
| 3 |
from typing import List, Dict, Any
|
| 4 |
from helpers.cleaner import clean_text
|
| 5 |
-
from parser.article_extractor import is_article, extract_article_number #
|
| 6 |
-
from parser.section_extractor import
|
| 7 |
|
| 8 |
def tables_from_soup(soup: BeautifulSoup) -> List[Tag]:
|
|
|
|
| 9 |
return soup.find_all("table")
|
| 10 |
|
| 11 |
def table_to_struct(table: Tag) -> Dict[str, Any]:
|
|
|
|
| 12 |
trs = table.find_all("tr")
|
| 13 |
if not trs:
|
| 14 |
return {"headers": [], "rows": []}
|
|
@@ -29,7 +31,13 @@ def table_to_struct(table: Tag) -> Dict[str, Any]:
|
|
| 29 |
|
| 30 |
return {"headers": headers, "rows": rows}
|
| 31 |
|
| 32 |
-
def link_tables_to_sections_and_articles(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
tables = tables_from_soup(soup)
|
| 34 |
|
| 35 |
for idx, table in enumerate(tables):
|
|
@@ -44,10 +52,11 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
|
|
| 44 |
while prev:
|
| 45 |
text = prev.strip()
|
| 46 |
|
| 47 |
-
|
|
|
|
| 48 |
num = extract_article_number(text)
|
| 49 |
for sec in reversed(sections):
|
| 50 |
-
for art in reversed(sec
|
| 51 |
if art.get("number") == num:
|
| 52 |
target_article = art
|
| 53 |
break
|
|
@@ -56,7 +65,8 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
|
|
| 56 |
if target_article:
|
| 57 |
break
|
| 58 |
|
| 59 |
-
|
|
|
|
| 60 |
for sec in reversed(sections):
|
| 61 |
if sec["title"] == text:
|
| 62 |
target_section = sec
|
|
@@ -66,6 +76,7 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
|
|
| 66 |
|
| 67 |
prev = prev.find_previous(string=True)
|
| 68 |
|
|
|
|
| 69 |
if target_article:
|
| 70 |
target_article.setdefault("tables", []).append(struct)
|
| 71 |
elif target_section:
|
|
|
|
| 2 |
from bs4 import BeautifulSoup, Tag
|
| 3 |
from typing import List, Dict, Any
|
| 4 |
from helpers.cleaner import clean_text
|
| 5 |
+
from parser.article_extractor import is_article, extract_article_number # دوال المواد
|
| 6 |
+
from parser.section_extractor import is_section # دالة الأقسام
|
| 7 |
|
| 8 |
def tables_from_soup(soup: BeautifulSoup) -> List[Tag]:
|
| 9 |
+
"""استخراج جميع عناصر الجدول من صفحة HTML"""
|
| 10 |
return soup.find_all("table")
|
| 11 |
|
| 12 |
def table_to_struct(table: Tag) -> Dict[str, Any]:
|
| 13 |
+
"""تحويل جدول HTML إلى هيكل JSON يحتوي على headers و rows"""
|
| 14 |
trs = table.find_all("tr")
|
| 15 |
if not trs:
|
| 16 |
return {"headers": [], "rows": []}
|
|
|
|
| 31 |
|
| 32 |
return {"headers": headers, "rows": rows}
|
| 33 |
|
| 34 |
+
def link_tables_to_sections_and_articles(
|
| 35 |
+
soup: BeautifulSoup, sections: List[Dict[str, Any]]
|
| 36 |
+
) -> List[Dict[str, Any]]:
|
| 37 |
+
"""
|
| 38 |
+
ربط الجداول بالأقسام أو المواد الأقرب لها.
|
| 39 |
+
كل جدول مرتبط بمادة أو قسم حسب النص السابق له مباشرة.
|
| 40 |
+
"""
|
| 41 |
tables = tables_from_soup(soup)
|
| 42 |
|
| 43 |
for idx, table in enumerate(tables):
|
|
|
|
| 52 |
while prev:
|
| 53 |
text = prev.strip()
|
| 54 |
|
| 55 |
+
# إذا كان السطر يمثل بداية مادة
|
| 56 |
+
if is_article(text):
|
| 57 |
num = extract_article_number(text)
|
| 58 |
for sec in reversed(sections):
|
| 59 |
+
for art in reversed(sec.get("articles", [])):
|
| 60 |
if art.get("number") == num:
|
| 61 |
target_article = art
|
| 62 |
break
|
|
|
|
| 65 |
if target_article:
|
| 66 |
break
|
| 67 |
|
| 68 |
+
# إذا كان السطر يمثل بداية قسم
|
| 69 |
+
if is_section(text):
|
| 70 |
for sec in reversed(sections):
|
| 71 |
if sec["title"] == text:
|
| 72 |
target_section = sec
|
|
|
|
| 76 |
|
| 77 |
prev = prev.find_previous(string=True)
|
| 78 |
|
| 79 |
+
# ربط الجدول بالمادة أو القسم أو القسم الأول إذا لم يكن هناك هدف
|
| 80 |
if target_article:
|
| 81 |
target_article.setdefault("tables", []).append(struct)
|
| 82 |
elif target_section:
|