extract_html_full / parser /extract_tables_by_article.py
Mazenbs's picture
Update parser/extract_tables_by_article.py
46646ce verified
# parser/extract_tables_by_article.py
from typing import List, Dict, Any
from bs4 import BeautifulSoup
import httpx
from helpers.cleaner import clean_text
from helpers.utils import is_article, extract_article_number
from parser.table_extractorgo import tables_from_soup, table_to_struct
async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
"""
جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
وإرجاعها كـ JSON بعد تجاهل أي جدول يحتوي على قيم فارغة.
"""
result: List[Dict[str, Any]] = []
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.get(url)
response.raise_for_status()
html_content = response.text
# تحليل HTML
soup = BeautifulSoup(html_content, "html.parser")
tables = tables_from_soup(soup)
for table in tables:
struct = table_to_struct(table)
# ===== تجاهل أي جدول يحتوي على أي خلية فارغة =====
if not struct["headers"] or any(h.strip() == "" for h in struct["headers"]):
continue
if not struct["rows"]:
continue
has_empty_cell = False
for row in struct["rows"]:
if any(cell is None or cell.strip() == "" for cell in row):
has_empty_cell = True
break
if has_empty_cell:
continue
# ==============================================
# البحث عن المادة الأقرب قبل الجدول
prev = table.find_previous(string=True)
target_article_number = None
target_article_snippet = None
while prev:
text = prev.strip()
if is_article(text):
target_article_number = extract_article_number(text)
target_article_snippet = text[:100] if len(text) > 100 else text
break
prev = prev.find_previous(string=True)
# إضافة الجدول فقط إذا وجدنا مادة
if target_article_number:
result.append({
"article_number": target_article_number,
"article_snippet": target_article_snippet,
"table": struct
})
return {
"url": url,
"tables_count": len(result),
"tables": result
}