Spaces:
Running
Running
File size: 2,433 Bytes
3cff8e2 56d507d 8dd48f5 fd49a85 56d507d 46646ce 56d507d 46646ce 8dd48f5 accec64 8dd48f5 accec64 46646ce accec64 46646ce accec64 46646ce 8dd48f5 accec64 8dd48f5 56d507d 8dd48f5 3cff8e2 56d507d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# parser/extract_tables_by_article.py
from typing import List, Dict, Any
from bs4 import BeautifulSoup
import httpx
from helpers.cleaner import clean_text
from helpers.utils import is_article, extract_article_number
from parser.table_extractorgo import tables_from_soup, table_to_struct
async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
"""
جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
وإرجاعها كـ JSON بعد تجاهل أي جدول يحتوي على قيم فارغة.
"""
result: List[Dict[str, Any]] = []
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.get(url)
response.raise_for_status()
html_content = response.text
# تحليل HTML
soup = BeautifulSoup(html_content, "html.parser")
tables = tables_from_soup(soup)
for table in tables:
struct = table_to_struct(table)
# ===== تجاهل أي جدول يحتوي على أي خلية فارغة =====
if not struct["headers"] or any(h.strip() == "" for h in struct["headers"]):
continue
if not struct["rows"]:
continue
has_empty_cell = False
for row in struct["rows"]:
if any(cell is None or cell.strip() == "" for cell in row):
has_empty_cell = True
break
if has_empty_cell:
continue
# ==============================================
# البحث عن المادة الأقرب قبل الجدول
prev = table.find_previous(string=True)
target_article_number = None
target_article_snippet = None
while prev:
text = prev.strip()
if is_article(text):
target_article_number = extract_article_number(text)
target_article_snippet = text[:100] if len(text) > 100 else text
break
prev = prev.find_previous(string=True)
# إضافة الجدول فقط إذا وجدنا مادة
if target_article_number:
result.append({
"article_number": target_article_number,
"article_snippet": target_article_snippet,
"table": struct
})
return {
"url": url,
"tables_count": len(result),
"tables": result
} |