# parser/extract_tables_by_article.py from typing import List, Dict, Any from bs4 import BeautifulSoup import httpx from helpers.cleaner import clean_text from helpers.utils import is_article, extract_article_number from parser.table_extractorgo import tables_from_soup, table_to_struct async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]: """ جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد، وإرجاعها كـ JSON بعد تجاهل أي جدول يحتوي على قيم فارغة. """ result: List[Dict[str, Any]] = [] async with httpx.AsyncClient(timeout=timeout) as client: response = await client.get(url) response.raise_for_status() html_content = response.text # تحليل HTML soup = BeautifulSoup(html_content, "html.parser") tables = tables_from_soup(soup) for table in tables: struct = table_to_struct(table) # ===== تجاهل أي جدول يحتوي على أي خلية فارغة ===== if not struct["headers"] or any(h.strip() == "" for h in struct["headers"]): continue if not struct["rows"]: continue has_empty_cell = False for row in struct["rows"]: if any(cell is None or cell.strip() == "" for cell in row): has_empty_cell = True break if has_empty_cell: continue # ============================================== # البحث عن المادة الأقرب قبل الجدول prev = table.find_previous(string=True) target_article_number = None target_article_snippet = None while prev: text = prev.strip() if is_article(text): target_article_number = extract_article_number(text) target_article_snippet = text[:100] if len(text) > 100 else text break prev = prev.find_previous(string=True) # إضافة الجدول فقط إذا وجدنا مادة if target_article_number: result.append({ "article_number": target_article_number, "article_snippet": target_article_snippet, "table": struct }) return { "url": url, "tables_count": len(result), "tables": result }