Spaces:
Running
Running
| # parser/extract_tables_by_article.py | |
| from typing import List, Dict, Any | |
| from bs4 import BeautifulSoup | |
| import httpx | |
| from helpers.cleaner import clean_text | |
| from helpers.utils import is_article, extract_article_number | |
| from parser.table_extractorgo import tables_from_soup, table_to_struct | |
| async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]: | |
| """ | |
| جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد، | |
| وإرجاعها كـ JSON بعد تجاهل أي جدول يحتوي على قيم فارغة. | |
| """ | |
| result: List[Dict[str, Any]] = [] | |
| async with httpx.AsyncClient(timeout=timeout) as client: | |
| response = await client.get(url) | |
| response.raise_for_status() | |
| html_content = response.text | |
| # تحليل HTML | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| tables = tables_from_soup(soup) | |
| for table in tables: | |
| struct = table_to_struct(table) | |
| # ===== تجاهل أي جدول يحتوي على أي خلية فارغة ===== | |
| if not struct["headers"] or any(h.strip() == "" for h in struct["headers"]): | |
| continue | |
| if not struct["rows"]: | |
| continue | |
| has_empty_cell = False | |
| for row in struct["rows"]: | |
| if any(cell is None or cell.strip() == "" for cell in row): | |
| has_empty_cell = True | |
| break | |
| if has_empty_cell: | |
| continue | |
| # ============================================== | |
| # البحث عن المادة الأقرب قبل الجدول | |
| prev = table.find_previous(string=True) | |
| target_article_number = None | |
| target_article_snippet = None | |
| while prev: | |
| text = prev.strip() | |
| if is_article(text): | |
| target_article_number = extract_article_number(text) | |
| target_article_snippet = text[:100] if len(text) > 100 else text | |
| break | |
| prev = prev.find_previous(string=True) | |
| # إضافة الجدول فقط إذا وجدنا مادة | |
| if target_article_number: | |
| result.append({ | |
| "article_number": target_article_number, | |
| "article_snippet": target_article_snippet, | |
| "table": struct | |
| }) | |
| return { | |
| "url": url, | |
| "tables_count": len(result), | |
| "tables": result | |
| } |