Spaces:
Running
Running
Update parser/extract_tables_by_article.py
Browse files
parser/extract_tables_by_article.py
CHANGED
|
@@ -9,7 +9,7 @@ from parser.table_extractorgo import tables_from_soup, table_to_struct
|
|
| 9 |
async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
|
| 10 |
"""
|
| 11 |
جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
|
| 12 |
-
وإرجاعها كـ JSON بعد تجاهل
|
| 13 |
"""
|
| 14 |
result: List[Dict[str, Any]] = []
|
| 15 |
|
|
@@ -25,19 +25,19 @@ async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]
|
|
| 25 |
for table in tables:
|
| 26 |
struct = table_to_struct(table)
|
| 27 |
|
| 28 |
-
# =====
|
| 29 |
-
if not struct["headers"] or
|
| 30 |
continue
|
| 31 |
|
| 32 |
if not struct["rows"]:
|
| 33 |
continue
|
| 34 |
|
| 35 |
-
|
| 36 |
for row in struct["rows"]:
|
| 37 |
-
if any(cell
|
| 38 |
-
|
| 39 |
break
|
| 40 |
-
if
|
| 41 |
continue
|
| 42 |
# ==============================================
|
| 43 |
|
|
|
|
| 9 |
async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
|
| 10 |
"""
|
| 11 |
جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
|
| 12 |
+
وإرجاعها كـ JSON بعد تجاهل أي جدول يحتوي على قيم فارغة.
|
| 13 |
"""
|
| 14 |
result: List[Dict[str, Any]] = []
|
| 15 |
|
|
|
|
| 25 |
for table in tables:
|
| 26 |
struct = table_to_struct(table)
|
| 27 |
|
| 28 |
+
# ===== تجاهل أي جدول يحتوي على أي خلية فارغة =====
|
| 29 |
+
if not struct["headers"] or any(h.strip() == "" for h in struct["headers"]):
|
| 30 |
continue
|
| 31 |
|
| 32 |
if not struct["rows"]:
|
| 33 |
continue
|
| 34 |
|
| 35 |
+
has_empty_cell = False
|
| 36 |
for row in struct["rows"]:
|
| 37 |
+
if any(cell is None or cell.strip() == "" for cell in row):
|
| 38 |
+
has_empty_cell = True
|
| 39 |
break
|
| 40 |
+
if has_empty_cell:
|
| 41 |
continue
|
| 42 |
# ==============================================
|
| 43 |
|