Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

extract_html_full / parser /extract_tables_by_article.py

Mazenbs

Update parser/extract_tables_by_article.py

46646ce verified about 1 month ago

raw

history blame contribute delete

2.43 kB

	# parser/extract_tables_by_article.py
	from typing import List, Dict, Any
	from bs4 import BeautifulSoup
	import httpx
	from helpers.cleaner import clean_text
	from helpers.utils import is_article, extract_article_number
	from parser.table_extractorgo import tables_from_soup, table_to_struct

	async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
	"""
	جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
	وإرجاعها كـ JSON بعد تجاهل أي جدول يحتوي على قيم فارغة.
	"""
	result: List[Dict[str, Any]] = []

	async with httpx.AsyncClient(timeout=timeout) as client:
	response = await client.get(url)
	response.raise_for_status()
	html_content = response.text

	# تحليل HTML
	soup = BeautifulSoup(html_content, "html.parser")
	tables = tables_from_soup(soup)

	for table in tables:
	struct = table_to_struct(table)

	# ===== تجاهل أي جدول يحتوي على أي خلية فارغة =====
	if not struct["headers"] or any(h.strip() == "" for h in struct["headers"]):
	continue

	if not struct["rows"]:
	continue

	has_empty_cell = False
	for row in struct["rows"]:
	if any(cell is None or cell.strip() == "" for cell in row):
	has_empty_cell = True
	break
	if has_empty_cell:
	continue
	# ==============================================

	# البحث عن المادة الأقرب قبل الجدول
	prev = table.find_previous(string=True)
	target_article_number = None
	target_article_snippet = None

	while prev:
	text = prev.strip()
	if is_article(text):
	target_article_number = extract_article_number(text)
	target_article_snippet = text[:100] if len(text) > 100 else text
	break
	prev = prev.find_previous(string=True)

	# إضافة الجدول فقط إذا وجدنا مادة
	if target_article_number:
	result.append({
	"article_number": target_article_number,
	"article_snippet": target_article_snippet,
	"table": struct
	})

	return {
	"url": url,
	"tables_count": len(result),
	"tables": result
	}