Spaces:

Mazenbs
/

extract_html_full

Running

File size: 2,433 Bytes

3cff8e2
56d507d
 
 
8dd48f5
 
fd49a85
56d507d
 
 
 
46646ce
56d507d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46646ce
 
8dd48f5
accec64
8dd48f5
 
accec64
46646ce
accec64
46646ce
 
accec64
46646ce
8dd48f5
accec64
8dd48f5
56d507d
 
 
 
 
 
 
8dd48f5
3cff8e2
56d507d

# parser/extract_tables_by_article.py
from typing import List, Dict, Any
from bs4 import BeautifulSoup
import httpx
from helpers.cleaner import clean_text
from helpers.utils import is_article, extract_article_number
from parser.table_extractorgo import tables_from_soup, table_to_struct

async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
    """
    جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
    وإرجاعها كـ JSON بعد تجاهل أي جدول يحتوي على قيم فارغة.
    """
    result: List[Dict[str, Any]] = []

    async with httpx.AsyncClient(timeout=timeout) as client:
        response = await client.get(url)
        response.raise_for_status()
        html_content = response.text

    # تحليل HTML
    soup = BeautifulSoup(html_content, "html.parser")
    tables = tables_from_soup(soup)

    for table in tables:
        struct = table_to_struct(table)

        # ===== تجاهل أي جدول يحتوي على أي خلية فارغة =====
        if not struct["headers"] or any(h.strip() == "" for h in struct["headers"]):
            continue

        if not struct["rows"]:
            continue

        has_empty_cell = False
        for row in struct["rows"]:
            if any(cell is None or cell.strip() == "" for cell in row):
                has_empty_cell = True
                break
        if has_empty_cell:
            continue
        # ==============================================

        # البحث عن المادة الأقرب قبل الجدول
        prev = table.find_previous(string=True)
        target_article_number = None
        target_article_snippet = None

        while prev:
            text = prev.strip()
            if is_article(text):
                target_article_number = extract_article_number(text)
                target_article_snippet = text[:100] if len(text) > 100 else text
                break
            prev = prev.find_previous(string=True)

        # إضافة الجدول فقط إذا وجدنا مادة
        if target_article_number:
            result.append({
                "article_number": target_article_number,
                "article_snippet": target_article_snippet,
                "table": struct
            })

    return {
        "url": url,
        "tables_count": len(result),
        "tables": result
    }