Spaces:

Mazenbs
/

extract_html_full

Sleeping

App Files Files Community

Mazenbs commited on Dec 9, 2025

Commit

56d507d

verified ·

1 Parent(s): 6e6e56f

Create extract_tables_by_article.py

Browse files

Files changed (1) hide show

parser/extract_tables_by_article.py +56 -0

parser/extract_tables_by_article.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# extract_tables_by_article.py
+from typing import List, Dict, Any
+from bs4 import BeautifulSoup
+import httpx
+from helpers.cleaner import clean_text
+from parser.article_extractor import is_article_line, extract_article_number
+from parser.table_extractor import tables_from_soup, table_to_struct
+async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
+    """
+    جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
+    وإرجاعها كـ JSON.
+    """
+    result: List[Dict[str, Any]] = []
+    async with httpx.AsyncClient(timeout=timeout) as client:
+        response = await client.get(url)
+        response.raise_for_status()
+        html_content = response.text
+    # تحليل HTML
+    soup = BeautifulSoup(html_content, "html.parser")
+    # استخراج جميع الجداول
+    tables = tables_from_soup(soup)
+    for table in tables:
+        struct = table_to_struct(table)
+        # البحث عن المادة الأقرب قبل الجدول
+        prev = table.find_previous(string=True)
+        target_article_number = None
+        target_article_snippet = None
+        while prev:
+            text = prev.strip()
+            if is_article_line(text):
+                target_article_number = extract_article_number(text)
+                target_article_snippet = text[:100] if len(text) > 100 else text
+                break
+            prev = prev.find_previous(string=True)
+        # إضافة الجدول فقط إذا وجدنا مادة
+        if target_article_number:
+            result.append({
+                "article_number": target_article_number,
+                "article_snippet": target_article_snippet,
+                "table": struct
+            })
+    return {
+        "url": url,
+        "tables_count": len(result),
+        "tables": result
+    }