Spaces:
Sleeping
Sleeping
Create extract_tables_by_article.py
Browse files
parser/extract_tables_by_article.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# extract_tables_by_article.py
|
| 2 |
+
from typing import List, Dict, Any
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import httpx
|
| 5 |
+
from helpers.cleaner import clean_text
|
| 6 |
+
from parser.article_extractor import is_article_line, extract_article_number
|
| 7 |
+
from parser.table_extractor import tables_from_soup, table_to_struct
|
| 8 |
+
|
| 9 |
+
async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
|
| 10 |
+
"""
|
| 11 |
+
جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
|
| 12 |
+
وإرجاعها كـ JSON.
|
| 13 |
+
"""
|
| 14 |
+
result: List[Dict[str, Any]] = []
|
| 15 |
+
|
| 16 |
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
| 17 |
+
response = await client.get(url)
|
| 18 |
+
response.raise_for_status()
|
| 19 |
+
html_content = response.text
|
| 20 |
+
|
| 21 |
+
# تحليل HTML
|
| 22 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 23 |
+
|
| 24 |
+
# استخراج جميع الجداول
|
| 25 |
+
tables = tables_from_soup(soup)
|
| 26 |
+
|
| 27 |
+
for table in tables:
|
| 28 |
+
struct = table_to_struct(table)
|
| 29 |
+
|
| 30 |
+
# البحث عن المادة الأقرب قبل الجدول
|
| 31 |
+
prev = table.find_previous(string=True)
|
| 32 |
+
|
| 33 |
+
target_article_number = None
|
| 34 |
+
target_article_snippet = None
|
| 35 |
+
|
| 36 |
+
while prev:
|
| 37 |
+
text = prev.strip()
|
| 38 |
+
if is_article_line(text):
|
| 39 |
+
target_article_number = extract_article_number(text)
|
| 40 |
+
target_article_snippet = text[:100] if len(text) > 100 else text
|
| 41 |
+
break
|
| 42 |
+
prev = prev.find_previous(string=True)
|
| 43 |
+
|
| 44 |
+
# إضافة الجدول فقط إذا وجدنا مادة
|
| 45 |
+
if target_article_number:
|
| 46 |
+
result.append({
|
| 47 |
+
"article_number": target_article_number,
|
| 48 |
+
"article_snippet": target_article_snippet,
|
| 49 |
+
"table": struct
|
| 50 |
+
})
|
| 51 |
+
|
| 52 |
+
return {
|
| 53 |
+
"url": url,
|
| 54 |
+
"tables_count": len(result),
|
| 55 |
+
"tables": result
|
| 56 |
+
}
|