Mazenbs commited on
Commit
56d507d
·
verified ·
1 Parent(s): 6e6e56f

Create extract_tables_by_article.py

Browse files
Files changed (1) hide show
  1. parser/extract_tables_by_article.py +56 -0
parser/extract_tables_by_article.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # extract_tables_by_article.py
2
+ from typing import List, Dict, Any
3
+ from bs4 import BeautifulSoup
4
+ import httpx
5
+ from helpers.cleaner import clean_text
6
+ from parser.article_extractor import is_article_line, extract_article_number
7
+ from parser.table_extractor import tables_from_soup, table_to_struct
8
+
9
+ async def extract_tables_from_url(url: str, timeout: int = 10) -> Dict[str, Any]:
10
+ """
11
+ جلب صفحة HTML من URL، استخراج الجداول المرتبطة بالمواد،
12
+ وإرجاعها كـ JSON.
13
+ """
14
+ result: List[Dict[str, Any]] = []
15
+
16
+ async with httpx.AsyncClient(timeout=timeout) as client:
17
+ response = await client.get(url)
18
+ response.raise_for_status()
19
+ html_content = response.text
20
+
21
+ # تحليل HTML
22
+ soup = BeautifulSoup(html_content, "html.parser")
23
+
24
+ # استخراج جميع الجداول
25
+ tables = tables_from_soup(soup)
26
+
27
+ for table in tables:
28
+ struct = table_to_struct(table)
29
+
30
+ # البحث عن المادة الأقرب قبل الجدول
31
+ prev = table.find_previous(string=True)
32
+
33
+ target_article_number = None
34
+ target_article_snippet = None
35
+
36
+ while prev:
37
+ text = prev.strip()
38
+ if is_article_line(text):
39
+ target_article_number = extract_article_number(text)
40
+ target_article_snippet = text[:100] if len(text) > 100 else text
41
+ break
42
+ prev = prev.find_previous(string=True)
43
+
44
+ # إضافة الجدول فقط إذا وجدنا مادة
45
+ if target_article_number:
46
+ result.append({
47
+ "article_number": target_article_number,
48
+ "article_snippet": target_article_snippet,
49
+ "table": struct
50
+ })
51
+
52
+ return {
53
+ "url": url,
54
+ "tables_count": len(result),
55
+ "tables": result
56
+ }