Mazenbs commited on
Commit
f54552b
·
verified ·
1 Parent(s): fd49a85

Update parser/table_extractorgo.py

Browse files
Files changed (1) hide show
  1. parser/table_extractorgo.py +17 -6
parser/table_extractorgo.py CHANGED
@@ -2,13 +2,15 @@
2
  from bs4 import BeautifulSoup, Tag
3
  from typing import List, Dict, Any
4
  from helpers.cleaner import clean_text
5
- from parser.article_extractor import is_article, extract_article_number # <-- تعديل الاستيراد
6
- from parser.section_extractor import is_section_line
7
 
8
  def tables_from_soup(soup: BeautifulSoup) -> List[Tag]:
 
9
  return soup.find_all("table")
10
 
11
  def table_to_struct(table: Tag) -> Dict[str, Any]:
 
12
  trs = table.find_all("tr")
13
  if not trs:
14
  return {"headers": [], "rows": []}
@@ -29,7 +31,13 @@ def table_to_struct(table: Tag) -> Dict[str, Any]:
29
 
30
  return {"headers": headers, "rows": rows}
31
 
32
- def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 
 
 
 
 
 
33
  tables = tables_from_soup(soup)
34
 
35
  for idx, table in enumerate(tables):
@@ -44,10 +52,11 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
44
  while prev:
45
  text = prev.strip()
46
 
47
- if is_article(text): # <-- استخدم is_article
 
48
  num = extract_article_number(text)
49
  for sec in reversed(sections):
50
- for art in reversed(sec["articles"]):
51
  if art.get("number") == num:
52
  target_article = art
53
  break
@@ -56,7 +65,8 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
56
  if target_article:
57
  break
58
 
59
- if is_section_line(text):
 
60
  for sec in reversed(sections):
61
  if sec["title"] == text:
62
  target_section = sec
@@ -66,6 +76,7 @@ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dic
66
 
67
  prev = prev.find_previous(string=True)
68
 
 
69
  if target_article:
70
  target_article.setdefault("tables", []).append(struct)
71
  elif target_section:
 
2
  from bs4 import BeautifulSoup, Tag
3
  from typing import List, Dict, Any
4
  from helpers.cleaner import clean_text
5
+ from parser.article_extractor import is_article, extract_article_number # دوال المواد
6
+ from parser.section_extractor import is_section # دالة الأقسام
7
 
8
  def tables_from_soup(soup: BeautifulSoup) -> List[Tag]:
9
+ """استخراج جميع عناصر الجدول من صفحة HTML"""
10
  return soup.find_all("table")
11
 
12
  def table_to_struct(table: Tag) -> Dict[str, Any]:
13
+ """تحويل جدول HTML إلى هيكل JSON يحتوي على headers و rows"""
14
  trs = table.find_all("tr")
15
  if not trs:
16
  return {"headers": [], "rows": []}
 
31
 
32
  return {"headers": headers, "rows": rows}
33
 
34
+ def link_tables_to_sections_and_articles(
35
+ soup: BeautifulSoup, sections: List[Dict[str, Any]]
36
+ ) -> List[Dict[str, Any]]:
37
+ """
38
+ ربط الجداول بالأقسام أو المواد الأقرب لها.
39
+ كل جدول مرتبط بمادة أو قسم حسب النص السابق له مباشرة.
40
+ """
41
  tables = tables_from_soup(soup)
42
 
43
  for idx, table in enumerate(tables):
 
52
  while prev:
53
  text = prev.strip()
54
 
55
+ # إذا كان السطر يمثل بداية مادة
56
+ if is_article(text):
57
  num = extract_article_number(text)
58
  for sec in reversed(sections):
59
+ for art in reversed(sec.get("articles", [])):
60
  if art.get("number") == num:
61
  target_article = art
62
  break
 
65
  if target_article:
66
  break
67
 
68
+ # إذا كان السطر يمثل بداية قسم
69
+ if is_section(text):
70
  for sec in reversed(sections):
71
  if sec["title"] == text:
72
  target_section = sec
 
76
 
77
  prev = prev.find_previous(string=True)
78
 
79
+ # ربط الجدول بالمادة أو القسم أو القسم الأول إذا لم يكن هناك هدف
80
  if target_article:
81
  target_article.setdefault("tables", []).append(struct)
82
  elif target_section: