Mazenbs commited on
Commit
26aeab0
·
verified ·
1 Parent(s): 33ad481

Create parser/table_extractor.py

Browse files
Files changed (1) hide show
  1. parser/table_extractor.py +76 -0
parser/table_extractor.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # parser/table_extractor.py
2
+ from bs4 import BeautifulSoup, Tag
3
+ from typing import List, Dict, Any
4
+ from helpers.cleaner import clean_text
5
+ from parser.article_extractor import is_article_line, extract_article_number
6
+ from parser.section_extractor import is_section_line
7
+
8
+ def tables_from_soup(soup: BeautifulSoup):
9
+ return soup.find_all("table")
10
+
11
+ def table_to_struct(table: Tag):
12
+ trs = table.find_all("tr")
13
+ if not trs:
14
+ return {"headers": [], "rows": []}
15
+
16
+ headers = [
17
+ clean_text(" ".join(td.stripped_strings))
18
+ for td in trs[0].find_all(["th", "td"])
19
+ ]
20
+
21
+ num_cols = len(headers)
22
+ rows = []
23
+
24
+ for tr in trs[1:]:
25
+ cols = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
26
+ while len(cols) < num_cols:
27
+ cols.append("")
28
+ rows.append(cols)
29
+
30
+ return {"headers": headers, "rows": rows}
31
+
32
+ def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]):
33
+ tables = tables_from_soup(soup)
34
+
35
+ for idx, table in enumerate(tables):
36
+ struct = table_to_struct(table)
37
+ struct["position"] = idx
38
+
39
+ prev = table.find_previous(string=True)
40
+
41
+ target_section = None
42
+ target_article = None
43
+
44
+ while prev:
45
+ text = prev.strip()
46
+
47
+ if is_article_line(text):
48
+ num = extract_article_number(text)
49
+ for sec in reversed(sections):
50
+ for art in reversed(sec["articles"]):
51
+ if art["number"] == num:
52
+ target_article = art
53
+ break
54
+ if target_article:
55
+ break
56
+ if target_article:
57
+ break
58
+
59
+ if is_section_line(text):
60
+ for sec in reversed(sections):
61
+ if sec["title"] == text:
62
+ target_section = sec
63
+ break
64
+ if target_section:
65
+ break
66
+
67
+ prev = prev.find_previous(string=True)
68
+
69
+ if target_article:
70
+ target_article.setdefault("tables", []).append(struct)
71
+ elif target_section:
72
+ target_section.setdefault("tables", []).append(struct)
73
+ else:
74
+ sections[0].setdefault("tables", []).append(struct)
75
+
76
+ return sections