Spaces:
Sleeping
Sleeping
Create parser/table_extractor.py
Browse files- parser/table_extractor.py +76 -0
parser/table_extractor.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# parser/table_extractor.py
|
| 2 |
+
from bs4 import BeautifulSoup, Tag
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
from helpers.cleaner import clean_text
|
| 5 |
+
from parser.article_extractor import is_article_line, extract_article_number
|
| 6 |
+
from parser.section_extractor import is_section_line
|
| 7 |
+
|
| 8 |
+
def tables_from_soup(soup: BeautifulSoup):
|
| 9 |
+
return soup.find_all("table")
|
| 10 |
+
|
| 11 |
+
def table_to_struct(table: Tag):
|
| 12 |
+
trs = table.find_all("tr")
|
| 13 |
+
if not trs:
|
| 14 |
+
return {"headers": [], "rows": []}
|
| 15 |
+
|
| 16 |
+
headers = [
|
| 17 |
+
clean_text(" ".join(td.stripped_strings))
|
| 18 |
+
for td in trs[0].find_all(["th", "td"])
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
num_cols = len(headers)
|
| 22 |
+
rows = []
|
| 23 |
+
|
| 24 |
+
for tr in trs[1:]:
|
| 25 |
+
cols = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
|
| 26 |
+
while len(cols) < num_cols:
|
| 27 |
+
cols.append("")
|
| 28 |
+
rows.append(cols)
|
| 29 |
+
|
| 30 |
+
return {"headers": headers, "rows": rows}
|
| 31 |
+
|
| 32 |
+
def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]):
|
| 33 |
+
tables = tables_from_soup(soup)
|
| 34 |
+
|
| 35 |
+
for idx, table in enumerate(tables):
|
| 36 |
+
struct = table_to_struct(table)
|
| 37 |
+
struct["position"] = idx
|
| 38 |
+
|
| 39 |
+
prev = table.find_previous(string=True)
|
| 40 |
+
|
| 41 |
+
target_section = None
|
| 42 |
+
target_article = None
|
| 43 |
+
|
| 44 |
+
while prev:
|
| 45 |
+
text = prev.strip()
|
| 46 |
+
|
| 47 |
+
if is_article_line(text):
|
| 48 |
+
num = extract_article_number(text)
|
| 49 |
+
for sec in reversed(sections):
|
| 50 |
+
for art in reversed(sec["articles"]):
|
| 51 |
+
if art["number"] == num:
|
| 52 |
+
target_article = art
|
| 53 |
+
break
|
| 54 |
+
if target_article:
|
| 55 |
+
break
|
| 56 |
+
if target_article:
|
| 57 |
+
break
|
| 58 |
+
|
| 59 |
+
if is_section_line(text):
|
| 60 |
+
for sec in reversed(sections):
|
| 61 |
+
if sec["title"] == text:
|
| 62 |
+
target_section = sec
|
| 63 |
+
break
|
| 64 |
+
if target_section:
|
| 65 |
+
break
|
| 66 |
+
|
| 67 |
+
prev = prev.find_previous(string=True)
|
| 68 |
+
|
| 69 |
+
if target_article:
|
| 70 |
+
target_article.setdefault("tables", []).append(struct)
|
| 71 |
+
elif target_section:
|
| 72 |
+
target_section.setdefault("tables", []).append(struct)
|
| 73 |
+
else:
|
| 74 |
+
sections[0].setdefault("tables", []).append(struct)
|
| 75 |
+
|
| 76 |
+
return sections
|