extract_html_full / parser /table_extractor.py
Mazenbs's picture
Create parser/table_extractor.py
26aeab0 verified
# parser/table_extractor.py
from bs4 import BeautifulSoup, Tag
from typing import List, Dict, Any
from helpers.cleaner import clean_text
from parser.article_extractor import is_article_line, extract_article_number
from parser.section_extractor import is_section_line
def tables_from_soup(soup: BeautifulSoup):
return soup.find_all("table")
def table_to_struct(table: Tag):
trs = table.find_all("tr")
if not trs:
return {"headers": [], "rows": []}
headers = [
clean_text(" ".join(td.stripped_strings))
for td in trs[0].find_all(["th", "td"])
]
num_cols = len(headers)
rows = []
for tr in trs[1:]:
cols = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
while len(cols) < num_cols:
cols.append("")
rows.append(cols)
return {"headers": headers, "rows": rows}
def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]):
tables = tables_from_soup(soup)
for idx, table in enumerate(tables):
struct = table_to_struct(table)
struct["position"] = idx
prev = table.find_previous(string=True)
target_section = None
target_article = None
while prev:
text = prev.strip()
if is_article_line(text):
num = extract_article_number(text)
for sec in reversed(sections):
for art in reversed(sec["articles"]):
if art["number"] == num:
target_article = art
break
if target_article:
break
if target_article:
break
if is_section_line(text):
for sec in reversed(sections):
if sec["title"] == text:
target_section = sec
break
if target_section:
break
prev = prev.find_previous(string=True)
if target_article:
target_article.setdefault("tables", []).append(struct)
elif target_section:
target_section.setdefault("tables", []).append(struct)
else:
sections[0].setdefault("tables", []).append(struct)
return sections