# parser/table_extractor.py from bs4 import BeautifulSoup, Tag from typing import List, Dict, Any from helpers.cleaner import clean_text from parser.article_extractor import is_article_line, extract_article_number from parser.section_extractor import is_section_line def tables_from_soup(soup: BeautifulSoup): return soup.find_all("table") def table_to_struct(table: Tag): trs = table.find_all("tr") if not trs: return {"headers": [], "rows": []} headers = [ clean_text(" ".join(td.stripped_strings)) for td in trs[0].find_all(["th", "td"]) ] num_cols = len(headers) rows = [] for tr in trs[1:]: cols = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])] while len(cols) < num_cols: cols.append("") rows.append(cols) return {"headers": headers, "rows": rows} def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]): tables = tables_from_soup(soup) for idx, table in enumerate(tables): struct = table_to_struct(table) struct["position"] = idx prev = table.find_previous(string=True) target_section = None target_article = None while prev: text = prev.strip() if is_article_line(text): num = extract_article_number(text) for sec in reversed(sections): for art in reversed(sec["articles"]): if art["number"] == num: target_article = art break if target_article: break if target_article: break if is_section_line(text): for sec in reversed(sections): if sec["title"] == text: target_section = sec break if target_section: break prev = prev.find_previous(string=True) if target_article: target_article.setdefault("tables", []).append(struct) elif target_section: target_section.setdefault("tables", []).append(struct) else: sections[0].setdefault("tables", []).append(struct) return sections