Spaces:
Running
Running
| # parser/table_extractor.py | |
| from bs4 import BeautifulSoup, Tag | |
| from typing import List, Dict, Any | |
| from helpers.cleaner import clean_text | |
| from parser.article_extractor import is_article_line, extract_article_number | |
| from parser.section_extractor import is_section_line | |
| def tables_from_soup(soup: BeautifulSoup): | |
| return soup.find_all("table") | |
| def table_to_struct(table: Tag): | |
| trs = table.find_all("tr") | |
| if not trs: | |
| return {"headers": [], "rows": []} | |
| headers = [ | |
| clean_text(" ".join(td.stripped_strings)) | |
| for td in trs[0].find_all(["th", "td"]) | |
| ] | |
| num_cols = len(headers) | |
| rows = [] | |
| for tr in trs[1:]: | |
| cols = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])] | |
| while len(cols) < num_cols: | |
| cols.append("") | |
| rows.append(cols) | |
| return {"headers": headers, "rows": rows} | |
| def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]): | |
| tables = tables_from_soup(soup) | |
| for idx, table in enumerate(tables): | |
| struct = table_to_struct(table) | |
| struct["position"] = idx | |
| prev = table.find_previous(string=True) | |
| target_section = None | |
| target_article = None | |
| while prev: | |
| text = prev.strip() | |
| if is_article_line(text): | |
| num = extract_article_number(text) | |
| for sec in reversed(sections): | |
| for art in reversed(sec["articles"]): | |
| if art["number"] == num: | |
| target_article = art | |
| break | |
| if target_article: | |
| break | |
| if target_article: | |
| break | |
| if is_section_line(text): | |
| for sec in reversed(sections): | |
| if sec["title"] == text: | |
| target_section = sec | |
| break | |
| if target_section: | |
| break | |
| prev = prev.find_previous(string=True) | |
| if target_article: | |
| target_article.setdefault("tables", []).append(struct) | |
| elif target_section: | |
| target_section.setdefault("tables", []).append(struct) | |
| else: | |
| sections[0].setdefault("tables", []).append(struct) | |
| return sections |