Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

extract_html_full / parser /table_extractor.py

Mazenbs

Create parser/table_extractor.py

26aeab0 verified about 1 month ago

raw

history blame contribute delete

2.36 kB

	# parser/table_extractor.py
	from bs4 import BeautifulSoup, Tag
	from typing import List, Dict, Any
	from helpers.cleaner import clean_text
	from parser.article_extractor import is_article_line, extract_article_number
	from parser.section_extractor import is_section_line

	def tables_from_soup(soup: BeautifulSoup):
	return soup.find_all("table")

	def table_to_struct(table: Tag):
	trs = table.find_all("tr")
	if not trs:
	return {"headers": [], "rows": []}

	headers = [
	clean_text(" ".join(td.stripped_strings))
	for td in trs[0].find_all(["th", "td"])
	]

	num_cols = len(headers)
	rows = []

	for tr in trs[1:]:
	cols = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
	while len(cols) < num_cols:
	cols.append("")
	rows.append(cols)

	return {"headers": headers, "rows": rows}

	def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]):
	tables = tables_from_soup(soup)

	for idx, table in enumerate(tables):
	struct = table_to_struct(table)
	struct["position"] = idx

	prev = table.find_previous(string=True)

	target_section = None
	target_article = None

	while prev:
	text = prev.strip()

	if is_article_line(text):
	num = extract_article_number(text)
	for sec in reversed(sections):
	for art in reversed(sec["articles"]):
	if art["number"] == num:
	target_article = art
	break
	if target_article:
	break
	if target_article:
	break

	if is_section_line(text):
	for sec in reversed(sections):
	if sec["title"] == text:
	target_section = sec
	break
	if target_section:
	break

	prev = prev.find_previous(string=True)

	if target_article:
	target_article.setdefault("tables", []).append(struct)
	elif target_section:
	target_section.setdefault("tables", []).append(struct)
	else:
	sections[0].setdefault("tables", []).append(struct)

	return sections