|
|
from bs4 import BeautifulSoup |
|
|
import json |
|
|
import pandas as pd |
|
|
import re |
|
|
|
|
|
|
|
|
def get_table_metadata(table, base_url): |
|
|
|
|
|
section = table.find_parent('section') |
|
|
while section and not section.get('id'): |
|
|
section = section.find_parent('section') |
|
|
|
|
|
section_id = section.get("id") if section else None |
|
|
if section_id: |
|
|
section_url = base_url + section_id |
|
|
|
|
|
heading_el = section.select_one(f"[data-anchor-id={section_id}]") |
|
|
section_heading = heading_el.get_text(strip=True) if heading_el else "" |
|
|
|
|
|
parent_sec = section.find_parent('section') |
|
|
subheading_el = parent_sec.select_one(".pmc_sec_title") if parent_sec else None |
|
|
section_subheading = subheading_el.get_text(strip=True) if subheading_el else "" |
|
|
headings = " > ".join(filter(None, [section_heading, section_subheading])) |
|
|
else: |
|
|
|
|
|
section_url = base_url |
|
|
headings = "" |
|
|
|
|
|
|
|
|
name_el = section.find("h4") if section else table.find("caption") |
|
|
name = name_el.get_text(strip=True) if name_el else "Table" |
|
|
caption_el = section.select_one('.caption p') if section else table.find("caption") |
|
|
caption = caption_el.get_text(strip=True) if caption_el else "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
section = table.find_parent("section", id=True) |
|
|
table_id = section["id"] if section and "tbl-" in section["id"] else None |
|
|
print(table_id) |
|
|
match = re.search(r"tbl-(\d+)", table_id or "") |
|
|
number = match.group(1).lstrip("0") if match else "" |
|
|
referee_id = f"table_{number}" if number else "table_unknown" |
|
|
label = f"Table {number}. " + caption if number else "Table" |
|
|
|
|
|
|
|
|
footnotes = {} |
|
|
|
|
|
for sup in section.select('.fn sup') if section else []: |
|
|
sibling = sup.find_next_sibling("p") |
|
|
if sibling: |
|
|
key = sup.get_text(strip=True) |
|
|
footnotes[key] = sibling.get_text(strip=True) |
|
|
|
|
|
|
|
|
for p in section.select('.fn p') if section else []: |
|
|
|
|
|
matches = re.findall(r"(?<=(\*|#))\s*(.*?)(?=\s\*|\s#|$)", p.get_text()) |
|
|
for key, text in matches: |
|
|
footnotes[key] = text.strip() |
|
|
print(name) |
|
|
return name, caption, footnotes, headings, label, referee_id, section_url |
|
|
|
|
|
|
|
|
def get_table_data(table, footnotes): |
|
|
table_data = [] |
|
|
rowspan_tracker = {} |
|
|
subsec = "" |
|
|
|
|
|
for tr in table.find_all("tr"): |
|
|
row = [] |
|
|
col_index = 0 |
|
|
|
|
|
|
|
|
while col_index in rowspan_tracker: |
|
|
value, remaining = rowspan_tracker[col_index] |
|
|
row.append(value) |
|
|
remaining -= 1 |
|
|
if remaining: |
|
|
rowspan_tracker[col_index] = (value, remaining) |
|
|
else: |
|
|
del rowspan_tracker[col_index] |
|
|
col_index += 1 |
|
|
|
|
|
for cell in tr.find_all(["th", "td"]): |
|
|
cell_text = cell.get_text(separator="\n", strip=True) |
|
|
cell_sups = [sup.get_text() for sup in cell.find_all("sup")] |
|
|
|
|
|
|
|
|
if cell_sups: |
|
|
|
|
|
lines = [t for t in cell_text.split("\n") if len(t) > 1] |
|
|
cell_text = " ".join(lines) |
|
|
|
|
|
|
|
|
for sup in cell_sups: |
|
|
if sup in footnotes: |
|
|
cell_text += f" ({footnotes[sup]})" |
|
|
|
|
|
|
|
|
colspan = int(cell.get("colspan", 1)) |
|
|
if colspan > 1: |
|
|
subsec = cell_text |
|
|
continue |
|
|
|
|
|
row.append(cell_text) |
|
|
|
|
|
|
|
|
rowspan = int(cell.get("rowspan", 1)) |
|
|
if rowspan > 1: |
|
|
rowspan_tracker[col_index] = (cell_text, rowspan - 1) |
|
|
|
|
|
col_index += 1 |
|
|
|
|
|
if row: |
|
|
if subsec: |
|
|
row.insert(0, subsec) |
|
|
table_data.append(row) |
|
|
|
|
|
return table_data |
|
|
|
|
|
|
|
|
def to_text(table_data, label, caption): |
|
|
lines = [] |
|
|
lines.append(f"**{label}**") |
|
|
|
|
|
|
|
|
headers = table_data[0] if table_data else [] |
|
|
for i, row in enumerate(table_data[1:], start=1): |
|
|
row_text = ", ".join(f"{h}: {v}" for h, v in zip(headers, row) if v) |
|
|
lines.append(f"{{Row {i} - {row_text}}}") |
|
|
|
|
|
return "[" + "\n".join(lines) + "]" |
|
|
|
|
|
|
|
|
def to_chunk(text_block, section_url, referee_id, headings): |
|
|
return { |
|
|
"text": text_block, |
|
|
"metadata": { |
|
|
"section": section_url, |
|
|
"type": "HTML table", |
|
|
"referee_id": referee_id, |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
def tables_to_json(input_path="bipolar.html", base_url="https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#"): |
|
|
doc = [] |
|
|
with open(input_path, encoding="utf-8") as f: |
|
|
html = f.read() |
|
|
soup = BeautifulSoup(html, features="html.parser") |
|
|
tables = soup.find_all("table") |
|
|
|
|
|
print(f"Found {len(tables)} tables in document.") |
|
|
|
|
|
for idx, tbl in enumerate(tables, start=1): |
|
|
name, caption, footnotes, headings, label, referee_id, section_url = \ |
|
|
get_table_metadata(tbl, base_url) |
|
|
table_data = get_table_data(tbl, footnotes) |
|
|
text_block = to_text(table_data, label, caption) |
|
|
chunk = to_chunk(text_block, section_url, referee_id, headings) |
|
|
doc.append(chunk) |
|
|
|
|
|
return doc |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
pass |