File size: 6,035 Bytes
3530638 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
def get_table_metadata(table, base_url):
# Find the nearest ancestor <section> that has an id
section = table.find_parent('section')
while section and not section.get('id'):
section = section.find_parent('section')
section_id = section.get("id") if section else None
if section_id:
section_url = base_url + section_id
# Try to get the main heading
heading_el = section.select_one(f"[data-anchor-id={section_id}]")
section_heading = heading_el.get_text(strip=True) if heading_el else ""
# Try to get the subheading from its parent section
parent_sec = section.find_parent('section')
subheading_el = parent_sec.select_one(".pmc_sec_title") if parent_sec else None
section_subheading = subheading_el.get_text(strip=True) if subheading_el else ""
headings = " > ".join(filter(None, [section_heading, section_subheading]))
else:
# Fallback if no section id is found
section_url = base_url
headings = ""
# Table name and caption
name_el = section.find("h4") if section else table.find("caption")
name = name_el.get_text(strip=True) if name_el else "Table"
caption_el = section.select_one('.caption p') if section else table.find("caption")
caption = caption_el.get_text(strip=True) if caption_el else ""
# Generate a referee_id from the table name
# e.g., "Table 1." → number = "1" → referee_id = "table_1"
# Look for parent <section> with an id containing 'tbl-'
section = table.find_parent("section", id=True)
table_id = section["id"] if section and "tbl-" in section["id"] else None
print(table_id)
match = re.search(r"tbl-(\d+)", table_id or "")
number = match.group(1).lstrip("0") if match else ""
referee_id = f"table_{number}" if number else "table_unknown"
label = f"Table {number}. " + caption if number else "Table"
# Collect footnotes
footnotes = {}
# case 1: <sup> outside <p>
for sup in section.select('.fn sup') if section else []:
sibling = sup.find_next_sibling("p")
if sibling:
key = sup.get_text(strip=True)
footnotes[key] = sibling.get_text(strip=True)
# case 2: <sup> inside <p>
for p in section.select('.fn p') if section else []:
# matches like "* text" or "# text"
matches = re.findall(r"(?<=(\*|#))\s*(.*?)(?=\s\*|\s#|$)", p.get_text())
for key, text in matches:
footnotes[key] = text.strip()
print(name)
return name, caption, footnotes, headings, label, referee_id, section_url
def get_table_data(table, footnotes):
table_data = []
rowspan_tracker = {}
subsec = ""
for tr in table.find_all("tr"):
row = []
col_index = 0
# Pre-fill cells carried over by rowspan
while col_index in rowspan_tracker:
value, remaining = rowspan_tracker[col_index]
row.append(value)
remaining -= 1
if remaining:
rowspan_tracker[col_index] = (value, remaining)
else:
del rowspan_tracker[col_index]
col_index += 1
for cell in tr.find_all(["th", "td"]):
cell_text = cell.get_text(separator="\n", strip=True)
cell_sups = [sup.get_text() for sup in cell.find_all("sup")]
# Normalize text if superscripts are inside
if cell_sups:
# remove short tokens
lines = [t for t in cell_text.split("\n") if len(t) > 1]
cell_text = " ".join(lines)
# Append footnote text if any
for sup in cell_sups:
if sup in footnotes:
cell_text += f" ({footnotes[sup]})"
# Handle colspan as a subsection marker
colspan = int(cell.get("colspan", 1))
if colspan > 1:
subsec = cell_text
continue
row.append(cell_text)
# Track rowspan for this column
rowspan = int(cell.get("rowspan", 1))
if rowspan > 1:
rowspan_tracker[col_index] = (cell_text, rowspan - 1)
col_index += 1
if row:
if subsec:
row.insert(0, subsec)
table_data.append(row)
return table_data
def to_text(table_data, label, caption):
lines = []
lines.append(f"**{label}**")
# Skip header row when enumerating data rows
headers = table_data[0] if table_data else []
for i, row in enumerate(table_data[1:], start=1):
row_text = ", ".join(f"{h}: {v}" for h, v in zip(headers, row) if v)
lines.append(f"{{Row {i} - {row_text}}}")
return "[" + "\n".join(lines) + "]"
def to_chunk(text_block, section_url, referee_id, headings):
return {
"text": text_block,
"metadata": {
"section": section_url,
"type": "HTML table",
"referee_id": referee_id,
# "headings": headings,
}
}
def tables_to_json(input_path="bipolar.html", base_url="https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#"):
doc = []
with open(input_path, encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, features="html.parser")
tables = soup.find_all("table")
print(f"Found {len(tables)} tables in document.")
for idx, tbl in enumerate(tables, start=1):
name, caption, footnotes, headings, label, referee_id, section_url = \
get_table_metadata(tbl, base_url)
table_data = get_table_data(tbl, footnotes)
text_block = to_text(table_data, label, caption)
chunk = to_chunk(text_block, section_url, referee_id, headings)
doc.append(chunk)
return doc
if __name__ == "__main__":
# doc = tables_to_json()
# with open("tables.json", "w", encoding="utf-8") as f:
# json.dump(doc, f, indent=4)
pass |