that has an id section = table.find_parent('section') while section and not section.get('id'): section = section.find_parent('section') section_id = section.get("id") if section else None if section_id: section_url = base_url + section_id # Try to get the main heading heading_el = section.select_one(f"[data-anchor-id={section_id}]") section_heading = heading_el.get_text(strip=True) if heading_el else "" # Try to get the subheading from its parent section parent_sec = section.find_parent('section') subheading_el = parent_sec.select_one(".pmc_sec_title") if parent_sec else None section_subheading = subheading_el.get_text(strip=True) if subheading_el else "" headings = " > ".join(filter(None, [section_heading, section_subheading])) else: # Fallback if no section id is found section_url = base_url headings = "" # Table name and caption name_el = section.find("h4") if section else table.find("caption") name = name_el.get_text(strip=True) if name_el else "Table" caption_el = section.select_one('.caption p') if section else table.find("caption") caption = caption_el.get_text(strip=True) if caption_el else "" # Generate a referee_id from the table name # e.g., "Table 1." → number = "1" → referee_id = "table_1" # Look for parent

with an id containing 'tbl-' section = table.find_parent("section", id=True) table_id = section["id"] if section and "tbl-" in section["id"] else None print(table_id) match = re.search(r"tbl-(\d+)", table_id or "") number = match.group(1).lstrip("0") if match else "" referee_id = f"table_{number}" if number else "table_unknown" label = f"Table {number}. " + caption if number else "Table" # Collect footnotes footnotes = {} # case 1: ^{outside
for sup in section.select('.fn sup') if section else []:
sibling = sup.find_next_sibling("p")
if sibling:
key = sup.get_text(strip=True)
footnotes[key] = sibling.get_text(strip=True)

# case 2: ^inside

for p in section.select('.fn p') if section else []:
# matches like "* text" or "# text"
matches = re.findall(r"(?<=(\*|#))\s*(.*?)(?=\s\*|\s#|$)", p.get_text())
for key, text in matches:
footnotes[key] = text.strip()
print(name)
return name, caption, footnotes, headings, label, referee_id, section_url

def get_table_data(table, footnotes):
table_data = []
rowspan_tracker = {}
subsec = ""

for tr in table.find_all("tr"):
row = []
col_index = 0

# Pre-fill cells carried over by rowspan
while col_index in rowspan_tracker:
value, remaining = rowspan_tracker[col_index]
row.append(value)
remaining -= 1
if remaining:
rowspan_tracker[col_index] = (value, remaining)
else:
del rowspan_tracker[col_index]
col_index += 1

for cell in tr.find_all(["th", "td"]):
cell_text = cell.get_text(separator="\n", strip=True)
cell_sups = [sup.get_text() for sup in cell.find_all("sup")]

# Normalize text if superscripts are inside
if cell_sups:
# remove short tokens
lines = [t for t in cell_text.split("\n") if len(t) > 1]
cell_text = " ".join(lines)

# Append footnote text if any
for sup in cell_sups:
if sup in footnotes:
cell_text += f" ({footnotes[sup]})"

# Handle colspan as a subsection marker
colspan = int(cell.get("colspan", 1))
if colspan > 1:
subsec = cell_text
continue

row.append(cell_text)

# Track rowspan for this column
rowspan = int(cell.get("rowspan", 1))
if rowspan > 1:
rowspan_tracker[col_index] = (cell_text, rowspan - 1)

col_index += 1

if row:
if subsec:
row.insert(0, subsec)
table_data.append(row)

return table_data

def to_text(table_data, label, caption):
lines = []
lines.append(f"**{label}**")

# Skip header row when enumerating data rows
headers = table_data[0] if table_data else []
for i, row in enumerate(table_data[1:], start=1):
row_text = ", ".join(f"{h}: {v}" for h, v in zip(headers, row) if v)
lines.append(f"{{Row {i} - {row_text}}}")

return "[" + "\n".join(lines) + "]"

def to_chunk(text_block, section_url, referee_id, headings):
return {
"text": text_block,
"metadata": {
"section": section_url,
"type": "HTML table",
"referee_id": referee_id,
# "headings": headings,
}
}

def tables_to_json(input_path="bipolar.html", base_url="https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#"):
doc = []
with open(input_path, encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, features="html.parser")
tables = soup.find_all("table")

print(f"Found {len(tables)} tables in document.")

for idx, tbl in enumerate(tables, start=1):
name, caption, footnotes, headings, label, referee_id, section_url = \
get_table_metadata(tbl, base_url)
table_data = get_table_data(tbl, footnotes)
text_block = to_text(table_data, label, caption)
chunk = to_chunk(text_block, section_url, referee_id, headings)
doc.append(chunk)

return doc

if __name__ == "__main__":
# doc = tables_to_json()
# with open("tables.json", "w", encoding="utf-8") as f:
# json.dump(doc, f, indent=4)
pass}