zzejiao's picture
yash's hf bipolar demo code with github action set
3530638
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
def get_table_metadata(table, base_url):
# Find the nearest ancestor <section> that has an id
section = table.find_parent('section')
while section and not section.get('id'):
section = section.find_parent('section')
section_id = section.get("id") if section else None
if section_id:
section_url = base_url + section_id
# Try to get the main heading
heading_el = section.select_one(f"[data-anchor-id={section_id}]")
section_heading = heading_el.get_text(strip=True) if heading_el else ""
# Try to get the subheading from its parent section
parent_sec = section.find_parent('section')
subheading_el = parent_sec.select_one(".pmc_sec_title") if parent_sec else None
section_subheading = subheading_el.get_text(strip=True) if subheading_el else ""
headings = " > ".join(filter(None, [section_heading, section_subheading]))
else:
# Fallback if no section id is found
section_url = base_url
headings = ""
# Table name and caption
name_el = section.find("h4") if section else table.find("caption")
name = name_el.get_text(strip=True) if name_el else "Table"
caption_el = section.select_one('.caption p') if section else table.find("caption")
caption = caption_el.get_text(strip=True) if caption_el else ""
# Generate a referee_id from the table name
# e.g., "Table 1." → number = "1" → referee_id = "table_1"
# Look for parent <section> with an id containing 'tbl-'
section = table.find_parent("section", id=True)
table_id = section["id"] if section and "tbl-" in section["id"] else None
print(table_id)
match = re.search(r"tbl-(\d+)", table_id or "")
number = match.group(1).lstrip("0") if match else ""
referee_id = f"table_{number}" if number else "table_unknown"
label = f"Table {number}. " + caption if number else "Table"
# Collect footnotes
footnotes = {}
# case 1: <sup> outside <p>
for sup in section.select('.fn sup') if section else []:
sibling = sup.find_next_sibling("p")
if sibling:
key = sup.get_text(strip=True)
footnotes[key] = sibling.get_text(strip=True)
# case 2: <sup> inside <p>
for p in section.select('.fn p') if section else []:
# matches like "* text" or "# text"
matches = re.findall(r"(?<=(\*|#))\s*(.*?)(?=\s\*|\s#|$)", p.get_text())
for key, text in matches:
footnotes[key] = text.strip()
print(name)
return name, caption, footnotes, headings, label, referee_id, section_url
def get_table_data(table, footnotes):
table_data = []
rowspan_tracker = {}
subsec = ""
for tr in table.find_all("tr"):
row = []
col_index = 0
# Pre-fill cells carried over by rowspan
while col_index in rowspan_tracker:
value, remaining = rowspan_tracker[col_index]
row.append(value)
remaining -= 1
if remaining:
rowspan_tracker[col_index] = (value, remaining)
else:
del rowspan_tracker[col_index]
col_index += 1
for cell in tr.find_all(["th", "td"]):
cell_text = cell.get_text(separator="\n", strip=True)
cell_sups = [sup.get_text() for sup in cell.find_all("sup")]
# Normalize text if superscripts are inside
if cell_sups:
# remove short tokens
lines = [t for t in cell_text.split("\n") if len(t) > 1]
cell_text = " ".join(lines)
# Append footnote text if any
for sup in cell_sups:
if sup in footnotes:
cell_text += f" ({footnotes[sup]})"
# Handle colspan as a subsection marker
colspan = int(cell.get("colspan", 1))
if colspan > 1:
subsec = cell_text
continue
row.append(cell_text)
# Track rowspan for this column
rowspan = int(cell.get("rowspan", 1))
if rowspan > 1:
rowspan_tracker[col_index] = (cell_text, rowspan - 1)
col_index += 1
if row:
if subsec:
row.insert(0, subsec)
table_data.append(row)
return table_data
def to_text(table_data, label, caption):
lines = []
lines.append(f"**{label}**")
# Skip header row when enumerating data rows
headers = table_data[0] if table_data else []
for i, row in enumerate(table_data[1:], start=1):
row_text = ", ".join(f"{h}: {v}" for h, v in zip(headers, row) if v)
lines.append(f"{{Row {i} - {row_text}}}")
return "[" + "\n".join(lines) + "]"
def to_chunk(text_block, section_url, referee_id, headings):
return {
"text": text_block,
"metadata": {
"section": section_url,
"type": "HTML table",
"referee_id": referee_id,
# "headings": headings,
}
}
def tables_to_json(input_path="bipolar.html", base_url="https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#"):
doc = []
with open(input_path, encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, features="html.parser")
tables = soup.find_all("table")
print(f"Found {len(tables)} tables in document.")
for idx, tbl in enumerate(tables, start=1):
name, caption, footnotes, headings, label, referee_id, section_url = \
get_table_metadata(tbl, base_url)
table_data = get_table_data(tbl, footnotes)
text_block = to_text(table_data, label, caption)
chunk = to_chunk(text_block, section_url, referee_id, headings)
doc.append(chunk)
return doc
if __name__ == "__main__":
# doc = tables_to_json()
# with open("tables.json", "w", encoding="utf-8") as f:
# json.dump(doc, f, indent=4)
pass