from bs4 import BeautifulSoup import json import pandas as pd import re def get_table_metadata(table, base_url): # Find the nearest ancestor
that has an id section = table.find_parent('section') while section and not section.get('id'): section = section.find_parent('section') section_id = section.get("id") if section else None if section_id: section_url = base_url + section_id # Try to get the main heading heading_el = section.select_one(f"[data-anchor-id={section_id}]") section_heading = heading_el.get_text(strip=True) if heading_el else "" # Try to get the subheading from its parent section parent_sec = section.find_parent('section') subheading_el = parent_sec.select_one(".pmc_sec_title") if parent_sec else None section_subheading = subheading_el.get_text(strip=True) if subheading_el else "" headings = " > ".join(filter(None, [section_heading, section_subheading])) else: # Fallback if no section id is found section_url = base_url headings = "" # Table name and caption name_el = section.find("h4") if section else table.find("caption") name = name_el.get_text(strip=True) if name_el else "Table" caption_el = section.select_one('.caption p') if section else table.find("caption") caption = caption_el.get_text(strip=True) if caption_el else "" # Generate a referee_id from the table name # e.g., "Table 1." → number = "1" → referee_id = "table_1" # Look for parent
with an id containing 'tbl-' section = table.find_parent("section", id=True) table_id = section["id"] if section and "tbl-" in section["id"] else None print(table_id) match = re.search(r"tbl-(\d+)", table_id or "") number = match.group(1).lstrip("0") if match else "" referee_id = f"table_{number}" if number else "table_unknown" label = f"Table {number}. " + caption if number else "Table" # Collect footnotes footnotes = {} # case 1: outside

for sup in section.select('.fn sup') if section else []: sibling = sup.find_next_sibling("p") if sibling: key = sup.get_text(strip=True) footnotes[key] = sibling.get_text(strip=True) # case 2: inside

for p in section.select('.fn p') if section else []: # matches like "* text" or "# text" matches = re.findall(r"(?<=(\*|#))\s*(.*?)(?=\s\*|\s#|$)", p.get_text()) for key, text in matches: footnotes[key] = text.strip() print(name) return name, caption, footnotes, headings, label, referee_id, section_url def get_table_data(table, footnotes): table_data = [] rowspan_tracker = {} subsec = "" for tr in table.find_all("tr"): row = [] col_index = 0 # Pre-fill cells carried over by rowspan while col_index in rowspan_tracker: value, remaining = rowspan_tracker[col_index] row.append(value) remaining -= 1 if remaining: rowspan_tracker[col_index] = (value, remaining) else: del rowspan_tracker[col_index] col_index += 1 for cell in tr.find_all(["th", "td"]): cell_text = cell.get_text(separator="\n", strip=True) cell_sups = [sup.get_text() for sup in cell.find_all("sup")] # Normalize text if superscripts are inside if cell_sups: # remove short tokens lines = [t for t in cell_text.split("\n") if len(t) > 1] cell_text = " ".join(lines) # Append footnote text if any for sup in cell_sups: if sup in footnotes: cell_text += f" ({footnotes[sup]})" # Handle colspan as a subsection marker colspan = int(cell.get("colspan", 1)) if colspan > 1: subsec = cell_text continue row.append(cell_text) # Track rowspan for this column rowspan = int(cell.get("rowspan", 1)) if rowspan > 1: rowspan_tracker[col_index] = (cell_text, rowspan - 1) col_index += 1 if row: if subsec: row.insert(0, subsec) table_data.append(row) return table_data def to_text(table_data, label, caption): lines = [] lines.append(f"**{label}**") # Skip header row when enumerating data rows headers = table_data[0] if table_data else [] for i, row in enumerate(table_data[1:], start=1): row_text = ", ".join(f"{h}: {v}" for h, v in zip(headers, row) if v) lines.append(f"{{Row {i} - {row_text}}}") return "[" + "\n".join(lines) + "]" def to_chunk(text_block, section_url, referee_id, headings): return { "text": text_block, "metadata": { "section": section_url, "type": "HTML table", "referee_id": referee_id, # "headings": headings, } } def tables_to_json(input_path="bipolar.html", base_url="https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#"): doc = [] with open(input_path, encoding="utf-8") as f: html = f.read() soup = BeautifulSoup(html, features="html.parser") tables = soup.find_all("table") print(f"Found {len(tables)} tables in document.") for idx, tbl in enumerate(tables, start=1): name, caption, footnotes, headings, label, referee_id, section_url = \ get_table_metadata(tbl, base_url) table_data = get_table_data(tbl, footnotes) text_block = to_text(table_data, label, caption) chunk = to_chunk(text_block, section_url, referee_id, headings) doc.append(chunk) return doc if __name__ == "__main__": # doc = tables_to_json() # with open("tables.json", "w", encoding="utf-8") as f: # json.dump(doc, f, indent=4) pass