Spaces:

ymali
/

bipolar

Sleeping

File size: 6,035 Bytes
from bs4 import BeautifulSoup
import json
import pandas as pd
import re


def get_table_metadata(table, base_url):
    # Find the nearest ancestor <section> that has an id
    section = table.find_parent('section')
    while section and not section.get('id'):
        section = section.find_parent('section')

    section_id = section.get("id") if section else None
    if section_id:
        section_url = base_url + section_id
        # Try to get the main heading
        heading_el = section.select_one(f"[data-anchor-id={section_id}]")
        section_heading = heading_el.get_text(strip=True) if heading_el else ""
        # Try to get the subheading from its parent section
        parent_sec = section.find_parent('section')
        subheading_el = parent_sec.select_one(".pmc_sec_title") if parent_sec else None
        section_subheading = subheading_el.get_text(strip=True) if subheading_el else ""
        headings = " > ".join(filter(None, [section_heading, section_subheading]))
    else:
        # Fallback if no section id is found
        section_url = base_url
        headings = ""

    # Table name and caption
    name_el = section.find("h4") if section else table.find("caption")
    name = name_el.get_text(strip=True) if name_el else "Table"
    caption_el = section.select_one('.caption p') if section else table.find("caption")
    caption = caption_el.get_text(strip=True) if caption_el else ""

    # Generate a referee_id from the table name
    # e.g., "Table 1." → number = "1" → referee_id = "table_1"
    # Look for parent <section> with an id containing 'tbl-'
    section = table.find_parent("section", id=True)
    table_id = section["id"] if section and "tbl-" in section["id"] else None
    print(table_id)
    match = re.search(r"tbl-(\d+)", table_id or "")
    number = match.group(1).lstrip("0") if match else ""
    referee_id = f"table_{number}" if number else "table_unknown"
    label = f"Table {number}. " + caption if number else "Table"

    # Collect footnotes
    footnotes = {}
    # case 1: <sup> outside <p>
    for sup in section.select('.fn sup') if section else []:
        sibling = sup.find_next_sibling("p")
        if sibling:
            key = sup.get_text(strip=True)
            footnotes[key] = sibling.get_text(strip=True)

    # case 2: <sup> inside <p>
    for p in section.select('.fn p') if section else []:
        # matches like "* text" or "# text"
        matches = re.findall(r"(?<=(\*|#))\s*(.*?)(?=\s\*|\s#|$)", p.get_text())
        for key, text in matches:
            footnotes[key] = text.strip()
    print(name)
    return name, caption, footnotes, headings, label, referee_id, section_url


def get_table_data(table, footnotes):
    table_data = []
    rowspan_tracker = {}
    subsec = ""

    for tr in table.find_all("tr"):
        row = []
        col_index = 0

        # Pre-fill cells carried over by rowspan
        while col_index in rowspan_tracker:
            value, remaining = rowspan_tracker[col_index]
            row.append(value)
            remaining -= 1
            if remaining:
                rowspan_tracker[col_index] = (value, remaining)
            else:
                del rowspan_tracker[col_index]
            col_index += 1

        for cell in tr.find_all(["th", "td"]):
            cell_text = cell.get_text(separator="\n", strip=True)
            cell_sups = [sup.get_text() for sup in cell.find_all("sup")]

            # Normalize text if superscripts are inside
            if cell_sups:
                # remove short tokens
                lines = [t for t in cell_text.split("\n") if len(t) > 1]
                cell_text = " ".join(lines)

            # Append footnote text if any
            for sup in cell_sups:
                if sup in footnotes:
                    cell_text += f" ({footnotes[sup]})"

            # Handle colspan as a subsection marker
            colspan = int(cell.get("colspan", 1))
            if colspan > 1:
                subsec = cell_text
                continue

            row.append(cell_text)

            # Track rowspan for this column
            rowspan = int(cell.get("rowspan", 1))
            if rowspan > 1:
                rowspan_tracker[col_index] = (cell_text, rowspan - 1)

            col_index += 1

        if row:
            if subsec:
                row.insert(0, subsec)
            table_data.append(row)

    return table_data


def to_text(table_data, label, caption):
    lines = []
    lines.append(f"**{label}**")

    # Skip header row when enumerating data rows
    headers = table_data[0] if table_data else []
    for i, row in enumerate(table_data[1:], start=1):
        row_text = ", ".join(f"{h}: {v}" for h, v in zip(headers, row) if v)
        lines.append(f"{{Row {i} - {row_text}}}")

    return "[" + "\n".join(lines) + "]"


def to_chunk(text_block, section_url, referee_id, headings):
    return {
        "text": text_block,
        "metadata": {
            "section": section_url,
            "type": "HTML table",
            "referee_id": referee_id,
            # "headings": headings,
        }
    }


def tables_to_json(input_path="bipolar.html", base_url="https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#"):
    doc = []
    with open(input_path, encoding="utf-8") as f:
        html = f.read()
        soup = BeautifulSoup(html, features="html.parser")
        tables = soup.find_all("table")

    print(f"Found {len(tables)} tables in document.")

    for idx, tbl in enumerate(tables, start=1):
        name, caption, footnotes, headings, label, referee_id, section_url = \
            get_table_metadata(tbl, base_url)
        table_data = get_table_data(tbl, footnotes)
        text_block = to_text(table_data, label, caption)
        chunk = to_chunk(text_block, section_url, referee_id, headings)
        doc.append(chunk)

    return doc


if __name__ == "__main__":
    # doc = tables_to_json()
    # with open("tables.json", "w", encoding="utf-8") as f:
    #     json.dump(doc, f, indent=4)
    pass