File size: 6,035 Bytes
3530638
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from bs4 import BeautifulSoup
import json
import pandas as pd
import re


def get_table_metadata(table, base_url):
    # Find the nearest ancestor <section> that has an id
    section = table.find_parent('section')
    while section and not section.get('id'):
        section = section.find_parent('section')

    section_id = section.get("id") if section else None
    if section_id:
        section_url = base_url + section_id
        # Try to get the main heading
        heading_el = section.select_one(f"[data-anchor-id={section_id}]")
        section_heading = heading_el.get_text(strip=True) if heading_el else ""
        # Try to get the subheading from its parent section
        parent_sec = section.find_parent('section')
        subheading_el = parent_sec.select_one(".pmc_sec_title") if parent_sec else None
        section_subheading = subheading_el.get_text(strip=True) if subheading_el else ""
        headings = " > ".join(filter(None, [section_heading, section_subheading]))
    else:
        # Fallback if no section id is found
        section_url = base_url
        headings = ""

    # Table name and caption
    name_el = section.find("h4") if section else table.find("caption")
    name = name_el.get_text(strip=True) if name_el else "Table"
    caption_el = section.select_one('.caption p') if section else table.find("caption")
    caption = caption_el.get_text(strip=True) if caption_el else ""

    # Generate a referee_id from the table name
    # e.g., "Table 1." → number = "1" → referee_id = "table_1"
    # Look for parent <section> with an id containing 'tbl-'
    section = table.find_parent("section", id=True)
    table_id = section["id"] if section and "tbl-" in section["id"] else None
    print(table_id)
    match = re.search(r"tbl-(\d+)", table_id or "")
    number = match.group(1).lstrip("0") if match else ""
    referee_id = f"table_{number}" if number else "table_unknown"
    label = f"Table {number}. " + caption if number else "Table"

    # Collect footnotes
    footnotes = {}
    # case 1: <sup> outside <p>
    for sup in section.select('.fn sup') if section else []:
        sibling = sup.find_next_sibling("p")
        if sibling:
            key = sup.get_text(strip=True)
            footnotes[key] = sibling.get_text(strip=True)

    # case 2: <sup> inside <p>
    for p in section.select('.fn p') if section else []:
        # matches like "* text" or "# text"
        matches = re.findall(r"(?<=(\*|#))\s*(.*?)(?=\s\*|\s#|$)", p.get_text())
        for key, text in matches:
            footnotes[key] = text.strip()
    print(name)
    return name, caption, footnotes, headings, label, referee_id, section_url


def get_table_data(table, footnotes):
    table_data = []
    rowspan_tracker = {}
    subsec = ""

    for tr in table.find_all("tr"):
        row = []
        col_index = 0

        # Pre-fill cells carried over by rowspan
        while col_index in rowspan_tracker:
            value, remaining = rowspan_tracker[col_index]
            row.append(value)
            remaining -= 1
            if remaining:
                rowspan_tracker[col_index] = (value, remaining)
            else:
                del rowspan_tracker[col_index]
            col_index += 1

        for cell in tr.find_all(["th", "td"]):
            cell_text = cell.get_text(separator="\n", strip=True)
            cell_sups = [sup.get_text() for sup in cell.find_all("sup")]

            # Normalize text if superscripts are inside
            if cell_sups:
                # remove short tokens
                lines = [t for t in cell_text.split("\n") if len(t) > 1]
                cell_text = " ".join(lines)

            # Append footnote text if any
            for sup in cell_sups:
                if sup in footnotes:
                    cell_text += f" ({footnotes[sup]})"

            # Handle colspan as a subsection marker
            colspan = int(cell.get("colspan", 1))
            if colspan > 1:
                subsec = cell_text
                continue

            row.append(cell_text)

            # Track rowspan for this column
            rowspan = int(cell.get("rowspan", 1))
            if rowspan > 1:
                rowspan_tracker[col_index] = (cell_text, rowspan - 1)

            col_index += 1

        if row:
            if subsec:
                row.insert(0, subsec)
            table_data.append(row)

    return table_data


def to_text(table_data, label, caption):
    lines = []
    lines.append(f"**{label}**")

    # Skip header row when enumerating data rows
    headers = table_data[0] if table_data else []
    for i, row in enumerate(table_data[1:], start=1):
        row_text = ", ".join(f"{h}: {v}" for h, v in zip(headers, row) if v)
        lines.append(f"{{Row {i} - {row_text}}}")

    return "[" + "\n".join(lines) + "]"


def to_chunk(text_block, section_url, referee_id, headings):
    return {
        "text": text_block,
        "metadata": {
            "section": section_url,
            "type": "HTML table",
            "referee_id": referee_id,
            # "headings": headings,
        }
    }


def tables_to_json(input_path="bipolar.html", base_url="https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#"):
    doc = []
    with open(input_path, encoding="utf-8") as f:
        html = f.read()
        soup = BeautifulSoup(html, features="html.parser")
        tables = soup.find_all("table")

    print(f"Found {len(tables)} tables in document.")

    for idx, tbl in enumerate(tables, start=1):
        name, caption, footnotes, headings, label, referee_id, section_url = \
            get_table_metadata(tbl, base_url)
        table_data = get_table_data(tbl, footnotes)
        text_block = to_text(table_data, label, caption)
        chunk = to_chunk(text_block, section_url, referee_id, headings)
        doc.append(chunk)

    return doc


if __name__ == "__main__":
    # doc = tables_to_json()
    # with open("tables.json", "w", encoding="utf-8") as f:
    #     json.dump(doc, f, indent=4)
    pass