Spaces:

ymali
/

bipolar

Sleeping

App Files Files Community

bipolar / src /data_processing /tables.py

zzejiao

yash's hf bipolar demo code with github action set

3530638 6 months ago

raw

history blame contribute delete

6.04 kB

	from bs4 import BeautifulSoup
	import json
	import pandas as pd
	import re


	def get_table_metadata(table, base_url):
	# Find the nearest ancestor <section> that has an id
	section = table.find_parent('section')
	while section and not section.get('id'):
	section = section.find_parent('section')

	section_id = section.get("id") if section else None
	if section_id:
	section_url = base_url + section_id
	# Try to get the main heading
	heading_el = section.select_one(f"[data-anchor-id={section_id}]")
	section_heading = heading_el.get_text(strip=True) if heading_el else ""
	# Try to get the subheading from its parent section
	parent_sec = section.find_parent('section')
	subheading_el = parent_sec.select_one(".pmc_sec_title") if parent_sec else None
	section_subheading = subheading_el.get_text(strip=True) if subheading_el else ""
	headings = " > ".join(filter(None, [section_heading, section_subheading]))
	else:
	# Fallback if no section id is found
	section_url = base_url
	headings = ""

	# Table name and caption
	name_el = section.find("h4") if section else table.find("caption")
	name = name_el.get_text(strip=True) if name_el else "Table"
	caption_el = section.select_one('.caption p') if section else table.find("caption")
	caption = caption_el.get_text(strip=True) if caption_el else ""

	# Generate a referee_id from the table name
	# e.g., "Table 1." → number = "1" → referee_id = "table_1"
	# Look for parent <section> with an id containing 'tbl-'
	section = table.find_parent("section", id=True)
	table_id = section["id"] if section and "tbl-" in section["id"] else None
	print(table_id)
	match = re.search(r"tbl-(\d+)", table_id or "")
	number = match.group(1).lstrip("0") if match else ""
	referee_id = f"table_{number}" if number else "table_unknown"
	label = f"Table {number}. " + caption if number else "Table"

	# Collect footnotes
	footnotes = {}
	# case 1: <sup> outside <p>
	for sup in section.select('.fn sup') if section else []:
	sibling = sup.find_next_sibling("p")
	if sibling:
	key = sup.get_text(strip=True)
	footnotes[key] = sibling.get_text(strip=True)

	# case 2: <sup> inside <p>
	for p in section.select('.fn p') if section else []:
	# matches like "* text" or "# text"
	matches = re.findall(r"(?<=(\\|#))\s(.?)(?=\s\\|\s#\|$)", p.get_text())
	for key, text in matches:
	footnotes[key] = text.strip()
	print(name)
	return name, caption, footnotes, headings, label, referee_id, section_url


	def get_table_data(table, footnotes):
	table_data = []
	rowspan_tracker = {}
	subsec = ""

	for tr in table.find_all("tr"):
	row = []
	col_index = 0

	# Pre-fill cells carried over by rowspan
	while col_index in rowspan_tracker:
	value, remaining = rowspan_tracker[col_index]
	row.append(value)
	remaining -= 1
	if remaining:
	rowspan_tracker[col_index] = (value, remaining)
	else:
	del rowspan_tracker[col_index]
	col_index += 1

	for cell in tr.find_all(["th", "td"]):
	cell_text = cell.get_text(separator="\n", strip=True)
	cell_sups = [sup.get_text() for sup in cell.find_all("sup")]

	# Normalize text if superscripts are inside
	if cell_sups:
	# remove short tokens
	lines = [t for t in cell_text.split("\n") if len(t) > 1]
	cell_text = " ".join(lines)

	# Append footnote text if any
	for sup in cell_sups:
	if sup in footnotes:
	cell_text += f" ({footnotes[sup]})"

	# Handle colspan as a subsection marker
	colspan = int(cell.get("colspan", 1))
	if colspan > 1:
	subsec = cell_text
	continue

	row.append(cell_text)

	# Track rowspan for this column
	rowspan = int(cell.get("rowspan", 1))
	if rowspan > 1:
	rowspan_tracker[col_index] = (cell_text, rowspan - 1)

	col_index += 1

	if row:
	if subsec:
	row.insert(0, subsec)
	table_data.append(row)

	return table_data


	def to_text(table_data, label, caption):
	lines = []
	lines.append(f"{label}")

	# Skip header row when enumerating data rows
	headers = table_data[0] if table_data else []
	for i, row in enumerate(table_data[1:], start=1):
	row_text = ", ".join(f"{h}: {v}" for h, v in zip(headers, row) if v)
	lines.append(f"{{Row {i} - {row_text}}}")

	return "[" + "\n".join(lines) + "]"


	def to_chunk(text_block, section_url, referee_id, headings):
	return {
	"text": text_block,
	"metadata": {
	"section": section_url,
	"type": "HTML table",
	"referee_id": referee_id,
	# "headings": headings,
	}
	}


	def tables_to_json(input_path="bipolar.html", base_url="https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#"):
	doc = []
	with open(input_path, encoding="utf-8") as f:
	html = f.read()
	soup = BeautifulSoup(html, features="html.parser")
	tables = soup.find_all("table")

	print(f"Found {len(tables)} tables in document.")

	for idx, tbl in enumerate(tables, start=1):
	name, caption, footnotes, headings, label, referee_id, section_url = \
	get_table_metadata(tbl, base_url)
	table_data = get_table_data(tbl, footnotes)
	text_block = to_text(table_data, label, caption)
	chunk = to_chunk(text_block, section_url, referee_id, headings)
	doc.append(chunk)

	return doc


	if __name__ == "__main__":
	# doc = tables_to_json()
	# with open("tables.json", "w", encoding="utf-8") as f:
	# json.dump(doc, f, indent=4)
	pass