Spaces:

ymali
/

bipolar

Sleeping

App Files Files Community

bipolar / src /data_processing /image_processing.py

zzejiao

yash's hf bipolar demo code with github action set

3530638 6 months ago

raw

history blame contribute delete

2.66 kB

	from bs4 import BeautifulSoup
	import json
	import re

	def get_graph_metadata(graph, url="https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#"):



	figure = graph.find_parent("figure")
	figure_flag = False

	section = graph.find_parent(id=re.compile(r'^section\d+-\d+$'))
	section_id = section.get("id")
	section_url = url + section_id

	section_heading = section.find("h2").get_text()
	section_subheading = section.find("h3").get_text()
	headings = section_heading + " > " + section_subheading

	attribution = ""


	if figure:

	figure_flag = True

	image_url = graph.get("src")

	name = figure.select_one(".obj_head").get_text()
	all_p = [p.get_text() for p in figure.find_all("p") if not p.attrs]
	caption = all_p[0]
	label = name + " " + caption

	attribution = "(" + figure.select_one('[aria-label="Attribution"]').get_text() + ")"
	number = "_".join(re.findall(r"(.{1})\.", name)).lower()
	referee_id = f"figure_{number}"


	else:

	image_url = graph.get("src")

	table_section = graph.find_parent("section")

	name = table_section.select_one(".obj_head").get_text()
	caption = table_section.select_one(".caption p").get_text()
	label = name + " " + caption

	number = "_".join(re.findall(r"(.{1})\.", name)).lower()
	referee_id = f"table_{number}"


	return attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url

	def to_chunk(text_block, section_url, referee_id, headings):

	d = {
	"text": text_block,
	"metadata": {
	"section": section_url,
	"type": "table image",
	"referee_id": referee_id,
	"headings": headings,
	}
	}
	return d

	def main():

	with open('../data/raw/source.html', encoding="utf-8") as f:
	html = f.read()
	soup = BeautifulSoup(html)

	with open("../data/processed/parsed_images.txt", encoding="utf-8") as f:
	text = f.read()
	text_blocks = text.split("------")

	docs = []
	for graph, text_block in zip(soup.select(".graphic"), text_blocks):
	attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url = get_graph_metadata(graph)
	text_block = text_block.strip()

	if text_block.startswith(name):
	chunk = to_chunk(text_block, section_url, referee_id, headings)
	docs.append(chunk)

	with open("../data/processed/graphs.json", "w", encoding="utf-8") as f:
	json.dump(docs, f, indent=4)

	if __name__ == "__main__":
	main()