bipolar / src /data_processing /image_processing.py
zzejiao's picture
yash's hf bipolar demo code with github action set
3530638
from bs4 import BeautifulSoup
import json
import re
def get_graph_metadata(graph, url="https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#"):
figure = graph.find_parent("figure")
figure_flag = False
section = graph.find_parent(id=re.compile(r'^section\d+-\d+$'))
section_id = section.get("id")
section_url = url + section_id
section_heading = section.find("h2").get_text()
section_subheading = section.find("h3").get_text()
headings = section_heading + " > " + section_subheading
attribution = ""
if figure:
figure_flag = True
image_url = graph.get("src")
name = figure.select_one(".obj_head").get_text()
all_p = [p.get_text() for p in figure.find_all("p") if not p.attrs]
caption = all_p[0]
label = name + " " + caption
attribution = "(" + figure.select_one('[aria-label="Attribution"]').get_text() + ")"
number = "_".join(re.findall(r"(.{1})\.", name)).lower()
referee_id = f"figure_{number}"
else:
image_url = graph.get("src")
table_section = graph.find_parent("section")
name = table_section.select_one(".obj_head").get_text()
caption = table_section.select_one(".caption p").get_text()
label = name + " " + caption
number = "_".join(re.findall(r"(.{1})\.", name)).lower()
referee_id = f"table_{number}"
return attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url
def to_chunk(text_block, section_url, referee_id, headings):
d = {
"text": text_block,
"metadata": {
"section": section_url,
"type": "table image",
"referee_id": referee_id,
"headings": headings,
}
}
return d
def main():
with open('../data/raw/source.html', encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html)
with open("../data/processed/parsed_images.txt", encoding="utf-8") as f:
text = f.read()
text_blocks = text.split("------")
docs = []
for graph, text_block in zip(soup.select(".graphic"), text_blocks):
attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url = get_graph_metadata(graph)
text_block = text_block.strip()
if text_block.startswith(name):
chunk = to_chunk(text_block, section_url, referee_id, headings)
docs.append(chunk)
with open("../data/processed/graphs.json", "w", encoding="utf-8") as f:
json.dump(docs, f, indent=4)
if __name__ == "__main__":
main()