File size: 2,655 Bytes
3530638 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
from bs4 import BeautifulSoup
import json
import re
def get_graph_metadata(graph, url="https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#"):
figure = graph.find_parent("figure")
figure_flag = False
section = graph.find_parent(id=re.compile(r'^section\d+-\d+$'))
section_id = section.get("id")
section_url = url + section_id
section_heading = section.find("h2").get_text()
section_subheading = section.find("h3").get_text()
headings = section_heading + " > " + section_subheading
attribution = ""
if figure:
figure_flag = True
image_url = graph.get("src")
name = figure.select_one(".obj_head").get_text()
all_p = [p.get_text() for p in figure.find_all("p") if not p.attrs]
caption = all_p[0]
label = name + " " + caption
attribution = "(" + figure.select_one('[aria-label="Attribution"]').get_text() + ")"
number = "_".join(re.findall(r"(.{1})\.", name)).lower()
referee_id = f"figure_{number}"
else:
image_url = graph.get("src")
table_section = graph.find_parent("section")
name = table_section.select_one(".obj_head").get_text()
caption = table_section.select_one(".caption p").get_text()
label = name + " " + caption
number = "_".join(re.findall(r"(.{1})\.", name)).lower()
referee_id = f"table_{number}"
return attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url
def to_chunk(text_block, section_url, referee_id, headings):
d = {
"text": text_block,
"metadata": {
"section": section_url,
"type": "table image",
"referee_id": referee_id,
"headings": headings,
}
}
return d
def main():
with open('../data/raw/source.html', encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html)
with open("../data/processed/parsed_images.txt", encoding="utf-8") as f:
text = f.read()
text_blocks = text.split("------")
docs = []
for graph, text_block in zip(soup.select(".graphic"), text_blocks):
attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url = get_graph_metadata(graph)
text_block = text_block.strip()
if text_block.startswith(name):
chunk = to_chunk(text_block, section_url, referee_id, headings)
docs.append(chunk)
with open("../data/processed/graphs.json", "w", encoding="utf-8") as f:
json.dump(docs, f, indent=4)
if __name__ == "__main__":
main() |