from bs4 import BeautifulSoup import json import re def get_graph_metadata(graph, url="https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#"): figure = graph.find_parent("figure") figure_flag = False section = graph.find_parent(id=re.compile(r'^section\d+-\d+$')) section_id = section.get("id") section_url = url + section_id section_heading = section.find("h2").get_text() section_subheading = section.find("h3").get_text() headings = section_heading + " > " + section_subheading attribution = "" if figure: figure_flag = True image_url = graph.get("src") name = figure.select_one(".obj_head").get_text() all_p = [p.get_text() for p in figure.find_all("p") if not p.attrs] caption = all_p[0] label = name + " " + caption attribution = "(" + figure.select_one('[aria-label="Attribution"]').get_text() + ")" number = "_".join(re.findall(r"(.{1})\.", name)).lower() referee_id = f"figure_{number}" else: image_url = graph.get("src") table_section = graph.find_parent("section") name = table_section.select_one(".obj_head").get_text() caption = table_section.select_one(".caption p").get_text() label = name + " " + caption number = "_".join(re.findall(r"(.{1})\.", name)).lower() referee_id = f"table_{number}" return attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url def to_chunk(text_block, section_url, referee_id, headings): d = { "text": text_block, "metadata": { "section": section_url, "type": "table image", "referee_id": referee_id, "headings": headings, } } return d def main(): with open('../data/raw/source.html', encoding="utf-8") as f: html = f.read() soup = BeautifulSoup(html) with open("../data/processed/parsed_images.txt", encoding="utf-8") as f: text = f.read() text_blocks = text.split("------") docs = [] for graph, text_block in zip(soup.select(".graphic"), text_blocks): attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url = get_graph_metadata(graph) text_block = text_block.strip() if text_block.startswith(name): chunk = to_chunk(text_block, section_url, referee_id, headings) docs.append(chunk) with open("../data/processed/graphs.json", "w", encoding="utf-8") as f: json.dump(docs, f, indent=4) if __name__ == "__main__": main()