Spaces:

ymali
/

bipolar

Sleeping

File size: 2,655 Bytes
from bs4 import BeautifulSoup
import json
import re

def get_graph_metadata(graph, url="https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#"):

    

    figure = graph.find_parent("figure")
    figure_flag = False

    section = graph.find_parent(id=re.compile(r'^section\d+-\d+$'))
    section_id = section.get("id")
    section_url = url + section_id

    section_heading = section.find("h2").get_text()
    section_subheading = section.find("h3").get_text()
    headings = section_heading + " > " + section_subheading

    attribution = ""

    
    if figure:

        figure_flag = True

        image_url = graph.get("src")

        name = figure.select_one(".obj_head").get_text()
        all_p = [p.get_text() for p in figure.find_all("p") if not p.attrs]
        caption = all_p[0]
        label = name + " " + caption

        attribution = "(" + figure.select_one('[aria-label="Attribution"]').get_text() + ")"
        number = "_".join(re.findall(r"(.{1})\.", name)).lower()
        referee_id = f"figure_{number}"
        
          
    else:

        image_url = graph.get("src")

        table_section = graph.find_parent("section")

        name = table_section.select_one(".obj_head").get_text()
        caption = table_section.select_one(".caption p").get_text()
        label = name + " " + caption

        number = "_".join(re.findall(r"(.{1})\.", name)).lower()
        referee_id = f"table_{number}"
    

    return attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url

def to_chunk(text_block, section_url, referee_id, headings):

    d = {
    "text": text_block,
    "metadata": {
        "section": section_url,
        "type": "table image",
        "referee_id": referee_id,
        "headings": headings,
        }
    }
    return d

def main():

    with open('../data/raw/source.html', encoding="utf-8") as f:
        html = f.read()
        soup = BeautifulSoup(html)

    with open("../data/processed/parsed_images.txt",  encoding="utf-8") as f:
        text = f.read()
        text_blocks = text.split("------")

    docs = []
    for graph, text_block in zip(soup.select(".graphic"), text_blocks):
        attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url = get_graph_metadata(graph)
        text_block = text_block.strip()
     
        if text_block.startswith(name):
            chunk = to_chunk(text_block, section_url, referee_id, headings)
            docs.append(chunk)
    
    with open("../data/processed/graphs.json", "w", encoding="utf-8") as f:
        json.dump(docs, f, indent=4)

if __name__ == "__main__":
    main()