Spaces:

ymali
/

bipolar

Sleeping

File size: 11,465 Bytes
import re
import json
from tables import get_table_metadata, to_text, get_table_data
from bs4 import BeautifulSoup, Tag, NavigableString

abbr_map = {
    "ACT":   "Acceptance and commitment therapy",
    "ADHD":  "Attention-deficit hyperactivity disorder",
    "AI":    "Artificial intelligence",
    "BA":    "Behavioural activation",
    "CAM":   "Complementary and alternative medicine",
    # "CANMAT":"Canadian Network for Mood and Anxiety Treatments",
    "CBASP": "Cognitive behavioural analysis system of psychotherapy",
    "CBT":   "Cognitive-behavioural therapy",
    "CPD":   "Continuing professional development",
    "CYP":   "Cytochrome P450",
    "DBS":   "Deep brain stimulation",
    "DHI":   "Digital health intervention",
    "DLPFC": "Dorsolateral prefrontal cortex",
    "DSM-5": "Diagnostic and Statistical Manual",
    "DSM-5-TR": "Diagnostic and Statistical Manual, 5th edition, Text Revision",
    "DSM-IV-TR":"Diagnostic and Statistical Manual, 4th edition, Text Revision",
    "DTD":   "Difficult-to-treat depression",
    "ECG":   "Electrocardiography",
    "ECT":   "Electroconvulsive therapy",
    "EEG":   "Electroencephalography",
    "GRADE": "Grading of Recommendations Assessment, Development, and Evaluation",
    "ICD":   "International Classification of Diseases",
    "IPT":   "Interpersonal therapy",
    "MAOI":  "Monoamine oxidase inhibitor",
    "MBC":   "Measurement-based care",
    "MBCT":  "Mindfulness-based cognitive therapy",
    "MCT":   "Metacognitive therapy",
    "MDD":   "Major depressive disorder",
    "MDE":   "Major depressive episode",
    "MI":    "Motivational interviewing",
    "MST":   "Magnetic seizure therapy",
    "NbN":   "Neuroscience-based nomenclature",
    "NDRI":  "Norepinephrine-dopamine reuptake inhibitor",
    "NMDA":  "N-methyl-D-aspartate",
    "NSAID": "Nonsteroidal anti-inflammatory drug",
    "PDD":   "Persistent depressive disorder",
    "PDT":   "Psychodynamic psychotherapy",
    "PHQ":   "Patient health questionnaire",
    "PST":   "Problem-solving therapy",
    "RCT":   "Randomized controlled trial",
    "rTMS":  "Repetitive transcranial magnetic stimulation",
    "SDM":   "Shared decision-making",
    "SNRI":  "Serotonin-norepinephrine reuptake inhibitor",
    "SSRI":  "Selective serotonin reuptake inhibitor",
    "STPP":  "Short-term psychodynamic psychotherapy",
    "TBS":   "Theta burst stimulation",
    "TCA":   "Tricyclic antidepressants",
    "tDCS":  "Transcranial direct current stimulation",
    "TMS":   "Transcranial magnetic stimulation",
    "TRD":   "Treatment-resistant depression",
    "VNS":   "Vagus nerve stimulation",
    "WHO":   "World Health Organization",
    }

def append_definition(guideline):
    pattern = re.compile(r'\b([A-Z]{2,})\b')

    for i in range(len(guideline)):
        if guideline[i]['metadata']['referee_id'] == 'table_c':
            continue
        
        text = guideline[i]['text']
        
        # Find all abbreviations in the text
        found_abbrs = set()
        matches = pattern.findall(text)
        for abbr in matches:
            if abbr in abbr_map:
                found_abbrs.add(abbr)
        
        # Create definitions section if abbreviations found
        if found_abbrs:
            definitions = []
            for abbr in sorted(found_abbrs):
                definitions.append(f"{abbr}: {abbr_map[abbr]}")
            
            definitions_text = "Abbreviations: " + "; ".join(definitions) + "\n\n"
            guideline[i]['text'] =  text + "\n" + definitions_text

    return guideline

def parse_title(soup):
    title = soup.find("h1")
    if title:
        title = title.decode_contents().replace('\n', '')
    return {
        "text": title,
        "metadata": {
            "section": "title",
            "type": "title",
            "headings": "Title of the guideline document",
            "referenced_tables": [],
            "referee_id": ""
        }
    }

def prepend_headings_to_text(guideline):
    for i in range(len(guideline)):
        guideline[i]['metadata']['chunk_id'] = i
        guideline[i]['text'] = guideline[i]['metadata']['headings'] + " > paragraph id: " + str(i) + "\n\n" + guideline[i]['text']


def build_headings_trail(p):
    # build headings trail
    heading = p.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name)))
    headings = heading.get_text(strip=True) if heading else 'No heading'
    
    parent_sec = p.find_parent(["section",'figure'], id=True)
    while parent_sec:
        heading = parent_sec.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name)))
        if heading:
            headings = heading.get_text(strip=True) + ' > ' + headings
        parent_sec = parent_sec.find_parent("section", id=True)
    headings = headings.strip().replace('\n', ' ')
    return headings


def delete_bib_links(soup):
    for a in soup.find_all("a", href=True):
        if a["href"].startswith("#bdi12609-bib-"):
            a.decompose()
    return soup

def delete_fig_and_tbl_sections(soup):
    # Remove all <figure> and <table> sections
    for fig in soup.find_all('figure'):
        fig.decompose()
    # find section that has class "tw xbox font-sm" and remove it
    for section in soup.find_all('section', class_="tw xbox font-sm"):
        section.decompose()
    return soup
    

def scan_links_and_tables(p):
    referenced_tables = set()
    # scan for numeric links and reconstruct table IDs
    for link in p.find_all('a', href=re.compile(r'-(fig|tbl)-')):
        href = link['href']
        tables = re.findall(r'(fig|tbl)-(\d+)', href)
        for table_id in tables:
            print(f"Found table links: {table_id[0]}-{table_id[1]}")
            referenced_tables.add(f"{table_id[0]}-{table_id[1]}")
    if referenced_tables:
        print("--------")

    return referenced_tables

def parse_paragraph(soup, output):
    paragraphs = soup.find_all('p')
    for p in paragraphs:
        parent = p.find_parent("section")
        sec_id = parent["id"] if parent else "unknown"
        
        output.append({
            "text": p.get_text(strip=False),
            "metadata": {
                "section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
                "type": "paragraph",
                "headings": build_headings_trail(p),
                "referenced_tables": list(scan_links_and_tables(p)),
                "referee_id": "",
            }
        })            

    
def parse_figures(soup, output):
    figures = soup.find_all('figure')
    for fig in figures:
        sec_id = fig["id"] 
        fig_caption = fig.find('figcaption').find('p').get_text()
        img_src_link = fig.find('img')['src'] if fig.find('img') else "No image link found"
        referee_id = re.search(r'(fig)-(\d+)', sec_id)
        
        p = fig.find('p')
        output.append({
            "text": f" > Figure: Image link: {img_src_link}-----\nFigure Caption: {fig_caption}\n",
            "metadata": {
                "section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
                "type": "figure",
                "headings": build_headings_trail(p),
                "referenced_tables": list(scan_links_and_tables(p)),
                "referee_id": referee_id.group(0) if referee_id else "fig_unknown",
            }
        })
        
def parse_tables(soup, output):
    tables = soup.find_all("section", class_="tw xbox font-sm")
    for table in tables:
        sec_id = table["id"]
        referee_id = re.search(r'(tbl)-(\d+)', sec_id)
        
        img = table.find('img')
        if img: # then this is a img table
            img_src_link = img['src'] if img else "No image link found"
            p = table.find('p')
            captions = table.find_all('div', class_='caption p')
            caption = ""
            for cap in captions:
                caption += cap.get_text(strip=True) + " "
            output.append({
                "text": f" > Table: Image link: {img_src_link}-----\nTable Caption: {caption}\n",
                "metadata": {
                    "section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
                    "type": "table",
                    "headings": build_headings_trail(p),
                    "referenced_tables": [],
                    "referee_id": referee_id.group(0) if referee_id else "tbl_unknown",
                    }
                })
        else:  # else it is a html table
            name, caption, footnotes, headings, label, ref_id, section_url = get_table_metadata(table, base_url="")
            table_data = get_table_data(table, footnotes)
            text = to_text(table_data, label, caption)
            p = table.find('div')
            
            output.append({
                "text": text,
                "metadata": {
                    "section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
                    "type": "table",
                    "headings": build_headings_trail(p),
                    "referenced_tables": [],
                    "referee_id": referee_id.group(0) if referee_id else "tbl_unknown",
                }
            })
        
        

def parse_main_article(soup, output):
    
    soup = delete_bib_links(soup)
    parse_figures(soup, output)
    parse_tables(soup, output)
    
    paragraph_only_soup = delete_fig_and_tbl_sections(soup)

    parse_paragraph(paragraph_only_soup, output)
    
    # parse_tables(soup, output)

    return output


def main():
    with open('bipolar.html', 'r', encoding='utf-8') as f:
        html = f.read()
    soup = BeautifulSoup(html, "html.parser")
    output = []

    output.append(parse_title(soup))
    output = parse_main_article(soup, output)
    print(f"Parsed {len(output)} paragraphs from the main article.")

    combined = output
    prepend_headings_to_text(combined)
    append_definition(combined)

    with open("guideline_db.json", "w", encoding="utf-8") as f:
        json.dump(combined, f, ensure_ascii=False, indent=4)
        print(f"guideline_db.json for bipolar created with {len(combined)} chunks.")


def write_referenced_tables():
    with open("guideline_db.json", "r", encoding="utf-8") as f:
        guideline_db = json.load(f)
        with open("table12_textual.txt", "r", encoding="utf-8") as f:
            guideline_db[21]['text'] += f.read().strip()
        with open("table14_textual.txt", "r", encoding="utf-8") as f:
            guideline_db[23]['text'] += f.read().strip()
        with open("table17_textual.txt", "r", encoding="utf-8") as f:
            guideline_db[26]['text'] += f.read().strip()
            
        figures_and_tables = guideline_db[1:34]  # Assuming these are the table chunks
        #write back to the original file
        with open ("guideline_db.json", "w", encoding="utf-8") as f:
            json.dump(guideline_db, f, ensure_ascii=False, indent=4)
            print(f"guideline_db.json updated with table 12 14 17 chunks.")
        with open("referenced_table_chunks.json", "w", encoding="utf-8") as f:
            json.dump(figures_and_tables, f, ensure_ascii=False, indent=4)
            print(f"referenced_table_chunks.json created with {len(figures_and_tables)} chunks.")
            

if __name__ == "__main__":
    # main()
    # read in the file guideline_db.json

    # make the referenced_tables.json
    write_referenced_tables()