import re import json from tables import get_table_metadata, to_text, get_table_data from bs4 import BeautifulSoup, Tag, NavigableString abbr_map = { "ACT": "Acceptance and commitment therapy", "ADHD": "Attention-deficit hyperactivity disorder", "AI": "Artificial intelligence", "BA": "Behavioural activation", "CAM": "Complementary and alternative medicine", # "CANMAT":"Canadian Network for Mood and Anxiety Treatments", "CBASP": "Cognitive behavioural analysis system of psychotherapy", "CBT": "Cognitive-behavioural therapy", "CPD": "Continuing professional development", "CYP": "Cytochrome P450", "DBS": "Deep brain stimulation", "DHI": "Digital health intervention", "DLPFC": "Dorsolateral prefrontal cortex", "DSM-5": "Diagnostic and Statistical Manual", "DSM-5-TR": "Diagnostic and Statistical Manual, 5th edition, Text Revision", "DSM-IV-TR":"Diagnostic and Statistical Manual, 4th edition, Text Revision", "DTD": "Difficult-to-treat depression", "ECG": "Electrocardiography", "ECT": "Electroconvulsive therapy", "EEG": "Electroencephalography", "GRADE": "Grading of Recommendations Assessment, Development, and Evaluation", "ICD": "International Classification of Diseases", "IPT": "Interpersonal therapy", "MAOI": "Monoamine oxidase inhibitor", "MBC": "Measurement-based care", "MBCT": "Mindfulness-based cognitive therapy", "MCT": "Metacognitive therapy", "MDD": "Major depressive disorder", "MDE": "Major depressive episode", "MI": "Motivational interviewing", "MST": "Magnetic seizure therapy", "NbN": "Neuroscience-based nomenclature", "NDRI": "Norepinephrine-dopamine reuptake inhibitor", "NMDA": "N-methyl-D-aspartate", "NSAID": "Nonsteroidal anti-inflammatory drug", "PDD": "Persistent depressive disorder", "PDT": "Psychodynamic psychotherapy", "PHQ": "Patient health questionnaire", "PST": "Problem-solving therapy", "RCT": "Randomized controlled trial", "rTMS": "Repetitive transcranial magnetic stimulation", "SDM": "Shared decision-making", "SNRI": "Serotonin-norepinephrine reuptake inhibitor", "SSRI": "Selective serotonin reuptake inhibitor", "STPP": "Short-term psychodynamic psychotherapy", "TBS": "Theta burst stimulation", "TCA": "Tricyclic antidepressants", "tDCS": "Transcranial direct current stimulation", "TMS": "Transcranial magnetic stimulation", "TRD": "Treatment-resistant depression", "VNS": "Vagus nerve stimulation", "WHO": "World Health Organization", } def append_definition(guideline): pattern = re.compile(r'\b([A-Z]{2,})\b') for i in range(len(guideline)): if guideline[i]['metadata']['referee_id'] == 'table_c': continue text = guideline[i]['text'] # Find all abbreviations in the text found_abbrs = set() matches = pattern.findall(text) for abbr in matches: if abbr in abbr_map: found_abbrs.add(abbr) # Create definitions section if abbreviations found if found_abbrs: definitions = [] for abbr in sorted(found_abbrs): definitions.append(f"{abbr}: {abbr_map[abbr]}") definitions_text = "Abbreviations: " + "; ".join(definitions) + "\n\n" guideline[i]['text'] = text + "\n" + definitions_text return guideline def parse_title(soup): title = soup.find("h1") if title: title = title.decode_contents().replace('\n', '') return { "text": title, "metadata": { "section": "title", "type": "title", "headings": "Title of the guideline document", "referenced_tables": [], "referee_id": "" } } def prepend_headings_to_text(guideline): for i in range(len(guideline)): guideline[i]['metadata']['chunk_id'] = i guideline[i]['text'] = guideline[i]['metadata']['headings'] + " > paragraph id: " + str(i) + "\n\n" + guideline[i]['text'] def build_headings_trail(p): # build headings trail heading = p.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name))) headings = heading.get_text(strip=True) if heading else 'No heading' parent_sec = p.find_parent(["section",'figure'], id=True) while parent_sec: heading = parent_sec.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name))) if heading: headings = heading.get_text(strip=True) + ' > ' + headings parent_sec = parent_sec.find_parent("section", id=True) headings = headings.strip().replace('\n', ' ') return headings def delete_bib_links(soup): for a in soup.find_all("a", href=True): if a["href"].startswith("#bdi12609-bib-"): a.decompose() return soup def delete_fig_and_tbl_sections(soup): # Remove all
and sections for fig in soup.find_all('figure'): fig.decompose() # find section that has class "tw xbox font-sm" and remove it for section in soup.find_all('section', class_="tw xbox font-sm"): section.decompose() return soup def scan_links_and_tables(p): referenced_tables = set() # scan for numeric links and reconstruct table IDs for link in p.find_all('a', href=re.compile(r'-(fig|tbl)-')): href = link['href'] tables = re.findall(r'(fig|tbl)-(\d+)', href) for table_id in tables: print(f"Found table links: {table_id[0]}-{table_id[1]}") referenced_tables.add(f"{table_id[0]}-{table_id[1]}") if referenced_tables: print("--------") return referenced_tables def parse_paragraph(soup, output): paragraphs = soup.find_all('p') for p in paragraphs: parent = p.find_parent("section") sec_id = parent["id"] if parent else "unknown" output.append({ "text": p.get_text(strip=False), "metadata": { "section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}", "type": "paragraph", "headings": build_headings_trail(p), "referenced_tables": list(scan_links_and_tables(p)), "referee_id": "", } }) def parse_figures(soup, output): figures = soup.find_all('figure') for fig in figures: sec_id = fig["id"] fig_caption = fig.find('figcaption').find('p').get_text() img_src_link = fig.find('img')['src'] if fig.find('img') else "No image link found" referee_id = re.search(r'(fig)-(\d+)', sec_id) p = fig.find('p') output.append({ "text": f" > Figure: Image link: {img_src_link}-----\nFigure Caption: {fig_caption}\n", "metadata": { "section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}", "type": "figure", "headings": build_headings_trail(p), "referenced_tables": list(scan_links_and_tables(p)), "referee_id": referee_id.group(0) if referee_id else "fig_unknown", } }) def parse_tables(soup, output): tables = soup.find_all("section", class_="tw xbox font-sm") for table in tables: sec_id = table["id"] referee_id = re.search(r'(tbl)-(\d+)', sec_id) img = table.find('img') if img: # then this is a img table img_src_link = img['src'] if img else "No image link found" p = table.find('p') captions = table.find_all('div', class_='caption p') caption = "" for cap in captions: caption += cap.get_text(strip=True) + " " output.append({ "text": f" > Table: Image link: {img_src_link}-----\nTable Caption: {caption}\n", "metadata": { "section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}", "type": "table", "headings": build_headings_trail(p), "referenced_tables": [], "referee_id": referee_id.group(0) if referee_id else "tbl_unknown", } }) else: # else it is a html table name, caption, footnotes, headings, label, ref_id, section_url = get_table_metadata(table, base_url="") table_data = get_table_data(table, footnotes) text = to_text(table_data, label, caption) p = table.find('div') output.append({ "text": text, "metadata": { "section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}", "type": "table", "headings": build_headings_trail(p), "referenced_tables": [], "referee_id": referee_id.group(0) if referee_id else "tbl_unknown", } }) def parse_main_article(soup, output): soup = delete_bib_links(soup) parse_figures(soup, output) parse_tables(soup, output) paragraph_only_soup = delete_fig_and_tbl_sections(soup) parse_paragraph(paragraph_only_soup, output) # parse_tables(soup, output) return output def main(): with open('bipolar.html', 'r', encoding='utf-8') as f: html = f.read() soup = BeautifulSoup(html, "html.parser") output = [] output.append(parse_title(soup)) output = parse_main_article(soup, output) print(f"Parsed {len(output)} paragraphs from the main article.") combined = output prepend_headings_to_text(combined) append_definition(combined) with open("guideline_db.json", "w", encoding="utf-8") as f: json.dump(combined, f, ensure_ascii=False, indent=4) print(f"guideline_db.json for bipolar created with {len(combined)} chunks.") def write_referenced_tables(): with open("guideline_db.json", "r", encoding="utf-8") as f: guideline_db = json.load(f) with open("table12_textual.txt", "r", encoding="utf-8") as f: guideline_db[21]['text'] += f.read().strip() with open("table14_textual.txt", "r", encoding="utf-8") as f: guideline_db[23]['text'] += f.read().strip() with open("table17_textual.txt", "r", encoding="utf-8") as f: guideline_db[26]['text'] += f.read().strip() figures_and_tables = guideline_db[1:34] # Assuming these are the table chunks #write back to the original file with open ("guideline_db.json", "w", encoding="utf-8") as f: json.dump(guideline_db, f, ensure_ascii=False, indent=4) print(f"guideline_db.json updated with table 12 14 17 chunks.") with open("referenced_table_chunks.json", "w", encoding="utf-8") as f: json.dump(figures_and_tables, f, ensure_ascii=False, indent=4) print(f"referenced_table_chunks.json created with {len(figures_and_tables)} chunks.") if __name__ == "__main__": # main() # read in the file guideline_db.json # make the referenced_tables.json write_referenced_tables()