|
|
import re |
|
|
import json |
|
|
from tables import get_table_metadata, to_text, get_table_data |
|
|
from bs4 import BeautifulSoup, Tag, NavigableString |
|
|
|
|
|
abbr_map = { |
|
|
"ACT": "Acceptance and commitment therapy", |
|
|
"ADHD": "Attention-deficit hyperactivity disorder", |
|
|
"AI": "Artificial intelligence", |
|
|
"BA": "Behavioural activation", |
|
|
"CAM": "Complementary and alternative medicine", |
|
|
|
|
|
"CBASP": "Cognitive behavioural analysis system of psychotherapy", |
|
|
"CBT": "Cognitive-behavioural therapy", |
|
|
"CPD": "Continuing professional development", |
|
|
"CYP": "Cytochrome P450", |
|
|
"DBS": "Deep brain stimulation", |
|
|
"DHI": "Digital health intervention", |
|
|
"DLPFC": "Dorsolateral prefrontal cortex", |
|
|
"DSM-5": "Diagnostic and Statistical Manual", |
|
|
"DSM-5-TR": "Diagnostic and Statistical Manual, 5th edition, Text Revision", |
|
|
"DSM-IV-TR":"Diagnostic and Statistical Manual, 4th edition, Text Revision", |
|
|
"DTD": "Difficult-to-treat depression", |
|
|
"ECG": "Electrocardiography", |
|
|
"ECT": "Electroconvulsive therapy", |
|
|
"EEG": "Electroencephalography", |
|
|
"GRADE": "Grading of Recommendations Assessment, Development, and Evaluation", |
|
|
"ICD": "International Classification of Diseases", |
|
|
"IPT": "Interpersonal therapy", |
|
|
"MAOI": "Monoamine oxidase inhibitor", |
|
|
"MBC": "Measurement-based care", |
|
|
"MBCT": "Mindfulness-based cognitive therapy", |
|
|
"MCT": "Metacognitive therapy", |
|
|
"MDD": "Major depressive disorder", |
|
|
"MDE": "Major depressive episode", |
|
|
"MI": "Motivational interviewing", |
|
|
"MST": "Magnetic seizure therapy", |
|
|
"NbN": "Neuroscience-based nomenclature", |
|
|
"NDRI": "Norepinephrine-dopamine reuptake inhibitor", |
|
|
"NMDA": "N-methyl-D-aspartate", |
|
|
"NSAID": "Nonsteroidal anti-inflammatory drug", |
|
|
"PDD": "Persistent depressive disorder", |
|
|
"PDT": "Psychodynamic psychotherapy", |
|
|
"PHQ": "Patient health questionnaire", |
|
|
"PST": "Problem-solving therapy", |
|
|
"RCT": "Randomized controlled trial", |
|
|
"rTMS": "Repetitive transcranial magnetic stimulation", |
|
|
"SDM": "Shared decision-making", |
|
|
"SNRI": "Serotonin-norepinephrine reuptake inhibitor", |
|
|
"SSRI": "Selective serotonin reuptake inhibitor", |
|
|
"STPP": "Short-term psychodynamic psychotherapy", |
|
|
"TBS": "Theta burst stimulation", |
|
|
"TCA": "Tricyclic antidepressants", |
|
|
"tDCS": "Transcranial direct current stimulation", |
|
|
"TMS": "Transcranial magnetic stimulation", |
|
|
"TRD": "Treatment-resistant depression", |
|
|
"VNS": "Vagus nerve stimulation", |
|
|
"WHO": "World Health Organization", |
|
|
} |
|
|
|
|
|
def append_definition(guideline): |
|
|
pattern = re.compile(r'\b([A-Z]{2,})\b') |
|
|
|
|
|
for i in range(len(guideline)): |
|
|
if guideline[i]['metadata']['referee_id'] == 'table_c': |
|
|
continue |
|
|
|
|
|
text = guideline[i]['text'] |
|
|
|
|
|
|
|
|
found_abbrs = set() |
|
|
matches = pattern.findall(text) |
|
|
for abbr in matches: |
|
|
if abbr in abbr_map: |
|
|
found_abbrs.add(abbr) |
|
|
|
|
|
|
|
|
if found_abbrs: |
|
|
definitions = [] |
|
|
for abbr in sorted(found_abbrs): |
|
|
definitions.append(f"{abbr}: {abbr_map[abbr]}") |
|
|
|
|
|
definitions_text = "Abbreviations: " + "; ".join(definitions) + "\n\n" |
|
|
guideline[i]['text'] = text + "\n" + definitions_text |
|
|
|
|
|
return guideline |
|
|
|
|
|
def parse_title(soup): |
|
|
title = soup.find("h1") |
|
|
if title: |
|
|
title = title.decode_contents().replace('\n', '') |
|
|
return { |
|
|
"text": title, |
|
|
"metadata": { |
|
|
"section": "title", |
|
|
"type": "title", |
|
|
"headings": "Title of the guideline document", |
|
|
"referenced_tables": [], |
|
|
"referee_id": "" |
|
|
} |
|
|
} |
|
|
|
|
|
def prepend_headings_to_text(guideline): |
|
|
for i in range(len(guideline)): |
|
|
guideline[i]['metadata']['chunk_id'] = i |
|
|
guideline[i]['text'] = guideline[i]['metadata']['headings'] + " > paragraph id: " + str(i) + "\n\n" + guideline[i]['text'] |
|
|
|
|
|
|
|
|
def build_headings_trail(p): |
|
|
|
|
|
heading = p.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name))) |
|
|
headings = heading.get_text(strip=True) if heading else 'No heading' |
|
|
|
|
|
parent_sec = p.find_parent(["section",'figure'], id=True) |
|
|
while parent_sec: |
|
|
heading = parent_sec.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name))) |
|
|
if heading: |
|
|
headings = heading.get_text(strip=True) + ' > ' + headings |
|
|
parent_sec = parent_sec.find_parent("section", id=True) |
|
|
headings = headings.strip().replace('\n', ' ') |
|
|
return headings |
|
|
|
|
|
|
|
|
def delete_bib_links(soup): |
|
|
for a in soup.find_all("a", href=True): |
|
|
if a["href"].startswith("#bdi12609-bib-"): |
|
|
a.decompose() |
|
|
return soup |
|
|
|
|
|
def delete_fig_and_tbl_sections(soup): |
|
|
|
|
|
for fig in soup.find_all('figure'): |
|
|
fig.decompose() |
|
|
|
|
|
for section in soup.find_all('section', class_="tw xbox font-sm"): |
|
|
section.decompose() |
|
|
return soup |
|
|
|
|
|
|
|
|
def scan_links_and_tables(p): |
|
|
referenced_tables = set() |
|
|
|
|
|
for link in p.find_all('a', href=re.compile(r'-(fig|tbl)-')): |
|
|
href = link['href'] |
|
|
tables = re.findall(r'(fig|tbl)-(\d+)', href) |
|
|
for table_id in tables: |
|
|
print(f"Found table links: {table_id[0]}-{table_id[1]}") |
|
|
referenced_tables.add(f"{table_id[0]}-{table_id[1]}") |
|
|
if referenced_tables: |
|
|
print("--------") |
|
|
|
|
|
return referenced_tables |
|
|
|
|
|
def parse_paragraph(soup, output): |
|
|
paragraphs = soup.find_all('p') |
|
|
for p in paragraphs: |
|
|
parent = p.find_parent("section") |
|
|
sec_id = parent["id"] if parent else "unknown" |
|
|
|
|
|
output.append({ |
|
|
"text": p.get_text(strip=False), |
|
|
"metadata": { |
|
|
"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}", |
|
|
"type": "paragraph", |
|
|
"headings": build_headings_trail(p), |
|
|
"referenced_tables": list(scan_links_and_tables(p)), |
|
|
"referee_id": "", |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
def parse_figures(soup, output): |
|
|
figures = soup.find_all('figure') |
|
|
for fig in figures: |
|
|
sec_id = fig["id"] |
|
|
fig_caption = fig.find('figcaption').find('p').get_text() |
|
|
img_src_link = fig.find('img')['src'] if fig.find('img') else "No image link found" |
|
|
referee_id = re.search(r'(fig)-(\d+)', sec_id) |
|
|
|
|
|
p = fig.find('p') |
|
|
output.append({ |
|
|
"text": f" > Figure: Image link: {img_src_link}-----\nFigure Caption: {fig_caption}\n", |
|
|
"metadata": { |
|
|
"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}", |
|
|
"type": "figure", |
|
|
"headings": build_headings_trail(p), |
|
|
"referenced_tables": list(scan_links_and_tables(p)), |
|
|
"referee_id": referee_id.group(0) if referee_id else "fig_unknown", |
|
|
} |
|
|
}) |
|
|
|
|
|
def parse_tables(soup, output): |
|
|
tables = soup.find_all("section", class_="tw xbox font-sm") |
|
|
for table in tables: |
|
|
sec_id = table["id"] |
|
|
referee_id = re.search(r'(tbl)-(\d+)', sec_id) |
|
|
|
|
|
img = table.find('img') |
|
|
if img: |
|
|
img_src_link = img['src'] if img else "No image link found" |
|
|
p = table.find('p') |
|
|
captions = table.find_all('div', class_='caption p') |
|
|
caption = "" |
|
|
for cap in captions: |
|
|
caption += cap.get_text(strip=True) + " " |
|
|
output.append({ |
|
|
"text": f" > Table: Image link: {img_src_link}-----\nTable Caption: {caption}\n", |
|
|
"metadata": { |
|
|
"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}", |
|
|
"type": "table", |
|
|
"headings": build_headings_trail(p), |
|
|
"referenced_tables": [], |
|
|
"referee_id": referee_id.group(0) if referee_id else "tbl_unknown", |
|
|
} |
|
|
}) |
|
|
else: |
|
|
name, caption, footnotes, headings, label, ref_id, section_url = get_table_metadata(table, base_url="") |
|
|
table_data = get_table_data(table, footnotes) |
|
|
text = to_text(table_data, label, caption) |
|
|
p = table.find('div') |
|
|
|
|
|
output.append({ |
|
|
"text": text, |
|
|
"metadata": { |
|
|
"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}", |
|
|
"type": "table", |
|
|
"headings": build_headings_trail(p), |
|
|
"referenced_tables": [], |
|
|
"referee_id": referee_id.group(0) if referee_id else "tbl_unknown", |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
def parse_main_article(soup, output): |
|
|
|
|
|
soup = delete_bib_links(soup) |
|
|
parse_figures(soup, output) |
|
|
parse_tables(soup, output) |
|
|
|
|
|
paragraph_only_soup = delete_fig_and_tbl_sections(soup) |
|
|
|
|
|
parse_paragraph(paragraph_only_soup, output) |
|
|
|
|
|
|
|
|
|
|
|
return output |
|
|
|
|
|
|
|
|
def main(): |
|
|
with open('bipolar.html', 'r', encoding='utf-8') as f: |
|
|
html = f.read() |
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
output = [] |
|
|
|
|
|
output.append(parse_title(soup)) |
|
|
output = parse_main_article(soup, output) |
|
|
print(f"Parsed {len(output)} paragraphs from the main article.") |
|
|
|
|
|
combined = output |
|
|
prepend_headings_to_text(combined) |
|
|
append_definition(combined) |
|
|
|
|
|
with open("guideline_db.json", "w", encoding="utf-8") as f: |
|
|
json.dump(combined, f, ensure_ascii=False, indent=4) |
|
|
print(f"guideline_db.json for bipolar created with {len(combined)} chunks.") |
|
|
|
|
|
|
|
|
def write_referenced_tables(): |
|
|
with open("guideline_db.json", "r", encoding="utf-8") as f: |
|
|
guideline_db = json.load(f) |
|
|
with open("table12_textual.txt", "r", encoding="utf-8") as f: |
|
|
guideline_db[21]['text'] += f.read().strip() |
|
|
with open("table14_textual.txt", "r", encoding="utf-8") as f: |
|
|
guideline_db[23]['text'] += f.read().strip() |
|
|
with open("table17_textual.txt", "r", encoding="utf-8") as f: |
|
|
guideline_db[26]['text'] += f.read().strip() |
|
|
|
|
|
figures_and_tables = guideline_db[1:34] |
|
|
|
|
|
with open ("guideline_db.json", "w", encoding="utf-8") as f: |
|
|
json.dump(guideline_db, f, ensure_ascii=False, indent=4) |
|
|
print(f"guideline_db.json updated with table 12 14 17 chunks.") |
|
|
with open("referenced_table_chunks.json", "w", encoding="utf-8") as f: |
|
|
json.dump(figures_and_tables, f, ensure_ascii=False, indent=4) |
|
|
print(f"referenced_table_chunks.json created with {len(figures_and_tables)} chunks.") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
write_referenced_tables() |
|
|
|