zzejiao's picture
yash's hf bipolar demo code with github action set
3530638
import re
import json
from tables import get_table_metadata, to_text, get_table_data
from bs4 import BeautifulSoup, Tag, NavigableString
abbr_map = {
"ACT": "Acceptance and commitment therapy",
"ADHD": "Attention-deficit hyperactivity disorder",
"AI": "Artificial intelligence",
"BA": "Behavioural activation",
"CAM": "Complementary and alternative medicine",
# "CANMAT":"Canadian Network for Mood and Anxiety Treatments",
"CBASP": "Cognitive behavioural analysis system of psychotherapy",
"CBT": "Cognitive-behavioural therapy",
"CPD": "Continuing professional development",
"CYP": "Cytochrome P450",
"DBS": "Deep brain stimulation",
"DHI": "Digital health intervention",
"DLPFC": "Dorsolateral prefrontal cortex",
"DSM-5": "Diagnostic and Statistical Manual",
"DSM-5-TR": "Diagnostic and Statistical Manual, 5th edition, Text Revision",
"DSM-IV-TR":"Diagnostic and Statistical Manual, 4th edition, Text Revision",
"DTD": "Difficult-to-treat depression",
"ECG": "Electrocardiography",
"ECT": "Electroconvulsive therapy",
"EEG": "Electroencephalography",
"GRADE": "Grading of Recommendations Assessment, Development, and Evaluation",
"ICD": "International Classification of Diseases",
"IPT": "Interpersonal therapy",
"MAOI": "Monoamine oxidase inhibitor",
"MBC": "Measurement-based care",
"MBCT": "Mindfulness-based cognitive therapy",
"MCT": "Metacognitive therapy",
"MDD": "Major depressive disorder",
"MDE": "Major depressive episode",
"MI": "Motivational interviewing",
"MST": "Magnetic seizure therapy",
"NbN": "Neuroscience-based nomenclature",
"NDRI": "Norepinephrine-dopamine reuptake inhibitor",
"NMDA": "N-methyl-D-aspartate",
"NSAID": "Nonsteroidal anti-inflammatory drug",
"PDD": "Persistent depressive disorder",
"PDT": "Psychodynamic psychotherapy",
"PHQ": "Patient health questionnaire",
"PST": "Problem-solving therapy",
"RCT": "Randomized controlled trial",
"rTMS": "Repetitive transcranial magnetic stimulation",
"SDM": "Shared decision-making",
"SNRI": "Serotonin-norepinephrine reuptake inhibitor",
"SSRI": "Selective serotonin reuptake inhibitor",
"STPP": "Short-term psychodynamic psychotherapy",
"TBS": "Theta burst stimulation",
"TCA": "Tricyclic antidepressants",
"tDCS": "Transcranial direct current stimulation",
"TMS": "Transcranial magnetic stimulation",
"TRD": "Treatment-resistant depression",
"VNS": "Vagus nerve stimulation",
"WHO": "World Health Organization",
}
def append_definition(guideline):
pattern = re.compile(r'\b([A-Z]{2,})\b')
for i in range(len(guideline)):
if guideline[i]['metadata']['referee_id'] == 'table_c':
continue
text = guideline[i]['text']
# Find all abbreviations in the text
found_abbrs = set()
matches = pattern.findall(text)
for abbr in matches:
if abbr in abbr_map:
found_abbrs.add(abbr)
# Create definitions section if abbreviations found
if found_abbrs:
definitions = []
for abbr in sorted(found_abbrs):
definitions.append(f"{abbr}: {abbr_map[abbr]}")
definitions_text = "Abbreviations: " + "; ".join(definitions) + "\n\n"
guideline[i]['text'] = text + "\n" + definitions_text
return guideline
def parse_title(soup):
title = soup.find("h1")
if title:
title = title.decode_contents().replace('\n', '')
return {
"text": title,
"metadata": {
"section": "title",
"type": "title",
"headings": "Title of the guideline document",
"referenced_tables": [],
"referee_id": ""
}
}
def prepend_headings_to_text(guideline):
for i in range(len(guideline)):
guideline[i]['metadata']['chunk_id'] = i
guideline[i]['text'] = guideline[i]['metadata']['headings'] + " > paragraph id: " + str(i) + "\n\n" + guideline[i]['text']
def build_headings_trail(p):
# build headings trail
heading = p.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name)))
headings = heading.get_text(strip=True) if heading else 'No heading'
parent_sec = p.find_parent(["section",'figure'], id=True)
while parent_sec:
heading = parent_sec.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name)))
if heading:
headings = heading.get_text(strip=True) + ' > ' + headings
parent_sec = parent_sec.find_parent("section", id=True)
headings = headings.strip().replace('\n', ' ')
return headings
def delete_bib_links(soup):
for a in soup.find_all("a", href=True):
if a["href"].startswith("#bdi12609-bib-"):
a.decompose()
return soup
def delete_fig_and_tbl_sections(soup):
# Remove all <figure> and <table> sections
for fig in soup.find_all('figure'):
fig.decompose()
# find section that has class "tw xbox font-sm" and remove it
for section in soup.find_all('section', class_="tw xbox font-sm"):
section.decompose()
return soup
def scan_links_and_tables(p):
referenced_tables = set()
# scan for numeric links and reconstruct table IDs
for link in p.find_all('a', href=re.compile(r'-(fig|tbl)-')):
href = link['href']
tables = re.findall(r'(fig|tbl)-(\d+)', href)
for table_id in tables:
print(f"Found table links: {table_id[0]}-{table_id[1]}")
referenced_tables.add(f"{table_id[0]}-{table_id[1]}")
if referenced_tables:
print("--------")
return referenced_tables
def parse_paragraph(soup, output):
paragraphs = soup.find_all('p')
for p in paragraphs:
parent = p.find_parent("section")
sec_id = parent["id"] if parent else "unknown"
output.append({
"text": p.get_text(strip=False),
"metadata": {
"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
"type": "paragraph",
"headings": build_headings_trail(p),
"referenced_tables": list(scan_links_and_tables(p)),
"referee_id": "",
}
})
def parse_figures(soup, output):
figures = soup.find_all('figure')
for fig in figures:
sec_id = fig["id"]
fig_caption = fig.find('figcaption').find('p').get_text()
img_src_link = fig.find('img')['src'] if fig.find('img') else "No image link found"
referee_id = re.search(r'(fig)-(\d+)', sec_id)
p = fig.find('p')
output.append({
"text": f" > Figure: Image link: {img_src_link}-----\nFigure Caption: {fig_caption}\n",
"metadata": {
"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
"type": "figure",
"headings": build_headings_trail(p),
"referenced_tables": list(scan_links_and_tables(p)),
"referee_id": referee_id.group(0) if referee_id else "fig_unknown",
}
})
def parse_tables(soup, output):
tables = soup.find_all("section", class_="tw xbox font-sm")
for table in tables:
sec_id = table["id"]
referee_id = re.search(r'(tbl)-(\d+)', sec_id)
img = table.find('img')
if img: # then this is a img table
img_src_link = img['src'] if img else "No image link found"
p = table.find('p')
captions = table.find_all('div', class_='caption p')
caption = ""
for cap in captions:
caption += cap.get_text(strip=True) + " "
output.append({
"text": f" > Table: Image link: {img_src_link}-----\nTable Caption: {caption}\n",
"metadata": {
"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
"type": "table",
"headings": build_headings_trail(p),
"referenced_tables": [],
"referee_id": referee_id.group(0) if referee_id else "tbl_unknown",
}
})
else: # else it is a html table
name, caption, footnotes, headings, label, ref_id, section_url = get_table_metadata(table, base_url="")
table_data = get_table_data(table, footnotes)
text = to_text(table_data, label, caption)
p = table.find('div')
output.append({
"text": text,
"metadata": {
"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
"type": "table",
"headings": build_headings_trail(p),
"referenced_tables": [],
"referee_id": referee_id.group(0) if referee_id else "tbl_unknown",
}
})
def parse_main_article(soup, output):
soup = delete_bib_links(soup)
parse_figures(soup, output)
parse_tables(soup, output)
paragraph_only_soup = delete_fig_and_tbl_sections(soup)
parse_paragraph(paragraph_only_soup, output)
# parse_tables(soup, output)
return output
def main():
with open('bipolar.html', 'r', encoding='utf-8') as f:
html = f.read()
soup = BeautifulSoup(html, "html.parser")
output = []
output.append(parse_title(soup))
output = parse_main_article(soup, output)
print(f"Parsed {len(output)} paragraphs from the main article.")
combined = output
prepend_headings_to_text(combined)
append_definition(combined)
with open("guideline_db.json", "w", encoding="utf-8") as f:
json.dump(combined, f, ensure_ascii=False, indent=4)
print(f"guideline_db.json for bipolar created with {len(combined)} chunks.")
def write_referenced_tables():
with open("guideline_db.json", "r", encoding="utf-8") as f:
guideline_db = json.load(f)
with open("table12_textual.txt", "r", encoding="utf-8") as f:
guideline_db[21]['text'] += f.read().strip()
with open("table14_textual.txt", "r", encoding="utf-8") as f:
guideline_db[23]['text'] += f.read().strip()
with open("table17_textual.txt", "r", encoding="utf-8") as f:
guideline_db[26]['text'] += f.read().strip()
figures_and_tables = guideline_db[1:34] # Assuming these are the table chunks
#write back to the original file
with open ("guideline_db.json", "w", encoding="utf-8") as f:
json.dump(guideline_db, f, ensure_ascii=False, indent=4)
print(f"guideline_db.json updated with table 12 14 17 chunks.")
with open("referenced_table_chunks.json", "w", encoding="utf-8") as f:
json.dump(figures_and_tables, f, ensure_ascii=False, indent=4)
print(f"referenced_table_chunks.json created with {len(figures_and_tables)} chunks.")
if __name__ == "__main__":
# main()
# read in the file guideline_db.json
# make the referenced_tables.json
write_referenced_tables()