Spaces:

ymali
/

bipolar

Sleeping

App Files Files Community

bipolar / src /data_processing /main.py

zzejiao

yash's hf bipolar demo code with github action set

3530638 6 months ago

raw

history blame contribute delete

11.5 kB

	import re
	import json
	from tables import get_table_metadata, to_text, get_table_data
	from bs4 import BeautifulSoup, Tag, NavigableString

	abbr_map = {
	"ACT": "Acceptance and commitment therapy",
	"ADHD": "Attention-deficit hyperactivity disorder",
	"AI": "Artificial intelligence",
	"BA": "Behavioural activation",
	"CAM": "Complementary and alternative medicine",
	# "CANMAT":"Canadian Network for Mood and Anxiety Treatments",
	"CBASP": "Cognitive behavioural analysis system of psychotherapy",
	"CBT": "Cognitive-behavioural therapy",
	"CPD": "Continuing professional development",
	"CYP": "Cytochrome P450",
	"DBS": "Deep brain stimulation",
	"DHI": "Digital health intervention",
	"DLPFC": "Dorsolateral prefrontal cortex",
	"DSM-5": "Diagnostic and Statistical Manual",
	"DSM-5-TR": "Diagnostic and Statistical Manual, 5th edition, Text Revision",
	"DSM-IV-TR":"Diagnostic and Statistical Manual, 4th edition, Text Revision",
	"DTD": "Difficult-to-treat depression",
	"ECG": "Electrocardiography",
	"ECT": "Electroconvulsive therapy",
	"EEG": "Electroencephalography",
	"GRADE": "Grading of Recommendations Assessment, Development, and Evaluation",
	"ICD": "International Classification of Diseases",
	"IPT": "Interpersonal therapy",
	"MAOI": "Monoamine oxidase inhibitor",
	"MBC": "Measurement-based care",
	"MBCT": "Mindfulness-based cognitive therapy",
	"MCT": "Metacognitive therapy",
	"MDD": "Major depressive disorder",
	"MDE": "Major depressive episode",
	"MI": "Motivational interviewing",
	"MST": "Magnetic seizure therapy",
	"NbN": "Neuroscience-based nomenclature",
	"NDRI": "Norepinephrine-dopamine reuptake inhibitor",
	"NMDA": "N-methyl-D-aspartate",
	"NSAID": "Nonsteroidal anti-inflammatory drug",
	"PDD": "Persistent depressive disorder",
	"PDT": "Psychodynamic psychotherapy",
	"PHQ": "Patient health questionnaire",
	"PST": "Problem-solving therapy",
	"RCT": "Randomized controlled trial",
	"rTMS": "Repetitive transcranial magnetic stimulation",
	"SDM": "Shared decision-making",
	"SNRI": "Serotonin-norepinephrine reuptake inhibitor",
	"SSRI": "Selective serotonin reuptake inhibitor",
	"STPP": "Short-term psychodynamic psychotherapy",
	"TBS": "Theta burst stimulation",
	"TCA": "Tricyclic antidepressants",
	"tDCS": "Transcranial direct current stimulation",
	"TMS": "Transcranial magnetic stimulation",
	"TRD": "Treatment-resistant depression",
	"VNS": "Vagus nerve stimulation",
	"WHO": "World Health Organization",
	}

	def append_definition(guideline):
	pattern = re.compile(r'\b([A-Z]{2,})\b')

	for i in range(len(guideline)):
	if guideline[i]['metadata']['referee_id'] == 'table_c':
	continue

	text = guideline[i]['text']

	# Find all abbreviations in the text
	found_abbrs = set()
	matches = pattern.findall(text)
	for abbr in matches:
	if abbr in abbr_map:
	found_abbrs.add(abbr)

	# Create definitions section if abbreviations found
	if found_abbrs:
	definitions = []
	for abbr in sorted(found_abbrs):
	definitions.append(f"{abbr}: {abbr_map[abbr]}")

	definitions_text = "Abbreviations: " + "; ".join(definitions) + "\n\n"
	guideline[i]['text'] = text + "\n" + definitions_text

	return guideline

	def parse_title(soup):
	title = soup.find("h1")
	if title:
	title = title.decode_contents().replace('\n', '')
	return {
	"text": title,
	"metadata": {
	"section": "title",
	"type": "title",
	"headings": "Title of the guideline document",
	"referenced_tables": [],
	"referee_id": ""
	}
	}

	def prepend_headings_to_text(guideline):
	for i in range(len(guideline)):
	guideline[i]['metadata']['chunk_id'] = i
	guideline[i]['text'] = guideline[i]['metadata']['headings'] + " > paragraph id: " + str(i) + "\n\n" + guideline[i]['text']


	def build_headings_trail(p):
	# build headings trail
	heading = p.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name)))
	headings = heading.get_text(strip=True) if heading else 'No heading'

	parent_sec = p.find_parent(["section",'figure'], id=True)
	while parent_sec:
	heading = parent_sec.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name)))
	if heading:
	headings = heading.get_text(strip=True) + ' > ' + headings
	parent_sec = parent_sec.find_parent("section", id=True)
	headings = headings.strip().replace('\n', ' ')
	return headings


	def delete_bib_links(soup):
	for a in soup.find_all("a", href=True):
	if a["href"].startswith("#bdi12609-bib-"):
	a.decompose()
	return soup

	def delete_fig_and_tbl_sections(soup):
	# Remove all <figure> and <table> sections
	for fig in soup.find_all('figure'):
	fig.decompose()
	# find section that has class "tw xbox font-sm" and remove it
	for section in soup.find_all('section', class_="tw xbox font-sm"):
	section.decompose()
	return soup


	def scan_links_and_tables(p):
	referenced_tables = set()
	# scan for numeric links and reconstruct table IDs
	for link in p.find_all('a', href=re.compile(r'-(fig\|tbl)-')):
	href = link['href']
	tables = re.findall(r'(fig\|tbl)-(\d+)', href)
	for table_id in tables:
	print(f"Found table links: {table_id[0]}-{table_id[1]}")
	referenced_tables.add(f"{table_id[0]}-{table_id[1]}")
	if referenced_tables:
	print("--------")

	return referenced_tables

	def parse_paragraph(soup, output):
	paragraphs = soup.find_all('p')
	for p in paragraphs:
	parent = p.find_parent("section")
	sec_id = parent["id"] if parent else "unknown"

	output.append({
	"text": p.get_text(strip=False),
	"metadata": {
	"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
	"type": "paragraph",
	"headings": build_headings_trail(p),
	"referenced_tables": list(scan_links_and_tables(p)),
	"referee_id": "",
	}
	})


	def parse_figures(soup, output):
	figures = soup.find_all('figure')
	for fig in figures:
	sec_id = fig["id"]
	fig_caption = fig.find('figcaption').find('p').get_text()
	img_src_link = fig.find('img')['src'] if fig.find('img') else "No image link found"
	referee_id = re.search(r'(fig)-(\d+)', sec_id)

	p = fig.find('p')
	output.append({
	"text": f" > Figure: Image link: {img_src_link}-----\nFigure Caption: {fig_caption}\n",
	"metadata": {
	"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
	"type": "figure",
	"headings": build_headings_trail(p),
	"referenced_tables": list(scan_links_and_tables(p)),
	"referee_id": referee_id.group(0) if referee_id else "fig_unknown",
	}
	})

	def parse_tables(soup, output):
	tables = soup.find_all("section", class_="tw xbox font-sm")
	for table in tables:
	sec_id = table["id"]
	referee_id = re.search(r'(tbl)-(\d+)', sec_id)

	img = table.find('img')
	if img: # then this is a img table
	img_src_link = img['src'] if img else "No image link found"
	p = table.find('p')
	captions = table.find_all('div', class_='caption p')
	caption = ""
	for cap in captions:
	caption += cap.get_text(strip=True) + " "
	output.append({
	"text": f" > Table: Image link: {img_src_link}-----\nTable Caption: {caption}\n",
	"metadata": {
	"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
	"type": "table",
	"headings": build_headings_trail(p),
	"referenced_tables": [],
	"referee_id": referee_id.group(0) if referee_id else "tbl_unknown",
	}
	})
	else: # else it is a html table
	name, caption, footnotes, headings, label, ref_id, section_url = get_table_metadata(table, base_url="")
	table_data = get_table_data(table, footnotes)
	text = to_text(table_data, label, caption)
	p = table.find('div')

	output.append({
	"text": text,
	"metadata": {
	"section": f"https://pmc.ncbi.nlm.nih.gov/articles/PMC5947163/#{sec_id}",
	"type": "table",
	"headings": build_headings_trail(p),
	"referenced_tables": [],
	"referee_id": referee_id.group(0) if referee_id else "tbl_unknown",
	}
	})



	def parse_main_article(soup, output):

	soup = delete_bib_links(soup)
	parse_figures(soup, output)
	parse_tables(soup, output)

	paragraph_only_soup = delete_fig_and_tbl_sections(soup)

	parse_paragraph(paragraph_only_soup, output)

	# parse_tables(soup, output)

	return output


	def main():
	with open('bipolar.html', 'r', encoding='utf-8') as f:
	html = f.read()
	soup = BeautifulSoup(html, "html.parser")
	output = []

	output.append(parse_title(soup))
	output = parse_main_article(soup, output)
	print(f"Parsed {len(output)} paragraphs from the main article.")

	combined = output
	prepend_headings_to_text(combined)
	append_definition(combined)

	with open("guideline_db.json", "w", encoding="utf-8") as f:
	json.dump(combined, f, ensure_ascii=False, indent=4)
	print(f"guideline_db.json for bipolar created with {len(combined)} chunks.")


	def write_referenced_tables():
	with open("guideline_db.json", "r", encoding="utf-8") as f:
	guideline_db = json.load(f)
	with open("table12_textual.txt", "r", encoding="utf-8") as f:
	guideline_db[21]['text'] += f.read().strip()
	with open("table14_textual.txt", "r", encoding="utf-8") as f:
	guideline_db[23]['text'] += f.read().strip()
	with open("table17_textual.txt", "r", encoding="utf-8") as f:
	guideline_db[26]['text'] += f.read().strip()

	figures_and_tables = guideline_db[1:34] # Assuming these are the table chunks
	#write back to the original file
	with open ("guideline_db.json", "w", encoding="utf-8") as f:
	json.dump(guideline_db, f, ensure_ascii=False, indent=4)
	print(f"guideline_db.json updated with table 12 14 17 chunks.")
	with open("referenced_table_chunks.json", "w", encoding="utf-8") as f:
	json.dump(figures_and_tables, f, ensure_ascii=False, indent=4)
	print(f"referenced_table_chunks.json created with {len(figures_and_tables)} chunks.")


	if __name__ == "__main__":
	# main()
	# read in the file guideline_db.json

	# make the referenced_tables.json
	write_referenced_tables()