Spaces:

taraky
/

Medical_Document_Retrieval

Sleeping

App Files Files Community

Medical_Document_Retrieval / adapters /pubmed.py

taraky

Upload folder using huggingface_hub

b7f3196 verified 6 days ago

raw

history blame

4.04 kB

	#! /usr/bin/env python3

	import os
	import gzip
	import json
	import re
	import subprocess
	import xml.etree.ElementTree as ET

	from tqdm import tqdm
	from urllib.request import urlopen, urlretrieve


	PUBMED_DATASET_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline"

	PUBMED_FILE_LIMIT = 10


	def get_pubmed_dataset_size():
	try:
	with urlopen(PUBMED_DATASET_BASE_URL) as response:
	html = response.read().decode("utf-8")

	files = re.findall(r"(pubmed\d+n\d+)\.xml\.gz(?!\.)", html)
	unique_files = set(files)

	return len(unique_files)

	except Exception as e:
	print(f"Unable to count PubMed files: {e}")

	return 0


	def download_pubmed_xml(output_dir, num_files=1, year='25'):
	os.makedirs(output_dir, exist_ok=True)

	total_dataset_size = get_pubmed_dataset_size()

	files = []
	pbar = tqdm(total=total_dataset_size, desc=f"Downloading {num_files}/{total_dataset_size} files in PubMed dataset")

	for i in range(1, num_files + 1):
	filename = f"pubmed{year}n{i:04d}.xml.gz"
	filepath = os.path.join(output_dir, filename)

	if not os.path.exists(filepath):
	urlretrieve(f"{PUBMED_DATASET_BASE_URL}/{filename}", filepath)

	pbar.update(1)

	files.append(filepath)

	pbar.close()

	return files


	def parse_pubmed_to_jsonl(xml_files, output_jsonl):
	with open(output_jsonl, 'w') as out:
	for xml_file in xml_files:
	print(f"Parsing {xml_file}...")
	with gzip.open(xml_file, 'rt', encoding='utf-8') as f:
	tree = ET.parse(f)
	root = tree.getroot()

	for article in tqdm(root.findall('.//PubmedArticle')):
	pmid_elem = article.find('.//PMID')
	title_elem = article.find('.//ArticleTitle')
	abstract_elem = article.find('.//Abstract/AbstractText')

	if pmid_elem is not None:
	title = title_elem.text if title_elem is not None else ""
	abstract = abstract_elem.text if abstract_elem is not None else ""

	doc = {
	'id': pmid_elem.text,
	'title': title,
	'contents': f"{title} {abstract}".strip()
	}
	out.write(json.dumps(doc) + '\n')


	def download_pubmed(output_jsonl, num_files=1):
	if os.path.exists(output_jsonl):
	print(f"Already downloaded PubMed dataset: {output_jsonl}")

	return

	xml_dir = os.path.join(os.path.dirname(output_jsonl), '../pubmed-xml')
	xml_files = download_pubmed_xml(xml_dir, num_files=num_files)
	parse_pubmed_to_jsonl(xml_files, output_jsonl)


	def build_index_cmd(input_file, index_dir):
	return [
	"python", "-m", "pyserini.index.lucene",
	"--collection", "JsonCollection",
	"--input", os.path.dirname(input_file),
	"--index", index_dir,
	"--generator", "DefaultLuceneDocumentGenerator",
	"--threads", "32",
	"--storePositions",
	"--storeDocvectors",
	"--storeRaw",
	]


	def build_index(input_file, index_dir, cmd_generator=build_index_cmd):
	if os.path.exists(index_dir) and os.listdir(index_dir):
	print(f"Skipping existing index: {index_dir}")

	return

	os.makedirs(os.path.dirname(index_dir) or '.', exist_ok=True)

	cmd = cmd_generator(input_file, index_dir)

	subprocess.run(cmd, check=True)


	def main(base_data_dir="data", base_index_dir="indexes", num_files=1):
	corpus_jsonl = os.path.join(base_data_dir, "pubmed", "corpus.jsonl")
	index_dir = os.path.join(base_index_dir, "pubmed")

	download_pubmed(corpus_jsonl, num_files=num_files)

	build_index(corpus_jsonl, index_dir)


	if __name__ == "__main__":
	main(num_files=PUBMED_FILE_LIMIT)