taraky's picture
Upload folder using huggingface_hub
b7f3196 verified
#! /usr/bin/env python3
import os
import gzip
import json
import re
import subprocess
import xml.etree.ElementTree as ET
from tqdm import tqdm
from urllib.request import urlopen, urlretrieve
PUBMED_DATASET_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline"
PUBMED_FILE_LIMIT = 10
def get_pubmed_dataset_size():
try:
with urlopen(PUBMED_DATASET_BASE_URL) as response:
html = response.read().decode("utf-8")
files = re.findall(r"(pubmed\d+n\d+)\.xml\.gz(?!\.)", html)
unique_files = set(files)
return len(unique_files)
except Exception as e:
print(f"Unable to count PubMed files: {e}")
return 0
def download_pubmed_xml(output_dir, num_files=1, year='25'):
os.makedirs(output_dir, exist_ok=True)
total_dataset_size = get_pubmed_dataset_size()
files = []
pbar = tqdm(total=total_dataset_size, desc=f"Downloading {num_files}/{total_dataset_size} files in PubMed dataset")
for i in range(1, num_files + 1):
filename = f"pubmed{year}n{i:04d}.xml.gz"
filepath = os.path.join(output_dir, filename)
if not os.path.exists(filepath):
urlretrieve(f"{PUBMED_DATASET_BASE_URL}/{filename}", filepath)
pbar.update(1)
files.append(filepath)
pbar.close()
return files
def parse_pubmed_to_jsonl(xml_files, output_jsonl):
with open(output_jsonl, 'w') as out:
for xml_file in xml_files:
print(f"Parsing {xml_file}...")
with gzip.open(xml_file, 'rt', encoding='utf-8') as f:
tree = ET.parse(f)
root = tree.getroot()
for article in tqdm(root.findall('.//PubmedArticle')):
pmid_elem = article.find('.//PMID')
title_elem = article.find('.//ArticleTitle')
abstract_elem = article.find('.//Abstract/AbstractText')
if pmid_elem is not None:
title = title_elem.text if title_elem is not None else ""
abstract = abstract_elem.text if abstract_elem is not None else ""
doc = {
'id': pmid_elem.text,
'title': title,
'contents': f"{title} {abstract}".strip()
}
out.write(json.dumps(doc) + '\n')
def download_pubmed(output_jsonl, num_files=1):
if os.path.exists(output_jsonl):
print(f"Already downloaded PubMed dataset: {output_jsonl}")
return
xml_dir = os.path.join(os.path.dirname(output_jsonl), '../pubmed-xml')
xml_files = download_pubmed_xml(xml_dir, num_files=num_files)
parse_pubmed_to_jsonl(xml_files, output_jsonl)
def build_index_cmd(input_file, index_dir):
return [
"python", "-m", "pyserini.index.lucene",
"--collection", "JsonCollection",
"--input", os.path.dirname(input_file),
"--index", index_dir,
"--generator", "DefaultLuceneDocumentGenerator",
"--threads", "32",
"--storePositions",
"--storeDocvectors",
"--storeRaw",
]
def build_index(input_file, index_dir, cmd_generator=build_index_cmd):
if os.path.exists(index_dir) and os.listdir(index_dir):
print(f"Skipping existing index: {index_dir}")
return
os.makedirs(os.path.dirname(index_dir) or '.', exist_ok=True)
cmd = cmd_generator(input_file, index_dir)
subprocess.run(cmd, check=True)
def main(base_data_dir="data", base_index_dir="indexes", num_files=1):
corpus_jsonl = os.path.join(base_data_dir, "pubmed", "corpus.jsonl")
index_dir = os.path.join(base_index_dir, "pubmed")
download_pubmed(corpus_jsonl, num_files=num_files)
build_index(corpus_jsonl, index_dir)
if __name__ == "__main__":
main(num_files=PUBMED_FILE_LIMIT)