Spaces:
Sleeping
Sleeping
| #! /usr/bin/env python3 | |
| import os | |
| import gzip | |
| import json | |
| import re | |
| import subprocess | |
| import xml.etree.ElementTree as ET | |
| from tqdm import tqdm | |
| from urllib.request import urlopen, urlretrieve | |
| PUBMED_DATASET_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline" | |
| PUBMED_FILE_LIMIT = 10 | |
| def get_pubmed_dataset_size(): | |
| try: | |
| with urlopen(PUBMED_DATASET_BASE_URL) as response: | |
| html = response.read().decode("utf-8") | |
| files = re.findall(r"(pubmed\d+n\d+)\.xml\.gz(?!\.)", html) | |
| unique_files = set(files) | |
| return len(unique_files) | |
| except Exception as e: | |
| print(f"Unable to count PubMed files: {e}") | |
| return 0 | |
| def download_pubmed_xml(output_dir, num_files=1, year='25'): | |
| os.makedirs(output_dir, exist_ok=True) | |
| total_dataset_size = get_pubmed_dataset_size() | |
| files = [] | |
| pbar = tqdm(total=total_dataset_size, desc=f"Downloading {num_files}/{total_dataset_size} files in PubMed dataset") | |
| for i in range(1, num_files + 1): | |
| filename = f"pubmed{year}n{i:04d}.xml.gz" | |
| filepath = os.path.join(output_dir, filename) | |
| if not os.path.exists(filepath): | |
| urlretrieve(f"{PUBMED_DATASET_BASE_URL}/{filename}", filepath) | |
| pbar.update(1) | |
| files.append(filepath) | |
| pbar.close() | |
| return files | |
| def parse_pubmed_to_jsonl(xml_files, output_jsonl): | |
| with open(output_jsonl, 'w') as out: | |
| for xml_file in xml_files: | |
| print(f"Parsing {xml_file}...") | |
| with gzip.open(xml_file, 'rt', encoding='utf-8') as f: | |
| tree = ET.parse(f) | |
| root = tree.getroot() | |
| for article in tqdm(root.findall('.//PubmedArticle')): | |
| pmid_elem = article.find('.//PMID') | |
| title_elem = article.find('.//ArticleTitle') | |
| abstract_elem = article.find('.//Abstract/AbstractText') | |
| if pmid_elem is not None: | |
| title = title_elem.text if title_elem is not None else "" | |
| abstract = abstract_elem.text if abstract_elem is not None else "" | |
| doc = { | |
| 'id': pmid_elem.text, | |
| 'title': title, | |
| 'contents': f"{title} {abstract}".strip() | |
| } | |
| out.write(json.dumps(doc) + '\n') | |
| def download_pubmed(output_jsonl, num_files=1): | |
| if os.path.exists(output_jsonl): | |
| print(f"Already downloaded PubMed dataset: {output_jsonl}") | |
| return | |
| xml_dir = os.path.join(os.path.dirname(output_jsonl), '../pubmed-xml') | |
| xml_files = download_pubmed_xml(xml_dir, num_files=num_files) | |
| parse_pubmed_to_jsonl(xml_files, output_jsonl) | |
| def build_index_cmd(input_file, index_dir): | |
| return [ | |
| "python", "-m", "pyserini.index.lucene", | |
| "--collection", "JsonCollection", | |
| "--input", os.path.dirname(input_file), | |
| "--index", index_dir, | |
| "--generator", "DefaultLuceneDocumentGenerator", | |
| "--threads", "32", | |
| "--storePositions", | |
| "--storeDocvectors", | |
| "--storeRaw", | |
| ] | |
| def build_index(input_file, index_dir, cmd_generator=build_index_cmd): | |
| if os.path.exists(index_dir) and os.listdir(index_dir): | |
| print(f"Skipping existing index: {index_dir}") | |
| return | |
| os.makedirs(os.path.dirname(index_dir) or '.', exist_ok=True) | |
| cmd = cmd_generator(input_file, index_dir) | |
| subprocess.run(cmd, check=True) | |
| def main(base_data_dir="data", base_index_dir="indexes", num_files=1): | |
| corpus_jsonl = os.path.join(base_data_dir, "pubmed", "corpus.jsonl") | |
| index_dir = os.path.join(base_index_dir, "pubmed") | |
| download_pubmed(corpus_jsonl, num_files=num_files) | |
| build_index(corpus_jsonl, index_dir) | |
| if __name__ == "__main__": | |
| main(num_files=PUBMED_FILE_LIMIT) | |