Spaces:

taraky
/

Medical_Document_Retrieval

Sleeping

File size: 4,037 Bytes

b7f3196

#! /usr/bin/env python3

import os
import gzip
import json
import re
import subprocess
import xml.etree.ElementTree as ET

from tqdm import tqdm
from urllib.request import urlopen, urlretrieve


PUBMED_DATASET_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline"

PUBMED_FILE_LIMIT = 10


def get_pubmed_dataset_size():
    try:
        with urlopen(PUBMED_DATASET_BASE_URL) as response:
            html = response.read().decode("utf-8")

            files = re.findall(r"(pubmed\d+n\d+)\.xml\.gz(?!\.)", html)
            unique_files = set(files)

            return len(unique_files)

    except Exception as e:
        print(f"Unable to count PubMed files: {e}")

        return 0


def download_pubmed_xml(output_dir, num_files=1, year='25'):
    os.makedirs(output_dir, exist_ok=True)

    total_dataset_size = get_pubmed_dataset_size()

    files = []
    pbar = tqdm(total=total_dataset_size, desc=f"Downloading {num_files}/{total_dataset_size} files in PubMed dataset")

    for i in range(1, num_files + 1):
        filename = f"pubmed{year}n{i:04d}.xml.gz"
        filepath = os.path.join(output_dir, filename)

        if not os.path.exists(filepath):
            urlretrieve(f"{PUBMED_DATASET_BASE_URL}/{filename}", filepath)

        pbar.update(1)

        files.append(filepath)

    pbar.close()

    return files


def parse_pubmed_to_jsonl(xml_files, output_jsonl):
    with open(output_jsonl, 'w') as out:
        for xml_file in xml_files:
            print(f"Parsing {xml_file}...")
            with gzip.open(xml_file, 'rt', encoding='utf-8') as f:
                tree = ET.parse(f)
                root = tree.getroot()

                for article in tqdm(root.findall('.//PubmedArticle')):
                    pmid_elem = article.find('.//PMID')
                    title_elem = article.find('.//ArticleTitle')
                    abstract_elem = article.find('.//Abstract/AbstractText')

                    if pmid_elem is not None:
                        title = title_elem.text if title_elem is not None else ""
                        abstract = abstract_elem.text if abstract_elem is not None else ""

                        doc = {
                            'id': pmid_elem.text,
                            'title': title,
                            'contents': f"{title} {abstract}".strip()
                        }
                        out.write(json.dumps(doc) + '\n')


def download_pubmed(output_jsonl, num_files=1):
    if os.path.exists(output_jsonl):
        print(f"Already downloaded PubMed dataset: {output_jsonl}")

        return

    xml_dir = os.path.join(os.path.dirname(output_jsonl), '../pubmed-xml')
    xml_files = download_pubmed_xml(xml_dir, num_files=num_files)
    parse_pubmed_to_jsonl(xml_files, output_jsonl)


def build_index_cmd(input_file, index_dir):
    return [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "JsonCollection",
        "--input", os.path.dirname(input_file),
        "--index", index_dir,
        "--generator", "DefaultLuceneDocumentGenerator",
        "--threads", "32",
        "--storePositions",
        "--storeDocvectors",
        "--storeRaw",
    ]


def build_index(input_file, index_dir, cmd_generator=build_index_cmd):
    if os.path.exists(index_dir) and os.listdir(index_dir):
        print(f"Skipping existing index: {index_dir}")

        return

    os.makedirs(os.path.dirname(index_dir) or '.', exist_ok=True)

    cmd = cmd_generator(input_file, index_dir)

    subprocess.run(cmd, check=True)


def main(base_data_dir="data", base_index_dir="indexes", num_files=1):
    corpus_jsonl = os.path.join(base_data_dir, "pubmed", "corpus.jsonl")
    index_dir = os.path.join(base_index_dir, "pubmed")

    download_pubmed(corpus_jsonl, num_files=num_files)

    build_index(corpus_jsonl, index_dir)


if __name__ == "__main__":
    main(num_files=PUBMED_FILE_LIMIT)