Spaces:

joytheslothh
/

MediRAG-API

Sleeping

File size: 16,758 Bytes

b6f9fa8

"""
scripts/download_guidelines.py
================================
Downloads clinical guidelines from PubMed Central (PMC) open-access API
and chunks them for ingestion into the MediRAG FAISS index.

Sources:
  - ADA Standards of Medical Care in Diabetes 2024 (16 sections via PMC)
  - More guidelines can be added to GUIDELINE_SOURCES below

Chunking strategy (based on structural analysis):
  - Primary boundary: H2 clinical topic + Recommendations block + evidence narrative
  - Never split a Recommendations block
  - Store evidence grades (A/B/C/E) and recommendation numbers as metadata

Usage:
    python scripts/download_guidelines.py
    python scripts/download_guidelines.py --source ada_diabetes
    python scripts/download_guidelines.py --dry-run
"""
from __future__ import annotations

import argparse
import json
import logging
import re
import time
import uuid
from pathlib import Path

import requests

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Guideline sources — PMC IDs for ADA 2024 Standards of Care
# ---------------------------------------------------------------------------
GUIDELINE_SOURCES = {
    "ada_diabetes": {
        "name": "ADA Standards of Medical Care in Diabetes 2024",
        "key": "ada",
        "pub_type": "clinical_guideline",
        "source": "American Diabetes Association",
        "pub_year": 2024,
        "journal": "Diabetes Care",
        "sections": [
            {"pmcid": "PMC10725812", "section": "2", "title": "Diagnosis and Classification of Diabetes"},
            {"pmcid": "PMC10725809", "section": "4", "title": "Comprehensive Medical Evaluation and Assessment of Comorbidities"},
            {"pmcid": "PMC10725816", "section": "5", "title": "Facilitating Positive Health Behaviors and Well-being"},
            {"pmcid": "PMC10725808", "section": "6", "title": "Glycemic Goals and Hypoglycemia"},
            {"pmcid": "PMC10725813", "section": "7", "title": "Diabetes Technology"},
            {"pmcid": "PMC10725806", "section": "8", "title": "Obesity and Weight Management for the Prevention and Treatment of Type 2 Diabetes"},
            {"pmcid": "PMC10725810", "section": "9", "title": "Pharmacologic Approaches to Glycemic Treatment"},
            {"pmcid": "PMC10725804", "section": "13", "title": "Older Adults"},
            {"pmcid": "PMC10725814", "section": "14", "title": "Children and Adolescents"},
            {"pmcid": "PMC10725801", "section": "15", "title": "Management of Diabetes in Pregnancy"},
            {"pmcid": "PMC10725815", "section": "16", "title": "Diabetes Care in the Hospital"},
            {"pmcid": "PMC10725798", "section": "1", "title": "Improving Care and Promoting Health in Populations"},
        ],
    },
    "acc_aha_cholesterol": {
        "name": "2018 ACC/AHA Guideline on Management of Blood Cholesterol",
        "key": "acc_aha_chol",
        "pub_type": "clinical_guideline",
        "source": "American College of Cardiology/American Heart Association",
        "pub_year": 2018,
        "journal": "Circulation",
        "sections": [
            # PMC7403606: Grundy et al. 2018 executive summary, freely accessible full text
            {"pmcid": "PMC7403606", "section": "1", "title": "Management of Blood Cholesterol — Statin Therapy and LDL Targets"},
        ],
    },
    "acc_aha_prevention": {
        "name": "2019 ACC/AHA Guideline on Primary Prevention of Cardiovascular Disease",
        "key": "acc_aha_prev",
        "pub_type": "clinical_guideline",
        "source": "American College of Cardiology/American Heart Association",
        "pub_year": 2019,
        "journal": "Journal of the American College of Cardiology",
        "sections": [
            # PMC7685565: Arnett et al. 2019, full guideline open access
            {"pmcid": "PMC7685565", "section": "1", "title": "Primary Prevention — Blood Pressure, Cholesterol, Aspirin, Lifestyle"},
        ],
    },
}

PMC_API = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmcid}/unicode"
PMC_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Evidence grade pattern: single letter A/B/C/E at end of recommendation
_GRADE_RE = re.compile(r'\b([ABCE])\s*$')
# Recommendation number pattern: e.g. "9.18", "2.1a", "6.5b"
_REC_NUM_RE = re.compile(r'^(\d+\.\d+[a-z]?)\s+')


PMC_HTML_URL = "https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/"


def fetch_pmc_xml(pmcid: str) -> str | None:
    """Fetch PMC article HTML page and extract clean structured text."""
    try:
        from lxml import html as lxml_html
        url = PMC_HTML_URL.format(pmcid=pmcid)
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=30)
        r.raise_for_status()
        return _extract_pmc_html_text(lxml_html.fromstring(r.content))
    except Exception as e:
        logger.warning("PMC HTML fetch failed for %s: %s", pmcid, e)
        return None


def _extract_pmc_html_text(tree) -> str:
    """
    Extract clean structured text from PMC article HTML.
    Uses lxml XPath to navigate the <article> element.
    Deduplicates recommendation paragraphs (PMC renders them twice).
    """
    # Get main article element
    articles = tree.xpath('//article')
    root = articles[0] if articles else tree

    lines = []
    seen_texts: set[str] = set()  # Deduplication for repeated elements

    def clean(el) -> str:
        return " ".join(el.text_content().split()).strip()

    def add_line(text: str) -> None:
        if text and len(text) > 10 and text not in seen_texts:
            seen_texts.add(text)
            lines.append(text)

    def extract_table(table_el):
        """Extract a table element as readable pipe-separated rows."""
        caption = table_el.xpath('.//caption')
        if caption:
            add_line(f"[Table: {clean(caption[0])}]")
        for tr in table_el.xpath('.//tr'):
            cells = [" ".join(td.text_content().split()).strip()
                     for td in tr.xpath('.//td | .//th')]
            row = " | ".join(c for c in cells if c)
            if row:
                add_line(row)

    def process_section(sec, depth=0):
        # Deep-search for tables first (they may be nested inside divs/figures)
        for table in sec.xpath('.//table'):
            # Only process tables whose nearest section ancestor is this sec
            ancestors = table.xpath('ancestor::section')
            if not ancestors or ancestors[-1] == sec:
                extract_table(table)

        for child in sec:
            tag = child.tag.lower() if isinstance(child.tag, str) else ""

            if tag in ("h1", "h2", "h3", "h4"):
                text = clean(child)
                if text and text not in ("Abstract", "References", "Footnotes"):
                    lines.append(f"\n{'#' * (depth + 2)} {text}")

            elif tag == "p":
                text = clean(child)
                add_line(text)

            elif tag in ("ul", "ol"):
                for li in child.xpath('.//li'):
                    text = clean(li)
                    add_line(f"• {text}")

            elif tag == "section":
                process_section(child, depth + 1)

            elif tag == "table":
                pass  # Already handled above via deep-search

            elif tag == "div":
                # Recurse into divs that might contain content
                cls = child.get("class", "")
                if any(k in cls for k in ("content", "body", "text", "article")):
                    process_section(child, depth)

    for sec in root.xpath('.//section'):
        # Only process top-level sections (not deeply nested)
        parent = sec.getparent()
        if parent is not None and parent.tag.lower() not in ("section",):
            process_section(sec)

    # If no sections found, fall back to all paragraphs
    if len(lines) < 5:
        for p in root.xpath('.//article//p | .//p[@class]'):
            add_line(clean(p))

    return "\n\n".join(l for l in lines if l.strip())


def extract_recommendations(text: str) -> list[dict]:
    """Extract individual recommendations with their numbers and grades."""
    recs = []
    for line in text.split('\n'):
        line = line.strip()
        m = _REC_NUM_RE.match(line)
        if m:
            rec_num = m.group(1)
            rec_text = line[m.end():].strip()
            grade_m = _GRADE_RE.search(rec_text)
            grade = grade_m.group(1) if grade_m else "E"
            recs.append({"number": rec_num, "text": rec_text, "grade": grade})
    return recs


def chunk_guideline_text(
    text: str,
    section_meta: dict,
    guideline_meta: dict,
    max_chunk_chars: int = 2000,
) -> list[dict]:
    """
    Chunk guideline text at ## heading boundaries produced by _extract_pmc_html_text.
    Each chunk = H2/H3 topic + its paragraphs/recommendations.
    """
    chunks = []
    section_num = section_meta["section"]
    section_title = section_meta["title"]
    guideline_name = guideline_meta["name"]
    source = guideline_meta["source"]
    pub_year = guideline_meta["pub_year"]
    pub_type = guideline_meta["pub_type"]
    source_key = guideline_meta.get("key", "ada")
    journal = guideline_meta.get("journal", "Diabetes Care")

    # Split text into blocks at any ## heading
    # Each block starts with a heading line and contains the following paragraphs
    _HEADING_RE = re.compile(r'^(#{1,4})\s+(.+)$', re.MULTILINE)

    # Find all heading positions
    heading_matches = list(_HEADING_RE.finditer(text))

    if not heading_matches:
        # No headings found — chunk by size
        blocks = [(section_title, text)]
    else:
        blocks = []
        for i, m in enumerate(heading_matches):
            heading_text = m.group(2).strip()
            # Skip metadata headings
            if heading_text in ("Abstract", "References", "Footnotes", "Author notes",
                                "Conflicts of interest", "Acknowledgments"):
                continue
            start = m.end()
            end = heading_matches[i + 1].start() if i + 1 < len(heading_matches) else len(text)
            content = text[start:end].strip()
            if content:
                blocks.append((heading_text, content))

    def make_chunk(heading: str, content: str, part_idx: int = 0) -> dict:
        recs = extract_recommendations(content)
        rec_nums = [r["number"] for r in recs]
        grades = {r["number"]: r["grade"] for r in recs}
        grade_summary = "/".join(sorted(set(r["grade"] for r in recs))) if recs else ""

        prefix = f"[{guideline_name} | Section {section_num}: {section_title} | {heading}]"
        if grade_summary:
            prefix += f" [Evidence: {grade_summary}]"

        return {
            "chunk_id":           f"guideline_{source_key}_{section_num}_{uuid.uuid4().hex[:8]}_{part_idx}",
            "doc_id":             f"guideline_{source_key}_section_{section_num}",
            "chunk_text":         f"{prefix}\n{content}",
            "chunk_index":        len(chunks),
            "total_chunks":       0,
            "pub_type":           pub_type,
            "source":             source,
            "title":              f"{guideline_name} — Section {section_num}: {heading}",
            "pub_year":           pub_year,
            "journal":            journal,
            "section_number":     section_num,
            "section_title":      section_title,
            "h2_heading":         heading,
            "recommendation_numbers": rec_nums,
            "evidence_grades":    grades,
        }

    for heading, content in blocks:
        if len(content) <= max_chunk_chars:
            chunks.append(make_chunk(heading, content))
        else:
            # Split long blocks at paragraph boundaries
            paras = [p.strip() for p in re.split(r'\n{2,}', content) if p.strip()]
            current: list[str] = []
            part = 0
            for para in paras:
                current.append(para)
                if len("\n\n".join(current)) >= max_chunk_chars:
                    chunks.append(make_chunk(heading, "\n\n".join(current[:-1]), part))
                    current = [para]
                    part += 1
            if current:
                chunks.append(make_chunk(heading, "\n\n".join(current), part))

    for chunk in chunks:
        chunk["total_chunks"] = len(chunks)

    return chunks


def download_guidelines(source_key: str, output_path: str, dry_run: bool = False) -> None:
    source = GUIDELINE_SOURCES[source_key]
    out = Path(output_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    total_chunks = 0
    failed_sections = []

    with open(out, "w", encoding="utf-8") as f:
        for section in source["sections"]:
            pmcid = section["pmcid"]
            logger.info("Fetching %s — Section %s: %s", pmcid, section["section"], section["title"])

            text = fetch_pmc_xml(pmcid)

            if not text or len(text) < 200:
                logger.warning("No text retrieved for %s — skipping", pmcid)
                failed_sections.append(section["title"])
                time.sleep(0.5)
                continue

            logger.info("  Retrieved %d chars", len(text))

            chunks = chunk_guideline_text(text, section, source)
            logger.info("  → %d chunks extracted", len(chunks))

            if dry_run:
                if chunks:
                    logger.info("  Sample chunk:\n%s\n...", chunks[0]["chunk_text"][:300])
                continue

            for chunk in chunks:
                f.write(json.dumps(chunk) + "\n")

            total_chunks += len(chunks)
            time.sleep(0.5)  # Be polite to NCBI API

    if not dry_run:
        logger.info("Done. %d total chunks written to %s", total_chunks, out)
    if failed_sections:
        logger.warning("Failed sections: %s", failed_sections)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--source", default=None,
                        choices=list(GUIDELINE_SOURCES.keys()),
                        help="Guideline source to download (default: all sources)")
    parser.add_argument("--all", action="store_true",
                        help="Download all guideline sources")
    parser.add_argument("--output", default="data/guidelines_chunks.jsonl")
    parser.add_argument("--dry-run", action="store_true",
                        help="Fetch and parse but don't write output")
    args = parser.parse_args()

    sources_to_run = list(GUIDELINE_SOURCES.keys()) if (args.all or args.source is None) else [args.source]

    for source_key in sources_to_run:
        logger.info("Downloading: %s", GUIDELINE_SOURCES[source_key]["name"])
        # For multi-source runs, append non-ada sources to the same output file
        if source_key == sources_to_run[0]:
            download_guidelines(source_key, args.output, dry_run=args.dry_run)
        else:
            # Append to existing file by re-opening in append mode
            out = Path(args.output)
            source = GUIDELINE_SOURCES[source_key]
            total_chunks = 0
            failed_sections = []
            with open(out, "a", encoding="utf-8") as f:
                for section in source["sections"]:
                    pmcid = section["pmcid"]
                    logger.info("Fetching %s — Section %s: %s", pmcid, section["section"], section["title"])
                    text = fetch_pmc_xml(pmcid)
                    if not text or len(text) < 200:
                        logger.warning("No text retrieved for %s — skipping", pmcid)
                        failed_sections.append(section["title"])
                        time.sleep(0.5)
                        continue
                    logger.info("  Retrieved %d chars", len(text))
                    chunks = chunk_guideline_text(text, section, source)
                    logger.info("  → %d chunks extracted", len(chunks))
                    if args.dry_run:
                        if chunks:
                            logger.info("  Sample chunk:\n%s\n...", chunks[0]["chunk_text"][:300])
                        continue
                    for chunk in chunks:
                        f.write(json.dumps(chunk) + "\n")
                    total_chunks += len(chunks)
                    time.sleep(0.5)
            if not args.dry_run:
                logger.info("Done. %d total chunks written for %s", total_chunks, source_key)
            if failed_sections:
                logger.warning("Failed sections: %s", failed_sections)