""" FR-20: build_rxnorm_cache.py — Offline Drug Name Normalisation Cache Builder ============================================================================= Accepts EITHER: A) DrugBank vocabulary CSV (--drugbank-csv) ← recommended, immediate B) DrugBank Open Data XML (--drugbank-xml) ← requires registration at drugbank.com DrugBank vocabulary CSV is freely downloadable (no account needed) from: https://go.drugbank.com/releases/latest#open-data → "DrugBank Vocabulary" Queries RxNorm REST API (single approximateTerm call per drug) and saves results to data/rxnorm_cache.csv. Runtime: ~14,000 names × 0.1s delay × 1 API call ≈ 24 minutes Usage: python scripts/build_rxnorm_cache.py --drugbank-csv "data/drugbank vocabulary.csv" python scripts/build_rxnorm_cache.py --drugbank-csv "data/drugbank vocabulary.csv" --dry-run 50 python scripts/build_rxnorm_cache.py --drugbank-xml data/raw/drugbank_open_data.xml """ from __future__ import annotations import argparse import csv import logging import sys import time import xml.etree.ElementTree as ET from pathlib import Path import requests logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) logger = logging.getLogger("build_rxnorm_cache") # RxNorm approximateTerm endpoint — returns rxcui + name in ONE call (v1.4 fix) RXNORM_APPROX_URL = "https://rxnav.nlm.nih.gov/REST/approximateTerm.json" # DrugBank Open Data XML namespace (XML path only) NS = {"db": "http://www.drugbank.ca"} # --------------------------------------------------------------------------- # Step 1A: Extract drug names from DrugBank Vocabulary CSV ← preferred # --------------------------------------------------------------------------- def extract_drug_names_from_csv(csv_path: str) -> list[str]: """ Parse the DrugBank vocabulary CSV and return all drug name strings. CSV columns: DrugBank ID | Accession Numbers | Common name | CAS | UNII | Synonyms | Standard InChI Key Synonyms column is pipe-separated (e.g. "Drug A | Alias B | Trade Name C"). Args: csv_path : path to the DrugBank vocabulary CSV file Returns: Sorted deduplicated list of drug name strings. """ path = Path(csv_path) if not path.exists(): logger.error( "DrugBank vocabulary CSV not found at '%s'. " "Download it from https://go.drugbank.com/releases/latest#open-data " "(look for 'DrugBank Vocabulary' — no account needed).", csv_path, ) sys.exit(1) logger.info("Parsing DrugBank vocabulary CSV: %s", path) names: set[str] = set() with open(path, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: # Common name common = row.get("Common name", "").strip() if common: names.add(common) # Pipe-separated synonyms synonyms_raw = row.get("Synonyms", "") if synonyms_raw: for syn in synonyms_raw.split("|"): syn = syn.strip() if syn: names.add(syn) result = sorted(names) logger.info("Extracted %d unique drug names/synonyms from CSV", len(result)) return result # --------------------------------------------------------------------------- # Step 1B: Extract drug names from DrugBank Open Data XML ← needs account # --------------------------------------------------------------------------- def extract_drug_names_from_xml(xml_path: str) -> list[str]: """ Parse DrugBank Open Data XML and extract all drug names + synonyms. Args: xml_path : Path to drugbank_open_data.xml Returns: Sorted deduplicated list of drug name strings. """ logger.info("Parsing DrugBank XML: %s", xml_path) try: tree = ET.parse(xml_path) except FileNotFoundError: logger.error( "DrugBank XML not found at '%s'. " "Download it from https://go.drugbank.com/releases/latest#open-data " "(free academic registration required), or use --drugbank-csv instead.", xml_path, ) sys.exit(1) except ET.ParseError as exc: logger.error("Failed to parse DrugBank XML: %s", exc) sys.exit(1) root = tree.getroot() names: set[str] = set() for drug in root.findall("db:drug", NS): name_el = drug.find("db:name", NS) if name_el is not None and name_el.text: names.add(name_el.text.strip()) for syn in drug.findall("db:synonyms/db:synonym", NS): if syn.text: names.add(syn.text.strip()) for brand in drug.findall( "db:international-brands/db:international-brand/db:name", NS ): if brand.text: names.add(brand.text.strip()) result = sorted(names) logger.info("Extracted %d unique drug names/synonyms from XML", len(result)) return result # --------------------------------------------------------------------------- # Step 2: Query RxNorm (single API call per drug — v1.4) # --------------------------------------------------------------------------- def query_rxnorm(drug_name: str, timeout: int = 5) -> tuple[str, str]: """ Look up a drug name in RxNorm using approximateTerm endpoint. Returns (rxcui, canonical_name). Returns ("", "") on any failure. Uses /approximateTerm — single HTTP call returning both rxcui and name. (Previous 2-call approach was replaced in v1.4, cutting runtime by ~50%.) """ try: resp = requests.get( RXNORM_APPROX_URL, params={"term": drug_name, "maxEntries": "1", "option": "1"}, timeout=timeout, ) if resp.status_code != 200: return "", "" candidates: list[dict] = ( resp.json() .get("approximateGroup", {}) .get("candidate", []) ) if not candidates: return "", "" rxcui = candidates[0].get("rxcui", "") name = candidates[0].get("name", drug_name) # fallback to input return rxcui, name except Exception: return "", "" # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> None: parser = argparse.ArgumentParser( description="Build offline RxNorm cache from DrugBank data (FR-20)" ) source = parser.add_mutually_exclusive_group() source.add_argument( "--drugbank-csv", metavar="PATH", default=None, help=( "Path to DrugBank vocabulary CSV [RECOMMENDED — no account needed]. " "Download from https://go.drugbank.com/releases/latest#open-data" ), ) source.add_argument( "--drugbank-xml", metavar="PATH", default=None, help="Path to DrugBank Open Data XML (requires free academic registration).", ) parser.add_argument( "--output-csv", default="data/rxnorm_cache.csv", help="Path for output CSV", ) parser.add_argument( "--delay", type=float, default=0.1, help="Seconds to wait between API calls (default 0.1 — ~24 min total)", ) parser.add_argument( "--dry-run", type=int, default=0, metavar="N", help="Only process first N drug names (for testing)", ) parser.add_argument( "--resume", action="store_true", help=( "Resume a previously interrupted run. Reads already-completed entries " "from --output-csv and skips them, appending only the missing ones." ), ) args = parser.parse_args() # ------------------------------------------------------------------ # Auto-detect source if neither flag was given # ------------------------------------------------------------------ csv_default = "data/drugbank vocabulary.csv" xml_default = "data/raw/drugbank_open_data.xml" if args.drugbank_csv: drug_names = extract_drug_names_from_csv(args.drugbank_csv) elif args.drugbank_xml: drug_names = extract_drug_names_from_xml(args.drugbank_xml) elif Path(csv_default).exists(): logger.info("Auto-detected DrugBank vocabulary CSV at '%s'", csv_default) drug_names = extract_drug_names_from_csv(csv_default) elif Path(xml_default).exists(): logger.info("Auto-detected DrugBank XML at '%s'", xml_default) drug_names = extract_drug_names_from_xml(xml_default) else: logger.error( "No DrugBank source found. Pass --drugbank-csv or --drugbank-xml. " "See script docstring for download links." ) sys.exit(1) if args.dry_run > 0: drug_names = drug_names[: args.dry_run] logger.info("Dry-run mode: processing %d names only", len(drug_names)) # ------------------------------------------------------------------ # Resume: skip names already in the output CSV # ------------------------------------------------------------------ out_path = Path(args.output_csv) out_path.parent.mkdir(parents=True, exist_ok=True) already_done: set[str] = set() if args.resume and out_path.exists(): try: with open(out_path, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: name = row.get("drug_name", "").strip() if name: already_done.add(name) logger.info( "Resume mode: %d entries already in cache — skipping these.", len(already_done), ) except Exception as exc: logger.warning("Could not read existing cache for resume: %s", exc) already_done = set() remaining = [n for n in drug_names if n not in already_done] skipped = len(drug_names) - len(remaining) if skipped: logger.info("Skipping %d already-resolved names. %d remaining.", skipped, len(remaining)) total = len(remaining) if total == 0: logger.info("Nothing to do — cache is already complete.") sys.exit(0) est_minutes = total * (args.delay + 0.05) / 60 logger.info( "Starting cache build: %d names to process, delay=%.2fs, estimated %.0f minutes", total, args.delay, est_minutes, ) # ------------------------------------------------------------------ # Write CSV — append if resuming, overwrite otherwise # ------------------------------------------------------------------ file_mode = "a" if args.resume and out_path.exists() and already_done else "w" write_header = file_mode == "w" found = len(already_done) # count previously resolved entries too new_found = 0 with open(out_path, file_mode, newline="", encoding="utf-8") as f: writer = csv.writer(f) if write_header: writer.writerow(["drug_name", "rxcui", "canonical_name"]) for i, name in enumerate(remaining): rxcui, canonical = query_rxnorm(name) writer.writerow([name, rxcui, canonical]) if rxcui: new_found += 1 found += 1 if i % 25 == 0 or i == total - 1: pct = 100 * (i + 1) / total logger.info( "Progress: %d/%d (%.1f%%) — %d resolved this run (%d total)", i + 1, total, pct, new_found, found, ) time.sleep(args.delay) logger.info( "Cache saved to %s — %d/%d names resolved to RxNorm IDs (this run: +%d)", out_path, found, len(drug_names), new_found, ) logger.info( "Commit this file to the repo: git add %s && git commit -m 'Add RxNorm cache'", out_path, ) if __name__ == "__main__": main()