blood-test-explainer / scripts /expand_lab_knowledge_graph.py
Codex
Improve lab knowledge graph from common labs PDF
c394cf5
Raw
History Blame Contribute Delete
21 kB
#!/usr/bin/env python3
"""Expand the lab knowledge graph to cover all canonical markers and attach video URLs."""
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
KG_PATH = ROOT / "kb" / "cbc_knowledge_graph.json"
VIDEO_PATH = ROOT / "kb" / "marker_videos.json"
# Canonical marker name -> knowledge-graph test id.
MARKER_IDS: dict[str, str] = {
"Hemoglobin": "hemoglobin",
"Hematocrit": "hct",
"White Blood Cell Count": "wbc",
"Platelet Count": "plt",
"Red Blood Cell Count": "rbc",
"MCV": "mcv",
"MCH": "mch",
"MCHC": "mchc",
"RDW": "rdw_cv",
"MPV": "mpv",
"Absolute Neutrophil Count": "neu_absolute",
"Absolute Lymphocyte Count": "lym_absolute",
"Absolute Monocyte Count": "mon_absolute",
"Absolute Eosinophil Count": "eos_absolute",
"Absolute Basophil Count": "bas_absolute",
"Band Neutrophils Percent": "band_neutrophils_percent",
"Reticulocyte Count": "reticulocyte_count",
"Haptoglobin": "haptoglobin",
"G6PD": "g6pd",
"Erythropoietin": "erythropoietin",
"Glucose": "glucose",
"Creatinine": "creatinine",
"eGFR": "egfr",
"Blood Urea Nitrogen": "bun",
"Sodium": "sodium",
"Potassium": "potassium",
"Chloride": "chloride",
"Calcium": "calcium",
"Albumin": "albumin",
"Total Protein": "total_protein",
"Globulin": "globulin",
"Bicarbonate": "bicarbonate",
"Anion Gap": "anion_gap",
"Magnesium": "magnesium",
"Phosphate": "phosphate",
"Uric Acid": "uric_acid",
"Serum Iron": "serum_iron",
"TIBC": "tibc",
"Transferrin": "transferrin",
"Transferrin Saturation": "transferrin_saturation",
"LDH": "ldh",
"Osmolality": "osmolality",
"Ammonia": "ammonia",
"Lactate": "lactate",
"Homocysteine": "homocysteine",
"Methylmalonic Acid": "methylmalonic_acid",
"Cystatin C": "cystatin_c",
"Prealbumin": "prealbumin",
"Beta-2 Microglobulin": "beta_2_microglobulin",
"C-Peptide": "c_peptide",
"Fructosamine": "fructosamine",
"Beta-Hydroxybutyrate": "beta_hydroxybutyrate",
"HbA1c": "hba1c",
"ALT": "alt",
"AST": "ast",
"ALP": "alp",
"GGT": "ggt",
"Total Bilirubin": "total_bilirubin",
"Direct Bilirubin": "direct_bilirubin",
"Lipase": "lipase",
"Amylase": "amylase",
"Total Cholesterol": "total_cholesterol",
"LDL Cholesterol": "ldl_cholesterol",
"HDL Cholesterol": "hdl_cholesterol",
"Triglycerides": "triglycerides",
"Non-HDL Cholesterol": "non_hdl_cholesterol",
"Apolipoprotein B": "apolipoprotein_b",
"Apolipoprotein A-1": "apolipoprotein_a1",
"Lipoprotein(a)": "lipoprotein_a",
"TSH": "tsh",
"Free T4": "free_t4",
"Free T3": "free_t3",
"Total T4": "total_t4",
"Total T3": "total_t3",
"Anti-TPO Antibodies": "anti_tpo_antibodies",
"TSH Receptor Antibodies": "tsh_receptor_antibodies",
"Thyroglobulin Antibodies": "thyroglobulin_antibodies",
"Vitamin D": "vitamin_d",
"Vitamin B12": "vitamin_b12",
"Ferritin": "ferritin",
"Zinc": "zinc",
"Copper": "copper",
"Ceruloplasmin": "ceruloplasmin",
"Selenium": "selenium",
"Vitamin E": "vitamin_e",
"Coenzyme Q10": "coenzyme_q10",
"Prothrombin Time": "prothrombin_time",
"INR": "inr",
"aPTT": "aptt",
"Fibrinogen": "fibrinogen",
"D-Dimer": "d_dimer",
"C-Reactive Protein": "crp",
"hs-CRP": "hs_crp",
"ESR": "esr",
"Procalcitonin": "procalcitonin",
"Complement C3": "complement_c3",
"Complement C4": "complement_c4",
"Rheumatoid Factor": "rheumatoid_factor",
"Anti-CCP Antibodies": "anti_ccp_antibodies",
"Immunoglobulin G": "immunoglobulin_g",
"Immunoglobulin A": "immunoglobulin_a",
"Immunoglobulin M": "immunoglobulin_m",
"Immunoglobulin E": "immunoglobulin_e",
"BNP": "bnp",
"NT-proBNP": "nt_probnp",
"Troponin I": "troponin_i",
"Creatine Kinase": "creatine_kinase",
"CK-MB": "ck_mb",
"Myoglobin": "myoglobin",
"Cortisol": "cortisol",
"Insulin": "insulin",
"Testosterone": "testosterone",
"Free Testosterone": "free_testosterone",
"Estradiol": "estradiol",
"Prolactin": "prolactin",
"FSH": "fsh",
"LH": "lh",
"Progesterone": "progesterone",
"Parathyroid Hormone": "pth",
"ACTH": "acth",
"DHEA-S": "dhea_s",
"Androstenedione": "androstenedione",
"Anti-Mullerian Hormone": "anti_mullerian_hormone",
"Beta-hCG": "beta_hcg",
"SHBG": "shbg",
"IGF-1": "igf_1",
"IGF Binding Protein-3": "igfbp_3",
"PSA": "psa",
"CEA": "cea",
"CA-125": "ca_125",
"CA 19-9": "ca_19_9",
"Alpha-Fetoprotein": "alpha_fetoprotein",
"CA 15-3": "ca_15_3",
"Folate": "folate",
"Vitamin A": "vitamin_a",
}
EXTRA_ALIAS_UPDATES: dict[str, list[str]] = {
"gra_absolute": ["Absolute Granulocyte Count", "Granulocytes Absolute", "Abs Granulocytes"],
"neu_absolute": ["ANC", "Absolute Neutrophil Count", "Abs Neutrophils", "Neutrophils Absolute"],
"mon_absolute": ["AMC", "Absolute Monocyte Count", "Abs Monocytes", "Monocytes Absolute"],
"eos_absolute": ["AEC", "Absolute Eosinophil Count", "Abs Eosinophils", "Eosinophils Absolute"],
"bas_absolute": ["ABC", "Absolute Basophil Count", "Abs Basophils", "Basophils Absolute"],
"band_neutrophils_percent": ["Band Neutrophil %", "Band Neutrophils", "Band %", "Bands", "Stab Neutrophils"],
"mpv": ["Mean Platelet Volume"],
"reticulocyte_count": ["Retic Count", "Retics"],
"bun": ["BUN", "Urea Nitrogen", "Blood Urea Nitrogen"],
"egfr": ["GFR", "Estimated GFR"],
"hba1c": ["A1c", "HgbA1C", "Hemoglobin A1c", "Hemoglobin A1C", "Glycated Hemoglobin"],
"aptt": ["PTT", "APTT", "Activated Partial Thromboplastin Time"],
"pth": ["PTH", "Intact PTH", "Parathyroid Hormone"],
"hs_crp": ["High-Sensitivity CRP", "High Sensitivity C-Reactive Protein"],
"d_dimer": ["D Dimer"],
}
CATEGORY_GUIDANCE: dict[str, dict[str, list[str]]] = {
"CBC": {
"food": [
"Support healthy blood production with balanced protein, iron, B12, folate, and vitamin C from whole foods.",
"Stay well hydrated unless a clinician advises fluid restriction.",
],
"exercises": [
"Use moderate activity as tolerated when blood counts are stable.",
"Avoid intense training if anemia, infection, or bleeding symptoms are present until evaluated.",
],
"supplements": [
"Discuss iron, B12, or folate supplementation only when testing supports a deficiency.",
"Do not start blood-building supplements without clinician guidance.",
],
},
"Metabolic": {
"food": [
"Favor minimally processed meals with vegetables, lean protein, whole grains, and healthy fats.",
"Limit excess added sugar, alcohol, and high-sodium ultra-processed foods when relevant to the marker.",
],
"exercises": [
"Aim for regular aerobic activity and resistance training if cleared by a clinician.",
"Match hydration and recovery to kidney, glucose, or electrolyte concerns.",
],
"supplements": [
"Use electrolyte, vitamin, or mineral supplements only when labs or diet indicate a need.",
"Review medications and supplements with a clinician because they can shift metabolic markers.",
],
},
"Liver": {
"food": [
"Limit alcohol and avoid unnecessary hepatotoxic exposures when liver enzymes are abnormal.",
"Choose balanced meals with vegetables, fiber, and moderate healthy fats.",
],
"exercises": [
"Stay active within symptom limits; avoid heavy alcohol-related training recovery patterns.",
"Seek medical review before intense exercise if jaundice or severe abdominal pain is present.",
],
"supplements": [
"Avoid unverified liver detox products.",
"Discuss medication, herb, and supplement use because many affect liver tests.",
],
},
"Lipid": {
"food": [
"Emphasize fiber-rich plants, fish, legumes, nuts, and unsaturated fats.",
"Reduce trans fats, excess saturated fat, and refined carbohydrates when triglycerides or LDL are high.",
],
"exercises": [
"Use regular aerobic and resistance exercise to support lipid and cardiovascular health.",
"Maintain consistency rather than extreme short-term training bursts.",
],
"supplements": [
"Discuss statins, fibrates, omega-3 prescriptions, or other lipid therapies with a clinician.",
"Do not rely on unproven supplement cocktails for cholesterol management.",
],
},
"Thyroid": {
"food": [
"Ensure adequate iodine and selenium from a balanced diet unless a clinician advises otherwise.",
"Keep a stable diet around thyroid testing when possible.",
],
"exercises": [
"Match activity to thyroid symptoms such as fatigue, palpitations, or heat intolerance.",
"Build up gradually after thyroid treatment changes.",
],
"supplements": [
"Avoid starting high-dose iodine or thyroid-support supplements without medical supervision.",
"Take prescribed thyroid medication consistently and separately from interfering foods or supplements.",
],
},
"Vitamin": {
"food": [
"Correct deficiencies first with food sources such as fortified grains, dairy, eggs, fish, legumes, and leafy greens.",
"Pair nutrient-dense meals with safe sun exposure for vitamin D when appropriate.",
],
"exercises": [
"Use weight-bearing and muscle-strengthening activity to support bone and metabolic health.",
"Adjust activity if deficiency symptoms such as fatigue or neuropathy are present.",
],
"supplements": [
"Supplement only after testing confirms deficiency or insufficiency.",
"Use clinician-guided dosing, especially for iron, vitamin A, and fat-soluble vitamins.",
],
},
"Coagulation": {
"food": [
"Maintain consistent vitamin K intake if on warfarin, rather than large day-to-day swings.",
"Use a balanced diet unless anticoagulation counseling specifies otherwise.",
],
"exercises": [
"Stay active, but use contact-sport caution when bleeding risk is elevated.",
"Seek urgent care for unexplained bruising, bleeding, or clot symptoms.",
],
"supplements": [
"Avoid starting aspirin, fish oil, or herbals that affect clotting without clinician review.",
"Take anticoagulants exactly as prescribed and monitor INR/PT when required.",
],
},
"Inflammation": {
"food": [
"Use an anti-inflammatory dietary pattern rich in vegetables, fruit, legumes, and omega-3 sources.",
"Limit excess alcohol and ultra-processed foods when inflammation markers are high.",
],
"exercises": [
"Use regular moderate exercise, which can lower chronic inflammation over time.",
"Rest during acute infection or inflammatory flares as advised by a clinician.",
],
"supplements": [
"Treat the underlying cause rather than relying on generic anti-inflammatory supplements.",
"Discuss persistent abnormal inflammatory markers with a clinician.",
],
},
"Cardiac": {
"food": [
"Follow a heart-healthy diet low in excess sodium and harmful fats when cardiac markers are abnormal.",
"Limit alcohol and manage blood pressure, glucose, and lipids together.",
],
"exercises": [
"Use clinician-approved cardiac rehabilitation or gradual aerobic training when safe.",
"Seek emergency care for chest pain, severe shortness of breath, or syncope.",
],
"supplements": [
"Do not self-treat suspected heart injury with supplements.",
"Take prescribed cardiac medications consistently and review interactions.",
],
},
"Hormone": {
"food": [
"Support hormone health with adequate protein, healthy fats, fiber, and micronutrients.",
"Avoid extreme dieting or rapid weight changes unless medically supervised.",
],
"exercises": [
"Use resistance training and sleep regularity to support hormonal balance.",
"Adjust training load during symptomatic hormone disorders.",
],
"supplements": [
"Avoid unsupervised hormone-boosting products.",
"Use prescribed hormone therapies only under endocrine or reproductive specialist guidance.",
],
},
"Oncology": {
"food": [
"Follow a balanced, nutrient-dense diet unless oncology care provides specific restrictions.",
"Limit charred processed meats and excess alcohol when discussing cancer screening markers.",
],
"exercises": [
"Stay physically active within the limits of prostate or oncology follow-up plans.",
"Report new urinary, bone, or systemic symptoms promptly.",
],
"supplements": [
"Do not use high-dose supplements to try to normalize screening markers without specialist input.",
"Discuss PSA changes with a clinician rather than self-interpreting a single value.",
],
},
}
SEX_SIGNIFICANCE_HIGH = {
"hemoglobin",
"rbc",
"hct",
"esr",
}
SEX_LOW = {
"level": "low",
"summary": "This marker is usually interpreted with age and lab method rather than sex-specific reference intervals.",
"pipeline_guidance": "Use the age-group interval unless the lab report provides a sex-specific range.",
}
SEX_HIGH_TEMPLATE = {
"level": "high",
"summary": "Reference intervals for this marker can differ by sex after puberty.",
"pipeline_guidance": "Prefer sex-specific lab ranges when available and include clinician context when sex is unknown.",
}
def _round_mid(lo: float, hi: float) -> float:
return round((lo + hi) / 2, 2)
def _stats_block(lo: float | None, hi: float | None) -> dict[str, dict[str, float]]:
if lo is None and hi is None:
lo, hi = 0.0, 1.0
elif lo is None and hi is not None:
lo = max(0.0, hi * 0.5)
elif hi is None and lo is not None:
hi = lo * 1.5 if lo > 0 else lo + 1.0
assert lo is not None and hi is not None
block = {
"minimal_value": lo,
"normal_value": _round_mid(lo, hi),
"maximum_value": hi,
}
return {group: dict(block) for group in ("child", "teenager", "adult", "elder")}
def _why_important(name: str, kb_entry: Any) -> str:
if kb_entry is None:
return f"Abnormal {name} values can be clinically meaningful and should be interpreted with symptoms, history, and related tests."
parts = []
if kb_entry.high:
parts.append(kb_entry.high)
if kb_entry.low:
parts.append(kb_entry.low)
return " ".join(parts) if parts else f"{name} helps clinicians evaluate related organ systems and disease patterns."
def _build_test(marker: Any, test_id: str, video_url: str, kb_entry: Any) -> dict[str, Any]:
aliases = list(dict.fromkeys([*marker.aliases]))
guidance = CATEGORY_GUIDANCE.get(marker.category, CATEGORY_GUIDANCE["Metabolic"])
sex = SEX_HIGH_TEMPLATE if test_id in SEX_SIGNIFICANCE_HIGH else SEX_LOW
return {
"id": test_id,
"display_name": marker.name,
"aliases": aliases,
"category": marker.category,
"unit": marker.unit,
"description": f"{marker.name} measures {marker.measures}.",
"why_important": _why_important(marker.name, kb_entry),
"sex_significance": dict(sex),
"instructions_to_improve": {
"food": list(guidance["food"]),
"exercises": list(guidance["exercises"]),
"supplements": list(guidance["supplements"]),
},
"statistics_per_group_age": _stats_block(marker.ref_low, marker.ref_high),
"related_tests": [],
"source_ids": [],
"video_url": video_url,
}
def _merge_aliases(test: dict[str, Any], extra: list[str]) -> None:
aliases = list(test.get("aliases") or [])
seen = {a.casefold() for a in aliases}
for alias in extra:
if alias.casefold() not in seen:
aliases.append(alias)
seen.add(alias.casefold())
test["aliases"] = aliases
def _remove_aliases(test: dict[str, Any], stale: set[str]) -> None:
test["aliases"] = [alias for alias in test.get("aliases", []) if alias.casefold() not in stale]
def _refresh_generated_fields(test: dict[str, Any], marker: Any, test_id: str, video_url: str, kb_entry: Any) -> None:
rebuilt = _build_test(marker, test_id, video_url, kb_entry)
for key in (
"display_name",
"category",
"unit",
"description",
"why_important",
"sex_significance",
"instructions_to_improve",
"statistics_per_group_age",
"video_url",
):
test[key] = rebuilt[key]
_merge_aliases(test, rebuilt["aliases"])
def main() -> None:
import sys
sys.path.insert(0, str(ROOT))
from kb.knowledge_base import KB
from src.markers import MARKERS
payload = json.loads(KG_PATH.read_text(encoding="utf-8"))
video_catalog = json.loads(VIDEO_PATH.read_text(encoding="utf-8"))
videos: dict[str, str] = video_catalog["videos"]
existing_by_id = {test["id"]: test for test in payload["tests"]}
preserved_ids = set(existing_by_id)
for marker in MARKERS:
test_id = MARKER_IDS[marker.name]
video_url = videos.get(test_id, video_catalog.get("default_video_url", ""))
kb_entry = KB.get(marker.name)
if test_id in existing_by_id:
test = existing_by_id[test_id]
_refresh_generated_fields(test, marker, test_id, video_url, kb_entry)
_merge_aliases(test, list(marker.aliases))
continue
existing_by_id[test_id] = _build_test(marker, test_id, video_url, kb_entry)
# Keep legacy CBC-only nodes that are not in MARKERS but still useful.
for legacy_id in ("rdw_sd", "neu_percent", "lym_percent", "mon_percent", "eos_percent", "bas_percent", "gra_absolute"):
if legacy_id in existing_by_id:
existing_by_id[legacy_id]["video_url"] = videos.get(legacy_id, existing_by_id[legacy_id].get("video_url", ""))
_merge_aliases(existing_by_id[legacy_id], EXTRA_ALIAS_UPDATES.get(legacy_id, []))
if legacy_id == "gra_absolute":
_remove_aliases(
existing_by_id[legacy_id],
{
"anc",
"anc when neutrophil-dominant",
"absolute neutrophil count",
"abs neutrophils",
"neutrophils absolute",
},
)
for test_id, extra_aliases in EXTRA_ALIAS_UPDATES.items():
if test_id in existing_by_id:
_merge_aliases(existing_by_id[test_id], extra_aliases)
# Add absolute differential markers missing from the legacy graph.
for marker in MARKERS:
test_id = MARKER_IDS[marker.name]
if test_id in preserved_ids or test_id not in {
"neu_absolute",
"mon_absolute",
"eos_absolute",
"bas_absolute",
"mpv",
"reticulocyte_count",
}:
continue
if test_id not in existing_by_id:
existing_by_id[test_id] = _build_test(
marker,
test_id,
videos.get(test_id, ""),
KB.get(marker.name),
)
ordered_ids = sorted(existing_by_id.keys())
payload["schema_version"] = "2.0"
payload["title"] = "Lab Marker Knowledge Graph"
payload["purpose"] = (
"Educational knowledge graph for common laboratory markers used by Blood Test Explainer. "
"It supports explanation agents, not diagnosis or treatment."
)
payload["video_url_policy"] = video_catalog.get("description", payload.get("video_url_policy", ""))
payload["video_catalog_path"] = "kb/marker_videos.json"
payload["video_notes"] = video_catalog.get("notes", {})
payload["tests"] = [existing_by_id[test_id] for test_id in ordered_ids]
KG_PATH.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
print(f"Wrote {len(payload['tests'])} tests to {KG_PATH}")
if __name__ == "__main__":
main()