Agentic-RagBot / src /shared_utils.py
T0X1N's picture
chore: codebase audit and fixes (ruff, mypy, pytest)
9659593
"""
MediGuard AI β€” Shared Utilities
Common functions used by both the main API and HuggingFace deployment:
- Biomarker parsing
- Disease scoring heuristics
- Result formatting
"""
from __future__ import annotations
import json
import logging
import re
from typing import Any
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Biomarker Parsing
# ---------------------------------------------------------------------------
# Canonical biomarker name mapping (aliases -> standard name)
BIOMARKER_ALIASES: dict[str, str] = {
# Glucose
"glucose": "Glucose",
"fasting glucose": "Glucose",
"fastingglucose": "Glucose",
"blood sugar": "Glucose",
"blood glucose": "Glucose",
"fbg": "Glucose",
"fbs": "Glucose",
# HbA1c
"hba1c": "HbA1c",
"a1c": "HbA1c",
"hemoglobin a1c": "HbA1c",
"hemoglobina1c": "HbA1c",
"glycated hemoglobin": "HbA1c",
# Cholesterol
"cholesterol": "Cholesterol",
"total cholesterol": "Cholesterol",
"totalcholesterol": "Cholesterol",
"tc": "Cholesterol",
# LDL
"ldl": "LDL",
"ldl cholesterol": "LDL",
"ldlcholesterol": "LDL",
"ldl-c": "LDL",
# HDL
"hdl": "HDL",
"hdl cholesterol": "HDL",
"hdlcholesterol": "HDL",
"hdl-c": "HDL",
# Triglycerides
"triglycerides": "Triglycerides",
"tg": "Triglycerides",
"trigs": "Triglycerides",
# Hemoglobin
"hemoglobin": "Hemoglobin",
"hgb": "Hemoglobin",
"hb": "Hemoglobin",
# TSH
"tsh": "TSH",
"thyroid stimulating hormone": "TSH",
# Creatinine
"creatinine": "Creatinine",
"cr": "Creatinine",
# ALT/AST
"alt": "ALT",
"sgpt": "ALT",
"ast": "AST",
"sgot": "AST",
# Blood pressure
"systolic": "Systolic_BP",
"systolic bp": "Systolic_BP",
"sbp": "Systolic_BP",
"diastolic": "Diastolic_BP",
"diastolic bp": "Diastolic_BP",
"dbp": "Diastolic_BP",
# BMI
"bmi": "BMI",
"body mass index": "BMI",
}
def normalize_biomarker_name(name: str) -> str:
"""
Normalize a biomarker name to its canonical form.
Args:
name: Raw biomarker name (may be alias, mixed case, etc.)
Returns:
Canonical biomarker name
"""
key = name.lower().strip().replace("_", " ")
return BIOMARKER_ALIASES.get(key, name)
def parse_biomarkers(text: str) -> dict[str, float]:
"""
Parse biomarkers from natural language text or JSON.
Supports formats like:
- JSON: {"Glucose": 140, "HbA1c": 7.5}
- Key-value: "Glucose: 140, HbA1c: 7.5"
- Natural: "glucose 140 mg/dL and hba1c 7.5%"
Args:
text: Input text containing biomarker values
Returns:
Dictionary of normalized biomarker names to float values
"""
text = text.strip()
if not text:
return {}
# Try JSON first
if text.startswith("{"):
try:
raw = json.loads(text)
return {normalize_biomarker_name(k): float(v) for k, v in raw.items()}
except (json.JSONDecodeError, ValueError, TypeError):
pass
# Regex patterns for biomarker extraction
patterns = [
# "Glucose: 140" or "Glucose = 140" or "Glucose - 140"
r"([A-Za-z][A-Za-z0-9_\s]{0,30})\s*[:=\-]\s*([\d.]+)",
# "Glucose 140 mg/dL" (value after name with optional unit)
r"\b([A-Za-z][A-Za-z0-9_]{0,15})\s+([\d.]+)\s*(?:mg/dL|mmol/L|%|g/dL|U/L|mIU/L|ng/mL|pg/mL|ΞΌmol/L|umol/L)?(?:\s|,|$)",
]
biomarkers: dict[str, float] = {}
for pattern in patterns:
for match in re.finditer(pattern, text, re.IGNORECASE):
name, value = match.groups()
name = name.strip()
# Skip common non-biomarker words
if name.lower() in {"the", "a", "an", "and", "or", "is", "was", "are", "were", "be"}:
continue
try:
fval = float(value)
canonical = normalize_biomarker_name(name)
# Don't overwrite if already found (first match wins)
if canonical not in biomarkers:
biomarkers[canonical] = fval
except ValueError:
continue
return biomarkers
# ---------------------------------------------------------------------------
# Disease Scoring Heuristics
# ---------------------------------------------------------------------------
# Reference ranges for biomarkers (approximate clinical ranges)
BIOMARKER_REFERENCE_RANGES: dict[str, tuple[float, float, str]] = {
# (low, high, unit)
"Glucose": (70, 100, "mg/dL"),
"HbA1c": (4.0, 5.6, "%"),
"Cholesterol": (0, 200, "mg/dL"),
"LDL": (0, 100, "mg/dL"),
"HDL": (40, 999, "mg/dL"), # Higher is better
"Triglycerides": (0, 150, "mg/dL"),
"Hemoglobin": (12.0, 17.5, "g/dL"),
"TSH": (0.4, 4.0, "mIU/L"),
"Creatinine": (0.6, 1.2, "mg/dL"),
"ALT": (7, 56, "U/L"),
"AST": (10, 40, "U/L"),
"Systolic_BP": (90, 120, "mmHg"),
"Diastolic_BP": (60, 80, "mmHg"),
"BMI": (18.5, 24.9, "kg/mΒ²"),
}
def classify_biomarker(name: str, value: float) -> str:
"""
Classify a biomarker value as normal, low, or high.
Args:
name: Canonical biomarker name
value: Measured value
Returns:
"normal", "low", or "high"
"""
ranges = BIOMARKER_REFERENCE_RANGES.get(name)
if not ranges:
return "unknown"
low, high, _ = ranges
if value < low:
return "low"
elif value > high:
return "high"
else:
return "normal"
def score_disease_diabetes(biomarkers: dict[str, float]) -> tuple[float, str]:
"""
Score diabetes risk based on biomarkers.
Returns: (score 0-1, severity)
"""
glucose = biomarkers.get("Glucose", 0)
hba1c = biomarkers.get("HbA1c", 0)
score = 0.0
reasons = []
# HbA1c scoring (most important)
if hba1c >= 6.5:
score += 0.5
reasons.append(f"HbA1c {hba1c}% >= 6.5% (diabetes threshold)")
elif hba1c >= 5.7:
score += 0.3
reasons.append(f"HbA1c {hba1c}% in prediabetes range")
# Fasting glucose scoring
if glucose >= 126:
score += 0.35
reasons.append(f"Glucose {glucose} mg/dL >= 126 (diabetes threshold)")
elif glucose >= 100:
score += 0.2
reasons.append(f"Glucose {glucose} mg/dL in prediabetes range")
# Normalize to 0-1
score = min(1.0, score)
# Determine severity
if score >= 0.7:
severity = "high"
elif score >= 0.4:
severity = "moderate"
else:
severity = "low"
return score, severity
def score_disease_dyslipidemia(biomarkers: dict[str, float]) -> tuple[float, str]:
"""Score dyslipidemia risk based on lipid panel."""
cholesterol = biomarkers.get("Cholesterol", 0)
ldl = biomarkers.get("LDL", 0)
hdl = biomarkers.get("HDL", 999) # High default (higher is better)
triglycerides = biomarkers.get("Triglycerides", 0)
score = 0.0
if cholesterol >= 240:
score += 0.3
elif cholesterol >= 200:
score += 0.15
if ldl >= 160:
score += 0.3
elif ldl >= 130:
score += 0.15
if hdl < 40:
score += 0.2
if triglycerides >= 200:
score += 0.2
elif triglycerides >= 150:
score += 0.1
score = min(1.0, score)
if score >= 0.6:
severity = "high"
elif score >= 0.3:
severity = "moderate"
else:
severity = "low"
return score, severity
def score_disease_anemia(biomarkers: dict[str, float]) -> tuple[float, str]:
"""Score anemia risk based on hemoglobin."""
hemoglobin = biomarkers.get("Hemoglobin", 0)
if not hemoglobin:
return 0.0, "unknown"
if hemoglobin < 8:
return 0.9, "critical"
elif hemoglobin < 10:
return 0.7, "high"
elif hemoglobin < 12:
return 0.5, "moderate"
elif hemoglobin < 13:
return 0.2, "low"
else:
return 0.0, "normal"
def score_disease_thyroid(biomarkers: dict[str, float]) -> tuple[float, str, str]:
"""Score thyroid disorder risk. Returns: (score, severity, direction)."""
tsh = biomarkers.get("TSH", 0)
if not tsh:
return 0.0, "unknown", "none"
if tsh > 10:
return 0.8, "high", "hypothyroid"
elif tsh > 4.5:
return 0.5, "moderate", "hypothyroid"
elif tsh < 0.1:
return 0.8, "high", "hyperthyroid"
elif tsh < 0.4:
return 0.5, "moderate", "hyperthyroid"
else:
return 0.0, "normal", "none"
def score_all_diseases(biomarkers: dict[str, float]) -> dict[str, dict[str, Any]]:
"""
Score all disease risks based on available biomarkers.
Args:
biomarkers: Dictionary of biomarker values
Returns:
Dictionary of disease -> {score, severity, disease, confidence}
"""
results = {}
# Diabetes
score, severity = score_disease_diabetes(biomarkers)
if score > 0:
results["diabetes"] = {
"disease": "Diabetes",
"confidence": score,
"severity": severity,
}
# Dyslipidemia
score, severity = score_disease_dyslipidemia(biomarkers)
if score > 0:
results["dyslipidemia"] = {
"disease": "Dyslipidemia",
"confidence": score,
"severity": severity,
}
# Anemia
score, severity = score_disease_anemia(biomarkers)
if score > 0:
results["anemia"] = {
"disease": "Anemia",
"confidence": score,
"severity": severity,
}
# Thyroid
score, severity, direction = score_disease_thyroid(biomarkers)
if score > 0:
disease_name = "Hypothyroidism" if direction == "hypothyroid" else "Hyperthyroidism"
results["thyroid"] = {
"disease": disease_name,
"confidence": score,
"severity": severity,
}
return results
def get_primary_prediction(biomarkers: dict[str, float]) -> dict[str, Any]:
"""
Get the highest-confidence disease prediction.
Args:
biomarkers: Dictionary of biomarker values
Returns:
Dictionary with disease, confidence, severity
"""
scores = score_all_diseases(biomarkers)
if not scores:
return {
"disease": "General Health Screening",
"confidence": 0.5,
"severity": "low",
}
# Return highest confidence
best = max(scores.values(), key=lambda x: x["confidence"])
return best
# ---------------------------------------------------------------------------
# Biomarker Flagging
# ---------------------------------------------------------------------------
def flag_biomarkers(biomarkers: dict[str, float]) -> list[dict[str, Any]]:
"""
Flag abnormal biomarkers with classification and reference ranges.
Args:
biomarkers: Dictionary of biomarker values
Returns:
List of flagged biomarkers with details
"""
flags = []
for name, value in biomarkers.items():
classification = classify_biomarker(name, value)
ranges = BIOMARKER_REFERENCE_RANGES.get(name)
flag = {
"name": name,
"value": value,
"status": classification,
}
if ranges:
low, high, unit = ranges
flag["reference_range"] = f"{low}-{high} {unit}"
flag["unit"] = unit
if classification != "normal":
flag["flagged"] = True
flags.append(flag)
# Sort: flagged first, then by name
flags.sort(key=lambda x: (not x.get("flagged", False), x["name"]))
return flags
# ---------------------------------------------------------------------------
# Utility Functions
# ---------------------------------------------------------------------------
def format_confidence_percent(score: float) -> str:
"""Format confidence score as percentage string."""
return f"{int(score * 100)}%"
def severity_to_emoji(severity: str) -> str:
"""Convert severity level to emoji."""
mapping = {
"critical": "πŸ”΄",
"high": "🟠",
"moderate": "🟑",
"low": "🟒",
"normal": "βœ…",
"unknown": "❓",
}
return mapping.get(severity.lower(), "βšͺ")