Spaces:

T0X1N
/

Agentic-RagBot

Sleeping

File size: 12,374 Bytes

"""
MediGuard AI — Shared Utilities

Common functions used by both the main API and HuggingFace deployment:
- Biomarker parsing
- Disease scoring heuristics
- Result formatting
"""

from __future__ import annotations

import json
import logging
import re
from typing import Any

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Biomarker Parsing
# ---------------------------------------------------------------------------

# Canonical biomarker name mapping (aliases -> standard name)
BIOMARKER_ALIASES: dict[str, str] = {
    # Glucose
    "glucose": "Glucose",
    "fasting glucose": "Glucose",
    "fastingglucose": "Glucose",
    "blood sugar": "Glucose",
    "blood glucose": "Glucose",
    "fbg": "Glucose",
    "fbs": "Glucose",
    # HbA1c
    "hba1c": "HbA1c",
    "a1c": "HbA1c",
    "hemoglobin a1c": "HbA1c",
    "hemoglobina1c": "HbA1c",
    "glycated hemoglobin": "HbA1c",
    # Cholesterol
    "cholesterol": "Cholesterol",
    "total cholesterol": "Cholesterol",
    "totalcholesterol": "Cholesterol",
    "tc": "Cholesterol",
    # LDL
    "ldl": "LDL",
    "ldl cholesterol": "LDL",
    "ldlcholesterol": "LDL",
    "ldl-c": "LDL",
    # HDL
    "hdl": "HDL",
    "hdl cholesterol": "HDL",
    "hdlcholesterol": "HDL",
    "hdl-c": "HDL",
    # Triglycerides
    "triglycerides": "Triglycerides",
    "tg": "Triglycerides",
    "trigs": "Triglycerides",
    # Hemoglobin
    "hemoglobin": "Hemoglobin",
    "hgb": "Hemoglobin",
    "hb": "Hemoglobin",
    # TSH
    "tsh": "TSH",
    "thyroid stimulating hormone": "TSH",
    # Creatinine
    "creatinine": "Creatinine",
    "cr": "Creatinine",
    # ALT/AST
    "alt": "ALT",
    "sgpt": "ALT",
    "ast": "AST",
    "sgot": "AST",
    # Blood pressure
    "systolic": "Systolic_BP",
    "systolic bp": "Systolic_BP",
    "sbp": "Systolic_BP",
    "diastolic": "Diastolic_BP",
    "diastolic bp": "Diastolic_BP",
    "dbp": "Diastolic_BP",
    # BMI
    "bmi": "BMI",
    "body mass index": "BMI",
}


def normalize_biomarker_name(name: str) -> str:
    """
    Normalize a biomarker name to its canonical form.

    Args:
        name: Raw biomarker name (may be alias, mixed case, etc.)

    Returns:
        Canonical biomarker name
    """
    key = name.lower().strip().replace("_", " ")
    return BIOMARKER_ALIASES.get(key, name)


def parse_biomarkers(text: str) -> dict[str, float]:
    """
    Parse biomarkers from natural language text or JSON.

    Supports formats like:
    - JSON: {"Glucose": 140, "HbA1c": 7.5}
    - Key-value: "Glucose: 140, HbA1c: 7.5"
    - Natural: "glucose 140 mg/dL and hba1c 7.5%"

    Args:
        text: Input text containing biomarker values

    Returns:
        Dictionary of normalized biomarker names to float values
    """
    text = text.strip()

    if not text:
        return {}

    # Try JSON first
    if text.startswith("{"):
        try:
            raw = json.loads(text)
            return {normalize_biomarker_name(k): float(v) for k, v in raw.items()}
        except (json.JSONDecodeError, ValueError, TypeError):
            pass

    # Regex patterns for biomarker extraction
    patterns = [
        # "Glucose: 140" or "Glucose = 140" or "Glucose - 140"
        r"([A-Za-z][A-Za-z0-9_\s]{0,30})\s*[:=\-]\s*([\d.]+)",
        # "Glucose 140 mg/dL" (value after name with optional unit)
        r"\b([A-Za-z][A-Za-z0-9_]{0,15})\s+([\d.]+)\s*(?:mg/dL|mmol/L|%|g/dL|U/L|mIU/L|ng/mL|pg/mL|μmol/L|umol/L)?(?:\s|,|$)",
    ]

    biomarkers: dict[str, float] = {}

    for pattern in patterns:
        for match in re.finditer(pattern, text, re.IGNORECASE):
            name, value = match.groups()
            name = name.strip()

            # Skip common non-biomarker words
            if name.lower() in {"the", "a", "an", "and", "or", "is", "was", "are", "were", "be"}:
                continue

            try:
                fval = float(value)
                canonical = normalize_biomarker_name(name)
                # Don't overwrite if already found (first match wins)
                if canonical not in biomarkers:
                    biomarkers[canonical] = fval
            except ValueError:
                continue

    return biomarkers


# ---------------------------------------------------------------------------
# Disease Scoring Heuristics
# ---------------------------------------------------------------------------

# Reference ranges for biomarkers (approximate clinical ranges)
BIOMARKER_REFERENCE_RANGES: dict[str, tuple[float, float, str]] = {
    # (low, high, unit)
    "Glucose": (70, 100, "mg/dL"),
    "HbA1c": (4.0, 5.6, "%"),
    "Cholesterol": (0, 200, "mg/dL"),
    "LDL": (0, 100, "mg/dL"),
    "HDL": (40, 999, "mg/dL"),  # Higher is better
    "Triglycerides": (0, 150, "mg/dL"),
    "Hemoglobin": (12.0, 17.5, "g/dL"),
    "TSH": (0.4, 4.0, "mIU/L"),
    "Creatinine": (0.6, 1.2, "mg/dL"),
    "ALT": (7, 56, "U/L"),
    "AST": (10, 40, "U/L"),
    "Systolic_BP": (90, 120, "mmHg"),
    "Diastolic_BP": (60, 80, "mmHg"),
    "BMI": (18.5, 24.9, "kg/m²"),
}


def classify_biomarker(name: str, value: float) -> str:
    """
    Classify a biomarker value as normal, low, or high.

    Args:
        name: Canonical biomarker name
        value: Measured value

    Returns:
        "normal", "low", or "high"
    """
    ranges = BIOMARKER_REFERENCE_RANGES.get(name)
    if not ranges:
        return "unknown"

    low, high, _ = ranges

    if value < low:
        return "low"
    elif value > high:
        return "high"
    else:
        return "normal"


def score_disease_diabetes(biomarkers: dict[str, float]) -> tuple[float, str]:
    """
    Score diabetes risk based on biomarkers.

    Returns: (score 0-1, severity)
    """
    glucose = biomarkers.get("Glucose", 0)
    hba1c = biomarkers.get("HbA1c", 0)

    score = 0.0
    reasons = []

    # HbA1c scoring (most important)
    if hba1c >= 6.5:
        score += 0.5
        reasons.append(f"HbA1c {hba1c}% >= 6.5% (diabetes threshold)")
    elif hba1c >= 5.7:
        score += 0.3
        reasons.append(f"HbA1c {hba1c}% in prediabetes range")

    # Fasting glucose scoring
    if glucose >= 126:
        score += 0.35
        reasons.append(f"Glucose {glucose} mg/dL >= 126 (diabetes threshold)")
    elif glucose >= 100:
        score += 0.2
        reasons.append(f"Glucose {glucose} mg/dL in prediabetes range")

    # Normalize to 0-1
    score = min(1.0, score)

    # Determine severity
    if score >= 0.7:
        severity = "high"
    elif score >= 0.4:
        severity = "moderate"
    else:
        severity = "low"

    return score, severity


def score_disease_dyslipidemia(biomarkers: dict[str, float]) -> tuple[float, str]:
    """Score dyslipidemia risk based on lipid panel."""
    cholesterol = biomarkers.get("Cholesterol", 0)
    ldl = biomarkers.get("LDL", 0)
    hdl = biomarkers.get("HDL", 999)  # High default (higher is better)
    triglycerides = biomarkers.get("Triglycerides", 0)

    score = 0.0

    if cholesterol >= 240:
        score += 0.3
    elif cholesterol >= 200:
        score += 0.15

    if ldl >= 160:
        score += 0.3
    elif ldl >= 130:
        score += 0.15

    if hdl < 40:
        score += 0.2

    if triglycerides >= 200:
        score += 0.2
    elif triglycerides >= 150:
        score += 0.1

    score = min(1.0, score)

    if score >= 0.6:
        severity = "high"
    elif score >= 0.3:
        severity = "moderate"
    else:
        severity = "low"

    return score, severity


def score_disease_anemia(biomarkers: dict[str, float]) -> tuple[float, str]:
    """Score anemia risk based on hemoglobin."""
    hemoglobin = biomarkers.get("Hemoglobin", 0)

    if not hemoglobin:
        return 0.0, "unknown"

    if hemoglobin < 8:
        return 0.9, "critical"
    elif hemoglobin < 10:
        return 0.7, "high"
    elif hemoglobin < 12:
        return 0.5, "moderate"
    elif hemoglobin < 13:
        return 0.2, "low"
    else:
        return 0.0, "normal"


def score_disease_thyroid(biomarkers: dict[str, float]) -> tuple[float, str, str]:
    """Score thyroid disorder risk. Returns: (score, severity, direction)."""
    tsh = biomarkers.get("TSH", 0)

    if not tsh:
        return 0.0, "unknown", "none"

    if tsh > 10:
        return 0.8, "high", "hypothyroid"
    elif tsh > 4.5:
        return 0.5, "moderate", "hypothyroid"
    elif tsh < 0.1:
        return 0.8, "high", "hyperthyroid"
    elif tsh < 0.4:
        return 0.5, "moderate", "hyperthyroid"
    else:
        return 0.0, "normal", "none"


def score_all_diseases(biomarkers: dict[str, float]) -> dict[str, dict[str, Any]]:
    """
    Score all disease risks based on available biomarkers.

    Args:
        biomarkers: Dictionary of biomarker values

    Returns:
        Dictionary of disease -> {score, severity, disease, confidence}
    """
    results = {}

    # Diabetes
    score, severity = score_disease_diabetes(biomarkers)
    if score > 0:
        results["diabetes"] = {
            "disease": "Diabetes",
            "confidence": score,
            "severity": severity,
        }

    # Dyslipidemia
    score, severity = score_disease_dyslipidemia(biomarkers)
    if score > 0:
        results["dyslipidemia"] = {
            "disease": "Dyslipidemia",
            "confidence": score,
            "severity": severity,
        }

    # Anemia
    score, severity = score_disease_anemia(biomarkers)
    if score > 0:
        results["anemia"] = {
            "disease": "Anemia",
            "confidence": score,
            "severity": severity,
        }

    # Thyroid
    score, severity, direction = score_disease_thyroid(biomarkers)
    if score > 0:
        disease_name = "Hypothyroidism" if direction == "hypothyroid" else "Hyperthyroidism"
        results["thyroid"] = {
            "disease": disease_name,
            "confidence": score,
            "severity": severity,
        }

    return results


def get_primary_prediction(biomarkers: dict[str, float]) -> dict[str, Any]:
    """
    Get the highest-confidence disease prediction.

    Args:
        biomarkers: Dictionary of biomarker values

    Returns:
        Dictionary with disease, confidence, severity
    """
    scores = score_all_diseases(biomarkers)

    if not scores:
        return {
            "disease": "General Health Screening",
            "confidence": 0.5,
            "severity": "low",
        }

    # Return highest confidence
    best = max(scores.values(), key=lambda x: x["confidence"])
    return best


# ---------------------------------------------------------------------------
# Biomarker Flagging
# ---------------------------------------------------------------------------


def flag_biomarkers(biomarkers: dict[str, float]) -> list[dict[str, Any]]:
    """
    Flag abnormal biomarkers with classification and reference ranges.

    Args:
        biomarkers: Dictionary of biomarker values

    Returns:
        List of flagged biomarkers with details
    """
    flags = []

    for name, value in biomarkers.items():
        classification = classify_biomarker(name, value)
        ranges = BIOMARKER_REFERENCE_RANGES.get(name)

        flag = {
            "name": name,
            "value": value,
            "status": classification,
        }

        if ranges:
            low, high, unit = ranges
            flag["reference_range"] = f"{low}-{high} {unit}"
            flag["unit"] = unit

        if classification != "normal":
            flag["flagged"] = True

        flags.append(flag)

    # Sort: flagged first, then by name
    flags.sort(key=lambda x: (not x.get("flagged", False), x["name"]))

    return flags


# ---------------------------------------------------------------------------
# Utility Functions
# ---------------------------------------------------------------------------


def format_confidence_percent(score: float) -> str:
    """Format confidence score as percentage string."""
    return f"{int(score * 100)}%"


def severity_to_emoji(severity: str) -> str:
    """Convert severity level to emoji."""
    mapping = {
        "critical": "🔴",
        "high": "🟠",
        "moderate": "🟡",
        "low": "🟢",
        "normal": "✅",
        "unknown": "❓",
    }
    return mapping.get(severity.lower(), "⚪")