Spaces:

T0X1N
/

Agentic-RagBot

Sleeping

File size: 3,748 Bytes

"""
MediGuard AI — Biomarker Extraction Service

Extracts biomarker values from natural language text using LLM.
"""

from __future__ import annotations

import json
import logging
import re
from typing import Any

from src.biomarker_normalization import normalize_biomarker_name

logger = logging.getLogger(__name__)


class ExtractionService:
    """Extracts biomarkers from natural language text."""

    def __init__(self, llm=None):
        self._llm = llm

    def _parse_llm_json(self, content: str) -> dict[str, Any]:
        """Parse JSON payload from LLM output with fallback recovery."""
        text = content.strip()

        if "```json" in text:
            text = text.split("```json")[1].split("```")[0].strip()
        elif "```" in text:
            text = text.split("```")[1].split("```")[0].strip()

        try:
            return json.loads(text)
        except json.JSONDecodeError:
            left = text.find("{")
            right = text.rfind("}")
            if left != -1 and right != -1 and right > left:
                return json.loads(text[left : right + 1])
            raise

    def _regex_extract(self, text: str) -> dict[str, float]:
        """Fallback regex-based extraction."""
        biomarkers = {}

        # Pattern: "Glucose: 140" or "Glucose = 140" or "glucose 140"
        patterns = [
            r"([A-Za-z0-9_\s]+?)[\s:=]+(\d+\.?\d*)\s*(?:mg/dL|mmol/L|%|g/dL|U/L|mIU/L|cells/μL)?",
        ]

        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for name, value in matches:
                name = name.strip()
                try:
                    canonical = normalize_biomarker_name(name)
                    biomarkers[canonical] = float(value)
                except (ValueError, KeyError):
                    continue

        return biomarkers

    async def extract_biomarkers(self, text: str) -> dict[str, float]:
        """
        Extract biomarkers from natural language text.

        Returns:
            Dict mapping biomarker names to values
        """
        if not self._llm:
            # Fallback to regex extraction
            return self._regex_extract(text)

        prompt = f"""You are a medical data extraction assistant. 
Extract biomarker values from the user's message.

Known biomarkers (24 total):
Glucose, Cholesterol, Triglycerides, HbA1c, LDL, HDL, Insulin, BMI,
Hemoglobin, Platelets, WBC (White Blood Cells), RBC (Red Blood Cells), 
Hematocrit, MCV, MCH, MCHC, Heart Rate, Systolic BP, Diastolic BP, 
Troponin, C-reactive Protein, ALT, AST, Creatinine

User message: {text}

Extract all biomarker names and their values. Return ONLY valid JSON (no other text):
{{"Glucose": 140, "HbA1c": 7.5}}

If you cannot find any biomarkers, return {{}}.
"""

        try:
            response = await self._llm.ainvoke(prompt)
            content = response.content.strip()
            extracted = self._parse_llm_json(content)

            # Normalize biomarker names
            normalized = {}
            for key, value in extracted.items():
                try:
                    standard_name = normalize_biomarker_name(key)
                    normalized[standard_name] = float(value)
                except (ValueError, KeyError, TypeError):
                    logger.warning(f"Skipping invalid biomarker: {key}={value}")
                    continue

            return normalized

        except Exception as e:
            logger.warning(f"LLM extraction failed: {e}, falling back to regex")
            return self._regex_extract(text)


def make_extraction_service(llm=None) -> ExtractionService:
    """Factory function for extraction service."""
    return ExtractionService(llm=llm)