File size: 3,748 Bytes
3ca1d38
 
 
 
 
 
 
 
 
 
 
696f787
3ca1d38
 
 
 
 
 
 
 
 
 
 
 
696f787
3ca1d38
 
 
 
 
 
 
 
 
 
 
 
 
 
9659593
3ca1d38
 
696f787
3ca1d38
 
696f787
3ca1d38
 
 
 
696f787
3ca1d38
 
 
 
 
 
 
 
 
696f787
3ca1d38
 
696f787
3ca1d38
 
9659593
3ca1d38
 
 
 
 
 
696f787
3ca1d38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696f787
3ca1d38
696f787
3ca1d38
 
696f787
3ca1d38
 
 
 
 
 
 
 
 
696f787
3ca1d38
696f787
3ca1d38
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
MediGuard AI — Biomarker Extraction Service

Extracts biomarker values from natural language text using LLM.
"""

from __future__ import annotations

import json
import logging
import re
from typing import Any

from src.biomarker_normalization import normalize_biomarker_name

logger = logging.getLogger(__name__)


class ExtractionService:
    """Extracts biomarkers from natural language text."""

    def __init__(self, llm=None):
        self._llm = llm

    def _parse_llm_json(self, content: str) -> dict[str, Any]:
        """Parse JSON payload from LLM output with fallback recovery."""
        text = content.strip()

        if "```json" in text:
            text = text.split("```json")[1].split("```")[0].strip()
        elif "```" in text:
            text = text.split("```")[1].split("```")[0].strip()

        try:
            return json.loads(text)
        except json.JSONDecodeError:
            left = text.find("{")
            right = text.rfind("}")
            if left != -1 and right != -1 and right > left:
                return json.loads(text[left : right + 1])
            raise

    def _regex_extract(self, text: str) -> dict[str, float]:
        """Fallback regex-based extraction."""
        biomarkers = {}

        # Pattern: "Glucose: 140" or "Glucose = 140" or "glucose 140"
        patterns = [
            r"([A-Za-z0-9_\s]+?)[\s:=]+(\d+\.?\d*)\s*(?:mg/dL|mmol/L|%|g/dL|U/L|mIU/L|cells/μL)?",
        ]

        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for name, value in matches:
                name = name.strip()
                try:
                    canonical = normalize_biomarker_name(name)
                    biomarkers[canonical] = float(value)
                except (ValueError, KeyError):
                    continue

        return biomarkers

    async def extract_biomarkers(self, text: str) -> dict[str, float]:
        """
        Extract biomarkers from natural language text.

        Returns:
            Dict mapping biomarker names to values
        """
        if not self._llm:
            # Fallback to regex extraction
            return self._regex_extract(text)

        prompt = f"""You are a medical data extraction assistant. 
Extract biomarker values from the user's message.

Known biomarkers (24 total):
Glucose, Cholesterol, Triglycerides, HbA1c, LDL, HDL, Insulin, BMI,
Hemoglobin, Platelets, WBC (White Blood Cells), RBC (Red Blood Cells), 
Hematocrit, MCV, MCH, MCHC, Heart Rate, Systolic BP, Diastolic BP, 
Troponin, C-reactive Protein, ALT, AST, Creatinine

User message: {text}

Extract all biomarker names and their values. Return ONLY valid JSON (no other text):
{{"Glucose": 140, "HbA1c": 7.5}}

If you cannot find any biomarkers, return {{}}.
"""

        try:
            response = await self._llm.ainvoke(prompt)
            content = response.content.strip()
            extracted = self._parse_llm_json(content)

            # Normalize biomarker names
            normalized = {}
            for key, value in extracted.items():
                try:
                    standard_name = normalize_biomarker_name(key)
                    normalized[standard_name] = float(value)
                except (ValueError, KeyError, TypeError):
                    logger.warning(f"Skipping invalid biomarker: {key}={value}")
                    continue

            return normalized

        except Exception as e:
            logger.warning(f"LLM extraction failed: {e}, falling back to regex")
            return self._regex_extract(text)


def make_extraction_service(llm=None) -> ExtractionService:
    """Factory function for extraction service."""
    return ExtractionService(llm=llm)