Spaces:
Sleeping
Sleeping
File size: 3,748 Bytes
3ca1d38 696f787 3ca1d38 696f787 3ca1d38 9659593 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 9659593 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 696f787 3ca1d38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | """
MediGuard AI — Biomarker Extraction Service
Extracts biomarker values from natural language text using LLM.
"""
from __future__ import annotations
import json
import logging
import re
from typing import Any
from src.biomarker_normalization import normalize_biomarker_name
logger = logging.getLogger(__name__)
class ExtractionService:
"""Extracts biomarkers from natural language text."""
def __init__(self, llm=None):
self._llm = llm
def _parse_llm_json(self, content: str) -> dict[str, Any]:
"""Parse JSON payload from LLM output with fallback recovery."""
text = content.strip()
if "```json" in text:
text = text.split("```json")[1].split("```")[0].strip()
elif "```" in text:
text = text.split("```")[1].split("```")[0].strip()
try:
return json.loads(text)
except json.JSONDecodeError:
left = text.find("{")
right = text.rfind("}")
if left != -1 and right != -1 and right > left:
return json.loads(text[left : right + 1])
raise
def _regex_extract(self, text: str) -> dict[str, float]:
"""Fallback regex-based extraction."""
biomarkers = {}
# Pattern: "Glucose: 140" or "Glucose = 140" or "glucose 140"
patterns = [
r"([A-Za-z0-9_\s]+?)[\s:=]+(\d+\.?\d*)\s*(?:mg/dL|mmol/L|%|g/dL|U/L|mIU/L|cells/μL)?",
]
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for name, value in matches:
name = name.strip()
try:
canonical = normalize_biomarker_name(name)
biomarkers[canonical] = float(value)
except (ValueError, KeyError):
continue
return biomarkers
async def extract_biomarkers(self, text: str) -> dict[str, float]:
"""
Extract biomarkers from natural language text.
Returns:
Dict mapping biomarker names to values
"""
if not self._llm:
# Fallback to regex extraction
return self._regex_extract(text)
prompt = f"""You are a medical data extraction assistant.
Extract biomarker values from the user's message.
Known biomarkers (24 total):
Glucose, Cholesterol, Triglycerides, HbA1c, LDL, HDL, Insulin, BMI,
Hemoglobin, Platelets, WBC (White Blood Cells), RBC (Red Blood Cells),
Hematocrit, MCV, MCH, MCHC, Heart Rate, Systolic BP, Diastolic BP,
Troponin, C-reactive Protein, ALT, AST, Creatinine
User message: {text}
Extract all biomarker names and their values. Return ONLY valid JSON (no other text):
{{"Glucose": 140, "HbA1c": 7.5}}
If you cannot find any biomarkers, return {{}}.
"""
try:
response = await self._llm.ainvoke(prompt)
content = response.content.strip()
extracted = self._parse_llm_json(content)
# Normalize biomarker names
normalized = {}
for key, value in extracted.items():
try:
standard_name = normalize_biomarker_name(key)
normalized[standard_name] = float(value)
except (ValueError, KeyError, TypeError):
logger.warning(f"Skipping invalid biomarker: {key}={value}")
continue
return normalized
except Exception as e:
logger.warning(f"LLM extraction failed: {e}, falling back to regex")
return self._regex_extract(text)
def make_extraction_service(llm=None) -> ExtractionService:
"""Factory function for extraction service."""
return ExtractionService(llm=llm)
|