T0X1N's picture
chore: codebase audit and fixes (ruff, mypy, pytest)
9659593
"""
MediGuard AI — Biomarker Extraction Service
Extracts biomarker values from natural language text using LLM.
"""
from __future__ import annotations
import json
import logging
import re
from typing import Any
from src.biomarker_normalization import normalize_biomarker_name
logger = logging.getLogger(__name__)
class ExtractionService:
"""Extracts biomarkers from natural language text."""
def __init__(self, llm=None):
self._llm = llm
def _parse_llm_json(self, content: str) -> dict[str, Any]:
"""Parse JSON payload from LLM output with fallback recovery."""
text = content.strip()
if "```json" in text:
text = text.split("```json")[1].split("```")[0].strip()
elif "```" in text:
text = text.split("```")[1].split("```")[0].strip()
try:
return json.loads(text)
except json.JSONDecodeError:
left = text.find("{")
right = text.rfind("}")
if left != -1 and right != -1 and right > left:
return json.loads(text[left : right + 1])
raise
def _regex_extract(self, text: str) -> dict[str, float]:
"""Fallback regex-based extraction."""
biomarkers = {}
# Pattern: "Glucose: 140" or "Glucose = 140" or "glucose 140"
patterns = [
r"([A-Za-z0-9_\s]+?)[\s:=]+(\d+\.?\d*)\s*(?:mg/dL|mmol/L|%|g/dL|U/L|mIU/L|cells/μL)?",
]
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for name, value in matches:
name = name.strip()
try:
canonical = normalize_biomarker_name(name)
biomarkers[canonical] = float(value)
except (ValueError, KeyError):
continue
return biomarkers
async def extract_biomarkers(self, text: str) -> dict[str, float]:
"""
Extract biomarkers from natural language text.
Returns:
Dict mapping biomarker names to values
"""
if not self._llm:
# Fallback to regex extraction
return self._regex_extract(text)
prompt = f"""You are a medical data extraction assistant.
Extract biomarker values from the user's message.
Known biomarkers (24 total):
Glucose, Cholesterol, Triglycerides, HbA1c, LDL, HDL, Insulin, BMI,
Hemoglobin, Platelets, WBC (White Blood Cells), RBC (Red Blood Cells),
Hematocrit, MCV, MCH, MCHC, Heart Rate, Systolic BP, Diastolic BP,
Troponin, C-reactive Protein, ALT, AST, Creatinine
User message: {text}
Extract all biomarker names and their values. Return ONLY valid JSON (no other text):
{{"Glucose": 140, "HbA1c": 7.5}}
If you cannot find any biomarkers, return {{}}.
"""
try:
response = await self._llm.ainvoke(prompt)
content = response.content.strip()
extracted = self._parse_llm_json(content)
# Normalize biomarker names
normalized = {}
for key, value in extracted.items():
try:
standard_name = normalize_biomarker_name(key)
normalized[standard_name] = float(value)
except (ValueError, KeyError, TypeError):
logger.warning(f"Skipping invalid biomarker: {key}={value}")
continue
return normalized
except Exception as e:
logger.warning(f"LLM extraction failed: {e}, falling back to regex")
return self._regex_extract(text)
def make_extraction_service(llm=None) -> ExtractionService:
"""Factory function for extraction service."""
return ExtractionService(llm=llm)