Spaces:
Sleeping
Sleeping
| """ | |
| MediGuard AI — Biomarker Extraction Service | |
| Extracts biomarker values from natural language text using LLM. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import re | |
| from typing import Any | |
| from src.biomarker_normalization import normalize_biomarker_name | |
| logger = logging.getLogger(__name__) | |
| class ExtractionService: | |
| """Extracts biomarkers from natural language text.""" | |
| def __init__(self, llm=None): | |
| self._llm = llm | |
| def _parse_llm_json(self, content: str) -> dict[str, Any]: | |
| """Parse JSON payload from LLM output with fallback recovery.""" | |
| text = content.strip() | |
| if "```json" in text: | |
| text = text.split("```json")[1].split("```")[0].strip() | |
| elif "```" in text: | |
| text = text.split("```")[1].split("```")[0].strip() | |
| try: | |
| return json.loads(text) | |
| except json.JSONDecodeError: | |
| left = text.find("{") | |
| right = text.rfind("}") | |
| if left != -1 and right != -1 and right > left: | |
| return json.loads(text[left : right + 1]) | |
| raise | |
| def _regex_extract(self, text: str) -> dict[str, float]: | |
| """Fallback regex-based extraction.""" | |
| biomarkers = {} | |
| # Pattern: "Glucose: 140" or "Glucose = 140" or "glucose 140" | |
| patterns = [ | |
| r"([A-Za-z0-9_\s]+?)[\s:=]+(\d+\.?\d*)\s*(?:mg/dL|mmol/L|%|g/dL|U/L|mIU/L|cells/μL)?", | |
| ] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text, re.IGNORECASE) | |
| for name, value in matches: | |
| name = name.strip() | |
| try: | |
| canonical = normalize_biomarker_name(name) | |
| biomarkers[canonical] = float(value) | |
| except (ValueError, KeyError): | |
| continue | |
| return biomarkers | |
| async def extract_biomarkers(self, text: str) -> dict[str, float]: | |
| """ | |
| Extract biomarkers from natural language text. | |
| Returns: | |
| Dict mapping biomarker names to values | |
| """ | |
| if not self._llm: | |
| # Fallback to regex extraction | |
| return self._regex_extract(text) | |
| prompt = f"""You are a medical data extraction assistant. | |
| Extract biomarker values from the user's message. | |
| Known biomarkers (24 total): | |
| Glucose, Cholesterol, Triglycerides, HbA1c, LDL, HDL, Insulin, BMI, | |
| Hemoglobin, Platelets, WBC (White Blood Cells), RBC (Red Blood Cells), | |
| Hematocrit, MCV, MCH, MCHC, Heart Rate, Systolic BP, Diastolic BP, | |
| Troponin, C-reactive Protein, ALT, AST, Creatinine | |
| User message: {text} | |
| Extract all biomarker names and their values. Return ONLY valid JSON (no other text): | |
| {{"Glucose": 140, "HbA1c": 7.5}} | |
| If you cannot find any biomarkers, return {{}}. | |
| """ | |
| try: | |
| response = await self._llm.ainvoke(prompt) | |
| content = response.content.strip() | |
| extracted = self._parse_llm_json(content) | |
| # Normalize biomarker names | |
| normalized = {} | |
| for key, value in extracted.items(): | |
| try: | |
| standard_name = normalize_biomarker_name(key) | |
| normalized[standard_name] = float(value) | |
| except (ValueError, KeyError, TypeError): | |
| logger.warning(f"Skipping invalid biomarker: {key}={value}") | |
| continue | |
| return normalized | |
| except Exception as e: | |
| logger.warning(f"LLM extraction failed: {e}, falling back to regex") | |
| return self._regex_extract(text) | |
| def make_extraction_service(llm=None) -> ExtractionService: | |
| """Factory function for extraction service.""" | |
| return ExtractionService(llm=llm) | |