Spaces:

T0X1N
/

Agentic-RagBot

Sleeping

App Files Files Community

Agentic-RagBot / src /services /extraction /service.py

T0X1N

chore: codebase audit and fixes (ruff, mypy, pytest)

9659593 1 day ago

raw

history blame contribute delete

3.75 kB

	"""
	MediGuard AI — Biomarker Extraction Service

	Extracts biomarker values from natural language text using LLM.
	"""

	from __future__ import annotations

	import json
	import logging
	import re
	from typing import Any

	from src.biomarker_normalization import normalize_biomarker_name

	logger = logging.getLogger(__name__)


	class ExtractionService:
	"""Extracts biomarkers from natural language text."""

	def __init__(self, llm=None):
	self._llm = llm

	def _parse_llm_json(self, content: str) -> dict[str, Any]:
	"""Parse JSON payload from LLM output with fallback recovery."""
	text = content.strip()

	if "```json" in text:
	text = text.split("```json")[1].split("```")[0].strip()
	elif "```" in text:
	text = text.split("```")[1].split("```")[0].strip()

	try:
	return json.loads(text)
	except json.JSONDecodeError:
	left = text.find("{")
	right = text.rfind("}")
	if left != -1 and right != -1 and right > left:
	return json.loads(text[left : right + 1])
	raise

	def _regex_extract(self, text: str) -> dict[str, float]:
	"""Fallback regex-based extraction."""
	biomarkers = {}

	# Pattern: "Glucose: 140" or "Glucose = 140" or "glucose 140"
	patterns = [
	r"([A-Za-z0-9_\s]+?)[\s:=]+(\d+\.?\d)\s(?:mg/dL\|mmol/L\|%\|g/dL\|U/L\|mIU/L\|cells/μL)?",
	]

	for pattern in patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	for name, value in matches:
	name = name.strip()
	try:
	canonical = normalize_biomarker_name(name)
	biomarkers[canonical] = float(value)
	except (ValueError, KeyError):
	continue

	return biomarkers

	async def extract_biomarkers(self, text: str) -> dict[str, float]:
	"""
	Extract biomarkers from natural language text.

	Returns:
	Dict mapping biomarker names to values
	"""
	if not self._llm:
	# Fallback to regex extraction
	return self._regex_extract(text)

	prompt = f"""You are a medical data extraction assistant.
	Extract biomarker values from the user's message.

	Known biomarkers (24 total):
	Glucose, Cholesterol, Triglycerides, HbA1c, LDL, HDL, Insulin, BMI,
	Hemoglobin, Platelets, WBC (White Blood Cells), RBC (Red Blood Cells),
	Hematocrit, MCV, MCH, MCHC, Heart Rate, Systolic BP, Diastolic BP,
	Troponin, C-reactive Protein, ALT, AST, Creatinine

	User message: {text}

	Extract all biomarker names and their values. Return ONLY valid JSON (no other text):
	{{"Glucose": 140, "HbA1c": 7.5}}

	If you cannot find any biomarkers, return {{}}.
	"""

	try:
	response = await self._llm.ainvoke(prompt)
	content = response.content.strip()
	extracted = self._parse_llm_json(content)

	# Normalize biomarker names
	normalized = {}
	for key, value in extracted.items():
	try:
	standard_name = normalize_biomarker_name(key)
	normalized[standard_name] = float(value)
	except (ValueError, KeyError, TypeError):
	logger.warning(f"Skipping invalid biomarker: {key}={value}")
	continue

	return normalized

	except Exception as e:
	logger.warning(f"LLM extraction failed: {e}, falling back to regex")
	return self._regex_extract(text)


	def make_extraction_service(llm=None) -> ExtractionService:
	"""Factory function for extraction service."""
	return ExtractionService(llm=llm)