Spaces:

MedSwin
/

MedicalDiagnosisSystem

Runtime error

MedicalDiagnosisSystem / src /services /extractor.py

dylanglenister

FIX: Improve local llm loading

f73c316 3 months ago

16.5 kB

	# src/services/extractor.py

	import base64
	import json
	import mimetypes
	import re
	from typing import Any, Dict, List, Optional, Tuple

	from src.models.emr import ExtractedData, LabResult, Medication, VitalSigns
	from src.services import local_llm_service
	from src.services.gemini import gemini_chat
	from src.utils.logger import logger
	from src.utils.rotator import APIKeyRotator


	class EMRExtractor:
	"""Service for extracting structured medical data from chat messages using Gemini AI."""

	def __init__(self, gemini_rotator: APIKeyRotator):
	self.gemini_rotator = gemini_rotator

	async def extract_medical_data(self, message: str, patient_context: Optional[Dict[str, Any]] = None) -> Tuple[ExtractedData, float]:
	"""
	Extract structured medical data from a chat message using Gemini AI.

	Args:
	message: The chat message to analyze
	patient_context: Optional patient context information

	Returns:
	Tuple of (ExtractedData, confidence_score)
	"""
	try:
	# Prepare the prompt for Gemini
	prompt = self._build_extraction_prompt(message, patient_context)

	if local_llm_service.model_loaded:
	response = local_llm_service.get_inference(prompt=prompt)
	else:
	# Get response from Gemini
	response = await self._call_gemini_api(prompt)

	# Parse the response
	extracted_data, confidence = self._parse_gemini_response(response)

	logger().info(f"Successfully extracted medical data with confidence {confidence:.2f}")
	return extracted_data, confidence

	except Exception as e:
	logger().error(f"Error extracting medical data: {e}")
	# Return empty data with low confidence
	return ExtractedData(), 0.0

	def _build_extraction_prompt(self, message: str, patient_context: Optional[Dict[str, Any]] = None) -> str:
	"""Build the prompt for Gemini AI to extract medical data."""

	context_info = ""
	if patient_context:
	context_info = f"""
	Patient Context:
	- Name: {patient_context.get('name', 'Unknown')}
	- Age: {patient_context.get('age', 'Unknown')}
	- Sex: {patient_context.get('sex', 'Unknown')}
	- Current Medications: {', '.join(patient_context.get('medications', []))}
	- Past Assessment Summary: {patient_context.get('past_assessment_summary', 'None')}
	"""

	prompt = f"""You are a medical AI assistant specialized in extracting structured medical data from clinical conversations.

	{context_info}

	Please analyze the following medical message and extract all relevant clinical information in the specified JSON format:

	Message: "{message}"

	Extract the following information and return ONLY a valid JSON object with this exact structure:

	{{
	"diagnosis": ["list of diagnoses mentioned"],
	"symptoms": ["list of symptoms described"],
	"medications": [
	{{
	"name": "medication name",
	"dosage": "dosage if mentioned",
	"frequency": "frequency if mentioned",
	"duration": "duration if mentioned"
	}}
	],
	"vital_signs": {{
	"blood_pressure": "value if mentioned",
	"heart_rate": "value if mentioned",
	"temperature": "value if mentioned",
	"respiratory_rate": "value if mentioned",
	"oxygen_saturation": "value if mentioned"
	}},
	"lab_results": [
	{{
	"test_name": "test name",
	"value": "test value",
	"unit": "unit if mentioned",
	"reference_range": "normal range if mentioned"
	}}
	],
	"procedures": ["list of procedures mentioned"],
	"notes": "additional clinical notes and observations"
	}}

	Guidelines:
	1. Only extract information that is explicitly mentioned or clearly implied
	2. Use medical terminology appropriately
	3. If a field has no relevant information, use an empty array [] or null
	4. For medications, only include those that are prescribed, recommended, or mentioned as current
	5. Extract vital signs only if specific values are mentioned
	6. Include lab results only if specific test values are provided
	7. Be conservative - it's better to miss something than to hallucinate information
	8. Return ONLY the JSON object, no additional text or explanation

	Confidence Assessment:
	After the JSON, provide a confidence score (0.0-1.0) based on:
	- Clarity of medical information in the message
	- Specificity of clinical details
	- Presence of measurable values (vitals, lab results)
	- Overall clinical relevance

	Format: CONFIDENCE: 0.85

	Return the JSON followed by the confidence score on a new line."""

	return prompt

	async def _call_gemini_api(self, prompt: str) -> str:
	"""Call the Gemini API with the extraction prompt."""
	try:
	# Use the gemini_chat function with the rotator
	response = await gemini_chat(prompt, self.gemini_rotator)
	return response
	except Exception as e:
	logger().error(f"Error calling Gemini API: {e}")
	raise

	def _parse_gemini_response(self, response: str) -> Tuple[ExtractedData, float]:
	"""Parse the Gemini response to extract structured data and confidence score."""
	try:
	# Extract confidence score
	confidence = 0.5 # Default confidence
	confidence_match = re.search(r'CONFIDENCE:\s*([0-9.]+)', response)
	if confidence_match:
	confidence = float(confidence_match.group(1))

	# Extract JSON from response
	json_match = re.search(r'\{.*\}', response, re.DOTALL)
	if not json_match:
	logger().warning("No JSON found in Gemini response")
	return ExtractedData(), confidence

	json_str = json_match.group(0)
	data = json.loads(json_str)

	# Parse medications
	medications = []
	for med_data in data.get('medications', []):
	if isinstance(med_data, dict):
	medications.append(Medication(
	name=med_data.get('name', ''),
	dosage=med_data.get('dosage'),
	frequency=med_data.get('frequency'),
	duration=med_data.get('duration')
	))

	# Parse vital signs
	vital_signs_data = data.get('vital_signs', {})
	vital_signs = None
	if vital_signs_data and any(vital_signs_data.values()):
	vital_signs = VitalSigns(
	blood_pressure=vital_signs_data.get('blood_pressure'),
	heart_rate=vital_signs_data.get('heart_rate'),
	temperature=vital_signs_data.get('temperature'),
	respiratory_rate=vital_signs_data.get('respiratory_rate'),
	oxygen_saturation=vital_signs_data.get('oxygen_saturation')
	)

	# Parse lab results
	lab_results = []
	for lab_data in data.get('lab_results', []):
	if isinstance(lab_data, dict):
	lab_results.append(LabResult(
	test_name=lab_data.get('test_name', ''),
	value=lab_data.get('value', ''),
	unit=lab_data.get('unit'),
	reference_range=lab_data.get('reference_range')
	))

	# Create ExtractedData object
	extracted_data = ExtractedData(
	diagnosis=data.get('diagnosis', []),
	symptoms=data.get('symptoms', []),
	medications=medications,
	vital_signs=vital_signs,
	lab_results=lab_results,
	procedures=data.get('procedures', []),
	notes=data.get('notes', '') + (f"\n\nDocument Overview: {data.get('overview', '')}" if data.get('overview') else '')
	)

	return extracted_data, confidence

	except json.JSONDecodeError as e:
	logger().error(f"Error parsing JSON from Gemini response: {e}")
	return ExtractedData(), 0.0
	except Exception as e:
	logger().error(f"Error parsing Gemini response: {e}")
	return ExtractedData(), 0.0

	def extract_medications_from_text(self, text: str) -> List[str]:
	"""Extract medication names from text using pattern matching."""
	# Common medication patterns
	medication_patterns = [
	r'\b(?:acetaminophen\|tylenol\|ibuprofen\|advil\|motrin\|aspirin\|naproxen\|aleve)\b',
	r'\b(?:metformin\|insulin\|glipizide\|metoprolol\|lisinopril\|amlodipine\|atorvastatin\|simvastatin)\b',
	r'\b(?:omeprazole\|pantoprazole\|ranitidine\|famotidine\|sertraline\|fluoxetine\|paroxetine)\b',
	r'\b(?:prednisone\|hydrocortisone\|dexamethasone\|methylprednisolone)\b',
	r'\b(?:warfarin\|heparin\|clopidogrel\|aspirin)\b',
	r'\b(?:furosemide\|hydrochlorothiazide\|spironolactone\|triamterene)\b'
	]

	medications = set()
	for pattern in medication_patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	medications.update(matches)

	return list(medications)

	def extract_vital_signs_from_text(self, text: str) -> Dict[str, str]:
	"""Extract vital signs from text using pattern matching."""
	vital_signs = {}

	# Blood pressure patterns
	bp_pattern = r'(?:blood pressure\|bp\|pressure)\s:?\s(\d{2,3}/\d{2,3})'
	bp_match = re.search(bp_pattern, text, re.IGNORECASE)
	if bp_match:
	vital_signs['blood_pressure'] = bp_match.group(1)

	# Heart rate patterns
	hr_pattern = r'(?:heart rate\|hr\|pulse)\s:?\s(\d{2,3})\s*(?:bpm\|beats per minute)?'
	hr_match = re.search(hr_pattern, text, re.IGNORECASE)
	if hr_match:
	vital_signs['heart_rate'] = hr_match.group(1)

	# Temperature patterns
	temp_pattern = r'(?:temperature\|temp\|fever)\s:?\s(\d{2,3}(?:\.\d)?)\s*(?:°?[fc])?'
	temp_match = re.search(temp_pattern, text, re.IGNORECASE)
	if temp_match:
	vital_signs['temperature'] = temp_match.group(1)

	# Respiratory rate patterns
	rr_pattern = r'(?:respiratory rate\|rr\|breathing rate)\s:?\s(\d{1,2})\s*(?:breaths per minute\|bpm)?'
	rr_match = re.search(rr_pattern, text, re.IGNORECASE)
	if rr_match:
	vital_signs['respiratory_rate'] = rr_match.group(1)

	# Oxygen saturation patterns
	o2_pattern = r'(?:oxygen saturation\|o2 sat\|spo2)\s:?\s(\d{2,3})\s*%?'
	o2_match = re.search(o2_pattern, text, re.IGNORECASE)
	if o2_match:
	vital_signs['oxygen_saturation'] = o2_match.group(1)

	return vital_signs

	async def analyze_document(self, file_content: bytes, filename: str, patient_context: Optional[Dict[str, Any]] = None) -> Tuple[ExtractedData, float]:
	"""
	Analyze a medical document (PDF, image, or text) and extract structured medical data.

	Args:
	file_content: The binary content of the uploaded file
	filename: The name of the uploaded file
	patient_context: Optional patient context information

	Returns:
	Tuple of (ExtractedData, confidence_score)
	"""
	try:
	# Determine file type and prepare content for Gemini
	mime_type, _ = mimetypes.guess_type(filename)

	if not mime_type:
	logger().warning(f"Unknown file type for {filename}")
	return ExtractedData(), 0.0

	# Encode file content to base64
	file_base64 = base64.b64encode(file_content).decode('utf-8')

	# Build the prompt for document analysis
	prompt = self._build_document_analysis_prompt(file_base64, mime_type, filename, patient_context)

	if local_llm_service.model_loaded:
	response = local_llm_service.get_inference(prompt=prompt)
	else:
	# Get response from Gemini
	response = await self._call_gemini_api(prompt)

	# Parse the response
	extracted_data, confidence = self._parse_gemini_response(response)

	logger().info(f"Successfully analyzed document {filename} with confidence {confidence:.2f}")
	return extracted_data, confidence

	except Exception as e:
	logger().error(f"Error analyzing document {filename}: {e}")
	# Return empty data with low confidence
	return ExtractedData(), 0.0

	def _build_document_analysis_prompt(self, file_base64: str, mime_type: str, filename: str, patient_context: Optional[Dict[str, Any]] = None) -> str:
	"""Build the prompt for Gemini AI to analyze medical documents."""

	context_info = ""
	if patient_context:
	context_info = f"""
	Patient Context:
	- Name: {patient_context.get('name', 'Unknown')}
	- Age: {patient_context.get('age', 'Unknown')}
	- Sex: {patient_context.get('sex', 'Unknown')}
	- Current Medications: {', '.join(patient_context.get('medications', []))}
	- Past Assessment Summary: {patient_context.get('past_assessment_summary', 'None')}
	"""

	# Determine the content type for Gemini
	if mime_type.startswith('image/'):
	content_type = "image"
	elif mime_type == 'application/pdf':
	content_type = "pdf"
	elif mime_type in ['application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']:
	content_type = "document"
	else:
	content_type = "text"

	prompt = f"""You are a medical AI assistant specialized in analyzing medical documents and extracting structured clinical information.

	{context_info}

	Please analyze the following medical document and extract all relevant clinical information in the specified JSON format.

	Document Information:
	- Filename: {filename}
	- Content Type: {content_type}
	- MIME Type: {mime_type}

	Document Content (Base64 encoded):
	{file_base64}

	Extract the following information and return ONLY a valid JSON object with this exact structure:

	{{
	"overview": "Brief summary of the document content and main findings",
	"diagnosis": ["list of diagnoses mentioned or identified"],
	"symptoms": ["list of symptoms described"],
	"medications": [
	{{
	"name": "medication name",
	"dosage": "dosage if mentioned",
	"frequency": "frequency if mentioned",
	"duration": "duration if mentioned"
	}}
	],
	"vital_signs": {{
	"blood_pressure": "value if mentioned",
	"heart_rate": "value if mentioned",
	"temperature": "value if mentioned",
	"respiratory_rate": "value if mentioned",
	"oxygen_saturation": "value if mentioned"
	}},
	"lab_results": [
	{{
	"test_name": "test name",
	"value": "test value",
	"unit": "unit if mentioned",
	"reference_range": "normal range if mentioned"
	}}
	],
	"procedures": ["list of procedures mentioned or performed"],
	"notes": "additional clinical notes and observations"
	}}

	Guidelines for Document Analysis:
	1. Carefully read and analyze the entire document content
	2. Extract information that is explicitly mentioned or clearly documented
	3. Use medical terminology appropriately and maintain accuracy
	4. If a field has no relevant information, use an empty array [] or null
	5. For medications, include all prescribed, recommended, or mentioned medications
	6. Extract vital signs only if specific values are documented
	7. Include lab results only if specific test values are provided
	8. Be thorough but conservative - prioritize accuracy over completeness
	9. For images, focus on visible text, charts, and medical data
	10. For PDFs and documents, analyze all text content systematically
	11. Return ONLY the JSON object, no additional text or explanation

	Confidence Assessment:
	After the JSON, provide a confidence score (0.0-1.0) based on:
	- Document clarity and readability
	- Specificity of medical information
	- Presence of measurable values (vitals, lab results)
	- Overall clinical relevance and completeness
	- Document type and quality

	Format: CONFIDENCE: 0.85

	Return the JSON followed by the confidence score on a new line."""

	return prompt