File size: 16,507 Bytes
dcf4aad
ba5b57f
55b641e
e218489
55b641e
e218489
ba5b57f
 
dcf4aad
f73c316
 
ba5b57f
dcf4aad
ba5b57f
 
 
 
dcf4aad
458cb55
ba5b57f
dcf4aad
ba5b57f
 
 
dcf4aad
ba5b57f
 
 
dcf4aad
ba5b57f
 
 
 
 
 
dcf4aad
f73c316
 
 
 
 
dcf4aad
ba5b57f
 
dcf4aad
ba5b57f
 
dcf4aad
ba5b57f
 
 
 
dcf4aad
ba5b57f
 
dcf4aad
ba5b57f
 
 
 
 
 
 
 
 
 
dcf4aad
 
ba5b57f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcf4aad
ba5b57f
 
 
458cb55
f73c316
ba5b57f
 
 
 
dcf4aad
ba5b57f
 
 
 
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
dcf4aad
ba5b57f
 
dcf4aad
ba5b57f
 
 
 
 
 
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
 
 
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
 
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
 
 
 
55b641e
ba5b57f
dcf4aad
ba5b57f
dcf4aad
ba5b57f
 
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
 
 
 
 
 
 
dcf4aad
ba5b57f
 
 
 
dcf4aad
ba5b57f
dcf4aad
ba5b57f
 
 
dcf4aad
ba5b57f
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
dcf4aad
ba5b57f
 
 
 
 
dcf4aad
ba5b57f
55b641e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e218489
55b641e
 
 
 
 
 
e218489
55b641e
 
 
f73c316
 
 
 
 
55b641e
 
 
 
 
 
 
 
 
 
 
 
 
 
e218489
55b641e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
# src/services/extractor.py

import base64
import json
import mimetypes
import re
from typing import Any, Dict, List, Optional, Tuple

from src.models.emr import ExtractedData, LabResult, Medication, VitalSigns
from src.services import local_llm_service
from src.services.gemini import gemini_chat
from src.utils.logger import logger
from src.utils.rotator import APIKeyRotator


class EMRExtractor:
    """Service for extracting structured medical data from chat messages using Gemini AI."""

    def __init__(self, gemini_rotator: APIKeyRotator):
        self.gemini_rotator = gemini_rotator

    async def extract_medical_data(self, message: str, patient_context: Optional[Dict[str, Any]] = None) -> Tuple[ExtractedData, float]:
        """
        Extract structured medical data from a chat message using Gemini AI.

        Args:
            message: The chat message to analyze
            patient_context: Optional patient context information

        Returns:
            Tuple of (ExtractedData, confidence_score)
        """
        try:
            # Prepare the prompt for Gemini
            prompt = self._build_extraction_prompt(message, patient_context)

            if local_llm_service.model_loaded:
                response = local_llm_service.get_inference(prompt=prompt)
            else:
                # Get response from Gemini
                response = await self._call_gemini_api(prompt)

            # Parse the response
            extracted_data, confidence = self._parse_gemini_response(response)

            logger().info(f"Successfully extracted medical data with confidence {confidence:.2f}")
            return extracted_data, confidence

        except Exception as e:
            logger().error(f"Error extracting medical data: {e}")
            # Return empty data with low confidence
            return ExtractedData(), 0.0

    def _build_extraction_prompt(self, message: str, patient_context: Optional[Dict[str, Any]] = None) -> str:
        """Build the prompt for Gemini AI to extract medical data."""

        context_info = ""
        if patient_context:
            context_info = f"""
Patient Context:
- Name: {patient_context.get('name', 'Unknown')}
- Age: {patient_context.get('age', 'Unknown')}
- Sex: {patient_context.get('sex', 'Unknown')}
- Current Medications: {', '.join(patient_context.get('medications', []))}
- Past Assessment Summary: {patient_context.get('past_assessment_summary', 'None')}
"""

        prompt = f"""You are a medical AI assistant specialized in extracting structured medical data from clinical conversations.

{context_info}

Please analyze the following medical message and extract all relevant clinical information in the specified JSON format:

Message: "{message}"

Extract the following information and return ONLY a valid JSON object with this exact structure:

{{
    "diagnosis": ["list of diagnoses mentioned"],
    "symptoms": ["list of symptoms described"],
    "medications": [
        {{
            "name": "medication name",
            "dosage": "dosage if mentioned",
            "frequency": "frequency if mentioned",
            "duration": "duration if mentioned"
        }}
    ],
    "vital_signs": {{
        "blood_pressure": "value if mentioned",
        "heart_rate": "value if mentioned",
        "temperature": "value if mentioned",
        "respiratory_rate": "value if mentioned",
        "oxygen_saturation": "value if mentioned"
    }},
    "lab_results": [
        {{
            "test_name": "test name",
            "value": "test value",
            "unit": "unit if mentioned",
            "reference_range": "normal range if mentioned"
        }}
    ],
    "procedures": ["list of procedures mentioned"],
    "notes": "additional clinical notes and observations"
}}

Guidelines:
1. Only extract information that is explicitly mentioned or clearly implied
2. Use medical terminology appropriately
3. If a field has no relevant information, use an empty array [] or null
4. For medications, only include those that are prescribed, recommended, or mentioned as current
5. Extract vital signs only if specific values are mentioned
6. Include lab results only if specific test values are provided
7. Be conservative - it's better to miss something than to hallucinate information
8. Return ONLY the JSON object, no additional text or explanation

Confidence Assessment:
After the JSON, provide a confidence score (0.0-1.0) based on:
- Clarity of medical information in the message
- Specificity of clinical details
- Presence of measurable values (vitals, lab results)
- Overall clinical relevance

Format: CONFIDENCE: 0.85

Return the JSON followed by the confidence score on a new line."""

        return prompt

    async def _call_gemini_api(self, prompt: str) -> str:
        """Call the Gemini API with the extraction prompt."""
        try:
            # Use the gemini_chat function with the rotator
            response = await gemini_chat(prompt, self.gemini_rotator)
            return response
        except Exception as e:
            logger().error(f"Error calling Gemini API: {e}")
            raise

    def _parse_gemini_response(self, response: str) -> Tuple[ExtractedData, float]:
        """Parse the Gemini response to extract structured data and confidence score."""
        try:
            # Extract confidence score
            confidence = 0.5  # Default confidence
            confidence_match = re.search(r'CONFIDENCE:\s*([0-9.]+)', response)
            if confidence_match:
                confidence = float(confidence_match.group(1))

            # Extract JSON from response
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if not json_match:
                logger().warning("No JSON found in Gemini response")
                return ExtractedData(), confidence

            json_str = json_match.group(0)
            data = json.loads(json_str)

            # Parse medications
            medications = []
            for med_data in data.get('medications', []):
                if isinstance(med_data, dict):
                    medications.append(Medication(
                        name=med_data.get('name', ''),
                        dosage=med_data.get('dosage'),
                        frequency=med_data.get('frequency'),
                        duration=med_data.get('duration')
                    ))

            # Parse vital signs
            vital_signs_data = data.get('vital_signs', {})
            vital_signs = None
            if vital_signs_data and any(vital_signs_data.values()):
                vital_signs = VitalSigns(
                    blood_pressure=vital_signs_data.get('blood_pressure'),
                    heart_rate=vital_signs_data.get('heart_rate'),
                    temperature=vital_signs_data.get('temperature'),
                    respiratory_rate=vital_signs_data.get('respiratory_rate'),
                    oxygen_saturation=vital_signs_data.get('oxygen_saturation')
                )

            # Parse lab results
            lab_results = []
            for lab_data in data.get('lab_results', []):
                if isinstance(lab_data, dict):
                    lab_results.append(LabResult(
                        test_name=lab_data.get('test_name', ''),
                        value=lab_data.get('value', ''),
                        unit=lab_data.get('unit'),
                        reference_range=lab_data.get('reference_range')
                    ))

            # Create ExtractedData object
            extracted_data = ExtractedData(
                diagnosis=data.get('diagnosis', []),
                symptoms=data.get('symptoms', []),
                medications=medications,
                vital_signs=vital_signs,
                lab_results=lab_results,
                procedures=data.get('procedures', []),
                notes=data.get('notes', '') + (f"\n\nDocument Overview: {data.get('overview', '')}" if data.get('overview') else '')
            )

            return extracted_data, confidence

        except json.JSONDecodeError as e:
            logger().error(f"Error parsing JSON from Gemini response: {e}")
            return ExtractedData(), 0.0
        except Exception as e:
            logger().error(f"Error parsing Gemini response: {e}")
            return ExtractedData(), 0.0

    def extract_medications_from_text(self, text: str) -> List[str]:
        """Extract medication names from text using pattern matching."""
        # Common medication patterns
        medication_patterns = [
            r'\b(?:acetaminophen|tylenol|ibuprofen|advil|motrin|aspirin|naproxen|aleve)\b',
            r'\b(?:metformin|insulin|glipizide|metoprolol|lisinopril|amlodipine|atorvastatin|simvastatin)\b',
            r'\b(?:omeprazole|pantoprazole|ranitidine|famotidine|sertraline|fluoxetine|paroxetine)\b',
            r'\b(?:prednisone|hydrocortisone|dexamethasone|methylprednisolone)\b',
            r'\b(?:warfarin|heparin|clopidogrel|aspirin)\b',
            r'\b(?:furosemide|hydrochlorothiazide|spironolactone|triamterene)\b'
        ]

        medications = set()
        for pattern in medication_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            medications.update(matches)

        return list(medications)

    def extract_vital_signs_from_text(self, text: str) -> Dict[str, str]:
        """Extract vital signs from text using pattern matching."""
        vital_signs = {}

        # Blood pressure patterns
        bp_pattern = r'(?:blood pressure|bp|pressure)\s*:?\s*(\d{2,3}/\d{2,3})'
        bp_match = re.search(bp_pattern, text, re.IGNORECASE)
        if bp_match:
            vital_signs['blood_pressure'] = bp_match.group(1)

        # Heart rate patterns
        hr_pattern = r'(?:heart rate|hr|pulse)\s*:?\s*(\d{2,3})\s*(?:bpm|beats per minute)?'
        hr_match = re.search(hr_pattern, text, re.IGNORECASE)
        if hr_match:
            vital_signs['heart_rate'] = hr_match.group(1)

        # Temperature patterns
        temp_pattern = r'(?:temperature|temp|fever)\s*:?\s*(\d{2,3}(?:\.\d)?)\s*(?:°?[fc])?'
        temp_match = re.search(temp_pattern, text, re.IGNORECASE)
        if temp_match:
            vital_signs['temperature'] = temp_match.group(1)

        # Respiratory rate patterns
        rr_pattern = r'(?:respiratory rate|rr|breathing rate)\s*:?\s*(\d{1,2})\s*(?:breaths per minute|bpm)?'
        rr_match = re.search(rr_pattern, text, re.IGNORECASE)
        if rr_match:
            vital_signs['respiratory_rate'] = rr_match.group(1)

        # Oxygen saturation patterns
        o2_pattern = r'(?:oxygen saturation|o2 sat|spo2)\s*:?\s*(\d{2,3})\s*%?'
        o2_match = re.search(o2_pattern, text, re.IGNORECASE)
        if o2_match:
            vital_signs['oxygen_saturation'] = o2_match.group(1)

        return vital_signs

    async def analyze_document(self, file_content: bytes, filename: str, patient_context: Optional[Dict[str, Any]] = None) -> Tuple[ExtractedData, float]:
        """
        Analyze a medical document (PDF, image, or text) and extract structured medical data.

        Args:
            file_content: The binary content of the uploaded file
            filename: The name of the uploaded file
            patient_context: Optional patient context information

        Returns:
            Tuple of (ExtractedData, confidence_score)
        """
        try:
            # Determine file type and prepare content for Gemini
            mime_type, _ = mimetypes.guess_type(filename)

            if not mime_type:
                logger().warning(f"Unknown file type for {filename}")
                return ExtractedData(), 0.0

            # Encode file content to base64
            file_base64 = base64.b64encode(file_content).decode('utf-8')

            # Build the prompt for document analysis
            prompt = self._build_document_analysis_prompt(file_base64, mime_type, filename, patient_context)

            if local_llm_service.model_loaded:
                response = local_llm_service.get_inference(prompt=prompt)
            else:
                # Get response from Gemini
                response = await self._call_gemini_api(prompt)

            # Parse the response
            extracted_data, confidence = self._parse_gemini_response(response)

            logger().info(f"Successfully analyzed document {filename} with confidence {confidence:.2f}")
            return extracted_data, confidence

        except Exception as e:
            logger().error(f"Error analyzing document {filename}: {e}")
            # Return empty data with low confidence
            return ExtractedData(), 0.0

    def _build_document_analysis_prompt(self, file_base64: str, mime_type: str, filename: str, patient_context: Optional[Dict[str, Any]] = None) -> str:
        """Build the prompt for Gemini AI to analyze medical documents."""

        context_info = ""
        if patient_context:
            context_info = f"""
Patient Context:
- Name: {patient_context.get('name', 'Unknown')}
- Age: {patient_context.get('age', 'Unknown')}
- Sex: {patient_context.get('sex', 'Unknown')}
- Current Medications: {', '.join(patient_context.get('medications', []))}
- Past Assessment Summary: {patient_context.get('past_assessment_summary', 'None')}
"""

        # Determine the content type for Gemini
        if mime_type.startswith('image/'):
            content_type = "image"
        elif mime_type == 'application/pdf':
            content_type = "pdf"
        elif mime_type in ['application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']:
            content_type = "document"
        else:
            content_type = "text"

        prompt = f"""You are a medical AI assistant specialized in analyzing medical documents and extracting structured clinical information.

{context_info}

Please analyze the following medical document and extract all relevant clinical information in the specified JSON format.

Document Information:
- Filename: {filename}
- Content Type: {content_type}
- MIME Type: {mime_type}

Document Content (Base64 encoded):
{file_base64}

Extract the following information and return ONLY a valid JSON object with this exact structure:

{{
    "overview": "Brief summary of the document content and main findings",
    "diagnosis": ["list of diagnoses mentioned or identified"],
    "symptoms": ["list of symptoms described"],
    "medications": [
        {{
            "name": "medication name",
            "dosage": "dosage if mentioned",
            "frequency": "frequency if mentioned",
            "duration": "duration if mentioned"
        }}
    ],
    "vital_signs": {{
        "blood_pressure": "value if mentioned",
        "heart_rate": "value if mentioned",
        "temperature": "value if mentioned",
        "respiratory_rate": "value if mentioned",
        "oxygen_saturation": "value if mentioned"
    }},
    "lab_results": [
        {{
            "test_name": "test name",
            "value": "test value",
            "unit": "unit if mentioned",
            "reference_range": "normal range if mentioned"
        }}
    ],
    "procedures": ["list of procedures mentioned or performed"],
    "notes": "additional clinical notes and observations"
}}

Guidelines for Document Analysis:
1. Carefully read and analyze the entire document content
2. Extract information that is explicitly mentioned or clearly documented
3. Use medical terminology appropriately and maintain accuracy
4. If a field has no relevant information, use an empty array [] or null
5. For medications, include all prescribed, recommended, or mentioned medications
6. Extract vital signs only if specific values are documented
7. Include lab results only if specific test values are provided
8. Be thorough but conservative - prioritize accuracy over completeness
9. For images, focus on visible text, charts, and medical data
10. For PDFs and documents, analyze all text content systematically
11. Return ONLY the JSON object, no additional text or explanation

Confidence Assessment:
After the JSON, provide a confidence score (0.0-1.0) based on:
- Document clarity and readability
- Specificity of medical information
- Presence of measurable values (vitals, lab results)
- Overall clinical relevance and completeness
- Document type and quality

Format: CONFIDENCE: 0.85

Return the JSON followed by the confidence score on a new line."""

        return prompt