File size: 6,558 Bytes
8a693e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""

PDF Extraction Module for Lab Reports

Extracts lab test names, values, and ranges from uploaded PDF files

"""

import pdfplumber
import re
from typing import Dict, List, Optional
from dataclasses import dataclass

@dataclass
class LabResult:
    """Represents a single lab test result"""
    test_name: str
    value: str
    unit: str
    reference_range: str
    status: str  # 'normal', 'high', 'low', 'unknown'

class LabReportExtractor:
    """Extract structured data from lab report PDFs"""
    
    def __init__(self):
        # Common lab test patterns
        self.test_patterns = [
            r'(Hemoglobin|Hgb|Hb)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
            r'(WBC|White Blood Cell|Leukocyte)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
            r'(Glucose|Blood Sugar)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
            r'(Iron|Ferritin)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
            r'(Cholesterol|LDL|HDL)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
        ]
    
    def extract_from_pdf(self, pdf_path: str) -> List[LabResult]:
        """Extract lab results from PDF file"""
        results = []
        
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                
                # Try to extract tables first (more structured)
                tables = page.extract_tables()
                if tables:
                    results.extend(self._parse_tables(tables))
                
                # Fall back to pattern matching
                results.extend(self._parse_text(text))
        
        # Remove duplicates
        unique_results = self._deduplicate_results(results)
        
        return unique_results
    
    def _parse_tables(self, tables: List) -> List[LabResult]:
        """Parse lab results from extracted tables"""
        results = []
        
        for table in tables:
            if not table or len(table) < 2:
                continue
            
            # Assume first row is header
            headers = [h.lower() if h else '' for h in table[0]]
            
            # Find relevant columns
            test_col = self._find_column(headers, ['test', 'name', 'component'])
            value_col = self._find_column(headers, ['value', 'result'])
            unit_col = self._find_column(headers, ['unit', 'units'])
            range_col = self._find_column(headers, ['range', 'reference', 'normal'])
            
            # Parse data rows
            for row in table[1:]:
                if not row or len(row) <= max(test_col or 0, value_col or 0):
                    continue
                
                test_name = row[test_col] if test_col is not None else ''
                value = row[value_col] if value_col is not None else ''
                unit = row[unit_col] if unit_col is not None else ''
                ref_range = row[range_col] if range_col is not None else ''
                
                if test_name and value:
                    status = self._determine_status(value, ref_range)
                    results.append(LabResult(
                        test_name=test_name.strip(),
                        value=str(value).strip(),
                        unit=str(unit).strip() if unit else '',
                        reference_range=str(ref_range).strip() if ref_range else '',
                        status=status
                    ))
        
        return results
    
    def _parse_text(self, text: str) -> List[LabResult]:
        """Parse lab results using regex patterns"""
        results = []
        
        for pattern in self.test_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                groups = match.groups()
                if len(groups) >= 2:
                    test_name = groups[0]
                    value = groups[1]
                    unit = groups[2] if len(groups) > 2 else ''
                    ref_range = groups[3] if len(groups) > 3 else ''
                    
                    status = self._determine_status(value, ref_range)
                    results.append(LabResult(
                        test_name=test_name,
                        value=value,
                        unit=unit or '',
                        reference_range=ref_range or '',
                        status=status
                    ))
        
        return results
    
    def _find_column(self, headers: List[str], keywords: List[str]) -> Optional[int]:
        """Find column index by keywords"""
        for i, header in enumerate(headers):
            for keyword in keywords:
                if keyword in header:
                    return i
        return None
    
    def _determine_status(self, value: str, ref_range: str) -> str:
        """Determine if value is normal, high, or low"""
        try:
            val = float(value.replace(',', ''))
            
            # Parse reference range
            range_match = re.search(r'([\d.]+)\s*-\s*([\d.]+)', ref_range)
            if range_match:
                low = float(range_match.group(1))
                high = float(range_match.group(2))
                
                if val < low:
                    return 'low'
                elif val > high:
                    return 'high'
                else:
                    return 'normal'
        except (ValueError, AttributeError):
            pass
        
        return 'unknown'
    
    def _deduplicate_results(self, results: List[LabResult]) -> List[LabResult]:
        """Remove duplicate test results"""
        seen = set()
        unique = []
        
        for result in results:
            key = (result.test_name.lower(), result.value)
            if key not in seen:
                seen.add(key)
                unique.append(result)
        
        return unique

# Example usage
if __name__ == "__main__":
    extractor = LabReportExtractor()
    results = extractor.extract_from_pdf("sample_lab_report.pdf")
    
    for result in results:
        print(f"{result.test_name}: {result.value} {result.unit} [{result.status}]")
        print(f"  Reference: {result.reference_range}")