Spaces:
Sleeping
Sleeping
File size: 6,558 Bytes
8a693e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | """
PDF Extraction Module for Lab Reports
Extracts lab test names, values, and ranges from uploaded PDF files
"""
import pdfplumber
import re
from typing import Dict, List, Optional
from dataclasses import dataclass
@dataclass
class LabResult:
"""Represents a single lab test result"""
test_name: str
value: str
unit: str
reference_range: str
status: str # 'normal', 'high', 'low', 'unknown'
class LabReportExtractor:
"""Extract structured data from lab report PDFs"""
def __init__(self):
# Common lab test patterns
self.test_patterns = [
r'(Hemoglobin|Hgb|Hb)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
r'(WBC|White Blood Cell|Leukocyte)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
r'(Glucose|Blood Sugar)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
r'(Iron|Ferritin)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
r'(Cholesterol|LDL|HDL)\s*:?\s*([\d.]+)\s*([a-zA-Z/]+)?\s*(?:Ref\.?\s*Range:?\s*)?([\d.\-\s]+)',
]
def extract_from_pdf(self, pdf_path: str) -> List[LabResult]:
"""Extract lab results from PDF file"""
results = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
# Try to extract tables first (more structured)
tables = page.extract_tables()
if tables:
results.extend(self._parse_tables(tables))
# Fall back to pattern matching
results.extend(self._parse_text(text))
# Remove duplicates
unique_results = self._deduplicate_results(results)
return unique_results
def _parse_tables(self, tables: List) -> List[LabResult]:
"""Parse lab results from extracted tables"""
results = []
for table in tables:
if not table or len(table) < 2:
continue
# Assume first row is header
headers = [h.lower() if h else '' for h in table[0]]
# Find relevant columns
test_col = self._find_column(headers, ['test', 'name', 'component'])
value_col = self._find_column(headers, ['value', 'result'])
unit_col = self._find_column(headers, ['unit', 'units'])
range_col = self._find_column(headers, ['range', 'reference', 'normal'])
# Parse data rows
for row in table[1:]:
if not row or len(row) <= max(test_col or 0, value_col or 0):
continue
test_name = row[test_col] if test_col is not None else ''
value = row[value_col] if value_col is not None else ''
unit = row[unit_col] if unit_col is not None else ''
ref_range = row[range_col] if range_col is not None else ''
if test_name and value:
status = self._determine_status(value, ref_range)
results.append(LabResult(
test_name=test_name.strip(),
value=str(value).strip(),
unit=str(unit).strip() if unit else '',
reference_range=str(ref_range).strip() if ref_range else '',
status=status
))
return results
def _parse_text(self, text: str) -> List[LabResult]:
"""Parse lab results using regex patterns"""
results = []
for pattern in self.test_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
groups = match.groups()
if len(groups) >= 2:
test_name = groups[0]
value = groups[1]
unit = groups[2] if len(groups) > 2 else ''
ref_range = groups[3] if len(groups) > 3 else ''
status = self._determine_status(value, ref_range)
results.append(LabResult(
test_name=test_name,
value=value,
unit=unit or '',
reference_range=ref_range or '',
status=status
))
return results
def _find_column(self, headers: List[str], keywords: List[str]) -> Optional[int]:
"""Find column index by keywords"""
for i, header in enumerate(headers):
for keyword in keywords:
if keyword in header:
return i
return None
def _determine_status(self, value: str, ref_range: str) -> str:
"""Determine if value is normal, high, or low"""
try:
val = float(value.replace(',', ''))
# Parse reference range
range_match = re.search(r'([\d.]+)\s*-\s*([\d.]+)', ref_range)
if range_match:
low = float(range_match.group(1))
high = float(range_match.group(2))
if val < low:
return 'low'
elif val > high:
return 'high'
else:
return 'normal'
except (ValueError, AttributeError):
pass
return 'unknown'
def _deduplicate_results(self, results: List[LabResult]) -> List[LabResult]:
"""Remove duplicate test results"""
seen = set()
unique = []
for result in results:
key = (result.test_name.lower(), result.value)
if key not in seen:
seen.add(key)
unique.append(result)
return unique
# Example usage
if __name__ == "__main__":
extractor = LabReportExtractor()
results = extractor.extract_from_pdf("sample_lab_report.pdf")
for result in results:
print(f"{result.test_name}: {result.value} {result.unit} [{result.status}]")
print(f" Reference: {result.reference_range}") |