Update main.py
Browse files
main.py
CHANGED
|
@@ -1373,6 +1373,151 @@ def parse_reference_range(range_str: str):
|
|
| 1373 |
return None, None
|
| 1374 |
|
| 1375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1376 |
def extract_lab_values_from_text(structured_text: str) -> List[Dict]:
|
| 1377 |
"""
|
| 1378 |
Extract test name, value, unit, and reference range from OCR structured text.
|
|
@@ -1709,18 +1854,34 @@ def get_medlineplus_info(slug: str, status: str) -> Dict:
|
|
| 1709 |
return {'url': url, 'description': ''}
|
| 1710 |
|
| 1711 |
|
| 1712 |
-
def check_lab_values(structured_text: str, table_data: Optional[Dict]) -> List[Dict]:
|
| 1713 |
"""
|
| 1714 |
Extract lab values from OCR output and check against reference ranges.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1715 |
Returns list of lab anomaly results.
|
| 1716 |
"""
|
| 1717 |
-
#
|
| 1718 |
extracted = []
|
| 1719 |
-
if
|
| 1720 |
-
extracted =
|
|
|
|
| 1721 |
|
| 1722 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1723 |
text_extracted = extract_lab_values_from_text(structured_text)
|
|
|
|
| 1724 |
|
| 1725 |
# Merge: add text-extracted values if test name not already found
|
| 1726 |
existing_names = {e['test_name'].lower() for e in extracted}
|
|
@@ -1995,7 +2156,7 @@ async def process_image(
|
|
| 1995 |
print(f"Found {len(interactions)} drug interactions")
|
| 1996 |
|
| 1997 |
# Check lab values against reference ranges
|
| 1998 |
-
lab_anomalies = check_lab_values(
|
| 1999 |
print(f"Found {len(lab_anomalies)} lab values ({sum(1 for a in lab_anomalies if a['status'] != 'normal')} abnormal)")
|
| 2000 |
|
| 2001 |
return {
|
|
|
|
| 1373 |
return None, None
|
| 1374 |
|
| 1375 |
|
| 1376 |
+
def extract_lab_values_from_words(words_with_boxes: List[Dict]) -> List[Dict]:
|
| 1377 |
+
"""
|
| 1378 |
+
Extract lab values using word positions from docTR.
|
| 1379 |
+
Groups words into rows by y-coordinate, then identifies columns
|
| 1380 |
+
(test name, value, unit, range) by x-position within each row.
|
| 1381 |
+
This is the most reliable method since it uses spatial layout.
|
| 1382 |
+
"""
|
| 1383 |
+
extracted = []
|
| 1384 |
+
if not words_with_boxes:
|
| 1385 |
+
return extracted
|
| 1386 |
+
|
| 1387 |
+
# 1. Group words into rows by y-center (within tolerance)
|
| 1388 |
+
ROW_TOLERANCE = 0.015 # Words within 1.5% of page height = same row
|
| 1389 |
+
rows = []
|
| 1390 |
+
sorted_words = sorted(words_with_boxes, key=lambda w: (w['bbox'][0][1], w['bbox'][0][0]))
|
| 1391 |
+
|
| 1392 |
+
current_row = []
|
| 1393 |
+
current_y = None
|
| 1394 |
+
|
| 1395 |
+
for word_info in sorted_words:
|
| 1396 |
+
y_center = (word_info['bbox'][0][1] + word_info['bbox'][1][1]) / 2
|
| 1397 |
+
if current_y is None or abs(y_center - current_y) < ROW_TOLERANCE:
|
| 1398 |
+
current_row.append(word_info)
|
| 1399 |
+
if current_y is None:
|
| 1400 |
+
current_y = y_center
|
| 1401 |
+
else:
|
| 1402 |
+
current_y = (current_y + y_center) / 2 # Running average
|
| 1403 |
+
else:
|
| 1404 |
+
if current_row:
|
| 1405 |
+
rows.append(sorted(current_row, key=lambda w: w['bbox'][0][0]))
|
| 1406 |
+
current_row = [word_info]
|
| 1407 |
+
current_y = y_center
|
| 1408 |
+
|
| 1409 |
+
if current_row:
|
| 1410 |
+
rows.append(sorted(current_row, key=lambda w: w['bbox'][0][0]))
|
| 1411 |
+
|
| 1412 |
+
# 2. For each row, classify words into: test_name, value, unit, range
|
| 1413 |
+
UNITS = {'mg/dl', 'mmol/l', 'g/dl', 'u/l', 'miu/l', 'ng/dl', 'pg/ml',
|
| 1414 |
+
'ug/dl', 'ng/ml', 'fl', 'pg', '%', 'mm/hr', 'mg/l', 'mg/mmol',
|
| 1415 |
+
'ug/l', 'ml/min/1.73m2'}
|
| 1416 |
+
|
| 1417 |
+
SKIP_WORDS = {'result', 'unit', 'ref.range', 'ref', 'range', 'reference',
|
| 1418 |
+
'date', 'request', 'no', 'no:'}
|
| 1419 |
+
|
| 1420 |
+
for row in rows:
|
| 1421 |
+
words_text = [w['word'] for w in row]
|
| 1422 |
+
row_str = ' '.join(words_text).lower()
|
| 1423 |
+
|
| 1424 |
+
# Skip header rows
|
| 1425 |
+
if 'result' in row_str and ('unit' in row_str or 'ref' in row_str):
|
| 1426 |
+
continue
|
| 1427 |
+
if 'profile' in row_str and len(words_text) <= 3:
|
| 1428 |
+
continue
|
| 1429 |
+
if 'function' in row_str and len(words_text) <= 3:
|
| 1430 |
+
continue
|
| 1431 |
+
|
| 1432 |
+
# Classify each word
|
| 1433 |
+
name_parts = []
|
| 1434 |
+
value = None
|
| 1435 |
+
unit = ''
|
| 1436 |
+
range_parts = []
|
| 1437 |
+
is_flagged = False
|
| 1438 |
+
in_range = False
|
| 1439 |
+
|
| 1440 |
+
for w in row:
|
| 1441 |
+
word = w['word'].strip()
|
| 1442 |
+
word_lower = word.lower().strip('()')
|
| 1443 |
+
|
| 1444 |
+
if not word:
|
| 1445 |
+
continue
|
| 1446 |
+
|
| 1447 |
+
# Check if this starts/continues a range (in parentheses)
|
| 1448 |
+
if '(' in word or in_range:
|
| 1449 |
+
in_range = True
|
| 1450 |
+
range_parts.append(word)
|
| 1451 |
+
if ')' in word:
|
| 1452 |
+
in_range = False
|
| 1453 |
+
continue
|
| 1454 |
+
|
| 1455 |
+
# Check for flagged marker
|
| 1456 |
+
if word == '*':
|
| 1457 |
+
is_flagged = True
|
| 1458 |
+
continue
|
| 1459 |
+
|
| 1460 |
+
# Check if it's a unit
|
| 1461 |
+
if word_lower in UNITS or word_lower.replace('/', '').replace('.', '').replace('1', '').replace('3', '').replace('7', '').replace('m', '').replace('2', '') == '':
|
| 1462 |
+
cleaned_unit = word_lower
|
| 1463 |
+
if cleaned_unit in UNITS:
|
| 1464 |
+
unit = word
|
| 1465 |
+
continue
|
| 1466 |
+
|
| 1467 |
+
# Check if unit with superscript like x10⁹/L or x10^9/L
|
| 1468 |
+
if 'x10' in word_lower or '10⁹' in word or '10¹²' in word:
|
| 1469 |
+
unit = word
|
| 1470 |
+
continue
|
| 1471 |
+
|
| 1472 |
+
# Check if it's a number (the result value)
|
| 1473 |
+
cleaned_word = word.lstrip('*').strip()
|
| 1474 |
+
try:
|
| 1475 |
+
num = float(cleaned_word)
|
| 1476 |
+
if value is None:
|
| 1477 |
+
value = num
|
| 1478 |
+
if '*' in word:
|
| 1479 |
+
is_flagged = True
|
| 1480 |
+
continue
|
| 1481 |
+
except ValueError:
|
| 1482 |
+
pass
|
| 1483 |
+
|
| 1484 |
+
# Check if it's a skip word
|
| 1485 |
+
if word_lower in SKIP_WORDS:
|
| 1486 |
+
continue
|
| 1487 |
+
|
| 1488 |
+
# Check if it's Chinese characters only — skip
|
| 1489 |
+
if all('\u4e00' <= c <= '\u9fff' or c in '()()' for c in word):
|
| 1490 |
+
continue
|
| 1491 |
+
|
| 1492 |
+
# Otherwise it's part of the test name
|
| 1493 |
+
if any(c.isalpha() for c in word):
|
| 1494 |
+
name_parts.append(word)
|
| 1495 |
+
|
| 1496 |
+
# Parse the range
|
| 1497 |
+
range_str = ' '.join(range_parts).strip('() ')
|
| 1498 |
+
ref_low, ref_high = parse_reference_range(range_str)
|
| 1499 |
+
|
| 1500 |
+
test_name = ' '.join(name_parts).strip()
|
| 1501 |
+
|
| 1502 |
+
# Validate: need at least a name, a value, and a range
|
| 1503 |
+
if test_name and value is not None and (ref_low is not None or ref_high is not None):
|
| 1504 |
+
# Filter out section headers that slipped through
|
| 1505 |
+
if test_name.upper() == test_name and len(test_name.split()) > 2:
|
| 1506 |
+
continue # ALL CAPS multi-word = likely a section header
|
| 1507 |
+
|
| 1508 |
+
extracted.append({
|
| 1509 |
+
'test_name': test_name,
|
| 1510 |
+
'value': value,
|
| 1511 |
+
'unit': unit,
|
| 1512 |
+
'ref_low': ref_low,
|
| 1513 |
+
'ref_high': ref_high,
|
| 1514 |
+
'ref_range_str': range_str,
|
| 1515 |
+
'is_flagged_in_document': is_flagged,
|
| 1516 |
+
})
|
| 1517 |
+
|
| 1518 |
+
return extracted
|
| 1519 |
+
|
| 1520 |
+
|
| 1521 |
def extract_lab_values_from_text(structured_text: str) -> List[Dict]:
|
| 1522 |
"""
|
| 1523 |
Extract test name, value, unit, and reference range from OCR structured text.
|
|
|
|
| 1854 |
return {'url': url, 'description': ''}
|
| 1855 |
|
| 1856 |
|
| 1857 |
+
def check_lab_values(structured_text: str, table_data: Optional[Dict], words_with_boxes: Optional[List[Dict]] = None) -> List[Dict]:
|
| 1858 |
"""
|
| 1859 |
Extract lab values from OCR output and check against reference ranges.
|
| 1860 |
+
Uses three extraction methods in priority order:
|
| 1861 |
+
1. Word-position-based (most reliable — uses spatial layout from docTR)
|
| 1862 |
+
2. Table-based (if table was detected)
|
| 1863 |
+
3. Text regex-based (fallback)
|
| 1864 |
Returns list of lab anomaly results.
|
| 1865 |
"""
|
| 1866 |
+
# Method 1: Word-position-based extraction (best for columnar lab reports)
|
| 1867 |
extracted = []
|
| 1868 |
+
if words_with_boxes:
|
| 1869 |
+
extracted = extract_lab_values_from_words(words_with_boxes)
|
| 1870 |
+
print(f"Lab extraction (word-position): found {len(extracted)} values")
|
| 1871 |
|
| 1872 |
+
# Method 2: Table-based extraction
|
| 1873 |
+
if table_data and table_data.get('is_table'):
|
| 1874 |
+
table_extracted = extract_lab_values_from_table(table_data)
|
| 1875 |
+
print(f"Lab extraction (table): found {len(table_extracted)} values")
|
| 1876 |
+
existing_names = {e['test_name'].lower() for e in extracted}
|
| 1877 |
+
for te in table_extracted:
|
| 1878 |
+
if te['test_name'].lower() not in existing_names:
|
| 1879 |
+
extracted.append(te)
|
| 1880 |
+
existing_names.add(te['test_name'].lower())
|
| 1881 |
+
|
| 1882 |
+
# Method 3: Text regex fallback
|
| 1883 |
text_extracted = extract_lab_values_from_text(structured_text)
|
| 1884 |
+
print(f"Lab extraction (text-regex): found {len(text_extracted)} values")
|
| 1885 |
|
| 1886 |
# Merge: add text-extracted values if test name not already found
|
| 1887 |
existing_names = {e['test_name'].lower() for e in extracted}
|
|
|
|
| 2156 |
print(f"Found {len(interactions)} drug interactions")
|
| 2157 |
|
| 2158 |
# Check lab values against reference ranges
|
| 2159 |
+
lab_anomalies = check_lab_values(structured_text, primary_table_data, words_with_boxes)
|
| 2160 |
print(f"Found {len(lab_anomalies)} lab values ({sum(1 for a in lab_anomalies if a['status'] != 'normal')} abnormal)")
|
| 2161 |
|
| 2162 |
return {
|