hkai20000 commited on
Commit
b9ce637
·
verified ·
1 Parent(s): 4530cde

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +167 -6
main.py CHANGED
@@ -1373,6 +1373,151 @@ def parse_reference_range(range_str: str):
1373
  return None, None
1374
 
1375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1376
  def extract_lab_values_from_text(structured_text: str) -> List[Dict]:
1377
  """
1378
  Extract test name, value, unit, and reference range from OCR structured text.
@@ -1709,18 +1854,34 @@ def get_medlineplus_info(slug: str, status: str) -> Dict:
1709
  return {'url': url, 'description': ''}
1710
 
1711
 
1712
- def check_lab_values(structured_text: str, table_data: Optional[Dict]) -> List[Dict]:
1713
  """
1714
  Extract lab values from OCR output and check against reference ranges.
 
 
 
 
1715
  Returns list of lab anomaly results.
1716
  """
1717
- # Extract from table first (more reliable), then text
1718
  extracted = []
1719
- if table_data and table_data.get('is_table'):
1720
- extracted = extract_lab_values_from_table(table_data)
 
1721
 
1722
- # Also try text extraction
 
 
 
 
 
 
 
 
 
 
1723
  text_extracted = extract_lab_values_from_text(structured_text)
 
1724
 
1725
  # Merge: add text-extracted values if test name not already found
1726
  existing_names = {e['test_name'].lower() for e in extracted}
@@ -1995,7 +2156,7 @@ async def process_image(
1995
  print(f"Found {len(interactions)} drug interactions")
1996
 
1997
  # Check lab values against reference ranges
1998
- lab_anomalies = check_lab_values(display_text, primary_table_data)
1999
  print(f"Found {len(lab_anomalies)} lab values ({sum(1 for a in lab_anomalies if a['status'] != 'normal')} abnormal)")
2000
 
2001
  return {
 
1373
  return None, None
1374
 
1375
 
1376
+ def extract_lab_values_from_words(words_with_boxes: List[Dict]) -> List[Dict]:
1377
+ """
1378
+ Extract lab values using word positions from docTR.
1379
+ Groups words into rows by y-coordinate, then identifies columns
1380
+ (test name, value, unit, range) by x-position within each row.
1381
+ This is the most reliable method since it uses spatial layout.
1382
+ """
1383
+ extracted = []
1384
+ if not words_with_boxes:
1385
+ return extracted
1386
+
1387
+ # 1. Group words into rows by y-center (within tolerance)
1388
+ ROW_TOLERANCE = 0.015 # Words within 1.5% of page height = same row
1389
+ rows = []
1390
+ sorted_words = sorted(words_with_boxes, key=lambda w: (w['bbox'][0][1], w['bbox'][0][0]))
1391
+
1392
+ current_row = []
1393
+ current_y = None
1394
+
1395
+ for word_info in sorted_words:
1396
+ y_center = (word_info['bbox'][0][1] + word_info['bbox'][1][1]) / 2
1397
+ if current_y is None or abs(y_center - current_y) < ROW_TOLERANCE:
1398
+ current_row.append(word_info)
1399
+ if current_y is None:
1400
+ current_y = y_center
1401
+ else:
1402
+ current_y = (current_y + y_center) / 2 # Running average
1403
+ else:
1404
+ if current_row:
1405
+ rows.append(sorted(current_row, key=lambda w: w['bbox'][0][0]))
1406
+ current_row = [word_info]
1407
+ current_y = y_center
1408
+
1409
+ if current_row:
1410
+ rows.append(sorted(current_row, key=lambda w: w['bbox'][0][0]))
1411
+
1412
+ # 2. For each row, classify words into: test_name, value, unit, range
1413
+ UNITS = {'mg/dl', 'mmol/l', 'g/dl', 'u/l', 'miu/l', 'ng/dl', 'pg/ml',
1414
+ 'ug/dl', 'ng/ml', 'fl', 'pg', '%', 'mm/hr', 'mg/l', 'mg/mmol',
1415
+ 'ug/l', 'ml/min/1.73m2'}
1416
+
1417
+ SKIP_WORDS = {'result', 'unit', 'ref.range', 'ref', 'range', 'reference',
1418
+ 'date', 'request', 'no', 'no:'}
1419
+
1420
+ for row in rows:
1421
+ words_text = [w['word'] for w in row]
1422
+ row_str = ' '.join(words_text).lower()
1423
+
1424
+ # Skip header rows
1425
+ if 'result' in row_str and ('unit' in row_str or 'ref' in row_str):
1426
+ continue
1427
+ if 'profile' in row_str and len(words_text) <= 3:
1428
+ continue
1429
+ if 'function' in row_str and len(words_text) <= 3:
1430
+ continue
1431
+
1432
+ # Classify each word
1433
+ name_parts = []
1434
+ value = None
1435
+ unit = ''
1436
+ range_parts = []
1437
+ is_flagged = False
1438
+ in_range = False
1439
+
1440
+ for w in row:
1441
+ word = w['word'].strip()
1442
+ word_lower = word.lower().strip('()')
1443
+
1444
+ if not word:
1445
+ continue
1446
+
1447
+ # Check if this starts/continues a range (in parentheses)
1448
+ if '(' in word or in_range:
1449
+ in_range = True
1450
+ range_parts.append(word)
1451
+ if ')' in word:
1452
+ in_range = False
1453
+ continue
1454
+
1455
+ # Check for flagged marker
1456
+ if word == '*':
1457
+ is_flagged = True
1458
+ continue
1459
+
1460
+ # Check if it's a unit
1461
+ if word_lower in UNITS or word_lower.replace('/', '').replace('.', '').replace('1', '').replace('3', '').replace('7', '').replace('m', '').replace('2', '') == '':
1462
+ cleaned_unit = word_lower
1463
+ if cleaned_unit in UNITS:
1464
+ unit = word
1465
+ continue
1466
+
1467
+ # Check if unit with superscript like x10⁹/L or x10^9/L
1468
+ if 'x10' in word_lower or '10⁹' in word or '10¹²' in word:
1469
+ unit = word
1470
+ continue
1471
+
1472
+ # Check if it's a number (the result value)
1473
+ cleaned_word = word.lstrip('*').strip()
1474
+ try:
1475
+ num = float(cleaned_word)
1476
+ if value is None:
1477
+ value = num
1478
+ if '*' in word:
1479
+ is_flagged = True
1480
+ continue
1481
+ except ValueError:
1482
+ pass
1483
+
1484
+ # Check if it's a skip word
1485
+ if word_lower in SKIP_WORDS:
1486
+ continue
1487
+
1488
+ # Check if it's Chinese characters only — skip
1489
+ if all('\u4e00' <= c <= '\u9fff' or c in '()()' for c in word):
1490
+ continue
1491
+
1492
+ # Otherwise it's part of the test name
1493
+ if any(c.isalpha() for c in word):
1494
+ name_parts.append(word)
1495
+
1496
+ # Parse the range
1497
+ range_str = ' '.join(range_parts).strip('() ')
1498
+ ref_low, ref_high = parse_reference_range(range_str)
1499
+
1500
+ test_name = ' '.join(name_parts).strip()
1501
+
1502
+ # Validate: need at least a name, a value, and a range
1503
+ if test_name and value is not None and (ref_low is not None or ref_high is not None):
1504
+ # Filter out section headers that slipped through
1505
+ if test_name.upper() == test_name and len(test_name.split()) > 2:
1506
+ continue # ALL CAPS multi-word = likely a section header
1507
+
1508
+ extracted.append({
1509
+ 'test_name': test_name,
1510
+ 'value': value,
1511
+ 'unit': unit,
1512
+ 'ref_low': ref_low,
1513
+ 'ref_high': ref_high,
1514
+ 'ref_range_str': range_str,
1515
+ 'is_flagged_in_document': is_flagged,
1516
+ })
1517
+
1518
+ return extracted
1519
+
1520
+
1521
  def extract_lab_values_from_text(structured_text: str) -> List[Dict]:
1522
  """
1523
  Extract test name, value, unit, and reference range from OCR structured text.
 
1854
  return {'url': url, 'description': ''}
1855
 
1856
 
1857
+ def check_lab_values(structured_text: str, table_data: Optional[Dict], words_with_boxes: Optional[List[Dict]] = None) -> List[Dict]:
1858
  """
1859
  Extract lab values from OCR output and check against reference ranges.
1860
+ Uses three extraction methods in priority order:
1861
+ 1. Word-position-based (most reliable — uses spatial layout from docTR)
1862
+ 2. Table-based (if table was detected)
1863
+ 3. Text regex-based (fallback)
1864
  Returns list of lab anomaly results.
1865
  """
1866
+ # Method 1: Word-position-based extraction (best for columnar lab reports)
1867
  extracted = []
1868
+ if words_with_boxes:
1869
+ extracted = extract_lab_values_from_words(words_with_boxes)
1870
+ print(f"Lab extraction (word-position): found {len(extracted)} values")
1871
 
1872
+ # Method 2: Table-based extraction
1873
+ if table_data and table_data.get('is_table'):
1874
+ table_extracted = extract_lab_values_from_table(table_data)
1875
+ print(f"Lab extraction (table): found {len(table_extracted)} values")
1876
+ existing_names = {e['test_name'].lower() for e in extracted}
1877
+ for te in table_extracted:
1878
+ if te['test_name'].lower() not in existing_names:
1879
+ extracted.append(te)
1880
+ existing_names.add(te['test_name'].lower())
1881
+
1882
+ # Method 3: Text regex fallback
1883
  text_extracted = extract_lab_values_from_text(structured_text)
1884
+ print(f"Lab extraction (text-regex): found {len(text_extracted)} values")
1885
 
1886
  # Merge: add text-extracted values if test name not already found
1887
  existing_names = {e['test_name'].lower() for e in extracted}
 
2156
  print(f"Found {len(interactions)} drug interactions")
2157
 
2158
  # Check lab values against reference ranges
2159
+ lab_anomalies = check_lab_values(structured_text, primary_table_data, words_with_boxes)
2160
  print(f"Found {len(lab_anomalies)} lab values ({sum(1 for a in lab_anomalies if a['status'] != 'normal')} abnormal)")
2161
 
2162
  return {