Spaces:

hkai20000
/

ocrAPP

Sleeping

App Files Files Community

hkai20000 commited on Feb 4

Commit

b9ce637

verified ·

1 Parent(s): 4530cde

Update main.py

Browse files

Files changed (1) hide show

main.py +167 -6

main.py CHANGED Viewed

@@ -1373,6 +1373,151 @@ def parse_reference_range(range_str: str):
     return None, None
 def extract_lab_values_from_text(structured_text: str) -> List[Dict]:
     """
     Extract test name, value, unit, and reference range from OCR structured text.
@@ -1709,18 +1854,34 @@ def get_medlineplus_info(slug: str, status: str) -> Dict:
     return {'url': url, 'description': ''}
-def check_lab_values(structured_text: str, table_data: Optional[Dict]) -> List[Dict]:
     """
     Extract lab values from OCR output and check against reference ranges.
     Returns list of lab anomaly results.
     """
-    # Extract from table first (more reliable), then text
     extracted = []
-    if table_data and table_data.get('is_table'):
-        extracted = extract_lab_values_from_table(table_data)
-    # Also try text extraction
     text_extracted = extract_lab_values_from_text(structured_text)
     # Merge: add text-extracted values if test name not already found
     existing_names = {e['test_name'].lower() for e in extracted}
@@ -1995,7 +2156,7 @@ async def process_image(
         print(f"Found {len(interactions)} drug interactions")
         # Check lab values against reference ranges
-        lab_anomalies = check_lab_values(display_text, primary_table_data)
         print(f"Found {len(lab_anomalies)} lab values ({sum(1 for a in lab_anomalies if a['status'] != 'normal')} abnormal)")
         return {

     return None, None
+def extract_lab_values_from_words(words_with_boxes: List[Dict]) -> List[Dict]:
+    """
+    Extract lab values using word positions from docTR.
+    Groups words into rows by y-coordinate, then identifies columns
+    (test name, value, unit, range) by x-position within each row.
+    This is the most reliable method since it uses spatial layout.
+    """
+    extracted = []
+    if not words_with_boxes:
+        return extracted
+    # 1. Group words into rows by y-center (within tolerance)
+    ROW_TOLERANCE = 0.015  # Words within 1.5% of page height = same row
+    rows = []
+    sorted_words = sorted(words_with_boxes, key=lambda w: (w['bbox'][0][1], w['bbox'][0][0]))
+    current_row = []
+    current_y = None
+    for word_info in sorted_words:
+        y_center = (word_info['bbox'][0][1] + word_info['bbox'][1][1]) / 2
+        if current_y is None or abs(y_center - current_y) < ROW_TOLERANCE:
+            current_row.append(word_info)
+            if current_y is None:
+                current_y = y_center
+            else:
+                current_y = (current_y + y_center) / 2  # Running average
+        else:
+            if current_row:
+                rows.append(sorted(current_row, key=lambda w: w['bbox'][0][0]))
+            current_row = [word_info]
+            current_y = y_center
+    if current_row:
+        rows.append(sorted(current_row, key=lambda w: w['bbox'][0][0]))
+    # 2. For each row, classify words into: test_name, value, unit, range
+    UNITS = {'mg/dl', 'mmol/l', 'g/dl', 'u/l', 'miu/l', 'ng/dl', 'pg/ml',
+             'ug/dl', 'ng/ml', 'fl', 'pg', '%', 'mm/hr', 'mg/l', 'mg/mmol',
+             'ug/l', 'ml/min/1.73m2'}
+    SKIP_WORDS = {'result', 'unit', 'ref.range', 'ref', 'range', 'reference',
+                  'date', 'request', 'no', 'no:'}
+    for row in rows:
+        words_text = [w['word'] for w in row]
+        row_str = ' '.join(words_text).lower()
+        # Skip header rows
+        if 'result' in row_str and ('unit' in row_str or 'ref' in row_str):
+            continue
+        if 'profile' in row_str and len(words_text) <= 3:
+            continue
+        if 'function' in row_str and len(words_text) <= 3:
+            continue
+        # Classify each word
+        name_parts = []
+        value = None
+        unit = ''
+        range_parts = []
+        is_flagged = False
+        in_range = False
+        for w in row:
+            word = w['word'].strip()
+            word_lower = word.lower().strip('()')
+            if not word:
+                continue
+            # Check if this starts/continues a range (in parentheses)
+            if '(' in word or in_range:
+                in_range = True
+                range_parts.append(word)
+                if ')' in word:
+                    in_range = False
+                continue
+            # Check for flagged marker
+            if word == '*':
+                is_flagged = True
+                continue
+            # Check if it's a unit
+            if word_lower in UNITS or word_lower.replace('/', '').replace('.', '').replace('1', '').replace('3', '').replace('7', '').replace('m', '').replace('2', '') == '':
+                cleaned_unit = word_lower
+                if cleaned_unit in UNITS:
+                    unit = word
+                    continue
+            # Check if unit with superscript like x10⁹/L or x10^9/L
+            if 'x10' in word_lower or '10⁹' in word or '10¹²' in word:
+                unit = word
+                continue
+            # Check if it's a number (the result value)
+            cleaned_word = word.lstrip('*').strip()
+            try:
+                num = float(cleaned_word)
+                if value is None:
+                    value = num
+                    if '*' in word:
+                        is_flagged = True
+                continue
+            except ValueError:
+                pass
+            # Check if it's a skip word
+            if word_lower in SKIP_WORDS:
+                continue
+            # Check if it's Chinese characters only — skip
+            if all('\u4e00' <= c <= '\u9fff' or c in '()（）' for c in word):
+                continue
+            # Otherwise it's part of the test name
+            if any(c.isalpha() for c in word):
+                name_parts.append(word)
+        # Parse the range
+        range_str = ' '.join(range_parts).strip('() ')
+        ref_low, ref_high = parse_reference_range(range_str)
+        test_name = ' '.join(name_parts).strip()
+        # Validate: need at least a name, a value, and a range
+        if test_name and value is not None and (ref_low is not None or ref_high is not None):
+            # Filter out section headers that slipped through
+            if test_name.upper() == test_name and len(test_name.split()) > 2:
+                continue  # ALL CAPS multi-word = likely a section header
+            extracted.append({
+                'test_name': test_name,
+                'value': value,
+                'unit': unit,
+                'ref_low': ref_low,
+                'ref_high': ref_high,
+                'ref_range_str': range_str,
+                'is_flagged_in_document': is_flagged,
+            })
+    return extracted
 def extract_lab_values_from_text(structured_text: str) -> List[Dict]:
     """
     Extract test name, value, unit, and reference range from OCR structured text.
     return {'url': url, 'description': ''}
+def check_lab_values(structured_text: str, table_data: Optional[Dict], words_with_boxes: Optional[List[Dict]] = None) -> List[Dict]:
     """
     Extract lab values from OCR output and check against reference ranges.
+    Uses three extraction methods in priority order:
+    1. Word-position-based (most reliable — uses spatial layout from docTR)
+    2. Table-based (if table was detected)
+    3. Text regex-based (fallback)
     Returns list of lab anomaly results.
     """
+    # Method 1: Word-position-based extraction (best for columnar lab reports)
     extracted = []
+    if words_with_boxes:
+        extracted = extract_lab_values_from_words(words_with_boxes)
+        print(f"Lab extraction (word-position): found {len(extracted)} values")
+    # Method 2: Table-based extraction
+    if table_data and table_data.get('is_table'):
+        table_extracted = extract_lab_values_from_table(table_data)
+        print(f"Lab extraction (table): found {len(table_extracted)} values")
+        existing_names = {e['test_name'].lower() for e in extracted}
+        for te in table_extracted:
+            if te['test_name'].lower() not in existing_names:
+                extracted.append(te)
+                existing_names.add(te['test_name'].lower())
+    # Method 3: Text regex fallback
     text_extracted = extract_lab_values_from_text(structured_text)
+    print(f"Lab extraction (text-regex): found {len(text_extracted)} values")
     # Merge: add text-extracted values if test name not already found
     existing_names = {e['test_name'].lower() for e in extracted}
         print(f"Found {len(interactions)} drug interactions")
         # Check lab values against reference ranges
+        lab_anomalies = check_lab_values(structured_text, primary_table_data, words_with_boxes)
         print(f"Found {len(lab_anomalies)} lab values ({sum(1 for a in lab_anomalies if a['status'] != 'normal')} abnormal)")
         return {