Spaces:

hkai20000
/

ocrAPP

Sleeping

App Files Files Community

hkai20000 commited on Feb 2

Commit

7dc006e

verified ·

1 Parent(s): 3f371b2

Update main.py

Browse files

Files changed (1) hide show

main.py +226 -4

main.py CHANGED Viewed

@@ -194,6 +194,211 @@ def basic_cleanup(text: str) -> str:
     text = " ".join(text.split())
     return text
 def extract_text_structured(result) -> str:
     """
     Extract text from docTR result preserving logical structure.
@@ -451,11 +656,22 @@ async def process_image(
         img_height, img_width = preprocessed_img.shape[:2]
         # Extract text and word bounding boxes
         structured_text = extract_text_structured(result)
         cleaned_text = basic_cleanup(structured_text)
         words_with_boxes = extract_words_with_boxes(result)
-        print(f"OCR Structured Text:\n{structured_text[:500]}...")
         print(f"Extracted {len(words_with_boxes)} words with bounding boxes")
         # Perform NER on cleaned text
@@ -485,14 +701,20 @@ async def process_image(
         print(f"Found {len(interactions)} drug interactions")
         return {
-            "structured_text": structured_text,
             "cleaned_text": cleaned_text,
             "medical_entities": entities_with_boxes,
-            "interactions": interactions,  # NEW: Drug interaction warnings
             "model_id": NER_MODELS[ner_model_id]["name"],
             "ocr_model": f"{det_arch} + {reco_arch}",
             "image_width": img_width,
-            "image_height": img_height
         }
     except Exception as e:

     text = " ".join(text.split())
     return text
+# --- TABLE DETECTION AND EXTRACTION ---
+def detect_columns(words_data: list, min_gap_ratio: float = 0.03) -> list:
+    """
+    Detect column boundaries by analyzing gaps between words.
+    Returns list of column boundaries [(x_start, x_end), ...].
+    """
+    if not words_data:
+        return []
+    all_x_starts = sorted(set(w['x'] for w in words_data))
+    if len(all_x_starts) < 2:
+        return [(0, 1)]
+    x_clusters = []
+    current_cluster = [all_x_starts[0]]
+    for i in range(1, len(all_x_starts)):
+        gap = all_x_starts[i] - all_x_starts[i-1]
+        if gap > min_gap_ratio:
+            x_clusters.append(current_cluster)
+            current_cluster = [all_x_starts[i]]
+        else:
+            current_cluster.append(all_x_starts[i])
+    x_clusters.append(current_cluster)
+    if len(x_clusters) >= 2:
+        columns = []
+        for i, cluster in enumerate(x_clusters):
+            x_start = min(cluster) - 0.01
+            if i < len(x_clusters) - 1:
+                x_end = min(x_clusters[i + 1]) - 0.005
+            else:
+                x_end = 1.0
+            columns.append((max(0, x_start), min(1, x_end)))
+        return columns
+    return [(0, 1)]
+def detect_rows(words_data: list, y_tolerance: float = 0.015) -> list:
+    """Detect row boundaries by analyzing y-positions."""
+    if not words_data:
+        return []
+    y_positions = sorted(set(w['y'] for w in words_data))
+    if not y_positions:
+        return []
+    rows = []
+    current_row_ys = [y_positions[0]]
+    for i in range(1, len(y_positions)):
+        if y_positions[i] - y_positions[i-1] <= y_tolerance:
+            current_row_ys.append(y_positions[i])
+        else:
+            rows.append(sum(current_row_ys) / len(current_row_ys))
+            current_row_ys = [y_positions[i]]
+    rows.append(sum(current_row_ys) / len(current_row_ys))
+    return rows
+def extract_table_structure(words_data: list) -> dict:
+    """
+    Extract table structure from words, returning rows and columns.
+    """
+    if not words_data:
+        return {'is_table': False, 'columns': [], 'rows': [], 'cells': []}
+    columns = detect_columns(words_data)
+    rows = detect_rows(words_data)
+    is_table = len(columns) >= 2 and len(rows) >= 2
+    if not is_table:
+        return {'is_table': False, 'columns': columns, 'rows': rows, 'cells': []}
+    y_tolerance = 0.02
+    cells = []
+    for row_y in rows:
+        row_cells = [''] * len(columns)
+        row_words = [w for w in words_data if abs(w['y'] - row_y) <= y_tolerance]
+        for word in row_words:
+            for col_idx, (col_start, col_end) in enumerate(columns):
+                if col_start <= word['x'] < col_end:
+                    if row_cells[col_idx]:
+                        row_cells[col_idx] += ' ' + word['text']
+                    else:
+                        row_cells[col_idx] = word['text']
+                    break
+        cells.append(row_cells)
+    return {
+        'is_table': True,
+        'columns': columns,
+        'rows': rows,
+        'cells': cells,
+        'num_columns': len(columns),
+        'num_rows': len(rows)
+    }
+def format_table_as_markdown(table_data: dict) -> str:
+    """Format extracted table data as a markdown table."""
+    if not table_data.get('is_table') or not table_data.get('cells'):
+        return ''
+    cells = table_data['cells']
+    if not cells:
+        return ''
+    num_cols = len(cells[0]) if cells else 0
+    if num_cols == 0:
+        return ''
+    lines = []
+    col_widths = [3] * num_cols
+    for row in cells:
+        for i, cell in enumerate(row):
+            if i < num_cols:
+                col_widths[i] = max(col_widths[i], len(cell))
+    for row_idx, row in enumerate(cells):
+        formatted_cells = []
+        for i, cell in enumerate(row):
+            if i < num_cols:
+                formatted_cells.append(cell.ljust(col_widths[i]))
+        line = '| ' + ' | '.join(formatted_cells) + ' |'
+        lines.append(line)
+        if row_idx == 0:
+            separator = '|' + '|'.join(['-' * (w + 2) for w in col_widths]) + '|'
+            lines.append(separator)
+    return '\n'.join(lines)
+def extract_text_with_table_detection(result) -> tuple:
+    """
+    Extract text from docTR result, detecting and preserving table structure.
+    Returns (structured_text, table_data).
+    """
+    all_words = []
+    for page in result.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                for word in line.words:
+                    x_min = word.geometry[0][0]
+                    y_min = word.geometry[0][1]
+                    x_max = word.geometry[1][0]
+                    y_max = word.geometry[1][1]
+                    all_words.append({
+                        'text': word.value,
+                        'x': x_min,
+                        'x_end': x_max,
+                        'y': y_min,
+                        'y_end': y_max,
+                        'width': x_max - x_min,
+                        'height': y_max - y_min
+                    })
+    if not all_words:
+        return '', {'is_table': False}
+    table_data = extract_table_structure(all_words)
+    if table_data['is_table']:
+        markdown_table = format_table_as_markdown(table_data)
+        return markdown_table, table_data
+    else:
+        all_words.sort(key=lambda w: (round(w['y'] * 50) / 50, w['x']))
+        lines = []
+        current_line = []
+        prev_y = -1
+        y_tolerance = 0.02
+        for word in all_words:
+            current_y = round(word['y'] * 50) / 50
+            if prev_y != -1 and abs(word['y'] - prev_y) > y_tolerance:
+                if current_line:
+                    lines.append(' '.join(w['text'] for w in current_line))
+                current_line = [word]
+            else:
+                current_line.append(word)
+            prev_y = word['y']
+        if current_line:
+            lines.append(' '.join(w['text'] for w in current_line))
+        return '\n'.join(lines), {'is_table': False}
 def extract_text_structured(result) -> str:
     """
     Extract text from docTR result preserving logical structure.
         img_height, img_width = preprocessed_img.shape[:2]
         # Extract text and word bounding boxes
+        # Try table detection first
+        table_formatted_text, table_data = extract_text_with_table_detection(result)
+        # Also get the regular structured text for NER processing
         structured_text = extract_text_structured(result)
         cleaned_text = basic_cleanup(structured_text)
         words_with_boxes = extract_words_with_boxes(result)
+        # Use table-formatted text if table was detected
+        if table_data.get('is_table'):
+            display_text = table_formatted_text
+            print(f"Table detected with {table_data.get('num_columns', 0)} columns and {table_data.get('num_rows', 0)} rows")
+        else:
+            display_text = structured_text
+        print(f"OCR Structured Text:\n{display_text[:500]}...")
         print(f"Extracted {len(words_with_boxes)} words with bounding boxes")
         # Perform NER on cleaned text
         print(f"Found {len(interactions)} drug interactions")
         return {
+            "structured_text": display_text,  # Table-formatted if detected, otherwise regular
             "cleaned_text": cleaned_text,
             "medical_entities": entities_with_boxes,
+            "interactions": interactions,  # Drug interaction warnings
             "model_id": NER_MODELS[ner_model_id]["name"],
             "ocr_model": f"{det_arch} + {reco_arch}",
             "image_width": img_width,
+            "image_height": img_height,
+            "table_detected": table_data.get('is_table', False),
+            "table_data": {
+                "num_columns": table_data.get('num_columns', 0),
+                "num_rows": table_data.get('num_rows', 0),
+                "cells": table_data.get('cells', [])
+            } if table_data.get('is_table') else None
         }
     except Exception as e: