Spaces:

hkai20000
/

ocrAPP

Sleeping

App Files Files Community

hkai20000 commited on Feb 2

Commit

af2ef1c

verified ·

1 Parent(s): 7dc006e

Update main.py

Browse files

Files changed (1) hide show

main.py +115 -160

main.py CHANGED Viewed

@@ -4,12 +4,15 @@ from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 import cv2
 import numpy as np
 from PIL import Image
 import io
 import json
 import os
 from typing import Dict, Any, Optional, List
 app = FastAPI(title="ScanAssured OCR & NER API")
@@ -195,112 +198,100 @@ def basic_cleanup(text: str) -> str:
     return text
-# --- TABLE DETECTION AND EXTRACTION ---
-def detect_columns(words_data: list, min_gap_ratio: float = 0.03) -> list:
-    """
-    Detect column boundaries by analyzing gaps between words.
-    Returns list of column boundaries [(x_start, x_end), ...].
-    """
-    if not words_data:
-        return []
-    all_x_starts = sorted(set(w['x'] for w in words_data))
-    if len(all_x_starts) < 2:
-        return [(0, 1)]
-    x_clusters = []
-    current_cluster = [all_x_starts[0]]
-    for i in range(1, len(all_x_starts)):
-        gap = all_x_starts[i] - all_x_starts[i-1]
-        if gap > min_gap_ratio:
-            x_clusters.append(current_cluster)
-            current_cluster = [all_x_starts[i]]
-        else:
-            current_cluster.append(all_x_starts[i])
-    x_clusters.append(current_cluster)
-    if len(x_clusters) >= 2:
-        columns = []
-        for i, cluster in enumerate(x_clusters):
-            x_start = min(cluster) - 0.01
-            if i < len(x_clusters) - 1:
-                x_end = min(x_clusters[i + 1]) - 0.005
-            else:
-                x_end = 1.0
-            columns.append((max(0, x_start), min(1, x_end)))
-        return columns
-    return [(0, 1)]
-def detect_rows(words_data: list, y_tolerance: float = 0.015) -> list:
-    """Detect row boundaries by analyzing y-positions."""
-    if not words_data:
-        return []
-    y_positions = sorted(set(w['y'] for w in words_data))
-    if not y_positions:
-        return []
-    rows = []
-    current_row_ys = [y_positions[0]]
-    for i in range(1, len(y_positions)):
-        if y_positions[i] - y_positions[i-1] <= y_tolerance:
-            current_row_ys.append(y_positions[i])
-        else:
-            rows.append(sum(current_row_ys) / len(current_row_ys))
-            current_row_ys = [y_positions[i]]
-    rows.append(sum(current_row_ys) / len(current_row_ys))
-    return rows
-def extract_table_structure(words_data: list) -> dict:
     """
-    Extract table structure from words, returning rows and columns.
     """
-    if not words_data:
-        return {'is_table': False, 'columns': [], 'rows': [], 'cells': []}
-    columns = detect_columns(words_data)
-    rows = detect_rows(words_data)
-    is_table = len(columns) >= 2 and len(rows) >= 2
-    if not is_table:
-        return {'is_table': False, 'columns': columns, 'rows': rows, 'cells': []}
-    y_tolerance = 0.02
-    cells = []
-    for row_y in rows:
-        row_cells = [''] * len(columns)
-        row_words = [w for w in words_data if abs(w['y'] - row_y) <= y_tolerance]
-        for word in row_words:
-            for col_idx, (col_start, col_end) in enumerate(columns):
-                if col_start <= word['x'] < col_end:
-                    if row_cells[col_idx]:
-                        row_cells[col_idx] += ' ' + word['text']
-                    else:
-                        row_cells[col_idx] = word['text']
-                    break
-        cells.append(row_cells)
-    return {
-        'is_table': True,
-        'columns': columns,
-        'rows': rows,
-        'cells': cells,
-        'num_columns': len(columns),
-        'num_rows': len(rows)
-    }
 def format_table_as_markdown(table_data: dict) -> str:
@@ -312,22 +303,27 @@ def format_table_as_markdown(table_data: dict) -> str:
     if not cells:
         return ''
-    num_cols = len(cells[0]) if cells else 0
     if num_cols == 0:
         return ''
     lines = []
     col_widths = [3] * num_cols
     for row in cells:
-        for i, cell in enumerate(row):
             if i < num_cols:
-                col_widths[i] = max(col_widths[i], len(cell))
-    for row_idx, row in enumerate(cells):
         formatted_cells = []
         for i, cell in enumerate(row):
             if i < num_cols:
-                formatted_cells.append(cell.ljust(col_widths[i]))
         line = '| ' + ' | '.join(formatted_cells) + ' |'
         lines.append(line)
@@ -339,64 +335,18 @@ def format_table_as_markdown(table_data: dict) -> str:
     return '\n'.join(lines)
-def extract_text_with_table_detection(result) -> tuple:
     """
-    Extract text from docTR result, detecting and preserving table structure.
-    Returns (structured_text, table_data).
     """
-    all_words = []
-    for page in result.pages:
-        for block in page.blocks:
-            for line in block.lines:
-                for word in line.words:
-                    x_min = word.geometry[0][0]
-                    y_min = word.geometry[0][1]
-                    x_max = word.geometry[1][0]
-                    y_max = word.geometry[1][1]
-                    all_words.append({
-                        'text': word.value,
-                        'x': x_min,
-                        'x_end': x_max,
-                        'y': y_min,
-                        'y_end': y_max,
-                        'width': x_max - x_min,
-                        'height': y_max - y_min
-                    })
-    if not all_words:
-        return '', {'is_table': False}
-    table_data = extract_table_structure(all_words)
-    if table_data['is_table']:
         markdown_table = format_table_as_markdown(table_data)
         return markdown_table, table_data
     else:
-        all_words.sort(key=lambda w: (round(w['y'] * 50) / 50, w['x']))
-        lines = []
-        current_line = []
-        prev_y = -1
-        y_tolerance = 0.02
-        for word in all_words:
-            current_y = round(word['y'] * 50) / 50
-            if prev_y != -1 and abs(word['y'] - prev_y) > y_tolerance:
-                if current_line:
-                    lines.append(' '.join(w['text'] for w in current_line))
-                current_line = [word]
-            else:
-                current_line.append(word)
-            prev_y = word['y']
-        if current_line:
-            lines.append(' '.join(w['text'] for w in current_line))
-        return '\n'.join(lines), {'is_table': False}
 def extract_text_structured(result) -> str:
@@ -655,24 +605,29 @@ async def process_image(
         # Get image dimensions for frontend highlighting
         img_height, img_width = preprocessed_img.shape[:2]
-        # Extract text and word bounding boxes
-        # Try table detection first
-        table_formatted_text, table_data = extract_text_with_table_detection(result)
-        # Also get the regular structured text for NER processing
         structured_text = extract_text_structured(result)
         cleaned_text = basic_cleanup(structured_text)
         words_with_boxes = extract_words_with_boxes(result)
         # Use table-formatted text if table was detected
         if table_data.get('is_table'):
             display_text = table_formatted_text
             print(f"Table detected with {table_data.get('num_columns', 0)} columns and {table_data.get('num_rows', 0)} rows")
         else:
             display_text = structured_text
-        print(f"OCR Structured Text:\n{display_text[:500]}...")
-        print(f"Extracted {len(words_with_boxes)} words with bounding boxes")
         # Perform NER on cleaned text
         print("Running NER...")

 from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
+from img2table.document import Image as Img2TableImage
+from img2table.ocr import DocTR
 import cv2
 import numpy as np
 from PIL import Image
 import io
 import json
 import os
+import tempfile
 from typing import Dict, Any, Optional, List
 app = FastAPI(title="ScanAssured OCR & NER API")
     return text
+# --- TABLE DETECTION WITH IMG2TABLE ---
+# Cache for img2table OCR instance
+img2table_ocr_cache = {}
+def get_img2table_ocr():
+    """Get or create img2table DocTR OCR instance."""
+    if 'doctr' not in img2table_ocr_cache:
+        img2table_ocr_cache['doctr'] = DocTR()
+    return img2table_ocr_cache['doctr']
+def extract_tables_with_img2table(image_bytes: bytes, img_width: int, img_height: int) -> dict:
     """
+    Use img2table to detect and extract table structure from image.
+    Returns table data with properly structured cells.
     """
+    try:
+        # Save image to temp file (img2table needs file path)
+        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
+            tmp_file.write(image_bytes)
+            tmp_path = tmp_file.name
+        # Create img2table Image object
+        img2table_img = Img2TableImage(src=tmp_path)
+        # Get OCR instance
+        ocr = get_img2table_ocr()
+        # Extract tables with OCR
+        tables = img2table_img.extract_tables(
+            ocr=ocr,
+            implicit_rows=True,  # Detect rows even without horizontal lines
+            implicit_columns=True,  # Detect columns even without vertical lines
+            borderless_tables=True,  # Detect tables without borders
+            min_confidence=50  # Minimum OCR confidence
+        )
+        # Clean up temp file
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass
+        if not tables:
+            return {'is_table': False, 'tables': []}
+        # Process all detected tables
+        all_tables = []
+        for table in tables:
+            # Get table content as list of lists
+            if hasattr(table, 'content'):
+                cells = []
+                for row in table.content:
+                    row_cells = []
+                    for cell in row:
+                        # Cell can be string or have value attribute
+                        if cell is None:
+                            row_cells.append('')
+                        elif isinstance(cell, str):
+                            row_cells.append(cell.strip())
+                        elif hasattr(cell, 'value'):
+                            row_cells.append(str(cell.value).strip() if cell.value else '')
+                        else:
+                            row_cells.append(str(cell).strip())
+                    cells.append(row_cells)
+                if cells and any(any(c for c in row) for row in cells):
+                    all_tables.append({
+                        'cells': cells,
+                        'num_rows': len(cells),
+                        'num_columns': len(cells[0]) if cells else 0
+                    })
+        if not all_tables:
+            return {'is_table': False, 'tables': []}
+        # Return the largest table (most cells) as primary
+        primary_table = max(all_tables, key=lambda t: t['num_rows'] * t['num_columns'])
+        return {
+            'is_table': True,
+            'cells': primary_table['cells'],
+            'num_rows': primary_table['num_rows'],
+            'num_columns': primary_table['num_columns'],
+            'tables': all_tables,
+            'total_tables': len(all_tables)
+        }
+    except Exception as e:
+        print(f"img2table extraction error: {e}")
+        import traceback
+        traceback.print_exc()
+        return {'is_table': False, 'error': str(e)}
 def format_table_as_markdown(table_data: dict) -> str:
     if not cells:
         return ''
+    num_cols = max(len(row) for row in cells) if cells else 0
     if num_cols == 0:
         return ''
     lines = []
     col_widths = [3] * num_cols
+    # Normalize rows to have same number of columns
+    normalized_cells = []
     for row in cells:
+        normalized_row = list(row) + [''] * (num_cols - len(row))
+        normalized_cells.append(normalized_row)
+        for i, cell in enumerate(normalized_row):
             if i < num_cols:
+                col_widths[i] = max(col_widths[i], len(str(cell)))
+    for row_idx, row in enumerate(normalized_cells):
         formatted_cells = []
         for i, cell in enumerate(row):
             if i < num_cols:
+                formatted_cells.append(str(cell).ljust(col_widths[i]))
         line = '| ' + ' | '.join(formatted_cells) + ' |'
         lines.append(line)
     return '\n'.join(lines)
+def extract_text_with_table_detection(image_bytes: bytes, img_width: int, img_height: int) -> tuple:
     """
+    Extract tables from image using img2table.
+    Returns (markdown_text, table_data).
     """
+    table_data = extract_tables_with_img2table(image_bytes, img_width, img_height)
+    if table_data.get('is_table'):
         markdown_table = format_table_as_markdown(table_data)
         return markdown_table, table_data
     else:
+        return '', {'is_table': False}
 def extract_text_structured(result) -> str:
         # Get image dimensions for frontend highlighting
         img_height, img_width = preprocessed_img.shape[:2]
+        # Extract text and word bounding boxes using docTR
         structured_text = extract_text_structured(result)
         cleaned_text = basic_cleanup(structured_text)
         words_with_boxes = extract_words_with_boxes(result)
+        print(f"OCR Structured Text:\n{structured_text[:500]}...")
+        print(f"Extracted {len(words_with_boxes)} words with bounding boxes")
+        # Try table detection with img2table
+        print("Running img2table for table detection...")
+        table_formatted_text, table_data = extract_text_with_table_detection(
+            img_bytes, img_width, img_height
+        )
         # Use table-formatted text if table was detected
         if table_data.get('is_table'):
             display_text = table_formatted_text
             print(f"Table detected with {table_data.get('num_columns', 0)} columns and {table_data.get('num_rows', 0)} rows")
+            if table_data.get('total_tables', 0) > 1:
+                print(f"Total tables found: {table_data.get('total_tables')}")
         else:
             display_text = structured_text
+            print("No table detected, using regular OCR text")
         # Perform NER on cleaned text
         print("Running NER...")