File size: 4,433 Bytes
e8051be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import cv2
import pytesseract
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from typing import List, Dict, Any

ImageFile.LOAD_TRUNCATED_IMAGES = True

def load_local_image(path: str) -> np.ndarray:
    """Load image from local path."""
    img = Image.open(path).convert("RGB")
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def sort_contours(cnts, method="top-to-bottom"):
    """Sort contours based on the specified method."""
    reverse = False
    i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
                                        key=lambda b: b[1][i], reverse=reverse))
    return cnts, boundingBoxes

def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
    """Extract table structure from image using OpenCV."""
    gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # Detect horizontal lines
    horizontal = binary.copy()
    cols = horizontal.shape[1]
    horizontal_size = cols // 15
    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    horizontal = cv2.erode(horizontal, horizontal_structure)
    horizontal = cv2.dilate(horizontal, horizontal_structure)

    # Detect vertical lines
    vertical = binary.copy()
    rows = vertical.shape[0]
    vertical_size = rows // 15
    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
    vertical = cv2.erode(vertical, vertical_structure)
    vertical = cv2.dilate(vertical, vertical_structure)

    # Combine mask
    mask = cv2.add(horizontal, vertical)
    contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    cells = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 30 and h > 20:  # Filter small contours
            cell_img = table_img[y:y+h, x:x+w]
            try:
                text = pytesseract.image_to_string(cell_img, config='--psm 7').strip()
                cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
            except:
                cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''})

    # Sort cells by position to create table structure
    cells.sort(key=lambda cell: (cell['y'], cell['x']))
    
    # Group cells into rows
    rows = []
    current_row = []
    current_y = 0
    
    for cell in cells:
        if abs(cell['y'] - current_y) > 20:  # New row threshold
            if current_row:
                rows.append(current_row)
            current_row = [cell]
            current_y = cell['y']
        else:
            current_row.append(cell)
    
    if current_row:
        rows.append(current_row)

    # Convert to DataFrame
    table_data = []
    for row in rows:
        row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])]
        table_data.append(row_data)

    if table_data:
        max_cols = max(len(row) for row in table_data)
        for row in table_data:
            while len(row) < max_cols:
                row.append('')
        return pd.DataFrame(table_data)
    else:
        return pd.DataFrame()

def extract_image_content(image_path: str) -> str:
    """Extract text content from images using OCR."""
    try:
        # Load image
        img = load_local_image(image_path)
        
        # Basic OCR
        text = pytesseract.image_to_string(img)
        
        # Try to detect if it's a table
        if '|' in text or '\\t' in text or len(text.split('\\n')) > 3:
            # Try table extraction
            try:
                table_df = extract_cells_from_grid(img)
                if not table_df.empty:
                    table_text = "\\n".join([" | ".join(row) for row in table_df.values])
                    return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}"
            except:
                pass
        
        return text.strip() if text.strip() else "[No text detected in image]"
        
    except Exception as e:
        return f"[Error processing image: {str(e)}]"