File size: 11,957 Bytes
0e09218
 
 
 
 
 
 
 
 
 
 
fe90fd9
 
 
 
 
 
 
 
 
0e09218
 
 
fe90fd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e09218
 
fe90fd9
 
5655435
fe90fd9
 
 
 
0e09218
 
fe90fd9
 
 
 
 
 
 
af88408
 
fe90fd9
 
af88408
fe90fd9
 
af88408
 
fe90fd9
af88408
 
 
 
fe90fd9
af88408
fe90fd9
 
af88408
fe90fd9
af88408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe90fd9
 
 
 
 
 
 
 
 
 
 
0e09218
fe90fd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e09218
fe90fd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e09218
fe90fd9
 
 
0e09218
fe90fd9
 
 
0e09218
fe90fd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e09218
fe90fd9
 
 
 
0e09218
 
fe90fd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e09218
 
 
fe90fd9
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import gradio as gr
import easyocr
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import os
import requests
from pathlib import Path
import pandas as pd
import pytesseract
from pytesseract import Output
import traceback
import logging
import sys

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger(__name__)

# Download and cache the font file
def get_font():
    try:
        logger.info("Attempting to get font...")
        font_path = Path("Roboto-Regular.ttf")
        if not font_path.exists():
            logger.info("Font not found, downloading...")
            font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
            response = requests.get(font_url)
            font_path.write_bytes(response.content)
            logger.info("Font downloaded successfully")
        else:
            logger.info("Font already exists")
        return str(font_path)
    except Exception as e:
        logger.error(f"Error in get_font: {str(e)}")
        logger.error(traceback.format_exc())
        return None

# Initialize EasyOCR Reader for French
try:
    logger.info("Initializing EasyOCR Reader for French...")
    reader = easyocr.Reader(['fr', 'en'], gpu=False)  # Changed to False since you're on CPU
    logger.info("EasyOCR Reader initialized successfully")
except Exception as e:
    logger.error(f"Error initializing EasyOCR: {str(e)}")
    logger.error(traceback.format_exc())

def ocr_extract_text_and_tables(image):
    try:
        logger.info("Starting OCR extraction...")
        
        if image is None:
            logger.warning("No image provided")
            return "No image provided", None, None
        
        logger.info(f"Image shape: {image.shape}, dtype: {image.dtype}")
        
        # Convert to RGB if needed
        if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA
            logger.info("Converting RGBA to RGB")
            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
        
        # Create copy for table detection
        table_image = image.copy()
        
        # 1. First extract general text with EasyOCR
        logger.info("Running EasyOCR text detection...")
        results = reader.readtext(image)
        logger.info(f"EasyOCR detected {len(results)} text regions")
        
        # Prepare text output and confidence scores
        detected_text = []
        for i, (bbox, text, confidence) in enumerate(results):
            logger.info(f"Text region {i+1}: '{text}' with confidence {confidence:.2f}")
            detected_text.append(f"{text} (Confidence: {confidence:.2f})")
        
        # 2. Use pytesseract for table detection and extraction
        logger.info("Running Pytesseract for table detection...")
        try:
            pytesseract_config = r'--oem 3 --psm 6 -l fra'  # French language
            logger.info(f"Pytesseract config: {pytesseract_config}")
            df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
            logger.info(f"Pytesseract returned dataframe with shape: {df.shape}")
        except Exception as e:
            logger.error(f"Pytesseract error: {str(e)}")
            logger.error(traceback.format_exc())
            df = pd.DataFrame()  # Empty dataframe to continue processing
        
        # Filter out low-confidence text
        try:
            if not df.empty:
                logger.info("Filtering low-confidence text...")
                df = df.dropna(subset=['text'])
                logger.info(f"After dropna, dataframe shape: {df.shape}")
                if 'conf' in df.columns:
                    df = df.query('conf > 50')
                    logger.info(f"After confidence filtering, dataframe shape: {df.shape}")
                else:
                    logger.warning("No 'conf' column found in pytesseract output")
        except Exception as e:
            logger.error(f"Error filtering dataframe: {str(e)}")
            logger.error(traceback.format_exc())
        
        # Try to identify table structures based on alignment and spacing
        tables = []
        
        try:
            if not df.empty and 'block_num' in df.columns:
                logger.info("Attempting to identify tables...")
                # Simple table detection: look for text that's aligned in columns with similar x-coordinates
                # Group by block_num which often separates tables
                blocks = df['block_num'].unique()
                logger.info(f"Found {len(blocks)} text blocks")
                
                for block in blocks:
                    logger.info(f"Processing block {block}")
                    block_df = df[df['block_num'] == block]
                    if len(block_df) > 4:  # Assuming a table has at least a few cells
                        logger.info(f"Block {block} has {len(block_df)} cells, might be a table")
                        # Sort by top-to-bottom (vertical position)
                        sorted_df = block_df.sort_values(['top', 'left'])
                        
                        # Convert to pandas table format
                        table_rows = []
                        current_row = []
                        last_top = -100
                        
                        for _, row in sorted_df.iterrows():
                            # If we're on a new row (based on vertical position)
                            if abs(row['top'] - last_top) > 10:  # Threshold for new row
                                if current_row:
                                    table_rows.append(current_row)
                                    current_row = []
                                last_top = row['top']
                            
                            current_row.append(row['text'])
                        
                        # Add the last row
                        if current_row:
                            table_rows.append(current_row)
                        
                        logger.info(f"Extracted {len(table_rows)} rows from potential table")
                        
                        # If we have multiple rows, we might have a table
                        if len(table_rows) > 1:
                            # Try to create a pandas DataFrame
                            try:
                                # Pad rows to have equal length
                                max_cols = max(len(row) for row in table_rows)
                                logger.info(f"Table has {max_cols} columns")
                                padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
                                
                                # Create DataFrame
                                table_df = pd.DataFrame(padded_rows)
                                # Add to tables list
                                tables.append(table_df)
                                logger.info(f"Successfully created table with shape {table_df.shape}")
                            except Exception as e:
                                logger.error(f"Error creating table DataFrame: {str(e)}")
                                logger.error(traceback.format_exc())
        except Exception as e:
            logger.error(f"Error in table detection: {str(e)}")
            logger.error(traceback.format_exc())
        
        logger.info(f"Detected {len(tables)} tables")
        
        # Create annotated image
        try:
            logger.info("Creating annotated image...")
            pil_image = Image.fromarray(image)
            draw = ImageDraw.Draw(pil_image)
            
            # Get font for annotation
            logger.info("Loading font...")
            try:
                font_path = get_font()
                if font_path:
                    font = ImageFont.truetype(font_path, size=20)
                    logger.info("Font loaded successfully")
                else:
                    logger.warning("Font path is None, using default font")
                    font = ImageFont.load_default()
            except Exception as e:
                logger.error(f"Error loading font: {str(e)}")
                logger.error(traceback.format_exc())
                font = ImageFont.load_default()
                logger.info("Using default font instead")
            
            # Draw boxes and text for regular text detection
            logger.info("Drawing annotation boxes...")
            for i, (bbox, text, confidence) in enumerate(results):
                try:
                    # Convert points to integers
                    top_left = tuple(map(int, bbox[0]))
                    bottom_right = tuple(map(int, bbox[2]))
                    
                    # Draw rectangle
                    draw.rectangle([top_left, bottom_right], outline="red", width=3)
                    
                    # Draw text with confidence
                    text_with_conf = f"{text} ({confidence:.2f})"
                    draw.text(top_left, text_with_conf, fill="blue", font=font)
                    
                    logger.info(f"Drew annotation for text region {i+1}")
                except Exception as e:
                    logger.error(f"Error drawing annotation for region {i+1}: {str(e)}")
                    continue
            
            # Convert back to numpy array
            annotated_image = np.array(pil_image)
            logger.info("Annotated image created successfully")
        except Exception as e:
            logger.error(f"Error creating annotated image: {str(e)}")
            logger.error(traceback.format_exc())
            annotated_image = image.copy()  # Return original image if annotation fails
        
        # Join detected text with proper formatting
        text_output = "\n".join(detected_text)
        
        # Format tables for display
        tables_output = ""
        for i, table in enumerate(tables):
            tables_output += f"Table {i+1}:\n"
            tables_output += table.to_string(index=False, header=False) + "\n\n"
        
        logger.info("OCR extraction completed successfully")
        return text_output, tables_output, annotated_image
        
    except Exception as e:
        error_msg = f"Unexpected error in OCR extraction: {str(e)}"
        logger.error(error_msg)
        logger.error(traceback.format_exc())
        return f"Error: {error_msg}", "Processing failed", None

# Create Gradio interface
try:
    logger.info("Creating Gradio interface...")
    iface = gr.Interface(
        fn=ocr_extract_text_and_tables,
        inputs=gr.Image(type="numpy", label="Upload Image"),
        outputs=[
            gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
            gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
            gr.Image(label="Annotated Image")
        ],
        title="French OCR & Table Extractor",
        description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
        examples=[],  # You can add example images here
        cache_examples=True
    )
    logger.info("Gradio interface created successfully")
except Exception as e:
    logger.error(f"Error creating Gradio interface: {str(e)}")
    logger.error(traceback.format_exc())

# Launch the interface
if __name__ == "__main__":
    try:
        logger.info("Launching Gradio interface...")
        iface.launch()
        logger.info("Gradio interface launched successfully")
    except Exception as e:
        logger.error(f"Error launching Gradio interface: {str(e)}")
        logger.error(traceback.format_exc())