GiantAnalytics commited on
Commit
fe90fd9
·
verified ·
1 Parent(s): b828227

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -125
app.py CHANGED
@@ -9,146 +9,258 @@ from pathlib import Path
9
  import pandas as pd
10
  import pytesseract
11
  from pytesseract import Output
 
 
 
 
 
 
 
 
 
12
 
13
  # Download and cache the font file
14
  def get_font():
15
- font_path = Path("Roboto-Regular.ttf")
16
- if not font_path.exists():
17
- font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
18
- response = requests.get(font_url)
19
- font_path.write_bytes(response.content)
20
- return str(font_path)
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Initialize EasyOCR Reader for French
23
- reader = easyocr.Reader(['fr'], gpu=True) # Set gpu=False if no GPU available
 
 
 
 
 
 
24
 
25
  def ocr_extract_text_and_tables(image):
26
- if image is None:
27
- return "No image provided", None, None
28
-
29
- # Convert to RGB if needed
30
- if len(image.shape) == 3 and image.shape[2] == 4: # RGBA
31
- image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
32
-
33
- # Create copy for table detection
34
- table_image = image.copy()
35
-
36
- # 1. First extract general text with EasyOCR
37
- results = reader.readtext(image)
38
-
39
- # Prepare text output and confidence scores
40
- detected_text = []
41
- for (_, text, confidence) in results:
42
- detected_text.append(f"{text} (Confidence: {confidence:.2f})")
43
-
44
- # 2. Use pytesseract for table detection and extraction
45
- # This approach uses pytesseract's data.frame output to identify potential tables
46
- pytesseract_config = r'--oem 3 --psm 6 -l fra' # French language
47
- df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
48
-
49
- # Filter out low-confidence text
50
- df = df.dropna(subset=['text']).query('conf > 50')
51
-
52
- # Try to identify table structures based on alignment and spacing
53
- tables = []
54
-
55
- # Simple table detection: look for text that's aligned in columns with similar x-coordinates
56
- # Group by block_num which often separates tables
57
- blocks = df['block_num'].unique()
58
-
59
- for block in blocks:
60
- block_df = df[df['block_num'] == block]
61
- if len(block_df) > 4: # Assuming a table has at least a few cells
62
- # Sort by top-to-bottom (vertical position)
63
- sorted_df = block_df.sort_values(['top', 'left'])
64
-
65
- # Convert to pandas table format
66
- table_rows = []
67
- current_row = []
68
- last_top = -100
69
-
70
- for _, row in sorted_df.iterrows():
71
- # If we're on a new row (based on vertical position)
72
- if abs(row['top'] - last_top) > 10: # Threshold for new row
73
- if current_row:
74
- table_rows.append(current_row)
75
- current_row = []
76
- last_top = row['top']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- current_row.append(row['text'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Add the last row
81
- if current_row:
82
- table_rows.append(current_row)
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- # If we have multiple rows, we might have a table
85
- if len(table_rows) > 1:
86
- # Try to create a pandas DataFrame
87
  try:
88
- # Pad rows to have equal length
89
- max_cols = max(len(row) for row in table_rows)
90
- padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
91
 
92
- # Create DataFrame
93
- table_df = pd.DataFrame(padded_rows)
94
- # Add to tables list
95
- tables.append(table_df)
96
- except:
97
- pass
98
-
99
- # Create annotated image
100
- pil_image = Image.fromarray(image)
101
- draw = ImageDraw.Draw(pil_image)
102
-
103
- # Get font for annotation
104
- try:
105
- font = ImageFont.truetype(get_font(), size=20)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  except Exception as e:
107
- print(f"Error loading font: {e}")
108
- font = ImageFont.load_default()
109
-
110
- # Draw boxes and text for regular text detection
111
- for (bbox, text, confidence) in results:
112
- # Convert points to integers
113
- top_left = tuple(map(int, bbox[0]))
114
- bottom_right = tuple(map(int, bbox[2]))
115
-
116
- # Draw rectangle
117
- draw.rectangle([top_left, bottom_right], outline="red", width=3)
118
-
119
- # Draw text with confidence
120
- text_with_conf = f"{text} ({confidence:.2f})"
121
- draw.text(top_left, text_with_conf, fill="blue", font=font)
122
-
123
- # Convert back to numpy array
124
- annotated_image = np.array(pil_image)
125
-
126
- # Join detected text with proper formatting
127
- text_output = "\n".join(detected_text)
128
-
129
- # Format tables for display
130
- tables_output = ""
131
- for i, table in enumerate(tables):
132
- tables_output += f"Table {i+1}:\n"
133
- tables_output += table.to_string(index=False, header=False) + "\n\n"
134
-
135
- return text_output, tables_output, annotated_image
136
 
137
  # Create Gradio interface
138
- iface = gr.Interface(
139
- fn=ocr_extract_text_and_tables,
140
- inputs=gr.Image(type="numpy", label="Upload Image"),
141
- outputs=[
142
- gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
143
- gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
144
- gr.Image(label="Annotated Image")
145
- ],
146
- title="French OCR & Table Extractor",
147
- description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
148
- examples=[], # You can add example images here
149
- cache_examples=True
150
- )
 
 
 
 
 
 
151
 
152
  # Launch the interface
153
  if __name__ == "__main__":
154
- iface.launch()
 
 
 
 
 
 
 
9
  import pandas as pd
10
  import pytesseract
11
  from pytesseract import Output
12
+ import traceback
13
+ import logging
14
+ import sys
15
+
16
+ # Set up logging
17
+ logging.basicConfig(level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s',
19
+ handlers=[logging.StreamHandler(sys.stdout)])
20
+ logger = logging.getLogger(__name__)
21
 
22
  # Download and cache the font file
23
  def get_font():
24
+ try:
25
+ logger.info("Attempting to get font...")
26
+ font_path = Path("Roboto-Regular.ttf")
27
+ if not font_path.exists():
28
+ logger.info("Font not found, downloading...")
29
+ font_url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
30
+ response = requests.get(font_url)
31
+ font_path.write_bytes(response.content)
32
+ logger.info("Font downloaded successfully")
33
+ else:
34
+ logger.info("Font already exists")
35
+ return str(font_path)
36
+ except Exception as e:
37
+ logger.error(f"Error in get_font: {str(e)}")
38
+ logger.error(traceback.format_exc())
39
+ return None
40
 
41
  # Initialize EasyOCR Reader for French
42
+ try:
43
+ logger.info("Initializing EasyOCR Reader for French...")
44
+ reader = easyocr.Reader(['fr'], gpu=False) # Changed to False since you're on CPU
45
+ logger.info("EasyOCR Reader initialized successfully")
46
+ except Exception as e:
47
+ logger.error(f"Error initializing EasyOCR: {str(e)}")
48
+ logger.error(traceback.format_exc())
49
 
50
  def ocr_extract_text_and_tables(image):
51
+ try:
52
+ logger.info("Starting OCR extraction...")
53
+
54
+ if image is None:
55
+ logger.warning("No image provided")
56
+ return "No image provided", None, None
57
+
58
+ logger.info(f"Image shape: {image.shape}, dtype: {image.dtype}")
59
+
60
+ # Convert to RGB if needed
61
+ if len(image.shape) == 3 and image.shape[2] == 4: # RGBA
62
+ logger.info("Converting RGBA to RGB")
63
+ image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
64
+
65
+ # Create copy for table detection
66
+ table_image = image.copy()
67
+
68
+ # 1. First extract general text with EasyOCR
69
+ logger.info("Running EasyOCR text detection...")
70
+ results = reader.readtext(image)
71
+ logger.info(f"EasyOCR detected {len(results)} text regions")
72
+
73
+ # Prepare text output and confidence scores
74
+ detected_text = []
75
+ for i, (bbox, text, confidence) in enumerate(results):
76
+ logger.info(f"Text region {i+1}: '{text}' with confidence {confidence:.2f}")
77
+ detected_text.append(f"{text} (Confidence: {confidence:.2f})")
78
+
79
+ # 2. Use pytesseract for table detection and extraction
80
+ logger.info("Running Pytesseract for table detection...")
81
+ try:
82
+ pytesseract_config = r'--oem 3 --psm 6 -l fra' # French language
83
+ logger.info(f"Pytesseract config: {pytesseract_config}")
84
+ df = pytesseract.image_to_data(table_image, output_type=Output.DATAFRAME, config=pytesseract_config)
85
+ logger.info(f"Pytesseract returned dataframe with shape: {df.shape}")
86
+ except Exception as e:
87
+ logger.error(f"Pytesseract error: {str(e)}")
88
+ logger.error(traceback.format_exc())
89
+ df = pd.DataFrame() # Empty dataframe to continue processing
90
+
91
+ # Filter out low-confidence text
92
+ try:
93
+ if not df.empty:
94
+ logger.info("Filtering low-confidence text...")
95
+ df = df.dropna(subset=['text'])
96
+ logger.info(f"After dropna, dataframe shape: {df.shape}")
97
+ if 'conf' in df.columns:
98
+ df = df.query('conf > 50')
99
+ logger.info(f"After confidence filtering, dataframe shape: {df.shape}")
100
+ else:
101
+ logger.warning("No 'conf' column found in pytesseract output")
102
+ except Exception as e:
103
+ logger.error(f"Error filtering dataframe: {str(e)}")
104
+ logger.error(traceback.format_exc())
105
+
106
+ # Try to identify table structures based on alignment and spacing
107
+ tables = []
108
+
109
+ try:
110
+ if not df.empty and 'block_num' in df.columns:
111
+ logger.info("Attempting to identify tables...")
112
+ # Simple table detection: look for text that's aligned in columns with similar x-coordinates
113
+ # Group by block_num which often separates tables
114
+ blocks = df['block_num'].unique()
115
+ logger.info(f"Found {len(blocks)} text blocks")
116
 
117
+ for block in blocks:
118
+ logger.info(f"Processing block {block}")
119
+ block_df = df[df['block_num'] == block]
120
+ if len(block_df) > 4: # Assuming a table has at least a few cells
121
+ logger.info(f"Block {block} has {len(block_df)} cells, might be a table")
122
+ # Sort by top-to-bottom (vertical position)
123
+ sorted_df = block_df.sort_values(['top', 'left'])
124
+
125
+ # Convert to pandas table format
126
+ table_rows = []
127
+ current_row = []
128
+ last_top = -100
129
+
130
+ for _, row in sorted_df.iterrows():
131
+ # If we're on a new row (based on vertical position)
132
+ if abs(row['top'] - last_top) > 10: # Threshold for new row
133
+ if current_row:
134
+ table_rows.append(current_row)
135
+ current_row = []
136
+ last_top = row['top']
137
+
138
+ current_row.append(row['text'])
139
+
140
+ # Add the last row
141
+ if current_row:
142
+ table_rows.append(current_row)
143
+
144
+ logger.info(f"Extracted {len(table_rows)} rows from potential table")
145
+
146
+ # If we have multiple rows, we might have a table
147
+ if len(table_rows) > 1:
148
+ # Try to create a pandas DataFrame
149
+ try:
150
+ # Pad rows to have equal length
151
+ max_cols = max(len(row) for row in table_rows)
152
+ logger.info(f"Table has {max_cols} columns")
153
+ padded_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
154
+
155
+ # Create DataFrame
156
+ table_df = pd.DataFrame(padded_rows)
157
+ # Add to tables list
158
+ tables.append(table_df)
159
+ logger.info(f"Successfully created table with shape {table_df.shape}")
160
+ except Exception as e:
161
+ logger.error(f"Error creating table DataFrame: {str(e)}")
162
+ logger.error(traceback.format_exc())
163
+ except Exception as e:
164
+ logger.error(f"Error in table detection: {str(e)}")
165
+ logger.error(traceback.format_exc())
166
+
167
+ logger.info(f"Detected {len(tables)} tables")
168
+
169
+ # Create annotated image
170
+ try:
171
+ logger.info("Creating annotated image...")
172
+ pil_image = Image.fromarray(image)
173
+ draw = ImageDraw.Draw(pil_image)
174
 
175
+ # Get font for annotation
176
+ logger.info("Loading font...")
177
+ try:
178
+ font_path = get_font()
179
+ if font_path:
180
+ font = ImageFont.truetype(font_path, size=20)
181
+ logger.info("Font loaded successfully")
182
+ else:
183
+ logger.warning("Font path is None, using default font")
184
+ font = ImageFont.load_default()
185
+ except Exception as e:
186
+ logger.error(f"Error loading font: {str(e)}")
187
+ logger.error(traceback.format_exc())
188
+ font = ImageFont.load_default()
189
+ logger.info("Using default font instead")
190
 
191
+ # Draw boxes and text for regular text detection
192
+ logger.info("Drawing annotation boxes...")
193
+ for i, (bbox, text, confidence) in enumerate(results):
194
  try:
195
+ # Convert points to integers
196
+ top_left = tuple(map(int, bbox[0]))
197
+ bottom_right = tuple(map(int, bbox[2]))
198
 
199
+ # Draw rectangle
200
+ draw.rectangle([top_left, bottom_right], outline="red", width=3)
201
+
202
+ # Draw text with confidence
203
+ text_with_conf = f"{text} ({confidence:.2f})"
204
+ draw.text(top_left, text_with_conf, fill="blue", font=font)
205
+
206
+ logger.info(f"Drew annotation for text region {i+1}")
207
+ except Exception as e:
208
+ logger.error(f"Error drawing annotation for region {i+1}: {str(e)}")
209
+ continue
210
+
211
+ # Convert back to numpy array
212
+ annotated_image = np.array(pil_image)
213
+ logger.info("Annotated image created successfully")
214
+ except Exception as e:
215
+ logger.error(f"Error creating annotated image: {str(e)}")
216
+ logger.error(traceback.format_exc())
217
+ annotated_image = image.copy() # Return original image if annotation fails
218
+
219
+ # Join detected text with proper formatting
220
+ text_output = "\n".join(detected_text)
221
+
222
+ # Format tables for display
223
+ tables_output = ""
224
+ for i, table in enumerate(tables):
225
+ tables_output += f"Table {i+1}:\n"
226
+ tables_output += table.to_string(index=False, header=False) + "\n\n"
227
+
228
+ logger.info("OCR extraction completed successfully")
229
+ return text_output, tables_output, annotated_image
230
+
231
  except Exception as e:
232
+ error_msg = f"Unexpected error in OCR extraction: {str(e)}"
233
+ logger.error(error_msg)
234
+ logger.error(traceback.format_exc())
235
+ return f"Error: {error_msg}", "Processing failed", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  # Create Gradio interface
238
+ try:
239
+ logger.info("Creating Gradio interface...")
240
+ iface = gr.Interface(
241
+ fn=ocr_extract_text_and_tables,
242
+ inputs=gr.Image(type="numpy", label="Upload Image"),
243
+ outputs=[
244
+ gr.Textbox(label="Extracted Text (French)", elem_classes=["output-text"]),
245
+ gr.Textbox(label="Extracted Tables", elem_classes=["output-text"]),
246
+ gr.Image(label="Annotated Image")
247
+ ],
248
+ title="French OCR & Table Extractor",
249
+ description="Upload an image containing French text and tables for OCR processing. The system will detect and extract both regular text and tabular data.",
250
+ examples=[], # You can add example images here
251
+ cache_examples=True
252
+ )
253
+ logger.info("Gradio interface created successfully")
254
+ except Exception as e:
255
+ logger.error(f"Error creating Gradio interface: {str(e)}")
256
+ logger.error(traceback.format_exc())
257
 
258
  # Launch the interface
259
  if __name__ == "__main__":
260
+ try:
261
+ logger.info("Launching Gradio interface...")
262
+ iface.launch()
263
+ logger.info("Gradio interface launched successfully")
264
+ except Exception as e:
265
+ logger.error(f"Error launching Gradio interface: {str(e)}")
266
+ logger.error(traceback.format_exc())