KarthiEz commited on
Commit
df31586
·
verified ·
1 Parent(s): 44aba4c

Upload 4 files

Browse files
Files changed (4) hide show
  1. README_HF.md +62 -0
  2. app.py +578 -0
  3. app_gradio.py +155 -0
  4. requirements_hf.txt +21 -0
README_HF.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Invoice Extraction with Layout Preservation
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: app_gradio.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Invoice Extraction with Layout Preservation
14
+
15
+ Extract text from invoice images while preserving the original layout and formatting using advanced OCR technology.
16
+
17
+ ## Features
18
+
19
+ - ✅ **Precise Text Extraction** - Uses PP-OCRv5 (latest OCR engine)
20
+ - ✅ **Table Recognition** - Advanced table recognition with cell-level accuracy
21
+ - ✅ **Layout Preservation** - Maintains original document layout and spacing
22
+ - ✅ **Smart Spacing** - Intelligent spacing detection between text elements
23
+ - ✅ **Column Alignment** - Proper column alignment for tables and multi-column layouts
24
+
25
+ ## How to Use
26
+
27
+ 1. Upload an invoice image (JPG, PNG, or other image formats)
28
+ 2. Click "Extract Text"
29
+ 3. View the extracted text with preserved layout in the output box
30
+ 4. Copy the text for further use
31
+
32
+ ## Technology Stack
33
+
34
+ - **PaddlePaddle 3.2.2** - Deep learning framework
35
+ - **PPStructureV3** - Document structure analysis
36
+ - **PP-OCRv5** - Latest OCR engine for text recognition
37
+ - **Gradio** - Web interface
38
+
39
+ ## Performance
40
+
41
+ - First run: Models are downloaded and initialized (~30-60 seconds)
42
+ - Subsequent runs: Fast processing using cached models
43
+ - Model source check: Disabled for faster startup
44
+
45
+ ## Use Cases
46
+
47
+ - Invoice processing and data extraction
48
+ - Document digitization
49
+ - Automated data entry
50
+ - Financial document analysis
51
+ - Receipt processing
52
+
53
+ ## Limitations
54
+
55
+ - Best results with clear, high-resolution images
56
+ - Works best with English text (can be extended to other languages)
57
+ - Complex layouts may require manual review
58
+
59
+ ## License
60
+
61
+ MIT License
62
+
app.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Advanced Document Extraction with Layout Preservation
3
+ Using PaddlePaddle 3.2.2 + PPStructureV3 + PP-OCRv5
4
+ Latest technologies for precise layout preservation
5
+ """
6
+
7
+ import os
8
+ import time
9
+
10
+ # CRITICAL: Set environment variables BEFORE any other imports
11
+ # This must be done before importing paddleocr to disable connectivity checks
12
+ os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = '1' # Use '1' for True
13
+ os.environ['DISABLE_MODEL_SOURCE_CHECK'] = '1' # Also set this for compatibility
14
+
15
+ # Suppress warnings about connectivity checks
16
+ import warnings
17
+ warnings.filterwarnings('ignore', message='.*Checking connectivity.*')
18
+ warnings.filterwarnings('ignore', message='.*model hoster.*')
19
+
20
+ import cv2
21
+ from paddleocr import PPStructureV3
22
+ from pathlib import Path
23
+ import json
24
+ from typing import List, Dict, Any
25
+ import numpy as np
26
+ from html.parser import HTMLParser
27
+
28
+
29
+ class TableHTMLParser(HTMLParser):
30
+ """Parser for HTML table structure"""
31
+ def __init__(self):
32
+ super().__init__()
33
+ self.rows = []
34
+ self.current_row = []
35
+ self.in_cell = False
36
+ self.current_cell = []
37
+
38
+ def handle_starttag(self, tag, attrs):
39
+ if tag == 'tr':
40
+ if self.current_row:
41
+ self.rows.append(self.current_row)
42
+ self.current_row = []
43
+ elif tag in ['td', 'th']:
44
+ self.in_cell = True
45
+ self.current_cell = []
46
+
47
+ def handle_endtag(self, tag):
48
+ if tag in ['td', 'th']:
49
+ cell_text = ' '.join(self.current_cell).strip()
50
+ self.current_row.append(cell_text)
51
+ self.in_cell = False
52
+ self.current_cell = []
53
+ elif tag == 'tr':
54
+ if self.current_row:
55
+ self.rows.append(self.current_row)
56
+ self.current_row = []
57
+
58
+ def handle_data(self, data):
59
+ if self.in_cell:
60
+ self.current_cell.append(data.strip())
61
+
62
+
63
+ def calculate_spacing(gap_pixels: float, PIXELS_PER_CHAR: int) -> int:
64
+ """
65
+ Calculate spacing between text elements based on pixel gap.
66
+ Smart detection: preserve large gaps, add spaces for small gaps.
67
+ """
68
+ if gap_pixels < 10:
69
+ return 1 # 1 space for very small gaps
70
+ elif gap_pixels < 30:
71
+ return 1 + int(gap_pixels / 20) # 1-2 spaces for medium gaps
72
+ else:
73
+ return int(gap_pixels / PIXELS_PER_CHAR) # Preserve exact spacing for large gaps
74
+
75
+
76
+ def format_text_with_layout(result: List[Dict[str, Any]], img_height: int, img_width: int) -> str:
77
+ """
78
+ Format extracted text preserving exact spatial layout.
79
+ PPStructureV3 returns a list with one dict containing:
80
+ - parsing_res_list: List of parsed regions with label, bbox, content
81
+ - table_res_list: List of tables with cell_box_list, pred_html
82
+ - overall_ocr_res: OCR results with rec_texts, rec_polys, rec_boxes
83
+ """
84
+ PIXELS_PER_CHAR = 5
85
+ MAX_LINE_WIDTH = int(img_width / PIXELS_PER_CHAR) + 400
86
+
87
+ all_text_elements = []
88
+ table_bboxes = []
89
+
90
+ # PPStructureV3 returns a list with one dict
91
+ if not result or not isinstance(result[0], dict):
92
+ return ""
93
+
94
+ page_data = result[0]
95
+ parsing_res_list = page_data.get('parsing_res_list', [])
96
+ table_res_list = page_data.get('table_res_list', [])
97
+ overall_ocr_res = page_data.get('overall_ocr_res', {})
98
+
99
+ # Extract precise OCR coordinates
100
+ ocr_boxes = []
101
+ ocr_texts = []
102
+ if overall_ocr_res:
103
+ # Handle both numpy array and list formats
104
+ rec_boxes = overall_ocr_res.get('rec_boxes', [])
105
+ rec_texts_list = overall_ocr_res.get('rec_texts', [])
106
+
107
+ if isinstance(rec_boxes, np.ndarray):
108
+ ocr_boxes = rec_boxes.tolist()
109
+ else:
110
+ ocr_boxes = rec_boxes if rec_boxes else []
111
+
112
+ ocr_texts = rec_texts_list if rec_texts_list else []
113
+
114
+ # First pass: identify table regions from parsing_res_list
115
+ for region in parsing_res_list:
116
+ # Handle both dict and LayoutBlock object
117
+ if isinstance(region, dict):
118
+ region_type = region.get('label', '')
119
+ bbox = region.get('bbox', [])
120
+ else:
121
+ # LayoutBlock object - access attributes directly
122
+ region_type = getattr(region, 'label', '')
123
+ bbox = getattr(region, 'bbox', [])
124
+
125
+ # Store table bounding boxes
126
+ if region_type == 'table':
127
+ if len(bbox) >= 4:
128
+ table_bboxes.append((bbox[0], bbox[1], bbox[2], bbox[3]))
129
+
130
+ # Process table regions from table_res_list with precise cell positions
131
+ for table_idx, table_res in enumerate(table_res_list):
132
+ # Get table bounding box from parsing_res_list
133
+ table_bbox = None
134
+ for region in parsing_res_list:
135
+ # Handle both dict and LayoutBlock object
136
+ if isinstance(region, dict):
137
+ region_label = region.get('label', '')
138
+ region_bbox = region.get('bbox', [])
139
+ else:
140
+ region_label = getattr(region, 'label', '')
141
+ region_bbox = getattr(region, 'bbox', [])
142
+
143
+ if region_label == 'table':
144
+ table_bbox = region_bbox
145
+ break
146
+
147
+ if not table_bbox or len(table_bbox) < 4:
148
+ continue
149
+
150
+ # Extract cell_box_list - precise bounding boxes for each table cell
151
+ cell_box_list = table_res.get('cell_box_list', [])
152
+ pred_html = table_res.get('pred_html', '')
153
+ table_ocr_pred = table_res.get('table_ocr_pred', {})
154
+ table_rec_texts = table_ocr_pred.get('rec_texts', [])
155
+
156
+ # Convert cell_box_list to list if it's a numpy array
157
+ if isinstance(cell_box_list, np.ndarray):
158
+ cell_box_list = cell_box_list.tolist()
159
+
160
+ # Parse HTML to get cell structure
161
+ if pred_html and len(cell_box_list) > 0:
162
+ try:
163
+ parser = TableHTMLParser()
164
+ parser.feed(pred_html)
165
+ if parser.current_row:
166
+ parser.rows.append(parser.current_row)
167
+
168
+ # Match HTML cells with cell_box_list
169
+ # cell_box_list contains [x1, y1, x2, y2] for each cell in row-major order
170
+ cell_idx = 0
171
+
172
+ for row_idx, row in enumerate(parser.rows):
173
+ for col_idx, cell_text in enumerate(row):
174
+ if cell_idx < len(cell_box_list):
175
+ # Get precise cell bounding box
176
+ cell_box = cell_box_list[cell_idx]
177
+
178
+ # Handle both list and numpy array formats
179
+ if isinstance(cell_box, np.ndarray):
180
+ cell_box = cell_box.tolist()
181
+
182
+ if len(cell_box) >= 4:
183
+ cx1, cy1, cx2, cy2 = cell_box[0], cell_box[1], cell_box[2], cell_box[3]
184
+
185
+ # Get text from table_rec_texts if available, otherwise use HTML cell text
186
+ cell_text_final = cell_text
187
+ if cell_idx < len(table_rec_texts) and table_rec_texts[cell_idx]:
188
+ cell_text_final = table_rec_texts[cell_idx]
189
+
190
+ # Handle multi-line cells by checking if text spans multiple lines
191
+ # Use center Y for positioning
192
+ cell_center_y = (cy1 + cy2) / 2
193
+
194
+ all_text_elements.append({
195
+ 'y': int(cell_center_y),
196
+ 'x': int(cx1),
197
+ 'x2': int(cx2),
198
+ 'y2': int(cy2),
199
+ 'text': cell_text_final.strip() if cell_text_final else '',
200
+ 'type': 'table_cell',
201
+ 'is_table': True,
202
+ 'row_idx': row_idx,
203
+ 'col_idx': col_idx
204
+ })
205
+ cell_idx += 1
206
+ except Exception as e:
207
+ print(f"Warning: Table parsing error: {e}")
208
+ import traceback
209
+ traceback.print_exc()
210
+
211
+ # Process non-table text using precise OCR coordinates from overall_ocr_res
212
+ # Filter out OCR boxes that fall within table regions to avoid duplicates
213
+ if ocr_boxes and ocr_texts:
214
+ for ocr_idx, (ocr_box, ocr_text) in enumerate(zip(ocr_boxes, ocr_texts)):
215
+ if not ocr_text or not ocr_text.strip():
216
+ continue
217
+
218
+ # Handle both list and numpy array formats
219
+ if isinstance(ocr_box, np.ndarray):
220
+ ocr_box = ocr_box.tolist()
221
+
222
+ if len(ocr_box) >= 4:
223
+ ox1, oy1, ox2, oy2 = ocr_box[0], ocr_box[1], ocr_box[2], ocr_box[3]
224
+
225
+ # Check if this OCR box is inside a table region
226
+ in_table = False
227
+ for tx1, ty1, tx2, ty2 in table_bboxes:
228
+ # Check if OCR box center or significant portion is within table
229
+ center_x = (ox1 + ox2) / 2
230
+ center_y = (oy1 + oy2) / 2
231
+ if tx1 <= center_x <= tx2 and ty1 <= center_y <= ty2:
232
+ in_table = True
233
+ break
234
+
235
+ # Only add if not in table (table cells already processed)
236
+ if not in_table:
237
+ # Use center Y for positioning
238
+ center_y = (oy1 + oy2) / 2
239
+
240
+ all_text_elements.append({
241
+ 'y': int(center_y),
242
+ 'x': int(ox1),
243
+ 'x2': int(ox2),
244
+ 'y2': int(oy2),
245
+ 'text': ocr_text.strip(),
246
+ 'type': 'text',
247
+ 'is_table': False
248
+ })
249
+
250
+ # Group text elements by Y position (row clustering)
251
+ Y_TOLERANCE_BASE = 10
252
+ Y_TOLERANCE_TABLE = 20 # Reduced for better row grouping
253
+
254
+ # Separate table cells and non-table elements
255
+ table_cells = [e for e in all_text_elements if e.get('is_table', False)]
256
+ non_table_elements = [e for e in all_text_elements if not e.get('is_table', False)]
257
+
258
+ lines_dict = {}
259
+
260
+ # Group table cells by row using actual Y-coordinates with improved clustering
261
+ if table_cells:
262
+ # Sort by Y, then by X for consistent ordering
263
+ table_cells_sorted = sorted(table_cells, key=lambda x: (x['y'], x['x']))
264
+
265
+ # Use row_idx if available (from HTML parsing), otherwise cluster by Y
266
+ table_rows = []
267
+ if table_cells_sorted and 'row_idx' in table_cells_sorted[0]:
268
+ # Group by row_idx first
269
+ row_groups = {}
270
+ for cell in table_cells_sorted:
271
+ row_idx = cell.get('row_idx', 0)
272
+ if row_idx not in row_groups:
273
+ row_groups[row_idx] = []
274
+ row_groups[row_idx].append(cell)
275
+
276
+ # Convert to list and sort by row_idx
277
+ for row_idx in sorted(row_groups.keys()):
278
+ row_cells = row_groups[row_idx]
279
+ # Sort cells within row by X (col_idx if available)
280
+ row_cells.sort(key=lambda x: (x.get('col_idx', 0), x['x']))
281
+ table_rows.append(row_cells)
282
+ else:
283
+ # Fallback: cluster by Y-coordinate
284
+ current_row = [table_cells_sorted[0]]
285
+ current_row_y = table_cells_sorted[0]['y']
286
+
287
+ for cell in table_cells_sorted[1:]:
288
+ cell_y = cell['y']
289
+ if abs(cell_y - current_row_y) <= Y_TOLERANCE_TABLE:
290
+ current_row.append(cell)
291
+ # Use median Y for better row representation
292
+ current_row_y = sorted([c['y'] for c in current_row])[len(current_row) // 2]
293
+ else:
294
+ # Sort current row by X before adding
295
+ current_row.sort(key=lambda x: x['x'])
296
+ table_rows.append(current_row)
297
+ current_row = [cell]
298
+ current_row_y = cell_y
299
+
300
+ if current_row:
301
+ current_row.sort(key=lambda x: x['x'])
302
+ table_rows.append(current_row)
303
+
304
+ # Add table rows to lines_dict using median Y
305
+ for row_cells in table_rows:
306
+ if row_cells:
307
+ # Use median Y for row representation
308
+ row_ys = [cell['y'] for cell in row_cells]
309
+ median_y = sorted(row_ys)[len(row_ys) // 2]
310
+ if median_y not in lines_dict:
311
+ lines_dict[median_y] = []
312
+ lines_dict[median_y].extend(row_cells)
313
+
314
+ # Group non-table elements by Y position
315
+ for elem in non_table_elements:
316
+ y_pos = elem['y']
317
+ matched_line = None
318
+
319
+ # Find closest existing line within tolerance
320
+ for existing_y in lines_dict.keys():
321
+ if abs(existing_y - y_pos) <= Y_TOLERANCE_BASE:
322
+ matched_line = existing_y
323
+ break
324
+
325
+ if matched_line is None:
326
+ matched_line = y_pos
327
+
328
+ if matched_line not in lines_dict:
329
+ lines_dict[matched_line] = []
330
+ lines_dict[matched_line].append(elem)
331
+
332
+ # Build formatted output with precise positioning and smart spacing
333
+ formatted_lines = []
334
+ sorted_y_positions = sorted(lines_dict.keys())
335
+ last_y = None
336
+
337
+ for y_pos in sorted_y_positions:
338
+ items = lines_dict[y_pos]
339
+ items.sort(key=lambda x: x['x'])
340
+
341
+ # Add blank lines for vertical spacing
342
+ if last_y is not None:
343
+ gap = y_pos - last_y
344
+ if gap > 30:
345
+ blank_lines = min(3, int(gap / 40))
346
+ for _ in range(blank_lines):
347
+ formatted_lines.append('')
348
+
349
+ # Build line with precise character positioning and smart spacing
350
+ line_array = [' '] * MAX_LINE_WIDTH
351
+
352
+ prev_x2 = None # Track end position of previous text element
353
+
354
+ for item_idx, item in enumerate(items):
355
+ x_pos = item['x']
356
+ x2_pos = item.get('x2', x_pos)
357
+ text = item['text'].strip()
358
+ if not text:
359
+ continue
360
+
361
+ is_table_cell = item.get('is_table', False)
362
+ char_col = int(x_pos / PIXELS_PER_CHAR)
363
+ char_col = max(0, min(char_col, MAX_LINE_WIDTH - len(text) - 1))
364
+
365
+ # Calculate spacing from previous element
366
+ if prev_x2 is not None and item_idx > 0:
367
+ gap_pixels = x_pos - prev_x2
368
+ if gap_pixels > 0:
369
+ spaces_to_add = calculate_spacing(gap_pixels, PIXELS_PER_CHAR)
370
+ # Ensure we don't overwrite existing text
371
+ prev_char_col_end = int(prev_x2 / PIXELS_PER_CHAR)
372
+ if char_col > prev_char_col_end:
373
+ # Add spaces between elements
374
+ for s in range(min(spaces_to_add, char_col - prev_char_col_end)):
375
+ space_pos = prev_char_col_end + s
376
+ if space_pos < MAX_LINE_WIDTH and line_array[space_pos] == ' ':
377
+ line_array[space_pos] = ' '
378
+
379
+ # Place text at calculated position
380
+ for i, char in enumerate(text):
381
+ pos = char_col + i
382
+ if pos < MAX_LINE_WIDTH:
383
+ if is_table_cell:
384
+ # For table cells, overwrite to ensure proper alignment
385
+ line_array[pos] = char
386
+ elif line_array[pos] == ' ':
387
+ # For non-table text, only place if position is empty
388
+ line_array[pos] = char
389
+
390
+ prev_x2 = x2_pos
391
+
392
+ # Convert to string
393
+ line_str = ''.join(line_array).rstrip()
394
+ if line_str.strip():
395
+ formatted_lines.append(line_str)
396
+
397
+ last_y = y_pos
398
+
399
+ return '\n'.join(formatted_lines)
400
+
401
+
402
+ # Global engine cache to avoid reinitializing on multiple runs
403
+ _engine_cache = None
404
+
405
+
406
+ def main():
407
+ """Main function for document extraction"""
408
+ global _engine_cache
409
+
410
+ # Start total timer
411
+ total_start = time.time()
412
+
413
+ # Configuration
414
+ img_path = 'test_invoice2.jpg'
415
+ save_folder = './output_results'
416
+
417
+ # Create output directory
418
+ Path(save_folder).mkdir(exist_ok=True)
419
+
420
+ # Check if image exists
421
+ if not os.path.exists(img_path):
422
+ print(f"Error: Image file '{img_path}' not found!")
423
+ return
424
+
425
+ # Initialize PPStructureV3 with optimized settings (reuse if already initialized)
426
+ print("=" * 80)
427
+ print("Initializing PPStructureV3 with PaddlePaddle 3.2.2")
428
+ print("Using PP-OCRv5 (latest OCR engine)")
429
+ print("=" * 80)
430
+
431
+ # Verify environment variable is set
432
+ check_disabled = os.environ.get('PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK', 'False')
433
+ if check_disabled in ('1', 'True', 'true', 'TRUE'):
434
+ print("Model source check: DISABLED (fast mode)")
435
+ else:
436
+ print("WARNING: Model source check may still be enabled!")
437
+
438
+ print("\nInitializing models (this may take a moment on first run)...\n")
439
+
440
+ start_init = time.time()
441
+ try:
442
+ if _engine_cache is None:
443
+ structure_engine = PPStructureV3(
444
+ lang='en',
445
+ ocr_version='PP-OCRv5',
446
+ use_table_recognition=True,
447
+ use_chart_recognition=False, # Disable for invoices
448
+ use_formula_recognition=False, # Disable for invoices
449
+ use_seal_recognition=False, # Disable for invoices
450
+ use_region_detection=False, # Disable for faster processing
451
+ )
452
+ _engine_cache = structure_engine
453
+ init_time = time.time() - start_init
454
+ print(f"[OK] PPStructureV3 initialized successfully ({init_time:.1f}s)\n")
455
+ else:
456
+ structure_engine = _engine_cache
457
+ print("[OK] Using cached PPStructureV3 engine (0.0s)\n")
458
+ except Exception as e:
459
+ print(f"Error initializing PPStructureV3: {e}")
460
+ return
461
+
462
+ # Read image
463
+ print(f"Processing image: {img_path}")
464
+ img = cv2.imread(img_path)
465
+ if img is None:
466
+ print(f"Error: Could not read image '{img_path}'")
467
+ return
468
+
469
+ img_height, img_width = img.shape[:2]
470
+ print(f"Image dimensions: {img_width} x {img_height} pixels\n")
471
+
472
+ # Run inference
473
+ print("Running document structure analysis...")
474
+ print("Using:")
475
+ print(" - PP-OCRv5 for text recognition")
476
+ print(" - Advanced table recognition with cell detection")
477
+ print(" - Layout preservation with precise coordinates\n")
478
+
479
+ start_inference = time.time()
480
+ try:
481
+ result = structure_engine.predict(
482
+ img_path, # Use file path for better compatibility
483
+ use_table_recognition=True,
484
+ use_ocr_results_with_table_cells=True,
485
+ use_e2e_wireless_table_rec_model=True,
486
+ use_table_orientation_classify=True,
487
+ use_chart_recognition=False, # Disable for invoices
488
+ use_formula_recognition=False, # Disable for invoices
489
+ use_seal_recognition=False, # Disable for invoices
490
+ )
491
+
492
+ inference_time = time.time() - start_inference
493
+ print(f"[OK] Analysis complete! ({inference_time:.1f}s)\n")
494
+
495
+ # Extract parsing results
496
+ if result and isinstance(result[0], dict):
497
+ page_data = result[0]
498
+ parsing_res_list = page_data.get('parsing_res_list', [])
499
+ table_res_list = page_data.get('table_res_list', [])
500
+
501
+ print(f"Detection Results:")
502
+ print(f" Total regions detected: {len(parsing_res_list)}\n")
503
+
504
+ for i, region in enumerate(parsing_res_list):
505
+ # Handle both dict and LayoutBlock object
506
+ if isinstance(region, dict):
507
+ region_type = region.get('label', 'unknown')
508
+ bbox = region.get('bbox', [])
509
+ else:
510
+ # LayoutBlock object - access attributes directly
511
+ region_type = getattr(region, 'label', 'unknown')
512
+ bbox = getattr(region, 'bbox', [])
513
+
514
+ print(f" Region {i}: type={region_type}, bbox={bbox}")
515
+
516
+ if region_type == 'table':
517
+ print(f" -> Table detected with HTML structure")
518
+
519
+ print(f"\n Tables detected: {len(table_res_list)}\n")
520
+ print("-" * 80 + "\n")
521
+
522
+ except Exception as e:
523
+ print(f"Error during inference: {e}")
524
+ import traceback
525
+ traceback.print_exc()
526
+ return
527
+
528
+ # Format text with layout preservation
529
+ print("Formatting text with layout preservation...")
530
+ start_format = time.time()
531
+ try:
532
+ layout_preserved_text = format_text_with_layout(result, img_height, img_width)
533
+ format_time = time.time() - start_format
534
+ print(f"[OK] Layout formatting complete! ({format_time:.1f}s)\n")
535
+ except Exception as e:
536
+ print(f"Error formatting layout: {e}")
537
+ import traceback
538
+ traceback.print_exc()
539
+ return
540
+
541
+ # Display output
542
+ print("=" * 80)
543
+ print("EXTRACTED TEXT (LAYOUT PRESERVED)")
544
+ print("=" * 80 + "\n")
545
+ print(layout_preserved_text)
546
+ print("\n" + "=" * 80 + "\n")
547
+
548
+ # Save results
549
+ output_layout_file = os.path.join(save_folder, f"{Path(img_path).stem}_layout_preserved.txt")
550
+ output_json_file = os.path.join(save_folder, f"{Path(img_path).stem}_result.json")
551
+
552
+ try:
553
+ with open(output_layout_file, 'w', encoding='utf-8') as f:
554
+ f.write(layout_preserved_text)
555
+
556
+ def json_serial(obj):
557
+ if hasattr(obj, '__dict__'):
558
+ return obj.__dict__
559
+ elif isinstance(obj, (list, tuple)):
560
+ return list(obj)
561
+ return str(obj)
562
+
563
+ with open(output_json_file, 'w', encoding='utf-8') as f:
564
+ json.dump(result, f, indent=2, ensure_ascii=False, default=json_serial)
565
+
566
+ print("Results saved:")
567
+ print(f" [OK] Layout-preserved text: {output_layout_file}")
568
+ print(f" [OK] JSON result: {output_json_file}")
569
+
570
+ total_time = time.time() - total_start
571
+ print(f"\n[OK] Extraction complete! (Total time: {total_time:.1f}s)")
572
+
573
+ except Exception as e:
574
+ print(f"Error saving results: {e}")
575
+
576
+
577
+ if __name__ == '__main__':
578
+ main()
app_gradio.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Spaces - Invoice Extraction with Layout Preservation
3
+ Gradio interface for document extraction using PaddlePaddle PPStructureV3
4
+ """
5
+
6
+ import os
7
+ import time
8
+ import tempfile
9
+ from pathlib import Path
10
+
11
+ # CRITICAL: Set environment variables BEFORE any other imports
12
+ os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = '1'
13
+ os.environ['DISABLE_MODEL_SOURCE_CHECK'] = '1'
14
+
15
+ import warnings
16
+ warnings.filterwarnings('ignore', message='.*Checking connectivity.*')
17
+ warnings.filterwarnings('ignore', message='.*model hoster.*')
18
+
19
+ import gradio as gr
20
+ import cv2
21
+ from paddleocr import PPStructureV3
22
+ import numpy as np
23
+
24
+ # Import the layout formatting function from app.py
25
+ from app import format_text_with_layout
26
+
27
+ # Global engine cache
28
+ _engine_cache = None
29
+
30
+
31
+ def initialize_engine():
32
+ """Initialize PPStructureV3 engine (cached)"""
33
+ global _engine_cache
34
+ if _engine_cache is None:
35
+ print("Initializing PPStructureV3...")
36
+ _engine_cache = PPStructureV3(
37
+ lang='en',
38
+ ocr_version='PP-OCRv5',
39
+ use_table_recognition=True,
40
+ use_chart_recognition=False,
41
+ use_formula_recognition=False,
42
+ use_seal_recognition=False,
43
+ use_region_detection=False,
44
+ )
45
+ print("Engine initialized!")
46
+ return _engine_cache
47
+
48
+
49
+ def process_invoice(image):
50
+ """Process invoice image and return extracted text with layout preservation"""
51
+ if image is None:
52
+ return "Please upload an image file."
53
+
54
+ try:
55
+ # Initialize engine
56
+ engine = initialize_engine()
57
+
58
+ # Save image to temporary file
59
+ with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
60
+ tmp_path = tmp_file.name
61
+ cv2.imwrite(tmp_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
62
+
63
+ try:
64
+ # Get image dimensions
65
+ img_height, img_width = image.shape[:2]
66
+
67
+ # Run inference
68
+ result = engine.predict(
69
+ tmp_path,
70
+ use_table_recognition=True,
71
+ use_ocr_results_with_table_cells=True,
72
+ use_e2e_wireless_table_rec_model=True,
73
+ use_table_orientation_classify=True,
74
+ use_chart_recognition=False,
75
+ use_formula_recognition=False,
76
+ use_seal_recognition=False,
77
+ )
78
+
79
+ # Format text with layout preservation
80
+ layout_preserved_text = format_text_with_layout(result, img_height, img_width)
81
+
82
+ return layout_preserved_text
83
+
84
+ finally:
85
+ # Clean up temporary file
86
+ if os.path.exists(tmp_path):
87
+ os.unlink(tmp_path)
88
+
89
+ except Exception as e:
90
+ return f"Error processing image: {str(e)}\n\nPlease try again or check if the image is a valid invoice document."
91
+
92
+
93
+ # Create Gradio interface
94
+ with gr.Blocks(title="Invoice Extraction with Layout Preservation") as demo:
95
+ gr.Markdown("""
96
+ # 📄 Invoice Extraction with Layout Preservation
97
+
98
+ Extract text from invoice images while preserving the original layout and formatting.
99
+
100
+ **Features:**
101
+ - ✅ Precise text extraction using PP-OCRv5
102
+ - ✅ Table recognition with cell-level accuracy
103
+ - ✅ Layout preservation matching original document
104
+ - ✅ Smart spacing and column alignment
105
+
106
+ **How to use:**
107
+ 1. Upload an invoice image (JPG, PNG, etc.)
108
+ 2. Click "Extract Text"
109
+ 3. View the extracted text with preserved layout
110
+ """)
111
+
112
+ with gr.Row():
113
+ with gr.Column():
114
+ image_input = gr.Image(
115
+ label="Upload Invoice Image",
116
+ type="numpy",
117
+ height=400
118
+ )
119
+ extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
120
+
121
+ with gr.Column():
122
+ text_output = gr.Textbox(
123
+ label="Extracted Text (Layout Preserved)",
124
+ lines=30,
125
+ max_lines=50,
126
+ show_copy_button=True
127
+ )
128
+
129
+ # Examples
130
+ gr.Examples(
131
+ examples=[],
132
+ inputs=image_input,
133
+ label="Example Invoices (add your examples here)"
134
+ )
135
+
136
+ # Process function
137
+ extract_btn.click(
138
+ fn=process_invoice,
139
+ inputs=image_input,
140
+ outputs=text_output
141
+ )
142
+
143
+ gr.Markdown("""
144
+ ---
145
+ **Powered by:**
146
+ - PaddlePaddle 3.2.2
147
+ - PPStructureV3
148
+ - PP-OCRv5
149
+
150
+ **Note:** First run may take longer as models are downloaded and initialized.
151
+ """)
152
+
153
+ if __name__ == "__main__":
154
+ demo.launch(server_name="0.0.0.0", server_port=7860)
155
+
requirements_hf.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces Requirements
2
+ # Optimized for deployment
3
+
4
+ # PaddlePaddle and PaddleOCR
5
+ paddlepaddle==3.2.2
6
+ paddleocr>=3.3.2
7
+
8
+ # Image processing
9
+ opencv-python-headless>=4.8.0
10
+ Pillow>=10.0.0
11
+
12
+ # Core dependencies
13
+ numpy>=1.21,<2.0
14
+
15
+ # Gradio for web interface
16
+ gradio>=4.0.0
17
+
18
+ # Utilities (optional, can be removed if not needed)
19
+ python-docx>=0.8.11
20
+ openpyxl>=3.0.0
21
+