Upload 4 files
Browse files- README_HF.md +62 -0
- app.py +578 -0
- app_gradio.py +155 -0
- requirements_hf.txt +21 -0
README_HF.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Invoice Extraction with Layout Preservation
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.0.0
|
| 8 |
+
app_file: app_gradio.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Invoice Extraction with Layout Preservation
|
| 14 |
+
|
| 15 |
+
Extract text from invoice images while preserving the original layout and formatting using advanced OCR technology.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- ✅ **Precise Text Extraction** - Uses PP-OCRv5 (latest OCR engine)
|
| 20 |
+
- ✅ **Table Recognition** - Advanced table recognition with cell-level accuracy
|
| 21 |
+
- ✅ **Layout Preservation** - Maintains original document layout and spacing
|
| 22 |
+
- ✅ **Smart Spacing** - Intelligent spacing detection between text elements
|
| 23 |
+
- ✅ **Column Alignment** - Proper column alignment for tables and multi-column layouts
|
| 24 |
+
|
| 25 |
+
## How to Use
|
| 26 |
+
|
| 27 |
+
1. Upload an invoice image (JPG, PNG, or other image formats)
|
| 28 |
+
2. Click "Extract Text"
|
| 29 |
+
3. View the extracted text with preserved layout in the output box
|
| 30 |
+
4. Copy the text for further use
|
| 31 |
+
|
| 32 |
+
## Technology Stack
|
| 33 |
+
|
| 34 |
+
- **PaddlePaddle 3.2.2** - Deep learning framework
|
| 35 |
+
- **PPStructureV3** - Document structure analysis
|
| 36 |
+
- **PP-OCRv5** - Latest OCR engine for text recognition
|
| 37 |
+
- **Gradio** - Web interface
|
| 38 |
+
|
| 39 |
+
## Performance
|
| 40 |
+
|
| 41 |
+
- First run: Models are downloaded and initialized (~30-60 seconds)
|
| 42 |
+
- Subsequent runs: Fast processing using cached models
|
| 43 |
+
- Model source check: Disabled for faster startup
|
| 44 |
+
|
| 45 |
+
## Use Cases
|
| 46 |
+
|
| 47 |
+
- Invoice processing and data extraction
|
| 48 |
+
- Document digitization
|
| 49 |
+
- Automated data entry
|
| 50 |
+
- Financial document analysis
|
| 51 |
+
- Receipt processing
|
| 52 |
+
|
| 53 |
+
## Limitations
|
| 54 |
+
|
| 55 |
+
- Best results with clear, high-resolution images
|
| 56 |
+
- Works best with English text (can be extended to other languages)
|
| 57 |
+
- Complex layouts may require manual review
|
| 58 |
+
|
| 59 |
+
## License
|
| 60 |
+
|
| 61 |
+
MIT License
|
| 62 |
+
|
app.py
ADDED
|
@@ -0,0 +1,578 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Advanced Document Extraction with Layout Preservation
|
| 3 |
+
Using PaddlePaddle 3.2.2 + PPStructureV3 + PP-OCRv5
|
| 4 |
+
Latest technologies for precise layout preservation
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
# CRITICAL: Set environment variables BEFORE any other imports
|
| 11 |
+
# This must be done before importing paddleocr to disable connectivity checks
|
| 12 |
+
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = '1' # Use '1' for True
|
| 13 |
+
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = '1' # Also set this for compatibility
|
| 14 |
+
|
| 15 |
+
# Suppress warnings about connectivity checks
|
| 16 |
+
import warnings
|
| 17 |
+
warnings.filterwarnings('ignore', message='.*Checking connectivity.*')
|
| 18 |
+
warnings.filterwarnings('ignore', message='.*model hoster.*')
|
| 19 |
+
|
| 20 |
+
import cv2
|
| 21 |
+
from paddleocr import PPStructureV3
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
import json
|
| 24 |
+
from typing import List, Dict, Any
|
| 25 |
+
import numpy as np
|
| 26 |
+
from html.parser import HTMLParser
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class TableHTMLParser(HTMLParser):
|
| 30 |
+
"""Parser for HTML table structure"""
|
| 31 |
+
def __init__(self):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.rows = []
|
| 34 |
+
self.current_row = []
|
| 35 |
+
self.in_cell = False
|
| 36 |
+
self.current_cell = []
|
| 37 |
+
|
| 38 |
+
def handle_starttag(self, tag, attrs):
|
| 39 |
+
if tag == 'tr':
|
| 40 |
+
if self.current_row:
|
| 41 |
+
self.rows.append(self.current_row)
|
| 42 |
+
self.current_row = []
|
| 43 |
+
elif tag in ['td', 'th']:
|
| 44 |
+
self.in_cell = True
|
| 45 |
+
self.current_cell = []
|
| 46 |
+
|
| 47 |
+
def handle_endtag(self, tag):
|
| 48 |
+
if tag in ['td', 'th']:
|
| 49 |
+
cell_text = ' '.join(self.current_cell).strip()
|
| 50 |
+
self.current_row.append(cell_text)
|
| 51 |
+
self.in_cell = False
|
| 52 |
+
self.current_cell = []
|
| 53 |
+
elif tag == 'tr':
|
| 54 |
+
if self.current_row:
|
| 55 |
+
self.rows.append(self.current_row)
|
| 56 |
+
self.current_row = []
|
| 57 |
+
|
| 58 |
+
def handle_data(self, data):
|
| 59 |
+
if self.in_cell:
|
| 60 |
+
self.current_cell.append(data.strip())
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def calculate_spacing(gap_pixels: float, PIXELS_PER_CHAR: int) -> int:
|
| 64 |
+
"""
|
| 65 |
+
Calculate spacing between text elements based on pixel gap.
|
| 66 |
+
Smart detection: preserve large gaps, add spaces for small gaps.
|
| 67 |
+
"""
|
| 68 |
+
if gap_pixels < 10:
|
| 69 |
+
return 1 # 1 space for very small gaps
|
| 70 |
+
elif gap_pixels < 30:
|
| 71 |
+
return 1 + int(gap_pixels / 20) # 1-2 spaces for medium gaps
|
| 72 |
+
else:
|
| 73 |
+
return int(gap_pixels / PIXELS_PER_CHAR) # Preserve exact spacing for large gaps
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def format_text_with_layout(result: List[Dict[str, Any]], img_height: int, img_width: int) -> str:
|
| 77 |
+
"""
|
| 78 |
+
Format extracted text preserving exact spatial layout.
|
| 79 |
+
PPStructureV3 returns a list with one dict containing:
|
| 80 |
+
- parsing_res_list: List of parsed regions with label, bbox, content
|
| 81 |
+
- table_res_list: List of tables with cell_box_list, pred_html
|
| 82 |
+
- overall_ocr_res: OCR results with rec_texts, rec_polys, rec_boxes
|
| 83 |
+
"""
|
| 84 |
+
PIXELS_PER_CHAR = 5
|
| 85 |
+
MAX_LINE_WIDTH = int(img_width / PIXELS_PER_CHAR) + 400
|
| 86 |
+
|
| 87 |
+
all_text_elements = []
|
| 88 |
+
table_bboxes = []
|
| 89 |
+
|
| 90 |
+
# PPStructureV3 returns a list with one dict
|
| 91 |
+
if not result or not isinstance(result[0], dict):
|
| 92 |
+
return ""
|
| 93 |
+
|
| 94 |
+
page_data = result[0]
|
| 95 |
+
parsing_res_list = page_data.get('parsing_res_list', [])
|
| 96 |
+
table_res_list = page_data.get('table_res_list', [])
|
| 97 |
+
overall_ocr_res = page_data.get('overall_ocr_res', {})
|
| 98 |
+
|
| 99 |
+
# Extract precise OCR coordinates
|
| 100 |
+
ocr_boxes = []
|
| 101 |
+
ocr_texts = []
|
| 102 |
+
if overall_ocr_res:
|
| 103 |
+
# Handle both numpy array and list formats
|
| 104 |
+
rec_boxes = overall_ocr_res.get('rec_boxes', [])
|
| 105 |
+
rec_texts_list = overall_ocr_res.get('rec_texts', [])
|
| 106 |
+
|
| 107 |
+
if isinstance(rec_boxes, np.ndarray):
|
| 108 |
+
ocr_boxes = rec_boxes.tolist()
|
| 109 |
+
else:
|
| 110 |
+
ocr_boxes = rec_boxes if rec_boxes else []
|
| 111 |
+
|
| 112 |
+
ocr_texts = rec_texts_list if rec_texts_list else []
|
| 113 |
+
|
| 114 |
+
# First pass: identify table regions from parsing_res_list
|
| 115 |
+
for region in parsing_res_list:
|
| 116 |
+
# Handle both dict and LayoutBlock object
|
| 117 |
+
if isinstance(region, dict):
|
| 118 |
+
region_type = region.get('label', '')
|
| 119 |
+
bbox = region.get('bbox', [])
|
| 120 |
+
else:
|
| 121 |
+
# LayoutBlock object - access attributes directly
|
| 122 |
+
region_type = getattr(region, 'label', '')
|
| 123 |
+
bbox = getattr(region, 'bbox', [])
|
| 124 |
+
|
| 125 |
+
# Store table bounding boxes
|
| 126 |
+
if region_type == 'table':
|
| 127 |
+
if len(bbox) >= 4:
|
| 128 |
+
table_bboxes.append((bbox[0], bbox[1], bbox[2], bbox[3]))
|
| 129 |
+
|
| 130 |
+
# Process table regions from table_res_list with precise cell positions
|
| 131 |
+
for table_idx, table_res in enumerate(table_res_list):
|
| 132 |
+
# Get table bounding box from parsing_res_list
|
| 133 |
+
table_bbox = None
|
| 134 |
+
for region in parsing_res_list:
|
| 135 |
+
# Handle both dict and LayoutBlock object
|
| 136 |
+
if isinstance(region, dict):
|
| 137 |
+
region_label = region.get('label', '')
|
| 138 |
+
region_bbox = region.get('bbox', [])
|
| 139 |
+
else:
|
| 140 |
+
region_label = getattr(region, 'label', '')
|
| 141 |
+
region_bbox = getattr(region, 'bbox', [])
|
| 142 |
+
|
| 143 |
+
if region_label == 'table':
|
| 144 |
+
table_bbox = region_bbox
|
| 145 |
+
break
|
| 146 |
+
|
| 147 |
+
if not table_bbox or len(table_bbox) < 4:
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
# Extract cell_box_list - precise bounding boxes for each table cell
|
| 151 |
+
cell_box_list = table_res.get('cell_box_list', [])
|
| 152 |
+
pred_html = table_res.get('pred_html', '')
|
| 153 |
+
table_ocr_pred = table_res.get('table_ocr_pred', {})
|
| 154 |
+
table_rec_texts = table_ocr_pred.get('rec_texts', [])
|
| 155 |
+
|
| 156 |
+
# Convert cell_box_list to list if it's a numpy array
|
| 157 |
+
if isinstance(cell_box_list, np.ndarray):
|
| 158 |
+
cell_box_list = cell_box_list.tolist()
|
| 159 |
+
|
| 160 |
+
# Parse HTML to get cell structure
|
| 161 |
+
if pred_html and len(cell_box_list) > 0:
|
| 162 |
+
try:
|
| 163 |
+
parser = TableHTMLParser()
|
| 164 |
+
parser.feed(pred_html)
|
| 165 |
+
if parser.current_row:
|
| 166 |
+
parser.rows.append(parser.current_row)
|
| 167 |
+
|
| 168 |
+
# Match HTML cells with cell_box_list
|
| 169 |
+
# cell_box_list contains [x1, y1, x2, y2] for each cell in row-major order
|
| 170 |
+
cell_idx = 0
|
| 171 |
+
|
| 172 |
+
for row_idx, row in enumerate(parser.rows):
|
| 173 |
+
for col_idx, cell_text in enumerate(row):
|
| 174 |
+
if cell_idx < len(cell_box_list):
|
| 175 |
+
# Get precise cell bounding box
|
| 176 |
+
cell_box = cell_box_list[cell_idx]
|
| 177 |
+
|
| 178 |
+
# Handle both list and numpy array formats
|
| 179 |
+
if isinstance(cell_box, np.ndarray):
|
| 180 |
+
cell_box = cell_box.tolist()
|
| 181 |
+
|
| 182 |
+
if len(cell_box) >= 4:
|
| 183 |
+
cx1, cy1, cx2, cy2 = cell_box[0], cell_box[1], cell_box[2], cell_box[3]
|
| 184 |
+
|
| 185 |
+
# Get text from table_rec_texts if available, otherwise use HTML cell text
|
| 186 |
+
cell_text_final = cell_text
|
| 187 |
+
if cell_idx < len(table_rec_texts) and table_rec_texts[cell_idx]:
|
| 188 |
+
cell_text_final = table_rec_texts[cell_idx]
|
| 189 |
+
|
| 190 |
+
# Handle multi-line cells by checking if text spans multiple lines
|
| 191 |
+
# Use center Y for positioning
|
| 192 |
+
cell_center_y = (cy1 + cy2) / 2
|
| 193 |
+
|
| 194 |
+
all_text_elements.append({
|
| 195 |
+
'y': int(cell_center_y),
|
| 196 |
+
'x': int(cx1),
|
| 197 |
+
'x2': int(cx2),
|
| 198 |
+
'y2': int(cy2),
|
| 199 |
+
'text': cell_text_final.strip() if cell_text_final else '',
|
| 200 |
+
'type': 'table_cell',
|
| 201 |
+
'is_table': True,
|
| 202 |
+
'row_idx': row_idx,
|
| 203 |
+
'col_idx': col_idx
|
| 204 |
+
})
|
| 205 |
+
cell_idx += 1
|
| 206 |
+
except Exception as e:
|
| 207 |
+
print(f"Warning: Table parsing error: {e}")
|
| 208 |
+
import traceback
|
| 209 |
+
traceback.print_exc()
|
| 210 |
+
|
| 211 |
+
# Process non-table text using precise OCR coordinates from overall_ocr_res
|
| 212 |
+
# Filter out OCR boxes that fall within table regions to avoid duplicates
|
| 213 |
+
if ocr_boxes and ocr_texts:
|
| 214 |
+
for ocr_idx, (ocr_box, ocr_text) in enumerate(zip(ocr_boxes, ocr_texts)):
|
| 215 |
+
if not ocr_text or not ocr_text.strip():
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
# Handle both list and numpy array formats
|
| 219 |
+
if isinstance(ocr_box, np.ndarray):
|
| 220 |
+
ocr_box = ocr_box.tolist()
|
| 221 |
+
|
| 222 |
+
if len(ocr_box) >= 4:
|
| 223 |
+
ox1, oy1, ox2, oy2 = ocr_box[0], ocr_box[1], ocr_box[2], ocr_box[3]
|
| 224 |
+
|
| 225 |
+
# Check if this OCR box is inside a table region
|
| 226 |
+
in_table = False
|
| 227 |
+
for tx1, ty1, tx2, ty2 in table_bboxes:
|
| 228 |
+
# Check if OCR box center or significant portion is within table
|
| 229 |
+
center_x = (ox1 + ox2) / 2
|
| 230 |
+
center_y = (oy1 + oy2) / 2
|
| 231 |
+
if tx1 <= center_x <= tx2 and ty1 <= center_y <= ty2:
|
| 232 |
+
in_table = True
|
| 233 |
+
break
|
| 234 |
+
|
| 235 |
+
# Only add if not in table (table cells already processed)
|
| 236 |
+
if not in_table:
|
| 237 |
+
# Use center Y for positioning
|
| 238 |
+
center_y = (oy1 + oy2) / 2
|
| 239 |
+
|
| 240 |
+
all_text_elements.append({
|
| 241 |
+
'y': int(center_y),
|
| 242 |
+
'x': int(ox1),
|
| 243 |
+
'x2': int(ox2),
|
| 244 |
+
'y2': int(oy2),
|
| 245 |
+
'text': ocr_text.strip(),
|
| 246 |
+
'type': 'text',
|
| 247 |
+
'is_table': False
|
| 248 |
+
})
|
| 249 |
+
|
| 250 |
+
# Group text elements by Y position (row clustering)
|
| 251 |
+
Y_TOLERANCE_BASE = 10
|
| 252 |
+
Y_TOLERANCE_TABLE = 20 # Reduced for better row grouping
|
| 253 |
+
|
| 254 |
+
# Separate table cells and non-table elements
|
| 255 |
+
table_cells = [e for e in all_text_elements if e.get('is_table', False)]
|
| 256 |
+
non_table_elements = [e for e in all_text_elements if not e.get('is_table', False)]
|
| 257 |
+
|
| 258 |
+
lines_dict = {}
|
| 259 |
+
|
| 260 |
+
# Group table cells by row using actual Y-coordinates with improved clustering
|
| 261 |
+
if table_cells:
|
| 262 |
+
# Sort by Y, then by X for consistent ordering
|
| 263 |
+
table_cells_sorted = sorted(table_cells, key=lambda x: (x['y'], x['x']))
|
| 264 |
+
|
| 265 |
+
# Use row_idx if available (from HTML parsing), otherwise cluster by Y
|
| 266 |
+
table_rows = []
|
| 267 |
+
if table_cells_sorted and 'row_idx' in table_cells_sorted[0]:
|
| 268 |
+
# Group by row_idx first
|
| 269 |
+
row_groups = {}
|
| 270 |
+
for cell in table_cells_sorted:
|
| 271 |
+
row_idx = cell.get('row_idx', 0)
|
| 272 |
+
if row_idx not in row_groups:
|
| 273 |
+
row_groups[row_idx] = []
|
| 274 |
+
row_groups[row_idx].append(cell)
|
| 275 |
+
|
| 276 |
+
# Convert to list and sort by row_idx
|
| 277 |
+
for row_idx in sorted(row_groups.keys()):
|
| 278 |
+
row_cells = row_groups[row_idx]
|
| 279 |
+
# Sort cells within row by X (col_idx if available)
|
| 280 |
+
row_cells.sort(key=lambda x: (x.get('col_idx', 0), x['x']))
|
| 281 |
+
table_rows.append(row_cells)
|
| 282 |
+
else:
|
| 283 |
+
# Fallback: cluster by Y-coordinate
|
| 284 |
+
current_row = [table_cells_sorted[0]]
|
| 285 |
+
current_row_y = table_cells_sorted[0]['y']
|
| 286 |
+
|
| 287 |
+
for cell in table_cells_sorted[1:]:
|
| 288 |
+
cell_y = cell['y']
|
| 289 |
+
if abs(cell_y - current_row_y) <= Y_TOLERANCE_TABLE:
|
| 290 |
+
current_row.append(cell)
|
| 291 |
+
# Use median Y for better row representation
|
| 292 |
+
current_row_y = sorted([c['y'] for c in current_row])[len(current_row) // 2]
|
| 293 |
+
else:
|
| 294 |
+
# Sort current row by X before adding
|
| 295 |
+
current_row.sort(key=lambda x: x['x'])
|
| 296 |
+
table_rows.append(current_row)
|
| 297 |
+
current_row = [cell]
|
| 298 |
+
current_row_y = cell_y
|
| 299 |
+
|
| 300 |
+
if current_row:
|
| 301 |
+
current_row.sort(key=lambda x: x['x'])
|
| 302 |
+
table_rows.append(current_row)
|
| 303 |
+
|
| 304 |
+
# Add table rows to lines_dict using median Y
|
| 305 |
+
for row_cells in table_rows:
|
| 306 |
+
if row_cells:
|
| 307 |
+
# Use median Y for row representation
|
| 308 |
+
row_ys = [cell['y'] for cell in row_cells]
|
| 309 |
+
median_y = sorted(row_ys)[len(row_ys) // 2]
|
| 310 |
+
if median_y not in lines_dict:
|
| 311 |
+
lines_dict[median_y] = []
|
| 312 |
+
lines_dict[median_y].extend(row_cells)
|
| 313 |
+
|
| 314 |
+
# Group non-table elements by Y position
|
| 315 |
+
for elem in non_table_elements:
|
| 316 |
+
y_pos = elem['y']
|
| 317 |
+
matched_line = None
|
| 318 |
+
|
| 319 |
+
# Find closest existing line within tolerance
|
| 320 |
+
for existing_y in lines_dict.keys():
|
| 321 |
+
if abs(existing_y - y_pos) <= Y_TOLERANCE_BASE:
|
| 322 |
+
matched_line = existing_y
|
| 323 |
+
break
|
| 324 |
+
|
| 325 |
+
if matched_line is None:
|
| 326 |
+
matched_line = y_pos
|
| 327 |
+
|
| 328 |
+
if matched_line not in lines_dict:
|
| 329 |
+
lines_dict[matched_line] = []
|
| 330 |
+
lines_dict[matched_line].append(elem)
|
| 331 |
+
|
| 332 |
+
# Build formatted output with precise positioning and smart spacing
|
| 333 |
+
formatted_lines = []
|
| 334 |
+
sorted_y_positions = sorted(lines_dict.keys())
|
| 335 |
+
last_y = None
|
| 336 |
+
|
| 337 |
+
for y_pos in sorted_y_positions:
|
| 338 |
+
items = lines_dict[y_pos]
|
| 339 |
+
items.sort(key=lambda x: x['x'])
|
| 340 |
+
|
| 341 |
+
# Add blank lines for vertical spacing
|
| 342 |
+
if last_y is not None:
|
| 343 |
+
gap = y_pos - last_y
|
| 344 |
+
if gap > 30:
|
| 345 |
+
blank_lines = min(3, int(gap / 40))
|
| 346 |
+
for _ in range(blank_lines):
|
| 347 |
+
formatted_lines.append('')
|
| 348 |
+
|
| 349 |
+
# Build line with precise character positioning and smart spacing
|
| 350 |
+
line_array = [' '] * MAX_LINE_WIDTH
|
| 351 |
+
|
| 352 |
+
prev_x2 = None # Track end position of previous text element
|
| 353 |
+
|
| 354 |
+
for item_idx, item in enumerate(items):
|
| 355 |
+
x_pos = item['x']
|
| 356 |
+
x2_pos = item.get('x2', x_pos)
|
| 357 |
+
text = item['text'].strip()
|
| 358 |
+
if not text:
|
| 359 |
+
continue
|
| 360 |
+
|
| 361 |
+
is_table_cell = item.get('is_table', False)
|
| 362 |
+
char_col = int(x_pos / PIXELS_PER_CHAR)
|
| 363 |
+
char_col = max(0, min(char_col, MAX_LINE_WIDTH - len(text) - 1))
|
| 364 |
+
|
| 365 |
+
# Calculate spacing from previous element
|
| 366 |
+
if prev_x2 is not None and item_idx > 0:
|
| 367 |
+
gap_pixels = x_pos - prev_x2
|
| 368 |
+
if gap_pixels > 0:
|
| 369 |
+
spaces_to_add = calculate_spacing(gap_pixels, PIXELS_PER_CHAR)
|
| 370 |
+
# Ensure we don't overwrite existing text
|
| 371 |
+
prev_char_col_end = int(prev_x2 / PIXELS_PER_CHAR)
|
| 372 |
+
if char_col > prev_char_col_end:
|
| 373 |
+
# Add spaces between elements
|
| 374 |
+
for s in range(min(spaces_to_add, char_col - prev_char_col_end)):
|
| 375 |
+
space_pos = prev_char_col_end + s
|
| 376 |
+
if space_pos < MAX_LINE_WIDTH and line_array[space_pos] == ' ':
|
| 377 |
+
line_array[space_pos] = ' '
|
| 378 |
+
|
| 379 |
+
# Place text at calculated position
|
| 380 |
+
for i, char in enumerate(text):
|
| 381 |
+
pos = char_col + i
|
| 382 |
+
if pos < MAX_LINE_WIDTH:
|
| 383 |
+
if is_table_cell:
|
| 384 |
+
# For table cells, overwrite to ensure proper alignment
|
| 385 |
+
line_array[pos] = char
|
| 386 |
+
elif line_array[pos] == ' ':
|
| 387 |
+
# For non-table text, only place if position is empty
|
| 388 |
+
line_array[pos] = char
|
| 389 |
+
|
| 390 |
+
prev_x2 = x2_pos
|
| 391 |
+
|
| 392 |
+
# Convert to string
|
| 393 |
+
line_str = ''.join(line_array).rstrip()
|
| 394 |
+
if line_str.strip():
|
| 395 |
+
formatted_lines.append(line_str)
|
| 396 |
+
|
| 397 |
+
last_y = y_pos
|
| 398 |
+
|
| 399 |
+
return '\n'.join(formatted_lines)
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
# Global engine cache to avoid reinitializing on multiple runs
|
| 403 |
+
_engine_cache = None
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
def main():
|
| 407 |
+
"""Main function for document extraction"""
|
| 408 |
+
global _engine_cache
|
| 409 |
+
|
| 410 |
+
# Start total timer
|
| 411 |
+
total_start = time.time()
|
| 412 |
+
|
| 413 |
+
# Configuration
|
| 414 |
+
img_path = 'test_invoice2.jpg'
|
| 415 |
+
save_folder = './output_results'
|
| 416 |
+
|
| 417 |
+
# Create output directory
|
| 418 |
+
Path(save_folder).mkdir(exist_ok=True)
|
| 419 |
+
|
| 420 |
+
# Check if image exists
|
| 421 |
+
if not os.path.exists(img_path):
|
| 422 |
+
print(f"Error: Image file '{img_path}' not found!")
|
| 423 |
+
return
|
| 424 |
+
|
| 425 |
+
# Initialize PPStructureV3 with optimized settings (reuse if already initialized)
|
| 426 |
+
print("=" * 80)
|
| 427 |
+
print("Initializing PPStructureV3 with PaddlePaddle 3.2.2")
|
| 428 |
+
print("Using PP-OCRv5 (latest OCR engine)")
|
| 429 |
+
print("=" * 80)
|
| 430 |
+
|
| 431 |
+
# Verify environment variable is set
|
| 432 |
+
check_disabled = os.environ.get('PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK', 'False')
|
| 433 |
+
if check_disabled in ('1', 'True', 'true', 'TRUE'):
|
| 434 |
+
print("Model source check: DISABLED (fast mode)")
|
| 435 |
+
else:
|
| 436 |
+
print("WARNING: Model source check may still be enabled!")
|
| 437 |
+
|
| 438 |
+
print("\nInitializing models (this may take a moment on first run)...\n")
|
| 439 |
+
|
| 440 |
+
start_init = time.time()
|
| 441 |
+
try:
|
| 442 |
+
if _engine_cache is None:
|
| 443 |
+
structure_engine = PPStructureV3(
|
| 444 |
+
lang='en',
|
| 445 |
+
ocr_version='PP-OCRv5',
|
| 446 |
+
use_table_recognition=True,
|
| 447 |
+
use_chart_recognition=False, # Disable for invoices
|
| 448 |
+
use_formula_recognition=False, # Disable for invoices
|
| 449 |
+
use_seal_recognition=False, # Disable for invoices
|
| 450 |
+
use_region_detection=False, # Disable for faster processing
|
| 451 |
+
)
|
| 452 |
+
_engine_cache = structure_engine
|
| 453 |
+
init_time = time.time() - start_init
|
| 454 |
+
print(f"[OK] PPStructureV3 initialized successfully ({init_time:.1f}s)\n")
|
| 455 |
+
else:
|
| 456 |
+
structure_engine = _engine_cache
|
| 457 |
+
print("[OK] Using cached PPStructureV3 engine (0.0s)\n")
|
| 458 |
+
except Exception as e:
|
| 459 |
+
print(f"Error initializing PPStructureV3: {e}")
|
| 460 |
+
return
|
| 461 |
+
|
| 462 |
+
# Read image
|
| 463 |
+
print(f"Processing image: {img_path}")
|
| 464 |
+
img = cv2.imread(img_path)
|
| 465 |
+
if img is None:
|
| 466 |
+
print(f"Error: Could not read image '{img_path}'")
|
| 467 |
+
return
|
| 468 |
+
|
| 469 |
+
img_height, img_width = img.shape[:2]
|
| 470 |
+
print(f"Image dimensions: {img_width} x {img_height} pixels\n")
|
| 471 |
+
|
| 472 |
+
# Run inference
|
| 473 |
+
print("Running document structure analysis...")
|
| 474 |
+
print("Using:")
|
| 475 |
+
print(" - PP-OCRv5 for text recognition")
|
| 476 |
+
print(" - Advanced table recognition with cell detection")
|
| 477 |
+
print(" - Layout preservation with precise coordinates\n")
|
| 478 |
+
|
| 479 |
+
start_inference = time.time()
|
| 480 |
+
try:
|
| 481 |
+
result = structure_engine.predict(
|
| 482 |
+
img_path, # Use file path for better compatibility
|
| 483 |
+
use_table_recognition=True,
|
| 484 |
+
use_ocr_results_with_table_cells=True,
|
| 485 |
+
use_e2e_wireless_table_rec_model=True,
|
| 486 |
+
use_table_orientation_classify=True,
|
| 487 |
+
use_chart_recognition=False, # Disable for invoices
|
| 488 |
+
use_formula_recognition=False, # Disable for invoices
|
| 489 |
+
use_seal_recognition=False, # Disable for invoices
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
inference_time = time.time() - start_inference
|
| 493 |
+
print(f"[OK] Analysis complete! ({inference_time:.1f}s)\n")
|
| 494 |
+
|
| 495 |
+
# Extract parsing results
|
| 496 |
+
if result and isinstance(result[0], dict):
|
| 497 |
+
page_data = result[0]
|
| 498 |
+
parsing_res_list = page_data.get('parsing_res_list', [])
|
| 499 |
+
table_res_list = page_data.get('table_res_list', [])
|
| 500 |
+
|
| 501 |
+
print(f"Detection Results:")
|
| 502 |
+
print(f" Total regions detected: {len(parsing_res_list)}\n")
|
| 503 |
+
|
| 504 |
+
for i, region in enumerate(parsing_res_list):
|
| 505 |
+
# Handle both dict and LayoutBlock object
|
| 506 |
+
if isinstance(region, dict):
|
| 507 |
+
region_type = region.get('label', 'unknown')
|
| 508 |
+
bbox = region.get('bbox', [])
|
| 509 |
+
else:
|
| 510 |
+
# LayoutBlock object - access attributes directly
|
| 511 |
+
region_type = getattr(region, 'label', 'unknown')
|
| 512 |
+
bbox = getattr(region, 'bbox', [])
|
| 513 |
+
|
| 514 |
+
print(f" Region {i}: type={region_type}, bbox={bbox}")
|
| 515 |
+
|
| 516 |
+
if region_type == 'table':
|
| 517 |
+
print(f" -> Table detected with HTML structure")
|
| 518 |
+
|
| 519 |
+
print(f"\n Tables detected: {len(table_res_list)}\n")
|
| 520 |
+
print("-" * 80 + "\n")
|
| 521 |
+
|
| 522 |
+
except Exception as e:
|
| 523 |
+
print(f"Error during inference: {e}")
|
| 524 |
+
import traceback
|
| 525 |
+
traceback.print_exc()
|
| 526 |
+
return
|
| 527 |
+
|
| 528 |
+
# Format text with layout preservation
|
| 529 |
+
print("Formatting text with layout preservation...")
|
| 530 |
+
start_format = time.time()
|
| 531 |
+
try:
|
| 532 |
+
layout_preserved_text = format_text_with_layout(result, img_height, img_width)
|
| 533 |
+
format_time = time.time() - start_format
|
| 534 |
+
print(f"[OK] Layout formatting complete! ({format_time:.1f}s)\n")
|
| 535 |
+
except Exception as e:
|
| 536 |
+
print(f"Error formatting layout: {e}")
|
| 537 |
+
import traceback
|
| 538 |
+
traceback.print_exc()
|
| 539 |
+
return
|
| 540 |
+
|
| 541 |
+
# Display output
|
| 542 |
+
print("=" * 80)
|
| 543 |
+
print("EXTRACTED TEXT (LAYOUT PRESERVED)")
|
| 544 |
+
print("=" * 80 + "\n")
|
| 545 |
+
print(layout_preserved_text)
|
| 546 |
+
print("\n" + "=" * 80 + "\n")
|
| 547 |
+
|
| 548 |
+
# Save results
|
| 549 |
+
output_layout_file = os.path.join(save_folder, f"{Path(img_path).stem}_layout_preserved.txt")
|
| 550 |
+
output_json_file = os.path.join(save_folder, f"{Path(img_path).stem}_result.json")
|
| 551 |
+
|
| 552 |
+
try:
|
| 553 |
+
with open(output_layout_file, 'w', encoding='utf-8') as f:
|
| 554 |
+
f.write(layout_preserved_text)
|
| 555 |
+
|
| 556 |
+
def json_serial(obj):
|
| 557 |
+
if hasattr(obj, '__dict__'):
|
| 558 |
+
return obj.__dict__
|
| 559 |
+
elif isinstance(obj, (list, tuple)):
|
| 560 |
+
return list(obj)
|
| 561 |
+
return str(obj)
|
| 562 |
+
|
| 563 |
+
with open(output_json_file, 'w', encoding='utf-8') as f:
|
| 564 |
+
json.dump(result, f, indent=2, ensure_ascii=False, default=json_serial)
|
| 565 |
+
|
| 566 |
+
print("Results saved:")
|
| 567 |
+
print(f" [OK] Layout-preserved text: {output_layout_file}")
|
| 568 |
+
print(f" [OK] JSON result: {output_json_file}")
|
| 569 |
+
|
| 570 |
+
total_time = time.time() - total_start
|
| 571 |
+
print(f"\n[OK] Extraction complete! (Total time: {total_time:.1f}s)")
|
| 572 |
+
|
| 573 |
+
except Exception as e:
|
| 574 |
+
print(f"Error saving results: {e}")
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
if __name__ == '__main__':
|
| 578 |
+
main()
|
app_gradio.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hugging Face Spaces - Invoice Extraction with Layout Preservation
|
| 3 |
+
Gradio interface for document extraction using PaddlePaddle PPStructureV3
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
import tempfile
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# CRITICAL: Set environment variables BEFORE any other imports
|
| 12 |
+
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = '1'
|
| 13 |
+
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = '1'
|
| 14 |
+
|
| 15 |
+
import warnings
|
| 16 |
+
warnings.filterwarnings('ignore', message='.*Checking connectivity.*')
|
| 17 |
+
warnings.filterwarnings('ignore', message='.*model hoster.*')
|
| 18 |
+
|
| 19 |
+
import gradio as gr
|
| 20 |
+
import cv2
|
| 21 |
+
from paddleocr import PPStructureV3
|
| 22 |
+
import numpy as np
|
| 23 |
+
|
| 24 |
+
# Import the layout formatting function from app.py
|
| 25 |
+
from app import format_text_with_layout
|
| 26 |
+
|
| 27 |
+
# Global engine cache
|
| 28 |
+
_engine_cache = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def initialize_engine():
|
| 32 |
+
"""Initialize PPStructureV3 engine (cached)"""
|
| 33 |
+
global _engine_cache
|
| 34 |
+
if _engine_cache is None:
|
| 35 |
+
print("Initializing PPStructureV3...")
|
| 36 |
+
_engine_cache = PPStructureV3(
|
| 37 |
+
lang='en',
|
| 38 |
+
ocr_version='PP-OCRv5',
|
| 39 |
+
use_table_recognition=True,
|
| 40 |
+
use_chart_recognition=False,
|
| 41 |
+
use_formula_recognition=False,
|
| 42 |
+
use_seal_recognition=False,
|
| 43 |
+
use_region_detection=False,
|
| 44 |
+
)
|
| 45 |
+
print("Engine initialized!")
|
| 46 |
+
return _engine_cache
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def process_invoice(image):
|
| 50 |
+
"""Process invoice image and return extracted text with layout preservation"""
|
| 51 |
+
if image is None:
|
| 52 |
+
return "Please upload an image file."
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
# Initialize engine
|
| 56 |
+
engine = initialize_engine()
|
| 57 |
+
|
| 58 |
+
# Save image to temporary file
|
| 59 |
+
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
|
| 60 |
+
tmp_path = tmp_file.name
|
| 61 |
+
cv2.imwrite(tmp_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
# Get image dimensions
|
| 65 |
+
img_height, img_width = image.shape[:2]
|
| 66 |
+
|
| 67 |
+
# Run inference
|
| 68 |
+
result = engine.predict(
|
| 69 |
+
tmp_path,
|
| 70 |
+
use_table_recognition=True,
|
| 71 |
+
use_ocr_results_with_table_cells=True,
|
| 72 |
+
use_e2e_wireless_table_rec_model=True,
|
| 73 |
+
use_table_orientation_classify=True,
|
| 74 |
+
use_chart_recognition=False,
|
| 75 |
+
use_formula_recognition=False,
|
| 76 |
+
use_seal_recognition=False,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Format text with layout preservation
|
| 80 |
+
layout_preserved_text = format_text_with_layout(result, img_height, img_width)
|
| 81 |
+
|
| 82 |
+
return layout_preserved_text
|
| 83 |
+
|
| 84 |
+
finally:
|
| 85 |
+
# Clean up temporary file
|
| 86 |
+
if os.path.exists(tmp_path):
|
| 87 |
+
os.unlink(tmp_path)
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
return f"Error processing image: {str(e)}\n\nPlease try again or check if the image is a valid invoice document."
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# Create Gradio interface
|
| 94 |
+
with gr.Blocks(title="Invoice Extraction with Layout Preservation") as demo:
|
| 95 |
+
gr.Markdown("""
|
| 96 |
+
# 📄 Invoice Extraction with Layout Preservation
|
| 97 |
+
|
| 98 |
+
Extract text from invoice images while preserving the original layout and formatting.
|
| 99 |
+
|
| 100 |
+
**Features:**
|
| 101 |
+
- ✅ Precise text extraction using PP-OCRv5
|
| 102 |
+
- ✅ Table recognition with cell-level accuracy
|
| 103 |
+
- ✅ Layout preservation matching original document
|
| 104 |
+
- ✅ Smart spacing and column alignment
|
| 105 |
+
|
| 106 |
+
**How to use:**
|
| 107 |
+
1. Upload an invoice image (JPG, PNG, etc.)
|
| 108 |
+
2. Click "Extract Text"
|
| 109 |
+
3. View the extracted text with preserved layout
|
| 110 |
+
""")
|
| 111 |
+
|
| 112 |
+
with gr.Row():
|
| 113 |
+
with gr.Column():
|
| 114 |
+
image_input = gr.Image(
|
| 115 |
+
label="Upload Invoice Image",
|
| 116 |
+
type="numpy",
|
| 117 |
+
height=400
|
| 118 |
+
)
|
| 119 |
+
extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
|
| 120 |
+
|
| 121 |
+
with gr.Column():
|
| 122 |
+
text_output = gr.Textbox(
|
| 123 |
+
label="Extracted Text (Layout Preserved)",
|
| 124 |
+
lines=30,
|
| 125 |
+
max_lines=50,
|
| 126 |
+
show_copy_button=True
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Examples
|
| 130 |
+
gr.Examples(
|
| 131 |
+
examples=[],
|
| 132 |
+
inputs=image_input,
|
| 133 |
+
label="Example Invoices (add your examples here)"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Process function
|
| 137 |
+
extract_btn.click(
|
| 138 |
+
fn=process_invoice,
|
| 139 |
+
inputs=image_input,
|
| 140 |
+
outputs=text_output
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
gr.Markdown("""
|
| 144 |
+
---
|
| 145 |
+
**Powered by:**
|
| 146 |
+
- PaddlePaddle 3.2.2
|
| 147 |
+
- PPStructureV3
|
| 148 |
+
- PP-OCRv5
|
| 149 |
+
|
| 150 |
+
**Note:** First run may take longer as models are downloaded and initialized.
|
| 151 |
+
""")
|
| 152 |
+
|
| 153 |
+
if __name__ == "__main__":
|
| 154 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
| 155 |
+
|
requirements_hf.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Spaces Requirements
|
| 2 |
+
# Optimized for deployment
|
| 3 |
+
|
| 4 |
+
# PaddlePaddle and PaddleOCR
|
| 5 |
+
paddlepaddle==3.2.2
|
| 6 |
+
paddleocr>=3.3.2
|
| 7 |
+
|
| 8 |
+
# Image processing
|
| 9 |
+
opencv-python-headless>=4.8.0
|
| 10 |
+
Pillow>=10.0.0
|
| 11 |
+
|
| 12 |
+
# Core dependencies
|
| 13 |
+
numpy>=1.21,<2.0
|
| 14 |
+
|
| 15 |
+
# Gradio for web interface
|
| 16 |
+
gradio>=4.0.0
|
| 17 |
+
|
| 18 |
+
# Utilities (optional, can be removed if not needed)
|
| 19 |
+
python-docx>=0.8.11
|
| 20 |
+
openpyxl>=3.0.0
|
| 21 |
+
|