Spaces:

KarthiEz
/

padocr

Sleeping

App Files Files Community

KarthiEz commited on Jan 7

Commit

df31586

verified ·

1 Parent(s): 44aba4c

Upload 4 files

Browse files

Files changed (4) hide show

README_HF.md +62 -0
app.py +578 -0
app_gradio.py +155 -0
requirements_hf.txt +21 -0

README_HF.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+title: Invoice Extraction with Layout Preservation
+emoji: 📄
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app_gradio.py
+pinned: false
+license: mit
+---
+# Invoice Extraction with Layout Preservation
+Extract text from invoice images while preserving the original layout and formatting using advanced OCR technology.
+## Features
+- ✅ **Precise Text Extraction** - Uses PP-OCRv5 (latest OCR engine)
+- ✅ **Table Recognition** - Advanced table recognition with cell-level accuracy
+- ✅ **Layout Preservation** - Maintains original document layout and spacing
+- ✅ **Smart Spacing** - Intelligent spacing detection between text elements
+- ✅ **Column Alignment** - Proper column alignment for tables and multi-column layouts
+## How to Use
+1. Upload an invoice image (JPG, PNG, or other image formats)
+2. Click "Extract Text"
+3. View the extracted text with preserved layout in the output box
+4. Copy the text for further use
+## Technology Stack
+- **PaddlePaddle 3.2.2** - Deep learning framework
+- **PPStructureV3** - Document structure analysis
+- **PP-OCRv5** - Latest OCR engine for text recognition
+- **Gradio** - Web interface
+## Performance
+- First run: Models are downloaded and initialized (~30-60 seconds)
+- Subsequent runs: Fast processing using cached models
+- Model source check: Disabled for faster startup
+## Use Cases
+- Invoice processing and data extraction
+- Document digitization
+- Automated data entry
+- Financial document analysis
+- Receipt processing
+## Limitations
+- Best results with clear, high-resolution images
+- Works best with English text (can be extended to other languages)
+- Complex layouts may require manual review
+## License
+MIT License

app.py ADDED Viewed

	@@ -0,0 +1,578 @@

+"""
+Advanced Document Extraction with Layout Preservation
+Using PaddlePaddle 3.2.2 + PPStructureV3 + PP-OCRv5
+Latest technologies for precise layout preservation
+"""
+import os
+import time
+# CRITICAL: Set environment variables BEFORE any other imports
+# This must be done before importing paddleocr to disable connectivity checks
+os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = '1'  # Use '1' for True
+os.environ['DISABLE_MODEL_SOURCE_CHECK'] = '1'  # Also set this for compatibility
+# Suppress warnings about connectivity checks
+import warnings
+warnings.filterwarnings('ignore', message='.*Checking connectivity.*')
+warnings.filterwarnings('ignore', message='.*model hoster.*')
+import cv2
+from paddleocr import PPStructureV3
+from pathlib import Path
+import json
+from typing import List, Dict, Any
+import numpy as np
+from html.parser import HTMLParser
+class TableHTMLParser(HTMLParser):
+    """Parser for HTML table structure"""
+    def __init__(self):
+        super().__init__()
+        self.rows = []
+        self.current_row = []
+        self.in_cell = False
+        self.current_cell = []
+    def handle_starttag(self, tag, attrs):
+        if tag == 'tr':
+            if self.current_row:
+                self.rows.append(self.current_row)
+            self.current_row = []
+        elif tag in ['td', 'th']:
+            self.in_cell = True
+            self.current_cell = []
+    def handle_endtag(self, tag):
+        if tag in ['td', 'th']:
+            cell_text = ' '.join(self.current_cell).strip()
+            self.current_row.append(cell_text)
+            self.in_cell = False
+            self.current_cell = []
+        elif tag == 'tr':
+            if self.current_row:
+                self.rows.append(self.current_row)
+                self.current_row = []
+    def handle_data(self, data):
+        if self.in_cell:
+            self.current_cell.append(data.strip())
+def calculate_spacing(gap_pixels: float, PIXELS_PER_CHAR: int) -> int:
+    """
+    Calculate spacing between text elements based on pixel gap.
+    Smart detection: preserve large gaps, add spaces for small gaps.
+    """
+    if gap_pixels < 10:
+        return 1  # 1 space for very small gaps
+    elif gap_pixels < 30:
+        return 1 + int(gap_pixels / 20)  # 1-2 spaces for medium gaps
+    else:
+        return int(gap_pixels / PIXELS_PER_CHAR)  # Preserve exact spacing for large gaps
+def format_text_with_layout(result: List[Dict[str, Any]], img_height: int, img_width: int) -> str:
+    """
+    Format extracted text preserving exact spatial layout.
+    PPStructureV3 returns a list with one dict containing:
+    - parsing_res_list: List of parsed regions with label, bbox, content
+    - table_res_list: List of tables with cell_box_list, pred_html
+    - overall_ocr_res: OCR results with rec_texts, rec_polys, rec_boxes
+    """
+    PIXELS_PER_CHAR = 5
+    MAX_LINE_WIDTH = int(img_width / PIXELS_PER_CHAR) + 400
+    all_text_elements = []
+    table_bboxes = []
+    # PPStructureV3 returns a list with one dict
+    if not result or not isinstance(result[0], dict):
+        return ""
+    page_data = result[0]
+    parsing_res_list = page_data.get('parsing_res_list', [])
+    table_res_list = page_data.get('table_res_list', [])
+    overall_ocr_res = page_data.get('overall_ocr_res', {})
+    # Extract precise OCR coordinates
+    ocr_boxes = []
+    ocr_texts = []
+    if overall_ocr_res:
+        # Handle both numpy array and list formats
+        rec_boxes = overall_ocr_res.get('rec_boxes', [])
+        rec_texts_list = overall_ocr_res.get('rec_texts', [])
+        if isinstance(rec_boxes, np.ndarray):
+            ocr_boxes = rec_boxes.tolist()
+        else:
+            ocr_boxes = rec_boxes if rec_boxes else []
+        ocr_texts = rec_texts_list if rec_texts_list else []
+    # First pass: identify table regions from parsing_res_list
+    for region in parsing_res_list:
+        # Handle both dict and LayoutBlock object
+        if isinstance(region, dict):
+            region_type = region.get('label', '')
+            bbox = region.get('bbox', [])
+        else:
+            # LayoutBlock object - access attributes directly
+            region_type = getattr(region, 'label', '')
+            bbox = getattr(region, 'bbox', [])
+        # Store table bounding boxes
+        if region_type == 'table':
+            if len(bbox) >= 4:
+                table_bboxes.append((bbox[0], bbox[1], bbox[2], bbox[3]))
+    # Process table regions from table_res_list with precise cell positions
+    for table_idx, table_res in enumerate(table_res_list):
+        # Get table bounding box from parsing_res_list
+        table_bbox = None
+        for region in parsing_res_list:
+            # Handle both dict and LayoutBlock object
+            if isinstance(region, dict):
+                region_label = region.get('label', '')
+                region_bbox = region.get('bbox', [])
+            else:
+                region_label = getattr(region, 'label', '')
+                region_bbox = getattr(region, 'bbox', [])
+            if region_label == 'table':
+                table_bbox = region_bbox
+                break
+        if not table_bbox or len(table_bbox) < 4:
+            continue
+        # Extract cell_box_list - precise bounding boxes for each table cell
+        cell_box_list = table_res.get('cell_box_list', [])
+        pred_html = table_res.get('pred_html', '')
+        table_ocr_pred = table_res.get('table_ocr_pred', {})
+        table_rec_texts = table_ocr_pred.get('rec_texts', [])
+        # Convert cell_box_list to list if it's a numpy array
+        if isinstance(cell_box_list, np.ndarray):
+            cell_box_list = cell_box_list.tolist()
+        # Parse HTML to get cell structure
+        if pred_html and len(cell_box_list) > 0:
+            try:
+                parser = TableHTMLParser()
+                parser.feed(pred_html)
+                if parser.current_row:
+                    parser.rows.append(parser.current_row)
+                # Match HTML cells with cell_box_list
+                # cell_box_list contains [x1, y1, x2, y2] for each cell in row-major order
+                cell_idx = 0
+                for row_idx, row in enumerate(parser.rows):
+                    for col_idx, cell_text in enumerate(row):
+                        if cell_idx < len(cell_box_list):
+                            # Get precise cell bounding box
+                            cell_box = cell_box_list[cell_idx]
+                            # Handle both list and numpy array formats
+                            if isinstance(cell_box, np.ndarray):
+                                cell_box = cell_box.tolist()
+                            if len(cell_box) >= 4:
+                                cx1, cy1, cx2, cy2 = cell_box[0], cell_box[1], cell_box[2], cell_box[3]
+                                # Get text from table_rec_texts if available, otherwise use HTML cell text
+                                cell_text_final = cell_text
+                                if cell_idx < len(table_rec_texts) and table_rec_texts[cell_idx]:
+                                    cell_text_final = table_rec_texts[cell_idx]
+                                # Handle multi-line cells by checking if text spans multiple lines
+                                # Use center Y for positioning
+                                cell_center_y = (cy1 + cy2) / 2
+                                all_text_elements.append({
+                                    'y': int(cell_center_y),
+                                    'x': int(cx1),
+                                    'x2': int(cx2),
+                                    'y2': int(cy2),
+                                    'text': cell_text_final.strip() if cell_text_final else '',
+                                    'type': 'table_cell',
+                                    'is_table': True,
+                                    'row_idx': row_idx,
+                                    'col_idx': col_idx
+                                })
+                        cell_idx += 1
+            except Exception as e:
+                print(f"Warning: Table parsing error: {e}")
+                import traceback
+                traceback.print_exc()
+    # Process non-table text using precise OCR coordinates from overall_ocr_res
+    # Filter out OCR boxes that fall within table regions to avoid duplicates
+    if ocr_boxes and ocr_texts:
+        for ocr_idx, (ocr_box, ocr_text) in enumerate(zip(ocr_boxes, ocr_texts)):
+            if not ocr_text or not ocr_text.strip():
+                continue
+            # Handle both list and numpy array formats
+            if isinstance(ocr_box, np.ndarray):
+                ocr_box = ocr_box.tolist()
+            if len(ocr_box) >= 4:
+                ox1, oy1, ox2, oy2 = ocr_box[0], ocr_box[1], ocr_box[2], ocr_box[3]
+                # Check if this OCR box is inside a table region
+                in_table = False
+                for tx1, ty1, tx2, ty2 in table_bboxes:
+                    # Check if OCR box center or significant portion is within table
+                    center_x = (ox1 + ox2) / 2
+                    center_y = (oy1 + oy2) / 2
+                    if tx1 <= center_x <= tx2 and ty1 <= center_y <= ty2:
+                        in_table = True
+                        break
+                # Only add if not in table (table cells already processed)
+                if not in_table:
+                    # Use center Y for positioning
+                    center_y = (oy1 + oy2) / 2
+                    all_text_elements.append({
+                        'y': int(center_y),
+                        'x': int(ox1),
+                        'x2': int(ox2),
+                        'y2': int(oy2),
+                        'text': ocr_text.strip(),
+                        'type': 'text',
+                        'is_table': False
+                    })
+    # Group text elements by Y position (row clustering)
+    Y_TOLERANCE_BASE = 10
+    Y_TOLERANCE_TABLE = 20  # Reduced for better row grouping
+    # Separate table cells and non-table elements
+    table_cells = [e for e in all_text_elements if e.get('is_table', False)]
+    non_table_elements = [e for e in all_text_elements if not e.get('is_table', False)]
+    lines_dict = {}
+    # Group table cells by row using actual Y-coordinates with improved clustering
+    if table_cells:
+        # Sort by Y, then by X for consistent ordering
+        table_cells_sorted = sorted(table_cells, key=lambda x: (x['y'], x['x']))
+        # Use row_idx if available (from HTML parsing), otherwise cluster by Y
+        table_rows = []
+        if table_cells_sorted and 'row_idx' in table_cells_sorted[0]:
+            # Group by row_idx first
+            row_groups = {}
+            for cell in table_cells_sorted:
+                row_idx = cell.get('row_idx', 0)
+                if row_idx not in row_groups:
+                    row_groups[row_idx] = []
+                row_groups[row_idx].append(cell)
+            # Convert to list and sort by row_idx
+            for row_idx in sorted(row_groups.keys()):
+                row_cells = row_groups[row_idx]
+                # Sort cells within row by X (col_idx if available)
+                row_cells.sort(key=lambda x: (x.get('col_idx', 0), x['x']))
+                table_rows.append(row_cells)
+        else:
+            # Fallback: cluster by Y-coordinate
+            current_row = [table_cells_sorted[0]]
+            current_row_y = table_cells_sorted[0]['y']
+            for cell in table_cells_sorted[1:]:
+                cell_y = cell['y']
+                if abs(cell_y - current_row_y) <= Y_TOLERANCE_TABLE:
+                    current_row.append(cell)
+                    # Use median Y for better row representation
+                    current_row_y = sorted([c['y'] for c in current_row])[len(current_row) // 2]
+                else:
+                    # Sort current row by X before adding
+                    current_row.sort(key=lambda x: x['x'])
+                    table_rows.append(current_row)
+                    current_row = [cell]
+                    current_row_y = cell_y
+            if current_row:
+                current_row.sort(key=lambda x: x['x'])
+                table_rows.append(current_row)
+        # Add table rows to lines_dict using median Y
+        for row_cells in table_rows:
+            if row_cells:
+                # Use median Y for row representation
+                row_ys = [cell['y'] for cell in row_cells]
+                median_y = sorted(row_ys)[len(row_ys) // 2]
+                if median_y not in lines_dict:
+                    lines_dict[median_y] = []
+                lines_dict[median_y].extend(row_cells)
+    # Group non-table elements by Y position
+    for elem in non_table_elements:
+        y_pos = elem['y']
+        matched_line = None
+        # Find closest existing line within tolerance
+        for existing_y in lines_dict.keys():
+            if abs(existing_y - y_pos) <= Y_TOLERANCE_BASE:
+                matched_line = existing_y
+                break
+        if matched_line is None:
+            matched_line = y_pos
+        if matched_line not in lines_dict:
+            lines_dict[matched_line] = []
+        lines_dict[matched_line].append(elem)
+    # Build formatted output with precise positioning and smart spacing
+    formatted_lines = []
+    sorted_y_positions = sorted(lines_dict.keys())
+    last_y = None
+    for y_pos in sorted_y_positions:
+        items = lines_dict[y_pos]
+        items.sort(key=lambda x: x['x'])
+        # Add blank lines for vertical spacing
+        if last_y is not None:
+            gap = y_pos - last_y
+            if gap > 30:
+                blank_lines = min(3, int(gap / 40))
+                for _ in range(blank_lines):
+                    formatted_lines.append('')
+        # Build line with precise character positioning and smart spacing
+        line_array = [' '] * MAX_LINE_WIDTH
+        prev_x2 = None  # Track end position of previous text element
+        for item_idx, item in enumerate(items):
+            x_pos = item['x']
+            x2_pos = item.get('x2', x_pos)
+            text = item['text'].strip()
+            if not text:
+                continue
+            is_table_cell = item.get('is_table', False)
+            char_col = int(x_pos / PIXELS_PER_CHAR)
+            char_col = max(0, min(char_col, MAX_LINE_WIDTH - len(text) - 1))
+            # Calculate spacing from previous element
+            if prev_x2 is not None and item_idx > 0:
+                gap_pixels = x_pos - prev_x2
+                if gap_pixels > 0:
+                    spaces_to_add = calculate_spacing(gap_pixels, PIXELS_PER_CHAR)
+                    # Ensure we don't overwrite existing text
+                    prev_char_col_end = int(prev_x2 / PIXELS_PER_CHAR)
+                    if char_col > prev_char_col_end:
+                        # Add spaces between elements
+                        for s in range(min(spaces_to_add, char_col - prev_char_col_end)):
+                            space_pos = prev_char_col_end + s
+                            if space_pos < MAX_LINE_WIDTH and line_array[space_pos] == ' ':
+                                line_array[space_pos] = ' '
+            # Place text at calculated position
+            for i, char in enumerate(text):
+                pos = char_col + i
+                if pos < MAX_LINE_WIDTH:
+                    if is_table_cell:
+                        # For table cells, overwrite to ensure proper alignment
+                        line_array[pos] = char
+                    elif line_array[pos] == ' ':
+                        # For non-table text, only place if position is empty
+                        line_array[pos] = char
+            prev_x2 = x2_pos
+        # Convert to string
+        line_str = ''.join(line_array).rstrip()
+        if line_str.strip():
+            formatted_lines.append(line_str)
+        last_y = y_pos
+    return '\n'.join(formatted_lines)
+# Global engine cache to avoid reinitializing on multiple runs
+_engine_cache = None
+def main():
+    """Main function for document extraction"""
+    global _engine_cache
+    # Start total timer
+    total_start = time.time()
+    # Configuration
+    img_path = 'test_invoice2.jpg'
+    save_folder = './output_results'
+    # Create output directory
+    Path(save_folder).mkdir(exist_ok=True)
+    # Check if image exists
+    if not os.path.exists(img_path):
+        print(f"Error: Image file '{img_path}' not found!")
+        return
+    # Initialize PPStructureV3 with optimized settings (reuse if already initialized)
+    print("=" * 80)
+    print("Initializing PPStructureV3 with PaddlePaddle 3.2.2")
+    print("Using PP-OCRv5 (latest OCR engine)")
+    print("=" * 80)
+    # Verify environment variable is set
+    check_disabled = os.environ.get('PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK', 'False')
+    if check_disabled in ('1', 'True', 'true', 'TRUE'):
+        print("Model source check: DISABLED (fast mode)")
+    else:
+        print("WARNING: Model source check may still be enabled!")
+    print("\nInitializing models (this may take a moment on first run)...\n")
+    start_init = time.time()
+    try:
+        if _engine_cache is None:
+            structure_engine = PPStructureV3(
+                lang='en',
+                ocr_version='PP-OCRv5',
+                use_table_recognition=True,
+                use_chart_recognition=False,  # Disable for invoices
+                use_formula_recognition=False,  # Disable for invoices
+                use_seal_recognition=False,  # Disable for invoices
+                use_region_detection=False,  # Disable for faster processing
+            )
+            _engine_cache = structure_engine
+            init_time = time.time() - start_init
+            print(f"[OK] PPStructureV3 initialized successfully ({init_time:.1f}s)\n")
+        else:
+            structure_engine = _engine_cache
+            print("[OK] Using cached PPStructureV3 engine (0.0s)\n")
+    except Exception as e:
+        print(f"Error initializing PPStructureV3: {e}")
+        return
+    # Read image
+    print(f"Processing image: {img_path}")
+    img = cv2.imread(img_path)
+    if img is None:
+        print(f"Error: Could not read image '{img_path}'")
+        return
+    img_height, img_width = img.shape[:2]
+    print(f"Image dimensions: {img_width} x {img_height} pixels\n")
+    # Run inference
+    print("Running document structure analysis...")
+    print("Using:")
+    print("  - PP-OCRv5 for text recognition")
+    print("  - Advanced table recognition with cell detection")
+    print("  - Layout preservation with precise coordinates\n")
+    start_inference = time.time()
+    try:
+        result = structure_engine.predict(
+            img_path,  # Use file path for better compatibility
+            use_table_recognition=True,
+            use_ocr_results_with_table_cells=True,
+            use_e2e_wireless_table_rec_model=True,
+            use_table_orientation_classify=True,
+            use_chart_recognition=False,  # Disable for invoices
+            use_formula_recognition=False,  # Disable for invoices
+            use_seal_recognition=False,  # Disable for invoices
+        )
+        inference_time = time.time() - start_inference
+        print(f"[OK] Analysis complete! ({inference_time:.1f}s)\n")
+        # Extract parsing results
+        if result and isinstance(result[0], dict):
+            page_data = result[0]
+            parsing_res_list = page_data.get('parsing_res_list', [])
+            table_res_list = page_data.get('table_res_list', [])
+            print(f"Detection Results:")
+            print(f"  Total regions detected: {len(parsing_res_list)}\n")
+            for i, region in enumerate(parsing_res_list):
+                # Handle both dict and LayoutBlock object
+                if isinstance(region, dict):
+                    region_type = region.get('label', 'unknown')
+                    bbox = region.get('bbox', [])
+                else:
+                    # LayoutBlock object - access attributes directly
+                    region_type = getattr(region, 'label', 'unknown')
+                    bbox = getattr(region, 'bbox', [])
+                print(f"  Region {i}: type={region_type}, bbox={bbox}")
+                if region_type == 'table':
+                    print(f"           -> Table detected with HTML structure")
+            print(f"\n  Tables detected: {len(table_res_list)}\n")
+            print("-" * 80 + "\n")
+    except Exception as e:
+        print(f"Error during inference: {e}")
+        import traceback
+        traceback.print_exc()
+        return
+    # Format text with layout preservation
+    print("Formatting text with layout preservation...")
+    start_format = time.time()
+    try:
+        layout_preserved_text = format_text_with_layout(result, img_height, img_width)
+        format_time = time.time() - start_format
+        print(f"[OK] Layout formatting complete! ({format_time:.1f}s)\n")
+    except Exception as e:
+        print(f"Error formatting layout: {e}")
+        import traceback
+        traceback.print_exc()
+        return
+    # Display output
+    print("=" * 80)
+    print("EXTRACTED TEXT (LAYOUT PRESERVED)")
+    print("=" * 80 + "\n")
+    print(layout_preserved_text)
+    print("\n" + "=" * 80 + "\n")
+    # Save results
+    output_layout_file = os.path.join(save_folder, f"{Path(img_path).stem}_layout_preserved.txt")
+    output_json_file = os.path.join(save_folder, f"{Path(img_path).stem}_result.json")
+    try:
+        with open(output_layout_file, 'w', encoding='utf-8') as f:
+            f.write(layout_preserved_text)
+        def json_serial(obj):
+            if hasattr(obj, '__dict__'):
+                return obj.__dict__
+            elif isinstance(obj, (list, tuple)):
+                return list(obj)
+            return str(obj)
+        with open(output_json_file, 'w', encoding='utf-8') as f:
+            json.dump(result, f, indent=2, ensure_ascii=False, default=json_serial)
+        print("Results saved:")
+        print(f"  [OK] Layout-preserved text: {output_layout_file}")
+        print(f"  [OK] JSON result: {output_json_file}")
+        total_time = time.time() - total_start
+        print(f"\n[OK] Extraction complete! (Total time: {total_time:.1f}s)")
+    except Exception as e:
+        print(f"Error saving results: {e}")
+if __name__ == '__main__':
+    main()

app_gradio.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Hugging Face Spaces - Invoice Extraction with Layout Preservation
+Gradio interface for document extraction using PaddlePaddle PPStructureV3
+"""
+import os
+import time
+import tempfile
+from pathlib import Path
+# CRITICAL: Set environment variables BEFORE any other imports
+os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = '1'
+os.environ['DISABLE_MODEL_SOURCE_CHECK'] = '1'
+import warnings
+warnings.filterwarnings('ignore', message='.*Checking connectivity.*')
+warnings.filterwarnings('ignore', message='.*model hoster.*')
+import gradio as gr
+import cv2
+from paddleocr import PPStructureV3
+import numpy as np
+# Import the layout formatting function from app.py
+from app import format_text_with_layout
+# Global engine cache
+_engine_cache = None
+def initialize_engine():
+    """Initialize PPStructureV3 engine (cached)"""
+    global _engine_cache
+    if _engine_cache is None:
+        print("Initializing PPStructureV3...")
+        _engine_cache = PPStructureV3(
+            lang='en',
+            ocr_version='PP-OCRv5',
+            use_table_recognition=True,
+            use_chart_recognition=False,
+            use_formula_recognition=False,
+            use_seal_recognition=False,
+            use_region_detection=False,
+        )
+        print("Engine initialized!")
+    return _engine_cache
+def process_invoice(image):
+    """Process invoice image and return extracted text with layout preservation"""
+    if image is None:
+        return "Please upload an image file."
+    try:
+        # Initialize engine
+        engine = initialize_engine()
+        # Save image to temporary file
+        with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
+            tmp_path = tmp_file.name
+            cv2.imwrite(tmp_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+        try:
+            # Get image dimensions
+            img_height, img_width = image.shape[:2]
+            # Run inference
+            result = engine.predict(
+                tmp_path,
+                use_table_recognition=True,
+                use_ocr_results_with_table_cells=True,
+                use_e2e_wireless_table_rec_model=True,
+                use_table_orientation_classify=True,
+                use_chart_recognition=False,
+                use_formula_recognition=False,
+                use_seal_recognition=False,
+            )
+            # Format text with layout preservation
+            layout_preserved_text = format_text_with_layout(result, img_height, img_width)
+            return layout_preserved_text
+        finally:
+            # Clean up temporary file
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+    except Exception as e:
+        return f"Error processing image: {str(e)}\n\nPlease try again or check if the image is a valid invoice document."
+# Create Gradio interface
+with gr.Blocks(title="Invoice Extraction with Layout Preservation") as demo:
+    gr.Markdown("""
+    # 📄 Invoice Extraction with Layout Preservation
+    Extract text from invoice images while preserving the original layout and formatting.
+    **Features:**
+    - ✅ Precise text extraction using PP-OCRv5
+    - ✅ Table recognition with cell-level accuracy
+    - ✅ Layout preservation matching original document
+    - ✅ Smart spacing and column alignment
+    **How to use:**
+    1. Upload an invoice image (JPG, PNG, etc.)
+    2. Click "Extract Text"
+    3. View the extracted text with preserved layout
+    """)
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(
+                label="Upload Invoice Image",
+                type="numpy",
+                height=400
+            )
+            extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
+        with gr.Column():
+            text_output = gr.Textbox(
+                label="Extracted Text (Layout Preserved)",
+                lines=30,
+                max_lines=50,
+                show_copy_button=True
+            )
+    # Examples
+    gr.Examples(
+        examples=[],
+        inputs=image_input,
+        label="Example Invoices (add your examples here)"
+    )
+    # Process function
+    extract_btn.click(
+        fn=process_invoice,
+        inputs=image_input,
+        outputs=text_output
+    )
+    gr.Markdown("""
+    ---
+    **Powered by:**
+    - PaddlePaddle 3.2.2
+    - PPStructureV3
+    - PP-OCRv5
+    **Note:** First run may take longer as models are downloaded and initialized.
+    """)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements_hf.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# Hugging Face Spaces Requirements
+# Optimized for deployment
+# PaddlePaddle and PaddleOCR
+paddlepaddle==3.2.2
+paddleocr>=3.3.2
+# Image processing
+opencv-python-headless>=4.8.0
+Pillow>=10.0.0
+# Core dependencies
+numpy>=1.21,<2.0
+# Gradio for web interface
+gradio>=4.0.0
+# Utilities (optional, can be removed if not needed)
+python-docx>=0.8.11
+openpyxl>=3.0.0