Spaces:

quantumbit
/

invoice_extractor

Paused

App Files Files Community

github-actions[bot] commited on Feb 6

Commit

b2ccdfc

1 Parent(s): 3679ff9

Sync from GitHub: 142da7f9640159d16a85a2917851324f6b7e8d54

Browse files

Files changed (6) hide show

.gitignore +3 -1
Dockerfile +13 -1
app.py +14 -8
config.py +5 -0
inference.py +51 -8
utils/image_enhancer.py +233 -0

.gitignore CHANGED Viewed

@@ -37,4 +37,6 @@ frontend/.env.local
 test*
 executable.py
 client_example.py
-Docs

 test*
 executable.py
 client_example.py
+Docs
+realesrgan

Dockerfile CHANGED Viewed

@@ -2,7 +2,7 @@ FROM python:3.10-slim
 WORKDIR /app
-# Install system dependencies including Node.js
 RUN apt-get update && apt-get install -y \
     git \
     libgl1 \
@@ -12,10 +12,22 @@ RUN apt-get update && apt-get install -y \
     libxrender-dev \
     libgomp1 \
     curl \
     && curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
     && apt-get install -y nodejs \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements first for better caching
 COPY requirements.txt .

 WORKDIR /app
+# Install system dependencies including Node.js and tools for Real-ESRGAN
 RUN apt-get update && apt-get install -y \
     git \
     libgl1 \
     libxrender-dev \
     libgomp1 \
     curl \
+    wget \
+    unzip \
+    libvulkan1 \
+    libvulkan-dev \
     && curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
     && apt-get install -y nodejs \
     && rm -rf /var/lib/apt/lists/*
+# Download and setup Real-ESRGAN-ncnn-vulkan for image enhancement
+RUN mkdir -p /app/utils/realesrgan && \
+    cd /app/utils/realesrgan && \
+    wget https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan/releases/download/v0.2.0/realesrgan-ncnn-vulkan-v0.2.0-ubuntu.zip && \
+    unzip realesrgan-ncnn-vulkan-v0.2.0-ubuntu.zip && \
+    rm realesrgan-ncnn-vulkan-v0.2.0-ubuntu.zip && \
+    chmod +x realesrgan-ncnn-vulkan
 # Copy requirements first for better caching
 COPY requirements.txt .

app.py CHANGED Viewed

@@ -98,7 +98,8 @@ async def health_check():
 @app.post("/extract")
 async def extract_invoice(
     file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
-    doc_id: Optional[str] = Form(None, description="Optional document identifier")
 ):
     """
     Extract information from invoice image
@@ -106,6 +107,7 @@ async def extract_invoice(
     **Parameters:**
     - **file**: Invoice image file (required)
     - **doc_id**: Optional document identifier (auto-generated from filename if not provided)
     **Returns:**
     - JSON with extracted fields, confidence scores, and metadata
@@ -170,8 +172,8 @@ async def extract_invoice(
         if doc_id is None:
             doc_id = os.path.splitext(file.filename)[0]
-        # Process invoice
-        result = InferenceProcessor.process_invoice(temp_file, doc_id)
         # Add total request time (includes file I/O)
         result['total_request_time_sec'] = round(time.time() - request_start, 2)
@@ -199,7 +201,8 @@ async def extract_invoice(
 @app.post("/process-invoice")
 async def process_invoice(
-    file: UploadFile = File(..., description="Invoice image file")
 ):
     """
     Process a single invoice and return extracted information
@@ -207,6 +210,7 @@ async def process_invoice(
     **Parameters:**
     - **file**: Invoice image file (required)
     **Returns:**
     - JSON with extracted_text, signature_coords, stamp_coords
@@ -237,8 +241,8 @@ async def process_invoice(
         # Use filename as doc_id
         doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
-        # Process invoice
-        result = InferenceProcessor.process_invoice(temp_file, doc_id)
         # Extract fields from result
         fields = result.get("fields", {})
@@ -303,13 +307,15 @@ async def process_invoice(
 @app.post("/extract_batch")
 async def extract_batch(
-    files: list[UploadFile] = File(..., description="Multiple invoice images")
 ):
     """
     Extract information from multiple invoice images
     **Parameters:**
     - **files**: List of invoice image files
     **Returns:**
     - JSON array with results for each invoice
@@ -344,7 +350,7 @@ async def extract_batch(
             # Process
             try:
                 doc_id = os.path.splitext(file.filename)[0]
-                result = InferenceProcessor.process_invoice(temp_file, doc_id)
                 results.append(result)
             except Exception as e:
                 results.append({

 @app.post("/extract")
 async def extract_invoice(
     file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
+    doc_id: Optional[str] = Form(None, description="Optional document identifier"),
+    enhance: Optional[bool] = Form(None, description="Enable image enhancement (default: True)")
 ):
     """
     Extract information from invoice image
     **Parameters:**
     - **file**: Invoice image file (required)
     - **doc_id**: Optional document identifier (auto-generated from filename if not provided)
+    - **enhance**: Enable image enhancement for blurry images (default: True)
     **Returns:**
     - JSON with extracted fields, confidence scores, and metadata
         if doc_id is None:
             doc_id = os.path.splitext(file.filename)[0]
+        # Process invoice (with optional enhancement)
+        result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance=enhance)
         # Add total request time (includes file I/O)
         result['total_request_time_sec'] = round(time.time() - request_start, 2)
 @app.post("/process-invoice")
 async def process_invoice(
+    file: UploadFile = File(..., description="Invoice image file"),
+    enhance: Optional[bool] = Form(None, description="Enable image enhancement (default: True)")
 ):
     """
     Process a single invoice and return extracted information
     **Parameters:**
     - **file**: Invoice image file (required)
+    - **enhance**: Enable image enhancement for blurry images (default: True)
     **Returns:**
     - JSON with extracted_text, signature_coords, stamp_coords
         # Use filename as doc_id
         doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
+        # Process invoice (with optional enhancement)
+        result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance=enhance)
         # Extract fields from result
         fields = result.get("fields", {})
 @app.post("/extract_batch")
 async def extract_batch(
+    files: list[UploadFile] = File(..., description="Multiple invoice images"),
+    enhance: Optional[bool] = Form(None, description="Enable image enhancement (default: True)")
 ):
     """
     Extract information from multiple invoice images
     **Parameters:**
     - **files**: List of invoice image files
+    - **enhance**: Enable image enhancement for blurry images (default: True)
     **Returns:**
     - JSON array with results for each invoice
             # Process
             try:
                 doc_id = os.path.splitext(file.filename)[0]
+                result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance=enhance)
                 results.append(result)
             except Exception as e:
                 results.append({

config.py CHANGED Viewed

@@ -26,6 +26,11 @@ QUANTIZATION_CONFIG = {
 # Image processing settings
 MAX_IMAGE_SIZE = 800  # Maximum dimension for resizing
 # Detection thresholds
 YOLO_CONFIDENCE_THRESHOLD = 0.25

 # Image processing settings
 MAX_IMAGE_SIZE = 800  # Maximum dimension for resizing
+# Image Enhancement Settings (Real-ESRGAN)
+ENABLE_IMAGE_ENHANCEMENT = True  # Enable/disable image enhancement
+ENHANCEMENT_SCALE = 2  # Upscaling factor (2, 3, or 4)
+ENHANCEMENT_MODEL = "realesrgan-x4plus"  # Model: realesrgan-x4plus, realesrgan-x4plus-anime, realesrnet-x4plus
 # Detection thresholds
 YOLO_CONFIDENCE_THRESHOLD = 0.25

inference.py CHANGED Viewed

@@ -7,6 +7,7 @@ import time
 import json
 import codecs
 import re
 from PIL import Image
 from qwen_vl_utils import process_vision_info
 from typing import Dict, Tuple
@@ -15,9 +16,13 @@ from config import (
     MAX_IMAGE_SIZE,
     HP_VALID_RANGE,
     ASSET_COST_VALID_RANGE,
-    COST_PER_GPU_HOUR
 )
 from model_manager import model_manager
 EXTRACTION_PROMPT = """
@@ -64,11 +69,48 @@ class InferenceProcessor:
     """Handles VLM inference, validation, and result processing"""
     @staticmethod
-    def preprocess_image(image_path: str) -> Image.Image:
-        """Load and resize image if needed"""
-        image = Image.open(image_path).convert("RGB")
-        # Resize if too large
         if max(image.size) > MAX_IMAGE_SIZE:
             ratio = MAX_IMAGE_SIZE / max(image.size)
             new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
@@ -284,13 +326,14 @@ class InferenceProcessor:
         return validated, field_confidence, warnings
     @staticmethod
-    def process_invoice(image_path: str, doc_id: str = None) -> Dict:
         """
         Complete invoice processing pipeline
         Args:
             image_path: Path to invoice image
             doc_id: Document identifier (optional)
         Returns:
             dict: Complete JSON output with all fields
@@ -303,9 +346,9 @@ class InferenceProcessor:
             import os
             doc_id = os.path.splitext(os.path.basename(image_path))[0]
-        # Step 1: Preprocess image
         t1 = time.time()
-        image = InferenceProcessor.preprocess_image(image_path)
         timing_breakdown['image_preprocessing'] = round(time.time() - t1, 3)
         # Step 2: YOLO Detection

 import json
 import codecs
 import re
+import os
 from PIL import Image
 from qwen_vl_utils import process_vision_info
 from typing import Dict, Tuple
     MAX_IMAGE_SIZE,
     HP_VALID_RANGE,
     ASSET_COST_VALID_RANGE,
+    COST_PER_GPU_HOUR,
+    ENABLE_IMAGE_ENHANCEMENT,
+    ENHANCEMENT_SCALE,
+    ENHANCEMENT_MODEL
 )
 from model_manager import model_manager
+from utils.image_enhancer import get_enhancer
 EXTRACTION_PROMPT = """
     """Handles VLM inference, validation, and result processing"""
     @staticmethod
+    def preprocess_image(image_path: str, enhance: bool = None) -> Image.Image:
+        """Load, enhance (optional), and resize image if needed
+        Args:
+            image_path: Path to input image
+            enhance: Whether to enhance image quality before processing (None=use config default)
+        Returns:
+            Preprocessed PIL Image ready for VLM inference
+        """
+        # Use config default if not specified
+        if enhance is None:
+            enhance = ENABLE_IMAGE_ENHANCEMENT
+        # Step 1: Enhance image if enabled
+        enhanced_path = image_path
+        cleanup_enhanced = False
+        if enhance:
+            try:
+                enhancer = get_enhancer()
+                enhanced_path = enhancer.enhance_image(
+                    image_path,
+                    scale=ENHANCEMENT_SCALE,
+                    model_name=ENHANCEMENT_MODEL
+                )
+                cleanup_enhanced = (enhanced_path != image_path)
+            except Exception as e:
+                print(f"⚠️ Enhancement failed: {str(e)}, using original image")
+                enhanced_path = image_path
+        # Step 2: Load image
+        image = Image.open(enhanced_path).convert("RGB")
+        # Cleanup enhanced temp file if created
+        if cleanup_enhanced:
+            try:
+                os.unlink(enhanced_path)
+            except:
+                pass
+        # Step 3: Resize if too large
         if max(image.size) > MAX_IMAGE_SIZE:
             ratio = MAX_IMAGE_SIZE / max(image.size)
             new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
         return validated, field_confidence, warnings
     @staticmethod
+    def process_invoice(image_path: str, doc_id: str = None, enhance: bool = None) -> Dict:
         """
         Complete invoice processing pipeline
         Args:
             image_path: Path to invoice image
             doc_id: Document identifier (optional)
+            enhance: Whether to enhance image (None=use config default)
         Returns:
             dict: Complete JSON output with all fields
             import os
             doc_id = os.path.splitext(os.path.basename(image_path))[0]
+        # Step 1: Preprocess image (with optional enhancement)
         t1 = time.time()
+        image = InferenceProcessor.preprocess_image(image_path, enhance=enhance)
         timing_breakdown['image_preprocessing'] = round(time.time() - t1, 3)
         # Step 2: YOLO Detection

utils/image_enhancer.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""
+Image Enhancement Utility using Real-ESRGAN-ncnn-vulkan
+Enhances blurry/low-quality images before VLM processing
+"""
+import os
+import subprocess
+import tempfile
+import zipfile
+import shutil
+from pathlib import Path
+from PIL import Image
+import urllib.request
+import platform
+class ImageEnhancer:
+    """Handles image enhancement using Real-ESRGAN-ncnn-vulkan"""
+    # Download URLs for Windows executable
+    REALESRGAN_VERSION = "v0.2.0"
+    REALESRGAN_WINDOWS_URL = f"https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan/releases/download/{REALESRGAN_VERSION}/realesrgan-ncnn-vulkan-v0.2.0-windows.zip"
+    def __init__(self, base_dir: str = None):
+        """Initialize image enhancer
+        Args:
+            base_dir: Base directory for storing executable and models
+        """
+        if base_dir is None:
+            base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        self.base_dir = Path(base_dir)
+        self.enhancer_dir = self.base_dir / "utils" / "realesrgan"
+        self.executable_path = None
+        self.models_path = None
+        self.is_available = False
+        # Initialize enhancer
+        self._setup_enhancer()
+    def _setup_enhancer(self):
+        """Setup Real-ESRGAN enhancer (download if needed)"""
+        try:
+            # Check if already exists
+            if self._check_existing_installation():
+                print("✅ Real-ESRGAN enhancer already installed")
+                self.is_available = True
+                return
+            # Download and setup
+            print("📥 Downloading Real-ESRGAN enhancer...")
+            self._download_and_extract()
+            if self._check_existing_installation():
+                print("✅ Real-ESRGAN enhancer installed successfully")
+                self.is_available = True
+            else:
+                print("⚠️ Real-ESRGAN enhancer setup incomplete")
+                self.is_available = False
+        except Exception as e:
+            print(f"⚠️ Failed to setup Real-ESRGAN enhancer: {str(e)}")
+            print("   Image enhancement will be skipped")
+            self.is_available = False
+    def _check_existing_installation(self) -> bool:
+        """Check if Real-ESRGAN is already installed"""
+        if not self.enhancer_dir.exists():
+            return False
+        # Look for executable
+        exe_name = "realesrgan-ncnn-vulkan.exe" if platform.system() == "Windows" else "realesrgan-ncnn-vulkan"
+        possible_paths = [
+            self.enhancer_dir / exe_name,
+            self.enhancer_dir / "realesrgan-ncnn-vulkan" / exe_name,
+        ]
+        for path in possible_paths:
+            if path.exists():
+                self.executable_path = path
+                # Look for models directory
+                models_dir = path.parent / "models"
+                if models_dir.exists():
+                    self.models_path = models_dir
+                    return True
+        return False
+    def _download_and_extract(self):
+        """Download and extract Real-ESRGAN executable"""
+        if platform.system() != "Windows":
+            print("⚠️ Auto-download only supported on Windows. Please manually install Real-ESRGAN-ncnn-vulkan")
+            return
+        # Create directory
+        self.enhancer_dir.mkdir(parents=True, exist_ok=True)
+        # Download file
+        zip_path = self.enhancer_dir / "realesrgan.zip"
+        print(f"   Downloading from {self.REALESRGAN_WINDOWS_URL}...")
+        try:
+            urllib.request.urlretrieve(self.REALESRGAN_WINDOWS_URL, zip_path)
+        except Exception as e:
+            print(f"   Download failed: {str(e)}")
+            return
+        # Extract
+        print("   Extracting files...")
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(self.enhancer_dir)
+        except Exception as e:
+            print(f"   Extraction failed: {str(e)}")
+            return
+        # Cleanup zip file
+        zip_path.unlink()
+        print("   Setup complete!")
+    def enhance_image(self, image_path: str, scale: int = 2, model_name: str = "realesrgan-x4plus") -> str:
+        """Enhance image using Real-ESRGAN
+        Args:
+            image_path: Path to input image
+            scale: Upscale ratio (2, 3, or 4)
+            model_name: Model to use (realesrgan-x4plus, realesrgan-x4plus-anime, realesrnet-x4plus)
+        Returns:
+            Path to enhanced image
+        """
+        if not self.is_available:
+            print("⚠️ Enhancement not available, using original image")
+            return image_path
+        # Create temporary output file
+        input_path = Path(image_path)
+        output_path = input_path.parent / f"{input_path.stem}_enhanced{input_path.suffix}"
+        try:
+            # Build command
+            cmd = [
+                str(self.executable_path),
+                "-i", str(image_path),
+                "-o", str(output_path),
+                "-n", model_name,
+                "-s", str(scale),
+                "-f", "png"  # Output format
+            ]
+            # Add model path if available
+            if self.models_path:
+                cmd.extend(["-m", str(self.models_path)])
+            # Run enhancement
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=30,
+                creationflags=subprocess.CREATE_NO_WINDOW if platform.system() == "Windows" else 0
+            )
+            if result.returncode == 0 and output_path.exists():
+                print(f"✨ Image enhanced successfully (scale={scale}x)")
+                return str(output_path)
+            else:
+                if result.stderr:
+                    print(f"⚠️ Enhancement failed: {result.stderr}")
+                print("   Using original image")
+                return image_path
+        except subprocess.TimeoutExpired:
+            print("⚠️ Enhancement timeout, using original image")
+            return image_path
+        except Exception as e:
+            print(f"⚠️ Enhancement error: {str(e)}, using original image")
+            return image_path
+    def enhance_pil_image(self, pil_image: Image.Image, scale: int = 2, model_name: str = "realesrgan-x4plus") -> Image.Image:
+        """Enhance PIL Image object
+        Args:
+            pil_image: PIL Image object
+            scale: Upscale ratio (2, 3, or 4)
+            model_name: Model to use
+        Returns:
+            Enhanced PIL Image object
+        """
+        if not self.is_available:
+            return pil_image
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_input:
+            temp_input_path = temp_input.name
+            pil_image.save(temp_input_path, "PNG")
+        try:
+            # Enhance
+            enhanced_path = self.enhance_image(temp_input_path, scale, model_name)
+            # Load enhanced image
+            if enhanced_path != temp_input_path:
+                enhanced_image = Image.open(enhanced_path).convert("RGB")
+                # Cleanup enhanced temp file
+                try:
+                    os.unlink(enhanced_path)
+                except:
+                    pass
+                return enhanced_image
+            else:
+                return pil_image
+        finally:
+            # Cleanup input temp file
+            try:
+                os.unlink(temp_input_path)
+            except:
+                pass
+# Global enhancer instance
+_enhancer_instance = None
+def get_enhancer() -> ImageEnhancer:
+    """Get global enhancer instance"""
+    global _enhancer_instance
+    if _enhancer_instance is None:
+        _enhancer_instance = ImageEnhancer()
+    return _enhancer_instance