github-actions[bot] commited on
Commit
b2ccdfc
·
1 Parent(s): 3679ff9

Sync from GitHub: 142da7f9640159d16a85a2917851324f6b7e8d54

Browse files
Files changed (6) hide show
  1. .gitignore +3 -1
  2. Dockerfile +13 -1
  3. app.py +14 -8
  4. config.py +5 -0
  5. inference.py +51 -8
  6. utils/image_enhancer.py +233 -0
.gitignore CHANGED
@@ -37,4 +37,6 @@ frontend/.env.local
37
  test*
38
  executable.py
39
  client_example.py
40
- Docs
 
 
 
37
  test*
38
  executable.py
39
  client_example.py
40
+ Docs
41
+
42
+ realesrgan
Dockerfile CHANGED
@@ -2,7 +2,7 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install system dependencies including Node.js
6
  RUN apt-get update && apt-get install -y \
7
  git \
8
  libgl1 \
@@ -12,10 +12,22 @@ RUN apt-get update && apt-get install -y \
12
  libxrender-dev \
13
  libgomp1 \
14
  curl \
 
 
 
 
15
  && curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
16
  && apt-get install -y nodejs \
17
  && rm -rf /var/lib/apt/lists/*
18
 
 
 
 
 
 
 
 
 
19
  # Copy requirements first for better caching
20
  COPY requirements.txt .
21
 
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies including Node.js and tools for Real-ESRGAN
6
  RUN apt-get update && apt-get install -y \
7
  git \
8
  libgl1 \
 
12
  libxrender-dev \
13
  libgomp1 \
14
  curl \
15
+ wget \
16
+ unzip \
17
+ libvulkan1 \
18
+ libvulkan-dev \
19
  && curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
20
  && apt-get install -y nodejs \
21
  && rm -rf /var/lib/apt/lists/*
22
 
23
+ # Download and setup Real-ESRGAN-ncnn-vulkan for image enhancement
24
+ RUN mkdir -p /app/utils/realesrgan && \
25
+ cd /app/utils/realesrgan && \
26
+ wget https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan/releases/download/v0.2.0/realesrgan-ncnn-vulkan-v0.2.0-ubuntu.zip && \
27
+ unzip realesrgan-ncnn-vulkan-v0.2.0-ubuntu.zip && \
28
+ rm realesrgan-ncnn-vulkan-v0.2.0-ubuntu.zip && \
29
+ chmod +x realesrgan-ncnn-vulkan
30
+
31
  # Copy requirements first for better caching
32
  COPY requirements.txt .
33
 
app.py CHANGED
@@ -98,7 +98,8 @@ async def health_check():
98
  @app.post("/extract")
99
  async def extract_invoice(
100
  file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
101
- doc_id: Optional[str] = Form(None, description="Optional document identifier")
 
102
  ):
103
  """
104
  Extract information from invoice image
@@ -106,6 +107,7 @@ async def extract_invoice(
106
  **Parameters:**
107
  - **file**: Invoice image file (required)
108
  - **doc_id**: Optional document identifier (auto-generated from filename if not provided)
 
109
 
110
  **Returns:**
111
  - JSON with extracted fields, confidence scores, and metadata
@@ -170,8 +172,8 @@ async def extract_invoice(
170
  if doc_id is None:
171
  doc_id = os.path.splitext(file.filename)[0]
172
 
173
- # Process invoice
174
- result = InferenceProcessor.process_invoice(temp_file, doc_id)
175
 
176
  # Add total request time (includes file I/O)
177
  result['total_request_time_sec'] = round(time.time() - request_start, 2)
@@ -199,7 +201,8 @@ async def extract_invoice(
199
 
200
  @app.post("/process-invoice")
201
  async def process_invoice(
202
- file: UploadFile = File(..., description="Invoice image file")
 
203
  ):
204
  """
205
  Process a single invoice and return extracted information
@@ -207,6 +210,7 @@ async def process_invoice(
207
 
208
  **Parameters:**
209
  - **file**: Invoice image file (required)
 
210
 
211
  **Returns:**
212
  - JSON with extracted_text, signature_coords, stamp_coords
@@ -237,8 +241,8 @@ async def process_invoice(
237
  # Use filename as doc_id
238
  doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
239
 
240
- # Process invoice
241
- result = InferenceProcessor.process_invoice(temp_file, doc_id)
242
 
243
  # Extract fields from result
244
  fields = result.get("fields", {})
@@ -303,13 +307,15 @@ async def process_invoice(
303
 
304
  @app.post("/extract_batch")
305
  async def extract_batch(
306
- files: list[UploadFile] = File(..., description="Multiple invoice images")
 
307
  ):
308
  """
309
  Extract information from multiple invoice images
310
 
311
  **Parameters:**
312
  - **files**: List of invoice image files
 
313
 
314
  **Returns:**
315
  - JSON array with results for each invoice
@@ -344,7 +350,7 @@ async def extract_batch(
344
  # Process
345
  try:
346
  doc_id = os.path.splitext(file.filename)[0]
347
- result = InferenceProcessor.process_invoice(temp_file, doc_id)
348
  results.append(result)
349
  except Exception as e:
350
  results.append({
 
98
  @app.post("/extract")
99
  async def extract_invoice(
100
  file: UploadFile = File(..., description="Invoice image file (JPG, PNG, JPEG)"),
101
+ doc_id: Optional[str] = Form(None, description="Optional document identifier"),
102
+ enhance: Optional[bool] = Form(None, description="Enable image enhancement (default: True)")
103
  ):
104
  """
105
  Extract information from invoice image
 
107
  **Parameters:**
108
  - **file**: Invoice image file (required)
109
  - **doc_id**: Optional document identifier (auto-generated from filename if not provided)
110
+ - **enhance**: Enable image enhancement for blurry images (default: True)
111
 
112
  **Returns:**
113
  - JSON with extracted fields, confidence scores, and metadata
 
172
  if doc_id is None:
173
  doc_id = os.path.splitext(file.filename)[0]
174
 
175
+ # Process invoice (with optional enhancement)
176
+ result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance=enhance)
177
 
178
  # Add total request time (includes file I/O)
179
  result['total_request_time_sec'] = round(time.time() - request_start, 2)
 
201
 
202
  @app.post("/process-invoice")
203
  async def process_invoice(
204
+ file: UploadFile = File(..., description="Invoice image file"),
205
+ enhance: Optional[bool] = Form(None, description="Enable image enhancement (default: True)")
206
  ):
207
  """
208
  Process a single invoice and return extracted information
 
210
 
211
  **Parameters:**
212
  - **file**: Invoice image file (required)
213
+ - **enhance**: Enable image enhancement for blurry images (default: True)
214
 
215
  **Returns:**
216
  - JSON with extracted_text, signature_coords, stamp_coords
 
241
  # Use filename as doc_id
242
  doc_id = os.path.splitext(file.filename)[0] if file.filename else "invoice"
243
 
244
+ # Process invoice (with optional enhancement)
245
+ result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance=enhance)
246
 
247
  # Extract fields from result
248
  fields = result.get("fields", {})
 
307
 
308
  @app.post("/extract_batch")
309
  async def extract_batch(
310
+ files: list[UploadFile] = File(..., description="Multiple invoice images"),
311
+ enhance: Optional[bool] = Form(None, description="Enable image enhancement (default: True)")
312
  ):
313
  """
314
  Extract information from multiple invoice images
315
 
316
  **Parameters:**
317
  - **files**: List of invoice image files
318
+ - **enhance**: Enable image enhancement for blurry images (default: True)
319
 
320
  **Returns:**
321
  - JSON array with results for each invoice
 
350
  # Process
351
  try:
352
  doc_id = os.path.splitext(file.filename)[0]
353
+ result = InferenceProcessor.process_invoice(temp_file, doc_id, enhance=enhance)
354
  results.append(result)
355
  except Exception as e:
356
  results.append({
config.py CHANGED
@@ -26,6 +26,11 @@ QUANTIZATION_CONFIG = {
26
  # Image processing settings
27
  MAX_IMAGE_SIZE = 800 # Maximum dimension for resizing
28
 
 
 
 
 
 
29
  # Detection thresholds
30
  YOLO_CONFIDENCE_THRESHOLD = 0.25
31
 
 
26
  # Image processing settings
27
  MAX_IMAGE_SIZE = 800 # Maximum dimension for resizing
28
 
29
+ # Image Enhancement Settings (Real-ESRGAN)
30
+ ENABLE_IMAGE_ENHANCEMENT = True # Enable/disable image enhancement
31
+ ENHANCEMENT_SCALE = 2 # Upscaling factor (2, 3, or 4)
32
+ ENHANCEMENT_MODEL = "realesrgan-x4plus" # Model: realesrgan-x4plus, realesrgan-x4plus-anime, realesrnet-x4plus
33
+
34
  # Detection thresholds
35
  YOLO_CONFIDENCE_THRESHOLD = 0.25
36
 
inference.py CHANGED
@@ -7,6 +7,7 @@ import time
7
  import json
8
  import codecs
9
  import re
 
10
  from PIL import Image
11
  from qwen_vl_utils import process_vision_info
12
  from typing import Dict, Tuple
@@ -15,9 +16,13 @@ from config import (
15
  MAX_IMAGE_SIZE,
16
  HP_VALID_RANGE,
17
  ASSET_COST_VALID_RANGE,
18
- COST_PER_GPU_HOUR
 
 
 
19
  )
20
  from model_manager import model_manager
 
21
 
22
 
23
  EXTRACTION_PROMPT = """
@@ -64,11 +69,48 @@ class InferenceProcessor:
64
  """Handles VLM inference, validation, and result processing"""
65
 
66
  @staticmethod
67
- def preprocess_image(image_path: str) -> Image.Image:
68
- """Load and resize image if needed"""
69
- image = Image.open(image_path).convert("RGB")
70
 
71
- # Resize if too large
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  if max(image.size) > MAX_IMAGE_SIZE:
73
  ratio = MAX_IMAGE_SIZE / max(image.size)
74
  new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
@@ -284,13 +326,14 @@ class InferenceProcessor:
284
  return validated, field_confidence, warnings
285
 
286
  @staticmethod
287
- def process_invoice(image_path: str, doc_id: str = None) -> Dict:
288
  """
289
  Complete invoice processing pipeline
290
 
291
  Args:
292
  image_path: Path to invoice image
293
  doc_id: Document identifier (optional)
 
294
 
295
  Returns:
296
  dict: Complete JSON output with all fields
@@ -303,9 +346,9 @@ class InferenceProcessor:
303
  import os
304
  doc_id = os.path.splitext(os.path.basename(image_path))[0]
305
 
306
- # Step 1: Preprocess image
307
  t1 = time.time()
308
- image = InferenceProcessor.preprocess_image(image_path)
309
  timing_breakdown['image_preprocessing'] = round(time.time() - t1, 3)
310
 
311
  # Step 2: YOLO Detection
 
7
  import json
8
  import codecs
9
  import re
10
+ import os
11
  from PIL import Image
12
  from qwen_vl_utils import process_vision_info
13
  from typing import Dict, Tuple
 
16
  MAX_IMAGE_SIZE,
17
  HP_VALID_RANGE,
18
  ASSET_COST_VALID_RANGE,
19
+ COST_PER_GPU_HOUR,
20
+ ENABLE_IMAGE_ENHANCEMENT,
21
+ ENHANCEMENT_SCALE,
22
+ ENHANCEMENT_MODEL
23
  )
24
  from model_manager import model_manager
25
+ from utils.image_enhancer import get_enhancer
26
 
27
 
28
  EXTRACTION_PROMPT = """
 
69
  """Handles VLM inference, validation, and result processing"""
70
 
71
  @staticmethod
72
+ def preprocess_image(image_path: str, enhance: bool = None) -> Image.Image:
73
+ """Load, enhance (optional), and resize image if needed
 
74
 
75
+ Args:
76
+ image_path: Path to input image
77
+ enhance: Whether to enhance image quality before processing (None=use config default)
78
+
79
+ Returns:
80
+ Preprocessed PIL Image ready for VLM inference
81
+ """
82
+ # Use config default if not specified
83
+ if enhance is None:
84
+ enhance = ENABLE_IMAGE_ENHANCEMENT
85
+
86
+ # Step 1: Enhance image if enabled
87
+ enhanced_path = image_path
88
+ cleanup_enhanced = False
89
+
90
+ if enhance:
91
+ try:
92
+ enhancer = get_enhancer()
93
+ enhanced_path = enhancer.enhance_image(
94
+ image_path,
95
+ scale=ENHANCEMENT_SCALE,
96
+ model_name=ENHANCEMENT_MODEL
97
+ )
98
+ cleanup_enhanced = (enhanced_path != image_path)
99
+ except Exception as e:
100
+ print(f"⚠️ Enhancement failed: {str(e)}, using original image")
101
+ enhanced_path = image_path
102
+
103
+ # Step 2: Load image
104
+ image = Image.open(enhanced_path).convert("RGB")
105
+
106
+ # Cleanup enhanced temp file if created
107
+ if cleanup_enhanced:
108
+ try:
109
+ os.unlink(enhanced_path)
110
+ except:
111
+ pass
112
+
113
+ # Step 3: Resize if too large
114
  if max(image.size) > MAX_IMAGE_SIZE:
115
  ratio = MAX_IMAGE_SIZE / max(image.size)
116
  new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
 
326
  return validated, field_confidence, warnings
327
 
328
  @staticmethod
329
+ def process_invoice(image_path: str, doc_id: str = None, enhance: bool = None) -> Dict:
330
  """
331
  Complete invoice processing pipeline
332
 
333
  Args:
334
  image_path: Path to invoice image
335
  doc_id: Document identifier (optional)
336
+ enhance: Whether to enhance image (None=use config default)
337
 
338
  Returns:
339
  dict: Complete JSON output with all fields
 
346
  import os
347
  doc_id = os.path.splitext(os.path.basename(image_path))[0]
348
 
349
+ # Step 1: Preprocess image (with optional enhancement)
350
  t1 = time.time()
351
+ image = InferenceProcessor.preprocess_image(image_path, enhance=enhance)
352
  timing_breakdown['image_preprocessing'] = round(time.time() - t1, 3)
353
 
354
  # Step 2: YOLO Detection
utils/image_enhancer.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Image Enhancement Utility using Real-ESRGAN-ncnn-vulkan
3
+ Enhances blurry/low-quality images before VLM processing
4
+ """
5
+
6
+ import os
7
+ import subprocess
8
+ import tempfile
9
+ import zipfile
10
+ import shutil
11
+ from pathlib import Path
12
+ from PIL import Image
13
+ import urllib.request
14
+ import platform
15
+
16
+
17
+ class ImageEnhancer:
18
+ """Handles image enhancement using Real-ESRGAN-ncnn-vulkan"""
19
+
20
+ # Download URLs for Windows executable
21
+ REALESRGAN_VERSION = "v0.2.0"
22
+ REALESRGAN_WINDOWS_URL = f"https://github.com/xinntao/Real-ESRGAN-ncnn-vulkan/releases/download/{REALESRGAN_VERSION}/realesrgan-ncnn-vulkan-v0.2.0-windows.zip"
23
+
24
+ def __init__(self, base_dir: str = None):
25
+ """Initialize image enhancer
26
+
27
+ Args:
28
+ base_dir: Base directory for storing executable and models
29
+ """
30
+ if base_dir is None:
31
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
32
+
33
+ self.base_dir = Path(base_dir)
34
+ self.enhancer_dir = self.base_dir / "utils" / "realesrgan"
35
+ self.executable_path = None
36
+ self.models_path = None
37
+ self.is_available = False
38
+
39
+ # Initialize enhancer
40
+ self._setup_enhancer()
41
+
42
+ def _setup_enhancer(self):
43
+ """Setup Real-ESRGAN enhancer (download if needed)"""
44
+ try:
45
+ # Check if already exists
46
+ if self._check_existing_installation():
47
+ print("✅ Real-ESRGAN enhancer already installed")
48
+ self.is_available = True
49
+ return
50
+
51
+ # Download and setup
52
+ print("📥 Downloading Real-ESRGAN enhancer...")
53
+ self._download_and_extract()
54
+
55
+ if self._check_existing_installation():
56
+ print("✅ Real-ESRGAN enhancer installed successfully")
57
+ self.is_available = True
58
+ else:
59
+ print("⚠️ Real-ESRGAN enhancer setup incomplete")
60
+ self.is_available = False
61
+
62
+ except Exception as e:
63
+ print(f"⚠️ Failed to setup Real-ESRGAN enhancer: {str(e)}")
64
+ print(" Image enhancement will be skipped")
65
+ self.is_available = False
66
+
67
+ def _check_existing_installation(self) -> bool:
68
+ """Check if Real-ESRGAN is already installed"""
69
+ if not self.enhancer_dir.exists():
70
+ return False
71
+
72
+ # Look for executable
73
+ exe_name = "realesrgan-ncnn-vulkan.exe" if platform.system() == "Windows" else "realesrgan-ncnn-vulkan"
74
+ possible_paths = [
75
+ self.enhancer_dir / exe_name,
76
+ self.enhancer_dir / "realesrgan-ncnn-vulkan" / exe_name,
77
+ ]
78
+
79
+ for path in possible_paths:
80
+ if path.exists():
81
+ self.executable_path = path
82
+ # Look for models directory
83
+ models_dir = path.parent / "models"
84
+ if models_dir.exists():
85
+ self.models_path = models_dir
86
+ return True
87
+
88
+ return False
89
+
90
+ def _download_and_extract(self):
91
+ """Download and extract Real-ESRGAN executable"""
92
+ if platform.system() != "Windows":
93
+ print("⚠️ Auto-download only supported on Windows. Please manually install Real-ESRGAN-ncnn-vulkan")
94
+ return
95
+
96
+ # Create directory
97
+ self.enhancer_dir.mkdir(parents=True, exist_ok=True)
98
+
99
+ # Download file
100
+ zip_path = self.enhancer_dir / "realesrgan.zip"
101
+ print(f" Downloading from {self.REALESRGAN_WINDOWS_URL}...")
102
+
103
+ try:
104
+ urllib.request.urlretrieve(self.REALESRGAN_WINDOWS_URL, zip_path)
105
+ except Exception as e:
106
+ print(f" Download failed: {str(e)}")
107
+ return
108
+
109
+ # Extract
110
+ print(" Extracting files...")
111
+ try:
112
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
113
+ zip_ref.extractall(self.enhancer_dir)
114
+ except Exception as e:
115
+ print(f" Extraction failed: {str(e)}")
116
+ return
117
+
118
+ # Cleanup zip file
119
+ zip_path.unlink()
120
+
121
+ print(" Setup complete!")
122
+
123
+ def enhance_image(self, image_path: str, scale: int = 2, model_name: str = "realesrgan-x4plus") -> str:
124
+ """Enhance image using Real-ESRGAN
125
+
126
+ Args:
127
+ image_path: Path to input image
128
+ scale: Upscale ratio (2, 3, or 4)
129
+ model_name: Model to use (realesrgan-x4plus, realesrgan-x4plus-anime, realesrnet-x4plus)
130
+
131
+ Returns:
132
+ Path to enhanced image
133
+ """
134
+ if not self.is_available:
135
+ print("⚠️ Enhancement not available, using original image")
136
+ return image_path
137
+
138
+ # Create temporary output file
139
+ input_path = Path(image_path)
140
+ output_path = input_path.parent / f"{input_path.stem}_enhanced{input_path.suffix}"
141
+
142
+ try:
143
+ # Build command
144
+ cmd = [
145
+ str(self.executable_path),
146
+ "-i", str(image_path),
147
+ "-o", str(output_path),
148
+ "-n", model_name,
149
+ "-s", str(scale),
150
+ "-f", "png" # Output format
151
+ ]
152
+
153
+ # Add model path if available
154
+ if self.models_path:
155
+ cmd.extend(["-m", str(self.models_path)])
156
+
157
+ # Run enhancement
158
+ result = subprocess.run(
159
+ cmd,
160
+ capture_output=True,
161
+ text=True,
162
+ timeout=30,
163
+ creationflags=subprocess.CREATE_NO_WINDOW if platform.system() == "Windows" else 0
164
+ )
165
+
166
+ if result.returncode == 0 and output_path.exists():
167
+ print(f"✨ Image enhanced successfully (scale={scale}x)")
168
+ return str(output_path)
169
+ else:
170
+ if result.stderr:
171
+ print(f"⚠️ Enhancement failed: {result.stderr}")
172
+ print(" Using original image")
173
+ return image_path
174
+
175
+ except subprocess.TimeoutExpired:
176
+ print("⚠️ Enhancement timeout, using original image")
177
+ return image_path
178
+ except Exception as e:
179
+ print(f"⚠️ Enhancement error: {str(e)}, using original image")
180
+ return image_path
181
+
182
+ def enhance_pil_image(self, pil_image: Image.Image, scale: int = 2, model_name: str = "realesrgan-x4plus") -> Image.Image:
183
+ """Enhance PIL Image object
184
+
185
+ Args:
186
+ pil_image: PIL Image object
187
+ scale: Upscale ratio (2, 3, or 4)
188
+ model_name: Model to use
189
+
190
+ Returns:
191
+ Enhanced PIL Image object
192
+ """
193
+ if not self.is_available:
194
+ return pil_image
195
+
196
+ # Save to temporary file
197
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_input:
198
+ temp_input_path = temp_input.name
199
+ pil_image.save(temp_input_path, "PNG")
200
+
201
+ try:
202
+ # Enhance
203
+ enhanced_path = self.enhance_image(temp_input_path, scale, model_name)
204
+
205
+ # Load enhanced image
206
+ if enhanced_path != temp_input_path:
207
+ enhanced_image = Image.open(enhanced_path).convert("RGB")
208
+ # Cleanup enhanced temp file
209
+ try:
210
+ os.unlink(enhanced_path)
211
+ except:
212
+ pass
213
+ return enhanced_image
214
+ else:
215
+ return pil_image
216
+
217
+ finally:
218
+ # Cleanup input temp file
219
+ try:
220
+ os.unlink(temp_input_path)
221
+ except:
222
+ pass
223
+
224
+
225
+ # Global enhancer instance
226
+ _enhancer_instance = None
227
+
228
+ def get_enhancer() -> ImageEnhancer:
229
+ """Get global enhancer instance"""
230
+ global _enhancer_instance
231
+ if _enhancer_instance is None:
232
+ _enhancer_instance = ImageEnhancer()
233
+ return _enhancer_instance