Spaces:
Sleeping
Sleeping
Alfonso Velasco
commited on
Commit
·
259596e
1
Parent(s):
c9e5fd6
fix chunk
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from fastapi import FastAPI, HTTPException
|
| 2 |
from pydantic import BaseModel
|
| 3 |
-
from typing import Dict, Any, List
|
| 4 |
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
|
@@ -9,6 +9,7 @@ import base64
|
|
| 9 |
import fitz # PyMuPDF
|
| 10 |
import tempfile
|
| 11 |
import os
|
|
|
|
| 12 |
|
| 13 |
# Fix the OMP_NUM_THREADS issue
|
| 14 |
os.environ['OMP_NUM_THREADS'] = '1'
|
|
@@ -73,7 +74,7 @@ async def extract_document(request: DocumentRequest):
|
|
| 73 |
print(f"Error in extract_document: {error_details}")
|
| 74 |
raise HTTPException(status_code=500, detail=str(e))
|
| 75 |
|
| 76 |
-
def process_image_chunk(image: Image.Image) -> List[Dict]:
|
| 77 |
"""
|
| 78 |
Process a single image or image chunk and return extractions with coordinates
|
| 79 |
relative to the chunk (0,0 at top-left of chunk).
|
|
@@ -90,7 +91,7 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
|
|
| 90 |
image,
|
| 91 |
truncation=True,
|
| 92 |
padding="max_length",
|
| 93 |
-
max_length=
|
| 94 |
return_tensors="pt"
|
| 95 |
)
|
| 96 |
except Exception as e:
|
|
@@ -98,11 +99,11 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
|
|
| 98 |
try:
|
| 99 |
encoding = processor(
|
| 100 |
image,
|
| 101 |
-
text=[""] *
|
| 102 |
-
boxes=[[0, 0, 0, 0]] *
|
| 103 |
truncation=True,
|
| 104 |
padding="max_length",
|
| 105 |
-
max_length=
|
| 106 |
return_tensors="pt"
|
| 107 |
)
|
| 108 |
except Exception as e2:
|
|
@@ -198,6 +199,79 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
|
|
| 198 |
|
| 199 |
return results
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
def process_pdf(pdf_bytes, split_wide: bool = True):
|
| 202 |
"""Process PDF document, optionally splitting wide pages into chunks"""
|
| 203 |
all_results = []
|
|
@@ -212,102 +286,86 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 212 |
os.unlink(tmp_file.name)
|
| 213 |
raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
|
| 214 |
|
|
|
|
| 215 |
RENDER_SCALE = 2.0
|
| 216 |
-
MAX_WIDTH =
|
| 217 |
-
|
| 218 |
|
| 219 |
for page_num in range(len(pdf_document)):
|
| 220 |
try:
|
| 221 |
page = pdf_document[page_num]
|
| 222 |
page_rect = page.rect
|
| 223 |
-
page_width = page_rect.width
|
| 224 |
-
page_height = page_rect.height
|
| 225 |
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
|
| 230 |
pix = page.get_pixmap(matrix=mat)
|
| 231 |
img_data = pix.tobytes("png")
|
| 232 |
full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
|
| 233 |
-
|
|
|
|
|
|
|
| 234 |
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
page_results = []
|
| 238 |
|
| 239 |
-
#
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
step_size = MAX_WIDTH // 2 # Fallback
|
| 247 |
-
|
| 248 |
-
num_chunks = max(1, ((img_width - OVERLAP) + step_size - 1) // step_size)
|
| 249 |
|
| 250 |
-
|
|
|
|
| 251 |
|
| 252 |
-
for chunk_idx in
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
end_x = min(start_x + MAX_WIDTH, img_width)
|
| 256 |
-
|
| 257 |
-
# Ensure chunk has valid dimensions
|
| 258 |
-
if end_x <= start_x:
|
| 259 |
-
print(f" Skipping invalid chunk {chunk_idx + 1}: start_x={start_x}, end_x={end_x}")
|
| 260 |
-
continue
|
| 261 |
-
|
| 262 |
-
chunk_actual_width = end_x - start_x
|
| 263 |
-
|
| 264 |
-
# Skip chunks that are too narrow
|
| 265 |
-
if chunk_actual_width < 100:
|
| 266 |
-
print(f" Skipping narrow chunk {chunk_idx + 1}: width={chunk_actual_width}")
|
| 267 |
-
continue
|
| 268 |
|
| 269 |
-
|
|
|
|
|
|
|
| 270 |
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
|
| 275 |
-
#
|
| 276 |
-
|
| 277 |
-
print(f" Chunk actual size: {verify_width}x{verify_height}")
|
| 278 |
|
| 279 |
-
#
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
bbox = result['bbox']
|
| 286 |
-
|
| 287 |
-
# Add chunk offset (in rendered image pixels)
|
| 288 |
-
bbox['x'] += start_x
|
| 289 |
-
# y stays the same (no vertical splitting)
|
| 290 |
-
|
| 291 |
-
# Now scale from rendered image pixels to PDF points
|
| 292 |
-
bbox['x'] = bbox['x'] / RENDER_SCALE
|
| 293 |
-
bbox['y'] = bbox['y'] / RENDER_SCALE
|
| 294 |
-
bbox['width'] = bbox['width'] / RENDER_SCALE
|
| 295 |
-
bbox['height'] = bbox['height'] / RENDER_SCALE
|
| 296 |
-
|
| 297 |
-
page_results.extend(chunk_results)
|
| 298 |
-
|
| 299 |
-
except Exception as e:
|
| 300 |
-
print(f" Error processing chunk {chunk_idx + 1}: {e}")
|
| 301 |
-
import traceback
|
| 302 |
-
traceback.print_exc()
|
| 303 |
-
continue
|
| 304 |
|
| 305 |
-
|
| 306 |
|
| 307 |
else:
|
| 308 |
-
# Process full page
|
| 309 |
-
print("Processing full page without splitting")
|
| 310 |
-
chunk_results = process_image_chunk(full_image)
|
| 311 |
|
| 312 |
# Scale coordinates from rendered image pixels to PDF points
|
| 313 |
for result in chunk_results:
|
|
@@ -318,35 +376,24 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 318 |
bbox['height'] = bbox['height'] / RENDER_SCALE
|
| 319 |
|
| 320 |
page_results = chunk_results
|
|
|
|
| 321 |
|
| 322 |
-
#
|
| 323 |
-
unique_results =
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
DEDUP_TOLERANCE = 5 # pixels tolerance for deduplication
|
| 327 |
-
|
| 328 |
-
for result in page_results:
|
| 329 |
-
bbox = result['bbox']
|
| 330 |
-
box_tuple = (
|
| 331 |
-
round(bbox['x'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
|
| 332 |
-
round(bbox['y'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
|
| 333 |
-
round(bbox['width'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
|
| 334 |
-
round(bbox['height'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE
|
| 335 |
-
)
|
| 336 |
-
|
| 337 |
-
if box_tuple not in seen_boxes:
|
| 338 |
-
seen_boxes.add(box_tuple)
|
| 339 |
-
unique_results.append(result)
|
| 340 |
-
|
| 341 |
-
print(f" After deduplication: {len(unique_results)} unique extractions")
|
| 342 |
|
|
|
|
| 343 |
all_results.append({
|
| 344 |
"page": page_num + 1,
|
| 345 |
"page_dimensions": {
|
| 346 |
-
"width":
|
| 347 |
-
"height":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
},
|
| 349 |
-
"rotation":
|
| 350 |
"extractions": unique_results
|
| 351 |
})
|
| 352 |
|
|
@@ -358,6 +405,7 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 358 |
all_results.append({
|
| 359 |
"page": page_num + 1,
|
| 360 |
"page_dimensions": {"width": 0, "height": 0},
|
|
|
|
| 361 |
"rotation": 0,
|
| 362 |
"extractions": [],
|
| 363 |
"error": str(e)
|
|
@@ -372,15 +420,87 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
|
|
| 372 |
"pages": all_results
|
| 373 |
}
|
| 374 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
def process_image(image_bytes):
|
| 376 |
"""Process single image"""
|
| 377 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 378 |
img_width, img_height = image.size
|
| 379 |
|
| 380 |
-
|
| 381 |
-
results = process_image_chunk(image)
|
| 382 |
|
| 383 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
|
| 385 |
return {
|
| 386 |
"document_type": "image",
|
|
@@ -389,4 +509,8 @@ def process_image(image_bytes):
|
|
| 389 |
"height": img_height
|
| 390 |
},
|
| 391 |
"extractions": results
|
| 392 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import FastAPI, HTTPException
|
| 2 |
from pydantic import BaseModel
|
| 3 |
+
from typing import Dict, Any, List, Tuple, Optional
|
| 4 |
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
|
|
|
| 9 |
import fitz # PyMuPDF
|
| 10 |
import tempfile
|
| 11 |
import os
|
| 12 |
+
import math
|
| 13 |
|
| 14 |
# Fix the OMP_NUM_THREADS issue
|
| 15 |
os.environ['OMP_NUM_THREADS'] = '1'
|
|
|
|
| 74 |
print(f"Error in extract_document: {error_details}")
|
| 75 |
raise HTTPException(status_code=500, detail=str(e))
|
| 76 |
|
| 77 |
+
def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
|
| 78 |
"""
|
| 79 |
Process a single image or image chunk and return extractions with coordinates
|
| 80 |
relative to the chunk (0,0 at top-left of chunk).
|
|
|
|
| 91 |
image,
|
| 92 |
truncation=True,
|
| 93 |
padding="max_length",
|
| 94 |
+
max_length=max_tokens,
|
| 95 |
return_tensors="pt"
|
| 96 |
)
|
| 97 |
except Exception as e:
|
|
|
|
| 99 |
try:
|
| 100 |
encoding = processor(
|
| 101 |
image,
|
| 102 |
+
text=[""] * max_tokens,
|
| 103 |
+
boxes=[[0, 0, 0, 0]] * max_tokens,
|
| 104 |
truncation=True,
|
| 105 |
padding="max_length",
|
| 106 |
+
max_length=max_tokens,
|
| 107 |
return_tensors="pt"
|
| 108 |
)
|
| 109 |
except Exception as e2:
|
|
|
|
| 199 |
|
| 200 |
return results
|
| 201 |
|
| 202 |
+
def should_split_page(rendered_width: int, rendered_height: int,
|
| 203 |
+
original_rotation: int, max_width: int) -> Tuple[bool, str]:
|
| 204 |
+
"""
|
| 205 |
+
Determine if a page should be split and in which direction.
|
| 206 |
+
Returns (should_split, split_direction)
|
| 207 |
+
"""
|
| 208 |
+
# For rotated pages (90 or 270), the page has already been rotated in the rendered image
|
| 209 |
+
# So we just check the rendered dimensions directly
|
| 210 |
+
|
| 211 |
+
aspect_ratio = rendered_width / rendered_height if rendered_height > 0 else 1
|
| 212 |
+
|
| 213 |
+
# Don't split if page is portrait or square-ish
|
| 214 |
+
if aspect_ratio <= 1.3:
|
| 215 |
+
return False, "none"
|
| 216 |
+
|
| 217 |
+
# Check if page is too wide
|
| 218 |
+
if rendered_width > max_width:
|
| 219 |
+
# For very wide pages (like 2-page spreads), split horizontally
|
| 220 |
+
if aspect_ratio > 1.8:
|
| 221 |
+
return True, "horizontal"
|
| 222 |
+
# For moderately wide pages, try to fit
|
| 223 |
+
else:
|
| 224 |
+
return True, "horizontal"
|
| 225 |
+
|
| 226 |
+
return False, "none"
|
| 227 |
+
|
| 228 |
+
def split_image_intelligently(image: Image.Image, max_width: int, overlap_ratio: float = 0.15) -> List[Tuple[Image.Image, int]]:
|
| 229 |
+
"""
|
| 230 |
+
Split an image into overlapping chunks intelligently.
|
| 231 |
+
Returns list of (chunk_image, x_offset) tuples.
|
| 232 |
+
"""
|
| 233 |
+
img_width, img_height = image.size
|
| 234 |
+
chunks = []
|
| 235 |
+
|
| 236 |
+
# Calculate overlap in pixels
|
| 237 |
+
overlap_pixels = int(max_width * overlap_ratio)
|
| 238 |
+
|
| 239 |
+
# Calculate effective step size
|
| 240 |
+
step_size = max_width - overlap_pixels
|
| 241 |
+
|
| 242 |
+
if step_size <= 0:
|
| 243 |
+
step_size = max_width // 2
|
| 244 |
+
|
| 245 |
+
# Calculate number of chunks needed
|
| 246 |
+
num_chunks = math.ceil((img_width - overlap_pixels) / step_size)
|
| 247 |
+
|
| 248 |
+
# If we'd only need 2 chunks and the second would be very small, just use 2 equal chunks
|
| 249 |
+
if num_chunks == 2:
|
| 250 |
+
second_chunk_width = img_width - step_size
|
| 251 |
+
if second_chunk_width < max_width * 0.6: # If second chunk would be less than 60% of max
|
| 252 |
+
# Split into two equal chunks with overlap
|
| 253 |
+
chunk_width = (img_width + overlap_pixels) // 2
|
| 254 |
+
chunks.append((image.crop((0, 0, chunk_width, img_height)), 0))
|
| 255 |
+
chunks.append((image.crop((img_width - chunk_width, 0, img_width, img_height)),
|
| 256 |
+
img_width - chunk_width))
|
| 257 |
+
return chunks
|
| 258 |
+
|
| 259 |
+
# Standard overlapping chunks
|
| 260 |
+
for i in range(num_chunks):
|
| 261 |
+
start_x = i * step_size
|
| 262 |
+
end_x = min(start_x + max_width, img_width)
|
| 263 |
+
|
| 264 |
+
# Ensure we don't create tiny slivers
|
| 265 |
+
if end_x - start_x < max_width * 0.3: # Skip if less than 30% of max width
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
+
chunk = image.crop((start_x, 0, end_x, img_height))
|
| 269 |
+
chunks.append((chunk, start_x))
|
| 270 |
+
|
| 271 |
+
print(f" Chunk {i+1}/{num_chunks}: x={start_x}-{end_x} (width={end_x-start_x})")
|
| 272 |
+
|
| 273 |
+
return chunks
|
| 274 |
+
|
| 275 |
def process_pdf(pdf_bytes, split_wide: bool = True):
|
| 276 |
"""Process PDF document, optionally splitting wide pages into chunks"""
|
| 277 |
all_results = []
|
|
|
|
| 286 |
os.unlink(tmp_file.name)
|
| 287 |
raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
|
| 288 |
|
| 289 |
+
# Configuration
|
| 290 |
RENDER_SCALE = 2.0
|
| 291 |
+
MAX_WIDTH = 2000 # Increased for better quality
|
| 292 |
+
MAX_TOKENS = 768 # Increased token limit for complex documents
|
| 293 |
|
| 294 |
for page_num in range(len(pdf_document)):
|
| 295 |
try:
|
| 296 |
page = pdf_document[page_num]
|
| 297 |
page_rect = page.rect
|
|
|
|
|
|
|
| 298 |
|
| 299 |
+
# Original page dimensions before any rotation
|
| 300 |
+
original_width = page_rect.width
|
| 301 |
+
original_height = page_rect.height
|
| 302 |
+
original_rotation = page.rotation
|
| 303 |
|
| 304 |
+
print(f"\nPage {page_num + 1}:")
|
| 305 |
+
print(f" Original dimensions: {original_width}x{original_height}")
|
| 306 |
+
print(f" Rotation: {original_rotation}°")
|
| 307 |
+
|
| 308 |
+
# Render page - PyMuPDF automatically applies rotation
|
| 309 |
mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
|
| 310 |
pix = page.get_pixmap(matrix=mat)
|
| 311 |
img_data = pix.tobytes("png")
|
| 312 |
full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
|
| 313 |
+
rendered_width, rendered_height = full_image.size
|
| 314 |
+
|
| 315 |
+
print(f" Rendered dimensions: {rendered_width}x{rendered_height}")
|
| 316 |
|
| 317 |
+
# Determine effective dimensions after rotation for coordinate mapping
|
| 318 |
+
if original_rotation in [90, 270]:
|
| 319 |
+
# Page has been rotated, so effective dimensions are swapped
|
| 320 |
+
effective_pdf_width = original_height
|
| 321 |
+
effective_pdf_height = original_width
|
| 322 |
+
else:
|
| 323 |
+
effective_pdf_width = original_width
|
| 324 |
+
effective_pdf_height = original_height
|
| 325 |
+
|
| 326 |
+
print(f" Effective PDF dimensions: {effective_pdf_width}x{effective_pdf_height}")
|
| 327 |
|
| 328 |
page_results = []
|
| 329 |
|
| 330 |
+
# Decide if we need to split
|
| 331 |
+
should_split, split_direction = should_split_page(
|
| 332 |
+
rendered_width, rendered_height, original_rotation, MAX_WIDTH
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
if split_wide and should_split:
|
| 336 |
+
print(f" Splitting page ({split_direction})...")
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
+
chunks = split_image_intelligently(full_image, MAX_WIDTH, overlap_ratio=0.2)
|
| 339 |
+
print(f" Created {len(chunks)} chunks")
|
| 340 |
|
| 341 |
+
for chunk_idx, (chunk_image, x_offset) in enumerate(chunks):
|
| 342 |
+
chunk_width, chunk_height = chunk_image.size
|
| 343 |
+
print(f" Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
|
| 345 |
+
# Process chunk with increased token limit
|
| 346 |
+
chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
|
| 347 |
+
print(f" Extracted {len(chunk_results)} items")
|
| 348 |
|
| 349 |
+
# Transform chunk-relative coordinates to full page coordinates
|
| 350 |
+
for result in chunk_results:
|
| 351 |
+
bbox = result['bbox']
|
| 352 |
|
| 353 |
+
# Add chunk offset (in rendered image pixels)
|
| 354 |
+
bbox['x'] += x_offset
|
|
|
|
| 355 |
|
| 356 |
+
# Scale from rendered image pixels to PDF points
|
| 357 |
+
# Use effective dimensions for proper scaling
|
| 358 |
+
bbox['x'] = bbox['x'] / RENDER_SCALE
|
| 359 |
+
bbox['y'] = bbox['y'] / RENDER_SCALE
|
| 360 |
+
bbox['width'] = bbox['width'] / RENDER_SCALE
|
| 361 |
+
bbox['height'] = bbox['height'] / RENDER_SCALE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
|
| 363 |
+
page_results.extend(chunk_results)
|
| 364 |
|
| 365 |
else:
|
| 366 |
+
# Process full page without splitting
|
| 367 |
+
print(" Processing full page without splitting...")
|
| 368 |
+
chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
|
| 369 |
|
| 370 |
# Scale coordinates from rendered image pixels to PDF points
|
| 371 |
for result in chunk_results:
|
|
|
|
| 376 |
bbox['height'] = bbox['height'] / RENDER_SCALE
|
| 377 |
|
| 378 |
page_results = chunk_results
|
| 379 |
+
print(f" Extracted {len(chunk_results)} items")
|
| 380 |
|
| 381 |
+
# Enhanced deduplication with spatial clustering
|
| 382 |
+
unique_results = deduplicate_results(page_results)
|
| 383 |
+
print(f" After deduplication: {len(unique_results)} unique items")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
|
| 385 |
+
# Return results with both original and effective dimensions
|
| 386 |
all_results.append({
|
| 387 |
"page": page_num + 1,
|
| 388 |
"page_dimensions": {
|
| 389 |
+
"width": original_width,
|
| 390 |
+
"height": original_height
|
| 391 |
+
},
|
| 392 |
+
"effective_dimensions": {
|
| 393 |
+
"width": effective_pdf_width,
|
| 394 |
+
"height": effective_pdf_height
|
| 395 |
},
|
| 396 |
+
"rotation": original_rotation,
|
| 397 |
"extractions": unique_results
|
| 398 |
})
|
| 399 |
|
|
|
|
| 405 |
all_results.append({
|
| 406 |
"page": page_num + 1,
|
| 407 |
"page_dimensions": {"width": 0, "height": 0},
|
| 408 |
+
"effective_dimensions": {"width": 0, "height": 0},
|
| 409 |
"rotation": 0,
|
| 410 |
"extractions": [],
|
| 411 |
"error": str(e)
|
|
|
|
| 420 |
"pages": all_results
|
| 421 |
}
|
| 422 |
|
| 423 |
+
def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
|
| 424 |
+
"""
|
| 425 |
+
Remove duplicate extractions using spatial clustering.
|
| 426 |
+
Tolerance is in PDF points.
|
| 427 |
+
"""
|
| 428 |
+
if not results:
|
| 429 |
+
return []
|
| 430 |
+
|
| 431 |
+
unique_results = []
|
| 432 |
+
processed_indices = set()
|
| 433 |
+
|
| 434 |
+
for i, result in enumerate(results):
|
| 435 |
+
if i in processed_indices:
|
| 436 |
+
continue
|
| 437 |
+
|
| 438 |
+
bbox = result['bbox']
|
| 439 |
+
center_x = bbox['x'] + bbox['width'] / 2
|
| 440 |
+
center_y = bbox['y'] + bbox['height'] / 2
|
| 441 |
+
|
| 442 |
+
# Find all results that are close to this one
|
| 443 |
+
cluster = [result]
|
| 444 |
+
cluster_indices = {i}
|
| 445 |
+
|
| 446 |
+
for j, other in enumerate(results):
|
| 447 |
+
if j <= i or j in processed_indices:
|
| 448 |
+
continue
|
| 449 |
+
|
| 450 |
+
other_bbox = other['bbox']
|
| 451 |
+
other_center_x = other_bbox['x'] + other_bbox['width'] / 2
|
| 452 |
+
other_center_y = other_bbox['y'] + other_bbox['height'] / 2
|
| 453 |
+
|
| 454 |
+
# Check if centers are within tolerance
|
| 455 |
+
dist = math.sqrt((center_x - other_center_x)**2 + (center_y - other_center_y)**2)
|
| 456 |
+
|
| 457 |
+
if dist < tolerance:
|
| 458 |
+
# Check if it's roughly the same size
|
| 459 |
+
size_ratio_w = bbox['width'] / other_bbox['width'] if other_bbox['width'] > 0 else 1
|
| 460 |
+
size_ratio_h = bbox['height'] / other_bbox['height'] if other_bbox['height'] > 0 else 1
|
| 461 |
+
|
| 462 |
+
if 0.7 < size_ratio_w < 1.3 and 0.7 < size_ratio_h < 1.3:
|
| 463 |
+
cluster.append(other)
|
| 464 |
+
cluster_indices.add(j)
|
| 465 |
+
|
| 466 |
+
# Choose the best result from the cluster (e.g., longest text)
|
| 467 |
+
best_result = max(cluster, key=lambda r: len(r.get('text', '')))
|
| 468 |
+
unique_results.append(best_result)
|
| 469 |
+
processed_indices.update(cluster_indices)
|
| 470 |
+
|
| 471 |
+
return unique_results
|
| 472 |
+
|
| 473 |
def process_image(image_bytes):
|
| 474 |
"""Process single image"""
|
| 475 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 476 |
img_width, img_height = image.size
|
| 477 |
|
| 478 |
+
print(f"Processing single image: {img_width}x{img_height}")
|
|
|
|
| 479 |
|
| 480 |
+
# Check if image should be split
|
| 481 |
+
should_split, _ = should_split_page(img_width, img_height, 0, 2000)
|
| 482 |
+
|
| 483 |
+
if should_split:
|
| 484 |
+
print(" Image is wide, splitting into chunks...")
|
| 485 |
+
chunks = split_image_intelligently(image, 2000, overlap_ratio=0.2)
|
| 486 |
+
|
| 487 |
+
all_results = []
|
| 488 |
+
for chunk_idx, (chunk_image, x_offset) in enumerate(chunks):
|
| 489 |
+
chunk_results = process_image_chunk(chunk_image, max_tokens=768)
|
| 490 |
+
|
| 491 |
+
# Adjust coordinates for chunk offset
|
| 492 |
+
for result in chunk_results:
|
| 493 |
+
result['bbox']['x'] += x_offset
|
| 494 |
+
|
| 495 |
+
all_results.extend(chunk_results)
|
| 496 |
+
|
| 497 |
+
# Deduplicate
|
| 498 |
+
results = deduplicate_results(all_results)
|
| 499 |
+
else:
|
| 500 |
+
# Process the image as-is
|
| 501 |
+
results = process_image_chunk(image, max_tokens=768)
|
| 502 |
+
|
| 503 |
+
print(f" Total extractions: {len(results)}")
|
| 504 |
|
| 505 |
return {
|
| 506 |
"document_type": "image",
|
|
|
|
| 509 |
"height": img_height
|
| 510 |
},
|
| 511 |
"extractions": results
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
if __name__ == "__main__":
|
| 515 |
+
import uvicorn
|
| 516 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|