Spaces:
Sleeping
Sleeping
Alfonso Velasco
commited on
Commit
·
50304f8
1
Parent(s):
0f430a1
fix chunk
Browse files
app.py
CHANGED
|
@@ -11,7 +11,6 @@ import tempfile
|
|
| 11 |
import os
|
| 12 |
import math
|
| 13 |
|
| 14 |
-
# Fix the OMP_NUM_THREADS issue
|
| 15 |
os.environ['OMP_NUM_THREADS'] = '1'
|
| 16 |
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
| 17 |
|
|
@@ -72,10 +71,7 @@ async def extract_document(request: DocumentRequest):
|
|
| 72 |
raise HTTPException(status_code=500, detail=str(e))
|
| 73 |
|
| 74 |
def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
|
| 75 |
-
"""
|
| 76 |
-
Process a single image or image chunk and return extractions with coordinates
|
| 77 |
-
relative to the chunk (0,0 at top-left of chunk).
|
| 78 |
-
"""
|
| 79 |
img_width, img_height = image.size
|
| 80 |
|
| 81 |
if img_width < 1 or img_height < 1:
|
|
@@ -121,7 +117,6 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
|
|
| 121 |
except RuntimeError as e:
|
| 122 |
if "CUDA" in str(e):
|
| 123 |
print(f"CUDA error encountered: {e}")
|
| 124 |
-
print("Falling back to CPU...")
|
| 125 |
encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
|
| 126 |
model.cpu()
|
| 127 |
with torch.no_grad():
|
|
@@ -149,15 +144,12 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
|
|
| 149 |
for idx, (token, box) in enumerate(zip(tokens, boxes)):
|
| 150 |
try:
|
| 151 |
if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
|
| 152 |
-
x_norm = box
|
| 153 |
-
y_norm = box[1]
|
| 154 |
-
x2_norm = box[2]
|
| 155 |
-
y2_norm = box[3]
|
| 156 |
|
| 157 |
if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
|
| 158 |
continue
|
| 159 |
|
| 160 |
-
# Convert normalized coordinates to
|
| 161 |
x = (x_norm / 1000.0) * img_width
|
| 162 |
y = (y_norm / 1000.0) * img_height
|
| 163 |
x2 = (x2_norm / 1000.0) * img_width
|
|
@@ -191,26 +183,15 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
|
|
| 191 |
|
| 192 |
return results
|
| 193 |
|
| 194 |
-
def should_split_page(rendered_width: int, rendered_height: int,
|
| 195 |
-
|
| 196 |
-
"""
|
| 197 |
-
Determine if a page should be split and in which direction.
|
| 198 |
-
For rotated pages, we check against the RENDERED dimensions.
|
| 199 |
-
"""
|
| 200 |
if rendered_width > max_width:
|
| 201 |
return (True, "horizontal")
|
| 202 |
-
|
| 203 |
return (False, None)
|
| 204 |
|
| 205 |
def split_image_intelligently(image: Image.Image, max_width: int,
|
| 206 |
overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
|
| 207 |
-
"""
|
| 208 |
-
Split image into overlapping chunks along the width.
|
| 209 |
-
|
| 210 |
-
Returns:
|
| 211 |
-
List of (chunk_image, x_offset) tuples where x_offset is the pixel position
|
| 212 |
-
in the RENDERED image where this chunk starts.
|
| 213 |
-
"""
|
| 214 |
img_width, img_height = image.size
|
| 215 |
|
| 216 |
if img_width <= max_width:
|
|
@@ -242,13 +223,11 @@ def split_image_intelligently(image: Image.Image, max_width: int,
|
|
| 242 |
|
| 243 |
def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
| 244 |
"""
|
| 245 |
-
Process PDF
|
| 246 |
|
| 247 |
-
KEY
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
- We split based on rendered dimensions
|
| 251 |
-
- Coordinates in results should be in the EFFECTIVE coordinate space
|
| 252 |
"""
|
| 253 |
RENDER_SCALE = 3.0
|
| 254 |
MAX_WIDTH = 2000 # Maximum width for a chunk in rendered pixels
|
|
@@ -276,7 +255,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
|
| 276 |
print(f" Original dimensions: {original_width}x{original_height}")
|
| 277 |
print(f" Rotation: {original_rotation}°")
|
| 278 |
|
| 279 |
-
#
|
| 280 |
if original_rotation in [90, 270]:
|
| 281 |
effective_pdf_width = original_height
|
| 282 |
effective_pdf_height = original_width
|
|
@@ -286,25 +265,41 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
|
| 286 |
|
| 287 |
print(f" Effective PDF dimensions (after rotation): {effective_pdf_width}x{effective_pdf_height}")
|
| 288 |
|
| 289 |
-
# Render page - PyMuPDF
|
| 290 |
mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
|
| 291 |
pix = page.get_pixmap(matrix=mat)
|
| 292 |
img_data = pix.tobytes("png")
|
| 293 |
full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
|
| 294 |
rendered_width, rendered_height = full_image.size
|
| 295 |
|
| 296 |
-
print(f"
|
| 297 |
|
| 298 |
-
#
|
| 299 |
expected_rendered_width = effective_pdf_width * RENDER_SCALE
|
| 300 |
expected_rendered_height = effective_pdf_height * RENDER_SCALE
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
page_results = []
|
| 304 |
|
| 305 |
-
# Decide if we need to split
|
| 306 |
should_split_decision, split_direction = should_split_page(
|
| 307 |
-
rendered_width, rendered_height,
|
| 308 |
)
|
| 309 |
|
| 310 |
if split_wide and should_split_decision:
|
|
@@ -320,28 +315,26 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
|
| 320 |
chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
|
| 321 |
print(f" Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
|
| 322 |
|
| 323 |
-
if chunk_results:
|
| 324 |
print(f" Sample items from chunk {chunk_idx + 1}:")
|
| 325 |
for i, item in enumerate(chunk_results[:3]):
|
| 326 |
print(f" Item {i+1}: text='{item['text']}', chunk_x={item['bbox']['x']:.1f}px")
|
| 327 |
|
| 328 |
-
# Transform coordinates
|
| 329 |
-
# 1. Add x_offset to move from chunk coordinates to full rendered image coordinates
|
| 330 |
-
# 2. Divide by RENDER_SCALE to convert to PDF points in effective coordinate space
|
| 331 |
for result in chunk_results:
|
| 332 |
bbox = result['bbox']
|
| 333 |
|
| 334 |
-
# Step 1: Chunk coordinates ->
|
| 335 |
-
|
| 336 |
-
|
| 337 |
|
| 338 |
-
# Step 2: Rendered coordinates -> PDF points
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
|
|
|
| 343 |
|
| 344 |
-
# Update bbox with PDF coordinates
|
| 345 |
bbox['x'] = pdf_x
|
| 346 |
bbox['y'] = pdf_y
|
| 347 |
bbox['width'] = pdf_width
|
|
@@ -349,7 +342,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
|
| 349 |
|
| 350 |
# Debug first item
|
| 351 |
if result == chunk_results[0]:
|
| 352 |
-
print(f"
|
| 353 |
|
| 354 |
page_results.extend(chunk_results)
|
| 355 |
|
|
@@ -360,13 +353,12 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
|
| 360 |
print(" Processing full page without splitting...")
|
| 361 |
chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
|
| 362 |
|
| 363 |
-
# Scale coordinates from rendered image pixels to PDF points
|
| 364 |
for result in chunk_results:
|
| 365 |
bbox = result['bbox']
|
| 366 |
-
bbox['x'] = bbox['x'] / RENDER_SCALE
|
| 367 |
-
bbox['y'] = bbox['y'] / RENDER_SCALE
|
| 368 |
-
bbox['width'] = bbox['width'] / RENDER_SCALE
|
| 369 |
-
bbox['height'] = bbox['height'] / RENDER_SCALE
|
| 370 |
|
| 371 |
page_results = chunk_results
|
| 372 |
print(f" Extracted {len(chunk_results)} items")
|
|
@@ -383,13 +375,13 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
|
| 383 |
print(f" X: {min(x_coords):.1f} to {max(x_coords):.1f} (effective width: {effective_pdf_width:.1f})")
|
| 384 |
print(f" Y: {min(y_coords):.1f} to {max(y_coords):.1f} (effective height: {effective_pdf_height:.1f})")
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
|
|
|
| 391 |
|
| 392 |
-
# Return results with proper dimensions
|
| 393 |
all_results.append({
|
| 394 |
"page": page_num + 1,
|
| 395 |
"page_dimensions": {
|
|
@@ -427,10 +419,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
|
| 427 |
}
|
| 428 |
|
| 429 |
def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
|
| 430 |
-
"""
|
| 431 |
-
Remove duplicate extractions using spatial clustering.
|
| 432 |
-
Tolerance is in PDF points.
|
| 433 |
-
"""
|
| 434 |
if not results:
|
| 435 |
return []
|
| 436 |
|
|
@@ -479,7 +468,7 @@ def process_image(image_bytes):
|
|
| 479 |
|
| 480 |
print(f"Processing single image: {img_width}x{img_height}")
|
| 481 |
|
| 482 |
-
should_split_decision, _ = should_split_page(img_width, img_height,
|
| 483 |
|
| 484 |
if should_split_decision:
|
| 485 |
print(" Image is wide, splitting into chunks...")
|
|
|
|
| 11 |
import os
|
| 12 |
import math
|
| 13 |
|
|
|
|
| 14 |
os.environ['OMP_NUM_THREADS'] = '1'
|
| 15 |
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
| 16 |
|
|
|
|
| 71 |
raise HTTPException(status_code=500, detail=str(e))
|
| 72 |
|
| 73 |
def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
|
| 74 |
+
"""Process a single image chunk and return extractions."""
|
|
|
|
|
|
|
|
|
|
| 75 |
img_width, img_height = image.size
|
| 76 |
|
| 77 |
if img_width < 1 or img_height < 1:
|
|
|
|
| 117 |
except RuntimeError as e:
|
| 118 |
if "CUDA" in str(e):
|
| 119 |
print(f"CUDA error encountered: {e}")
|
|
|
|
| 120 |
encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
|
| 121 |
model.cpu()
|
| 122 |
with torch.no_grad():
|
|
|
|
| 144 |
for idx, (token, box) in enumerate(zip(tokens, boxes)):
|
| 145 |
try:
|
| 146 |
if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
|
| 147 |
+
x_norm, y_norm, x2_norm, y2_norm = box
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
|
| 150 |
continue
|
| 151 |
|
| 152 |
+
# Convert normalized coordinates to pixel coordinates
|
| 153 |
x = (x_norm / 1000.0) * img_width
|
| 154 |
y = (y_norm / 1000.0) * img_height
|
| 155 |
x2 = (x2_norm / 1000.0) * img_width
|
|
|
|
| 183 |
|
| 184 |
return results
|
| 185 |
|
| 186 |
+
def should_split_page(rendered_width: int, rendered_height: int, max_width: int) -> Tuple[bool, str]:
|
| 187 |
+
"""Determine if a page should be split based on rendered dimensions."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
if rendered_width > max_width:
|
| 189 |
return (True, "horizontal")
|
|
|
|
| 190 |
return (False, None)
|
| 191 |
|
| 192 |
def split_image_intelligently(image: Image.Image, max_width: int,
|
| 193 |
overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
|
| 194 |
+
"""Split image into overlapping chunks along the width."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
img_width, img_height = image.size
|
| 196 |
|
| 197 |
if img_width <= max_width:
|
|
|
|
| 223 |
|
| 224 |
def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
|
| 225 |
"""
|
| 226 |
+
Process PDF with proper handling of rotated pages.
|
| 227 |
|
| 228 |
+
KEY FIX: We now work with ACTUAL rendered dimensions instead of assuming
|
| 229 |
+
they match the effective dimensions. We map coordinates based on the
|
| 230 |
+
actual render, then transform them to the effective coordinate space.
|
|
|
|
|
|
|
| 231 |
"""
|
| 232 |
RENDER_SCALE = 3.0
|
| 233 |
MAX_WIDTH = 2000 # Maximum width for a chunk in rendered pixels
|
|
|
|
| 255 |
print(f" Original dimensions: {original_width}x{original_height}")
|
| 256 |
print(f" Rotation: {original_rotation}°")
|
| 257 |
|
| 258 |
+
# Determine effective dimensions (what the page looks like when properly oriented)
|
| 259 |
if original_rotation in [90, 270]:
|
| 260 |
effective_pdf_width = original_height
|
| 261 |
effective_pdf_height = original_width
|
|
|
|
| 265 |
|
| 266 |
print(f" Effective PDF dimensions (after rotation): {effective_pdf_width}x{effective_pdf_height}")
|
| 267 |
|
| 268 |
+
# Render the page - PyMuPDF may not rotate it as expected
|
| 269 |
mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
|
| 270 |
pix = page.get_pixmap(matrix=mat)
|
| 271 |
img_data = pix.tobytes("png")
|
| 272 |
full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
|
| 273 |
rendered_width, rendered_height = full_image.size
|
| 274 |
|
| 275 |
+
print(f" Actual rendered dimensions: {rendered_width}x{rendered_height}")
|
| 276 |
|
| 277 |
+
# Detect if dimensions don't match expectations
|
| 278 |
expected_rendered_width = effective_pdf_width * RENDER_SCALE
|
| 279 |
expected_rendered_height = effective_pdf_height * RENDER_SCALE
|
| 280 |
+
|
| 281 |
+
dimensions_swapped = False
|
| 282 |
+
if (abs(rendered_width - expected_rendered_height) < 10 and
|
| 283 |
+
abs(rendered_height - expected_rendered_width) < 10):
|
| 284 |
+
print(f" ⚠️ Dimensions are swapped! Rotating image 90° to match expected orientation.")
|
| 285 |
+
# Rotate the image to match expected orientation
|
| 286 |
+
full_image = full_image.rotate(-90, expand=True)
|
| 287 |
+
rendered_width, rendered_height = full_image.size
|
| 288 |
+
print(f" After rotation: {rendered_width}x{rendered_height}")
|
| 289 |
+
dimensions_swapped = True
|
| 290 |
+
|
| 291 |
+
# Calculate the scale factor from rendered pixels to effective PDF points
|
| 292 |
+
# This handles any discrepancies between expected and actual rendering
|
| 293 |
+
scale_x = rendered_width / (effective_pdf_width * RENDER_SCALE)
|
| 294 |
+
scale_y = rendered_height / (effective_pdf_height * RENDER_SCALE)
|
| 295 |
+
|
| 296 |
+
print(f" Scale factors: x={scale_x:.4f}, y={scale_y:.4f}")
|
| 297 |
|
| 298 |
page_results = []
|
| 299 |
|
| 300 |
+
# Decide if we need to split
|
| 301 |
should_split_decision, split_direction = should_split_page(
|
| 302 |
+
rendered_width, rendered_height, MAX_WIDTH
|
| 303 |
)
|
| 304 |
|
| 305 |
if split_wide and should_split_decision:
|
|
|
|
| 315 |
chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
|
| 316 |
print(f" Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
|
| 317 |
|
| 318 |
+
if chunk_results and chunk_idx < 2:
|
| 319 |
print(f" Sample items from chunk {chunk_idx + 1}:")
|
| 320 |
for i, item in enumerate(chunk_results[:3]):
|
| 321 |
print(f" Item {i+1}: text='{item['text']}', chunk_x={item['bbox']['x']:.1f}px")
|
| 322 |
|
| 323 |
+
# Transform coordinates from chunk space to PDF effective space
|
|
|
|
|
|
|
| 324 |
for result in chunk_results:
|
| 325 |
bbox = result['bbox']
|
| 326 |
|
| 327 |
+
# Step 1: Chunk coordinates -> Full rendered image coordinates
|
| 328 |
+
rendered_x = bbox['x'] + x_offset
|
| 329 |
+
rendered_y = bbox['y']
|
| 330 |
|
| 331 |
+
# Step 2: Rendered coordinates -> PDF points in effective space
|
| 332 |
+
# Account for the actual render scale and any dimension swapping
|
| 333 |
+
pdf_x = rendered_x / (RENDER_SCALE * scale_x)
|
| 334 |
+
pdf_y = rendered_y / (RENDER_SCALE * scale_y)
|
| 335 |
+
pdf_width = bbox['width'] / (RENDER_SCALE * scale_x)
|
| 336 |
+
pdf_height = bbox['height'] / (RENDER_SCALE * scale_y)
|
| 337 |
|
|
|
|
| 338 |
bbox['x'] = pdf_x
|
| 339 |
bbox['y'] = pdf_y
|
| 340 |
bbox['width'] = pdf_width
|
|
|
|
| 342 |
|
| 343 |
# Debug first item
|
| 344 |
if result == chunk_results[0]:
|
| 345 |
+
print(f" Transform: chunk_x={bbox['x'] - pdf_x + rendered_x - x_offset:.1f}px + offset={x_offset}px = rendered_x={rendered_x:.1f}px → pdf_x={pdf_x:.1f}pts")
|
| 346 |
|
| 347 |
page_results.extend(chunk_results)
|
| 348 |
|
|
|
|
| 353 |
print(" Processing full page without splitting...")
|
| 354 |
chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
|
| 355 |
|
|
|
|
| 356 |
for result in chunk_results:
|
| 357 |
bbox = result['bbox']
|
| 358 |
+
bbox['x'] = bbox['x'] / (RENDER_SCALE * scale_x)
|
| 359 |
+
bbox['y'] = bbox['y'] / (RENDER_SCALE * scale_y)
|
| 360 |
+
bbox['width'] = bbox['width'] / (RENDER_SCALE * scale_x)
|
| 361 |
+
bbox['height'] = bbox['height'] / (RENDER_SCALE * scale_y)
|
| 362 |
|
| 363 |
page_results = chunk_results
|
| 364 |
print(f" Extracted {len(chunk_results)} items")
|
|
|
|
| 375 |
print(f" X: {min(x_coords):.1f} to {max(x_coords):.1f} (effective width: {effective_pdf_width:.1f})")
|
| 376 |
print(f" Y: {min(y_coords):.1f} to {max(y_coords):.1f} (effective height: {effective_pdf_height:.1f})")
|
| 377 |
|
| 378 |
+
if max(x_coords) > effective_pdf_width + 10:
|
| 379 |
+
print(f" ⚠️ WARNING: Some X coordinates still exceed effective page width!")
|
| 380 |
+
elif max(x_coords) > effective_pdf_width:
|
| 381 |
+
print(f" ℹ️ Note: Max X slightly exceeds width (likely edge items), but within tolerance")
|
| 382 |
+
else:
|
| 383 |
+
print(f" ✓ All coordinates within expected bounds")
|
| 384 |
|
|
|
|
| 385 |
all_results.append({
|
| 386 |
"page": page_num + 1,
|
| 387 |
"page_dimensions": {
|
|
|
|
| 419 |
}
|
| 420 |
|
| 421 |
def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
|
| 422 |
+
"""Remove duplicate extractions using spatial clustering."""
|
|
|
|
|
|
|
|
|
|
| 423 |
if not results:
|
| 424 |
return []
|
| 425 |
|
|
|
|
| 468 |
|
| 469 |
print(f"Processing single image: {img_width}x{img_height}")
|
| 470 |
|
| 471 |
+
should_split_decision, _ = should_split_page(img_width, img_height, 2000)
|
| 472 |
|
| 473 |
if should_split_decision:
|
| 474 |
print(" Image is wide, splitting into chunks...")
|