Alfonso Velasco commited on
Commit
50304f8
·
1 Parent(s): 0f430a1
Files changed (1) hide show
  1. app.py +57 -68
app.py CHANGED
@@ -11,7 +11,6 @@ import tempfile
11
  import os
12
  import math
13
 
14
- # Fix the OMP_NUM_THREADS issue
15
  os.environ['OMP_NUM_THREADS'] = '1'
16
  os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
17
 
@@ -72,10 +71,7 @@ async def extract_document(request: DocumentRequest):
72
  raise HTTPException(status_code=500, detail=str(e))
73
 
74
  def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
75
- """
76
- Process a single image or image chunk and return extractions with coordinates
77
- relative to the chunk (0,0 at top-left of chunk).
78
- """
79
  img_width, img_height = image.size
80
 
81
  if img_width < 1 or img_height < 1:
@@ -121,7 +117,6 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
121
  except RuntimeError as e:
122
  if "CUDA" in str(e):
123
  print(f"CUDA error encountered: {e}")
124
- print("Falling back to CPU...")
125
  encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
126
  model.cpu()
127
  with torch.no_grad():
@@ -149,15 +144,12 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
149
  for idx, (token, box) in enumerate(zip(tokens, boxes)):
150
  try:
151
  if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
152
- x_norm = box[0]
153
- y_norm = box[1]
154
- x2_norm = box[2]
155
- y2_norm = box[3]
156
 
157
  if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
158
  continue
159
 
160
- # Convert normalized coordinates to chunk pixel coordinates
161
  x = (x_norm / 1000.0) * img_width
162
  y = (y_norm / 1000.0) * img_height
163
  x2 = (x2_norm / 1000.0) * img_width
@@ -191,26 +183,15 @@ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]
191
 
192
  return results
193
 
194
- def should_split_page(rendered_width: int, rendered_height: int,
195
- original_rotation: int, max_width: int) -> Tuple[bool, str]:
196
- """
197
- Determine if a page should be split and in which direction.
198
- For rotated pages, we check against the RENDERED dimensions.
199
- """
200
  if rendered_width > max_width:
201
  return (True, "horizontal")
202
-
203
  return (False, None)
204
 
205
  def split_image_intelligently(image: Image.Image, max_width: int,
206
  overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
207
- """
208
- Split image into overlapping chunks along the width.
209
-
210
- Returns:
211
- List of (chunk_image, x_offset) tuples where x_offset is the pixel position
212
- in the RENDERED image where this chunk starts.
213
- """
214
  img_width, img_height = image.size
215
 
216
  if img_width <= max_width:
@@ -242,13 +223,11 @@ def split_image_intelligently(image: Image.Image, max_width: int,
242
 
243
  def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
244
  """
245
- Process PDF and extract structured content with proper coordinate handling for rotated pages.
246
 
247
- KEY INSIGHT: For rotated pages, PyMuPDF renders them already rotated. So:
248
- - A page with rotation=270° and original size 1224x792 gets rendered as if it were 792x1224
249
- - The rendered image dimensions match the "effective" dimensions
250
- - We split based on rendered dimensions
251
- - Coordinates in results should be in the EFFECTIVE coordinate space
252
  """
253
  RENDER_SCALE = 3.0
254
  MAX_WIDTH = 2000 # Maximum width for a chunk in rendered pixels
@@ -276,7 +255,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
276
  print(f" Original dimensions: {original_width}x{original_height}")
277
  print(f" Rotation: {original_rotation}°")
278
 
279
- # CRITICAL: Determine effective dimensions (what the page looks like after rotation)
280
  if original_rotation in [90, 270]:
281
  effective_pdf_width = original_height
282
  effective_pdf_height = original_width
@@ -286,25 +265,41 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
286
 
287
  print(f" Effective PDF dimensions (after rotation): {effective_pdf_width}x{effective_pdf_height}")
288
 
289
- # Render page - PyMuPDF automatically handles rotation
290
  mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
291
  pix = page.get_pixmap(matrix=mat)
292
  img_data = pix.tobytes("png")
293
  full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
294
  rendered_width, rendered_height = full_image.size
295
 
296
- print(f" Rendered dimensions: {rendered_width}x{rendered_height}")
297
 
298
- # Verify: rendered dimensions should match effective dimensions * scale
299
  expected_rendered_width = effective_pdf_width * RENDER_SCALE
300
  expected_rendered_height = effective_pdf_height * RENDER_SCALE
301
- print(f" Expected rendered: {expected_rendered_width}x{expected_rendered_height}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  page_results = []
304
 
305
- # Decide if we need to split based on RENDERED dimensions
306
  should_split_decision, split_direction = should_split_page(
307
- rendered_width, rendered_height, original_rotation, MAX_WIDTH
308
  )
309
 
310
  if split_wide and should_split_decision:
@@ -320,28 +315,26 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
320
  chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
321
  print(f" Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
322
 
323
- if chunk_results:
324
  print(f" Sample items from chunk {chunk_idx + 1}:")
325
  for i, item in enumerate(chunk_results[:3]):
326
  print(f" Item {i+1}: text='{item['text']}', chunk_x={item['bbox']['x']:.1f}px")
327
 
328
- # Transform coordinates:
329
- # 1. Add x_offset to move from chunk coordinates to full rendered image coordinates
330
- # 2. Divide by RENDER_SCALE to convert to PDF points in effective coordinate space
331
  for result in chunk_results:
332
  bbox = result['bbox']
333
 
334
- # Step 1: Chunk coordinates -> Rendered image coordinates
335
- chunk_x = bbox['x']
336
- rendered_x = chunk_x + x_offset
337
 
338
- # Step 2: Rendered coordinates -> PDF points (effective coordinate space)
339
- pdf_x = rendered_x / RENDER_SCALE
340
- pdf_y = bbox['y'] / RENDER_SCALE
341
- pdf_width = bbox['width'] / RENDER_SCALE
342
- pdf_height = bbox['height'] / RENDER_SCALE
 
343
 
344
- # Update bbox with PDF coordinates
345
  bbox['x'] = pdf_x
346
  bbox['y'] = pdf_y
347
  bbox['width'] = pdf_width
@@ -349,7 +342,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
349
 
350
  # Debug first item
351
  if result == chunk_results[0]:
352
- print(f" Coordinate transform: chunk_x={chunk_x:.1f}px + offset={x_offset}px = rendered_x={rendered_x:.1f}px → pdf_x={pdf_x:.1f}pts")
353
 
354
  page_results.extend(chunk_results)
355
 
@@ -360,13 +353,12 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
360
  print(" Processing full page without splitting...")
361
  chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
362
 
363
- # Scale coordinates from rendered image pixels to PDF points
364
  for result in chunk_results:
365
  bbox = result['bbox']
366
- bbox['x'] = bbox['x'] / RENDER_SCALE
367
- bbox['y'] = bbox['y'] / RENDER_SCALE
368
- bbox['width'] = bbox['width'] / RENDER_SCALE
369
- bbox['height'] = bbox['height'] / RENDER_SCALE
370
 
371
  page_results = chunk_results
372
  print(f" Extracted {len(chunk_results)} items")
@@ -383,13 +375,13 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
383
  print(f" X: {min(x_coords):.1f} to {max(x_coords):.1f} (effective width: {effective_pdf_width:.1f})")
384
  print(f" Y: {min(y_coords):.1f} to {max(y_coords):.1f} (effective height: {effective_pdf_height:.1f})")
385
 
386
- # Warn if coordinates exceed page dimensions
387
- if max(x_coords) > effective_pdf_width:
388
- print(f" WARNING: Some X coordinates exceed effective page width!")
389
- if max(y_coords) > effective_pdf_height:
390
- print(f" WARNING: Some Y coordinates exceed effective page height!")
 
391
 
392
- # Return results with proper dimensions
393
  all_results.append({
394
  "page": page_num + 1,
395
  "page_dimensions": {
@@ -427,10 +419,7 @@ def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
427
  }
428
 
429
  def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
430
- """
431
- Remove duplicate extractions using spatial clustering.
432
- Tolerance is in PDF points.
433
- """
434
  if not results:
435
  return []
436
 
@@ -479,7 +468,7 @@ def process_image(image_bytes):
479
 
480
  print(f"Processing single image: {img_width}x{img_height}")
481
 
482
- should_split_decision, _ = should_split_page(img_width, img_height, 0, 2000)
483
 
484
  if should_split_decision:
485
  print(" Image is wide, splitting into chunks...")
 
11
  import os
12
  import math
13
 
 
14
  os.environ['OMP_NUM_THREADS'] = '1'
15
  os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
16
 
 
71
  raise HTTPException(status_code=500, detail=str(e))
72
 
73
  def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
74
+ """Process a single image chunk and return extractions."""
 
 
 
75
  img_width, img_height = image.size
76
 
77
  if img_width < 1 or img_height < 1:
 
117
  except RuntimeError as e:
118
  if "CUDA" in str(e):
119
  print(f"CUDA error encountered: {e}")
 
120
  encoding = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in encoding.items()}
121
  model.cpu()
122
  with torch.no_grad():
 
144
  for idx, (token, box) in enumerate(zip(tokens, boxes)):
145
  try:
146
  if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>', '<pad>']:
147
+ x_norm, y_norm, x2_norm, y2_norm = box
 
 
 
148
 
149
  if x_norm == 0 and y_norm == 0 and x2_norm == 0 and y2_norm == 0:
150
  continue
151
 
152
+ # Convert normalized coordinates to pixel coordinates
153
  x = (x_norm / 1000.0) * img_width
154
  y = (y_norm / 1000.0) * img_height
155
  x2 = (x2_norm / 1000.0) * img_width
 
183
 
184
  return results
185
 
186
+ def should_split_page(rendered_width: int, rendered_height: int, max_width: int) -> Tuple[bool, str]:
187
+ """Determine if a page should be split based on rendered dimensions."""
 
 
 
 
188
  if rendered_width > max_width:
189
  return (True, "horizontal")
 
190
  return (False, None)
191
 
192
  def split_image_intelligently(image: Image.Image, max_width: int,
193
  overlap_ratio: float = 0.1) -> List[Tuple[Image.Image, int]]:
194
+ """Split image into overlapping chunks along the width."""
 
 
 
 
 
 
195
  img_width, img_height = image.size
196
 
197
  if img_width <= max_width:
 
223
 
224
  def process_pdf(pdf_bytes: bytes, split_wide: bool = True):
225
  """
226
+ Process PDF with proper handling of rotated pages.
227
 
228
+ KEY FIX: We now work with ACTUAL rendered dimensions instead of assuming
229
+ they match the effective dimensions. We map coordinates based on the
230
+ actual render, then transform them to the effective coordinate space.
 
 
231
  """
232
  RENDER_SCALE = 3.0
233
  MAX_WIDTH = 2000 # Maximum width for a chunk in rendered pixels
 
255
  print(f" Original dimensions: {original_width}x{original_height}")
256
  print(f" Rotation: {original_rotation}°")
257
 
258
+ # Determine effective dimensions (what the page looks like when properly oriented)
259
  if original_rotation in [90, 270]:
260
  effective_pdf_width = original_height
261
  effective_pdf_height = original_width
 
265
 
266
  print(f" Effective PDF dimensions (after rotation): {effective_pdf_width}x{effective_pdf_height}")
267
 
268
+ # Render the page - PyMuPDF may not rotate it as expected
269
  mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
270
  pix = page.get_pixmap(matrix=mat)
271
  img_data = pix.tobytes("png")
272
  full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
273
  rendered_width, rendered_height = full_image.size
274
 
275
+ print(f" Actual rendered dimensions: {rendered_width}x{rendered_height}")
276
 
277
+ # Detect if dimensions don't match expectations
278
  expected_rendered_width = effective_pdf_width * RENDER_SCALE
279
  expected_rendered_height = effective_pdf_height * RENDER_SCALE
280
+
281
+ dimensions_swapped = False
282
+ if (abs(rendered_width - expected_rendered_height) < 10 and
283
+ abs(rendered_height - expected_rendered_width) < 10):
284
+ print(f" ⚠️ Dimensions are swapped! Rotating image 90° to match expected orientation.")
285
+ # Rotate the image to match expected orientation
286
+ full_image = full_image.rotate(-90, expand=True)
287
+ rendered_width, rendered_height = full_image.size
288
+ print(f" After rotation: {rendered_width}x{rendered_height}")
289
+ dimensions_swapped = True
290
+
291
+ # Calculate the scale factor from rendered pixels to effective PDF points
292
+ # This handles any discrepancies between expected and actual rendering
293
+ scale_x = rendered_width / (effective_pdf_width * RENDER_SCALE)
294
+ scale_y = rendered_height / (effective_pdf_height * RENDER_SCALE)
295
+
296
+ print(f" Scale factors: x={scale_x:.4f}, y={scale_y:.4f}")
297
 
298
  page_results = []
299
 
300
+ # Decide if we need to split
301
  should_split_decision, split_direction = should_split_page(
302
+ rendered_width, rendered_height, MAX_WIDTH
303
  )
304
 
305
  if split_wide and should_split_decision:
 
315
  chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
316
  print(f" Extracted {len(chunk_results)} items from chunk {chunk_idx + 1}")
317
 
318
+ if chunk_results and chunk_idx < 2:
319
  print(f" Sample items from chunk {chunk_idx + 1}:")
320
  for i, item in enumerate(chunk_results[:3]):
321
  print(f" Item {i+1}: text='{item['text']}', chunk_x={item['bbox']['x']:.1f}px")
322
 
323
+ # Transform coordinates from chunk space to PDF effective space
 
 
324
  for result in chunk_results:
325
  bbox = result['bbox']
326
 
327
+ # Step 1: Chunk coordinates -> Full rendered image coordinates
328
+ rendered_x = bbox['x'] + x_offset
329
+ rendered_y = bbox['y']
330
 
331
+ # Step 2: Rendered coordinates -> PDF points in effective space
332
+ # Account for the actual render scale and any dimension swapping
333
+ pdf_x = rendered_x / (RENDER_SCALE * scale_x)
334
+ pdf_y = rendered_y / (RENDER_SCALE * scale_y)
335
+ pdf_width = bbox['width'] / (RENDER_SCALE * scale_x)
336
+ pdf_height = bbox['height'] / (RENDER_SCALE * scale_y)
337
 
 
338
  bbox['x'] = pdf_x
339
  bbox['y'] = pdf_y
340
  bbox['width'] = pdf_width
 
342
 
343
  # Debug first item
344
  if result == chunk_results[0]:
345
+ print(f" Transform: chunk_x={bbox['x'] - pdf_x + rendered_x - x_offset:.1f}px + offset={x_offset}px = rendered_x={rendered_x:.1f}px → pdf_x={pdf_x:.1f}pts")
346
 
347
  page_results.extend(chunk_results)
348
 
 
353
  print(" Processing full page without splitting...")
354
  chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
355
 
 
356
  for result in chunk_results:
357
  bbox = result['bbox']
358
+ bbox['x'] = bbox['x'] / (RENDER_SCALE * scale_x)
359
+ bbox['y'] = bbox['y'] / (RENDER_SCALE * scale_y)
360
+ bbox['width'] = bbox['width'] / (RENDER_SCALE * scale_x)
361
+ bbox['height'] = bbox['height'] / (RENDER_SCALE * scale_y)
362
 
363
  page_results = chunk_results
364
  print(f" Extracted {len(chunk_results)} items")
 
375
  print(f" X: {min(x_coords):.1f} to {max(x_coords):.1f} (effective width: {effective_pdf_width:.1f})")
376
  print(f" Y: {min(y_coords):.1f} to {max(y_coords):.1f} (effective height: {effective_pdf_height:.1f})")
377
 
378
+ if max(x_coords) > effective_pdf_width + 10:
379
+ print(f" ⚠️ WARNING: Some X coordinates still exceed effective page width!")
380
+ elif max(x_coords) > effective_pdf_width:
381
+ print(f" ℹ️ Note: Max X slightly exceeds width (likely edge items), but within tolerance")
382
+ else:
383
+ print(f" ✓ All coordinates within expected bounds")
384
 
 
385
  all_results.append({
386
  "page": page_num + 1,
387
  "page_dimensions": {
 
419
  }
420
 
421
  def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
422
+ """Remove duplicate extractions using spatial clustering."""
 
 
 
423
  if not results:
424
  return []
425
 
 
468
 
469
  print(f"Processing single image: {img_width}x{img_height}")
470
 
471
+ should_split_decision, _ = should_split_page(img_width, img_height, 2000)
472
 
473
  if should_split_decision:
474
  print(" Image is wide, splitting into chunks...")