Alfonso Velasco commited on
Commit
259596e
·
1 Parent(s): c9e5fd6
Files changed (1) hide show
  1. app.py +228 -104
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
- from typing import Dict, Any, List
4
  from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
5
  import torch
6
  from PIL import Image
@@ -9,6 +9,7 @@ import base64
9
  import fitz # PyMuPDF
10
  import tempfile
11
  import os
 
12
 
13
  # Fix the OMP_NUM_THREADS issue
14
  os.environ['OMP_NUM_THREADS'] = '1'
@@ -73,7 +74,7 @@ async def extract_document(request: DocumentRequest):
73
  print(f"Error in extract_document: {error_details}")
74
  raise HTTPException(status_code=500, detail=str(e))
75
 
76
- def process_image_chunk(image: Image.Image) -> List[Dict]:
77
  """
78
  Process a single image or image chunk and return extractions with coordinates
79
  relative to the chunk (0,0 at top-left of chunk).
@@ -90,7 +91,7 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
90
  image,
91
  truncation=True,
92
  padding="max_length",
93
- max_length=512, # Reduced from 1024 for better stability
94
  return_tensors="pt"
95
  )
96
  except Exception as e:
@@ -98,11 +99,11 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
98
  try:
99
  encoding = processor(
100
  image,
101
- text=[""] * 512,
102
- boxes=[[0, 0, 0, 0]] * 512,
103
  truncation=True,
104
  padding="max_length",
105
- max_length=512,
106
  return_tensors="pt"
107
  )
108
  except Exception as e2:
@@ -198,6 +199,79 @@ def process_image_chunk(image: Image.Image) -> List[Dict]:
198
 
199
  return results
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def process_pdf(pdf_bytes, split_wide: bool = True):
202
  """Process PDF document, optionally splitting wide pages into chunks"""
203
  all_results = []
@@ -212,102 +286,86 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
212
  os.unlink(tmp_file.name)
213
  raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
214
 
 
215
  RENDER_SCALE = 2.0
216
- MAX_WIDTH = 1800 # Reduced from 2000 for better stability
217
- OVERLAP = 150 # Reduced overlap
218
 
219
  for page_num in range(len(pdf_document)):
220
  try:
221
  page = pdf_document[page_num]
222
  page_rect = page.rect
223
- page_width = page_rect.width
224
- page_height = page_rect.height
225
 
226
- print(f"Page {page_num + 1}: {page_width}x{page_height}, rotation={page.rotation}°")
 
 
 
227
 
228
- # Render page
 
 
 
 
229
  mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
230
  pix = page.get_pixmap(matrix=mat)
231
  img_data = pix.tobytes("png")
232
  full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
233
- img_width, img_height = full_image.size
 
 
234
 
235
- print(f"Rendered image: {img_width}x{img_height}")
 
 
 
 
 
 
 
 
 
236
 
237
  page_results = []
238
 
239
- # Check if page is too wide and should be split
240
- if split_wide and img_width > MAX_WIDTH:
241
- print(f"Page is wide ({img_width}px), splitting into chunks...")
242
-
243
- # Calculate proper number of chunks with safer logic
244
- step_size = MAX_WIDTH - OVERLAP
245
- if step_size <= 0:
246
- step_size = MAX_WIDTH // 2 # Fallback
247
-
248
- num_chunks = max(1, ((img_width - OVERLAP) + step_size - 1) // step_size)
249
 
250
- print(f"Will create {num_chunks} chunks with step size {step_size}")
 
251
 
252
- for chunk_idx in range(num_chunks):
253
- # Calculate chunk boundaries in rendered image pixels
254
- start_x = chunk_idx * step_size
255
- end_x = min(start_x + MAX_WIDTH, img_width)
256
-
257
- # Ensure chunk has valid dimensions
258
- if end_x <= start_x:
259
- print(f" Skipping invalid chunk {chunk_idx + 1}: start_x={start_x}, end_x={end_x}")
260
- continue
261
-
262
- chunk_actual_width = end_x - start_x
263
-
264
- # Skip chunks that are too narrow
265
- if chunk_actual_width < 100:
266
- print(f" Skipping narrow chunk {chunk_idx + 1}: width={chunk_actual_width}")
267
- continue
268
 
269
- print(f" Processing chunk {chunk_idx + 1}/{num_chunks}: x={start_x}-{end_x} (width={chunk_actual_width})")
 
 
270
 
271
- try:
272
- # Crop chunk from rendered image
273
- chunk = full_image.crop((start_x, 0, end_x, img_height))
274
 
275
- # Verify chunk dimensions
276
- verify_width, verify_height = chunk.size
277
- print(f" Chunk actual size: {verify_width}x{verify_height}")
278
 
279
- # Process chunk (returns coordinates relative to chunk)
280
- chunk_results = process_image_chunk(chunk)
281
- print(f" Extracted {len(chunk_results)} items from chunk")
282
-
283
- # Transform chunk-relative coordinates to full page coordinates
284
- for result in chunk_results:
285
- bbox = result['bbox']
286
-
287
- # Add chunk offset (in rendered image pixels)
288
- bbox['x'] += start_x
289
- # y stays the same (no vertical splitting)
290
-
291
- # Now scale from rendered image pixels to PDF points
292
- bbox['x'] = bbox['x'] / RENDER_SCALE
293
- bbox['y'] = bbox['y'] / RENDER_SCALE
294
- bbox['width'] = bbox['width'] / RENDER_SCALE
295
- bbox['height'] = bbox['height'] / RENDER_SCALE
296
-
297
- page_results.extend(chunk_results)
298
-
299
- except Exception as e:
300
- print(f" Error processing chunk {chunk_idx + 1}: {e}")
301
- import traceback
302
- traceback.print_exc()
303
- continue
304
 
305
- print(f" Total extractions from all chunks: {len(page_results)}")
306
 
307
  else:
308
- # Process full page (no splitting needed)
309
- print("Processing full page without splitting")
310
- chunk_results = process_image_chunk(full_image)
311
 
312
  # Scale coordinates from rendered image pixels to PDF points
313
  for result in chunk_results:
@@ -318,35 +376,24 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
318
  bbox['height'] = bbox['height'] / RENDER_SCALE
319
 
320
  page_results = chunk_results
 
321
 
322
- # Remove duplicates from overlapping chunks
323
- unique_results = []
324
- seen_boxes = set()
325
-
326
- DEDUP_TOLERANCE = 5 # pixels tolerance for deduplication
327
-
328
- for result in page_results:
329
- bbox = result['bbox']
330
- box_tuple = (
331
- round(bbox['x'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
332
- round(bbox['y'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
333
- round(bbox['width'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE,
334
- round(bbox['height'] / DEDUP_TOLERANCE) * DEDUP_TOLERANCE
335
- )
336
-
337
- if box_tuple not in seen_boxes:
338
- seen_boxes.add(box_tuple)
339
- unique_results.append(result)
340
-
341
- print(f" After deduplication: {len(unique_results)} unique extractions")
342
 
 
343
  all_results.append({
344
  "page": page_num + 1,
345
  "page_dimensions": {
346
- "width": page_width,
347
- "height": page_height
 
 
 
 
348
  },
349
- "rotation": page.rotation,
350
  "extractions": unique_results
351
  })
352
 
@@ -358,6 +405,7 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
358
  all_results.append({
359
  "page": page_num + 1,
360
  "page_dimensions": {"width": 0, "height": 0},
 
361
  "rotation": 0,
362
  "extractions": [],
363
  "error": str(e)
@@ -372,15 +420,87 @@ def process_pdf(pdf_bytes, split_wide: bool = True):
372
  "pages": all_results
373
  }
374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  def process_image(image_bytes):
376
  """Process single image"""
377
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
378
  img_width, img_height = image.size
379
 
380
- # Process the image
381
- results = process_image_chunk(image)
382
 
383
- # Coordinates are already in image pixels, no scaling needed for standalone images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
  return {
386
  "document_type": "image",
@@ -389,4 +509,8 @@ def process_image(image_bytes):
389
  "height": img_height
390
  },
391
  "extractions": results
392
- }
 
 
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
+ from typing import Dict, Any, List, Tuple, Optional
4
  from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
5
  import torch
6
  from PIL import Image
 
9
  import fitz # PyMuPDF
10
  import tempfile
11
  import os
12
+ import math
13
 
14
  # Fix the OMP_NUM_THREADS issue
15
  os.environ['OMP_NUM_THREADS'] = '1'
 
74
  print(f"Error in extract_document: {error_details}")
75
  raise HTTPException(status_code=500, detail=str(e))
76
 
77
+ def process_image_chunk(image: Image.Image, max_tokens: int = 512) -> List[Dict]:
78
  """
79
  Process a single image or image chunk and return extractions with coordinates
80
  relative to the chunk (0,0 at top-left of chunk).
 
91
  image,
92
  truncation=True,
93
  padding="max_length",
94
+ max_length=max_tokens,
95
  return_tensors="pt"
96
  )
97
  except Exception as e:
 
99
  try:
100
  encoding = processor(
101
  image,
102
+ text=[""] * max_tokens,
103
+ boxes=[[0, 0, 0, 0]] * max_tokens,
104
  truncation=True,
105
  padding="max_length",
106
+ max_length=max_tokens,
107
  return_tensors="pt"
108
  )
109
  except Exception as e2:
 
199
 
200
  return results
201
 
202
+ def should_split_page(rendered_width: int, rendered_height: int,
203
+ original_rotation: int, max_width: int) -> Tuple[bool, str]:
204
+ """
205
+ Determine if a page should be split and in which direction.
206
+ Returns (should_split, split_direction)
207
+ """
208
+ # For rotated pages (90 or 270), the page has already been rotated in the rendered image
209
+ # So we just check the rendered dimensions directly
210
+
211
+ aspect_ratio = rendered_width / rendered_height if rendered_height > 0 else 1
212
+
213
+ # Don't split if page is portrait or square-ish
214
+ if aspect_ratio <= 1.3:
215
+ return False, "none"
216
+
217
+ # Check if page is too wide
218
+ if rendered_width > max_width:
219
+ # For very wide pages (like 2-page spreads), split horizontally
220
+ if aspect_ratio > 1.8:
221
+ return True, "horizontal"
222
+ # For moderately wide pages, try to fit
223
+ else:
224
+ return True, "horizontal"
225
+
226
+ return False, "none"
227
+
228
+ def split_image_intelligently(image: Image.Image, max_width: int, overlap_ratio: float = 0.15) -> List[Tuple[Image.Image, int]]:
229
+ """
230
+ Split an image into overlapping chunks intelligently.
231
+ Returns list of (chunk_image, x_offset) tuples.
232
+ """
233
+ img_width, img_height = image.size
234
+ chunks = []
235
+
236
+ # Calculate overlap in pixels
237
+ overlap_pixels = int(max_width * overlap_ratio)
238
+
239
+ # Calculate effective step size
240
+ step_size = max_width - overlap_pixels
241
+
242
+ if step_size <= 0:
243
+ step_size = max_width // 2
244
+
245
+ # Calculate number of chunks needed
246
+ num_chunks = math.ceil((img_width - overlap_pixels) / step_size)
247
+
248
+ # If we'd only need 2 chunks and the second would be very small, just use 2 equal chunks
249
+ if num_chunks == 2:
250
+ second_chunk_width = img_width - step_size
251
+ if second_chunk_width < max_width * 0.6: # If second chunk would be less than 60% of max
252
+ # Split into two equal chunks with overlap
253
+ chunk_width = (img_width + overlap_pixels) // 2
254
+ chunks.append((image.crop((0, 0, chunk_width, img_height)), 0))
255
+ chunks.append((image.crop((img_width - chunk_width, 0, img_width, img_height)),
256
+ img_width - chunk_width))
257
+ return chunks
258
+
259
+ # Standard overlapping chunks
260
+ for i in range(num_chunks):
261
+ start_x = i * step_size
262
+ end_x = min(start_x + max_width, img_width)
263
+
264
+ # Ensure we don't create tiny slivers
265
+ if end_x - start_x < max_width * 0.3: # Skip if less than 30% of max width
266
+ continue
267
+
268
+ chunk = image.crop((start_x, 0, end_x, img_height))
269
+ chunks.append((chunk, start_x))
270
+
271
+ print(f" Chunk {i+1}/{num_chunks}: x={start_x}-{end_x} (width={end_x-start_x})")
272
+
273
+ return chunks
274
+
275
  def process_pdf(pdf_bytes, split_wide: bool = True):
276
  """Process PDF document, optionally splitting wide pages into chunks"""
277
  all_results = []
 
286
  os.unlink(tmp_file.name)
287
  raise HTTPException(status_code=400, detail=f"Failed to open PDF: {str(e)}")
288
 
289
+ # Configuration
290
  RENDER_SCALE = 2.0
291
+ MAX_WIDTH = 2000 # Increased for better quality
292
+ MAX_TOKENS = 768 # Increased token limit for complex documents
293
 
294
  for page_num in range(len(pdf_document)):
295
  try:
296
  page = pdf_document[page_num]
297
  page_rect = page.rect
 
 
298
 
299
+ # Original page dimensions before any rotation
300
+ original_width = page_rect.width
301
+ original_height = page_rect.height
302
+ original_rotation = page.rotation
303
 
304
+ print(f"\nPage {page_num + 1}:")
305
+ print(f" Original dimensions: {original_width}x{original_height}")
306
+ print(f" Rotation: {original_rotation}°")
307
+
308
+ # Render page - PyMuPDF automatically applies rotation
309
  mat = fitz.Matrix(RENDER_SCALE, RENDER_SCALE)
310
  pix = page.get_pixmap(matrix=mat)
311
  img_data = pix.tobytes("png")
312
  full_image = Image.open(io.BytesIO(img_data)).convert("RGB")
313
+ rendered_width, rendered_height = full_image.size
314
+
315
+ print(f" Rendered dimensions: {rendered_width}x{rendered_height}")
316
 
317
+ # Determine effective dimensions after rotation for coordinate mapping
318
+ if original_rotation in [90, 270]:
319
+ # Page has been rotated, so effective dimensions are swapped
320
+ effective_pdf_width = original_height
321
+ effective_pdf_height = original_width
322
+ else:
323
+ effective_pdf_width = original_width
324
+ effective_pdf_height = original_height
325
+
326
+ print(f" Effective PDF dimensions: {effective_pdf_width}x{effective_pdf_height}")
327
 
328
  page_results = []
329
 
330
+ # Decide if we need to split
331
+ should_split, split_direction = should_split_page(
332
+ rendered_width, rendered_height, original_rotation, MAX_WIDTH
333
+ )
334
+
335
+ if split_wide and should_split:
336
+ print(f" Splitting page ({split_direction})...")
 
 
 
337
 
338
+ chunks = split_image_intelligently(full_image, MAX_WIDTH, overlap_ratio=0.2)
339
+ print(f" Created {len(chunks)} chunks")
340
 
341
+ for chunk_idx, (chunk_image, x_offset) in enumerate(chunks):
342
+ chunk_width, chunk_height = chunk_image.size
343
+ print(f" Processing chunk {chunk_idx + 1}: offset={x_offset}, size={chunk_width}x{chunk_height}")
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
+ # Process chunk with increased token limit
346
+ chunk_results = process_image_chunk(chunk_image, max_tokens=MAX_TOKENS)
347
+ print(f" Extracted {len(chunk_results)} items")
348
 
349
+ # Transform chunk-relative coordinates to full page coordinates
350
+ for result in chunk_results:
351
+ bbox = result['bbox']
352
 
353
+ # Add chunk offset (in rendered image pixels)
354
+ bbox['x'] += x_offset
 
355
 
356
+ # Scale from rendered image pixels to PDF points
357
+ # Use effective dimensions for proper scaling
358
+ bbox['x'] = bbox['x'] / RENDER_SCALE
359
+ bbox['y'] = bbox['y'] / RENDER_SCALE
360
+ bbox['width'] = bbox['width'] / RENDER_SCALE
361
+ bbox['height'] = bbox['height'] / RENDER_SCALE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
+ page_results.extend(chunk_results)
364
 
365
  else:
366
+ # Process full page without splitting
367
+ print(" Processing full page without splitting...")
368
+ chunk_results = process_image_chunk(full_image, max_tokens=MAX_TOKENS)
369
 
370
  # Scale coordinates from rendered image pixels to PDF points
371
  for result in chunk_results:
 
376
  bbox['height'] = bbox['height'] / RENDER_SCALE
377
 
378
  page_results = chunk_results
379
+ print(f" Extracted {len(chunk_results)} items")
380
 
381
+ # Enhanced deduplication with spatial clustering
382
+ unique_results = deduplicate_results(page_results)
383
+ print(f" After deduplication: {len(unique_results)} unique items")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
+ # Return results with both original and effective dimensions
386
  all_results.append({
387
  "page": page_num + 1,
388
  "page_dimensions": {
389
+ "width": original_width,
390
+ "height": original_height
391
+ },
392
+ "effective_dimensions": {
393
+ "width": effective_pdf_width,
394
+ "height": effective_pdf_height
395
  },
396
+ "rotation": original_rotation,
397
  "extractions": unique_results
398
  })
399
 
 
405
  all_results.append({
406
  "page": page_num + 1,
407
  "page_dimensions": {"width": 0, "height": 0},
408
+ "effective_dimensions": {"width": 0, "height": 0},
409
  "rotation": 0,
410
  "extractions": [],
411
  "error": str(e)
 
420
  "pages": all_results
421
  }
422
 
423
+ def deduplicate_results(results: List[Dict], tolerance: float = 10.0) -> List[Dict]:
424
+ """
425
+ Remove duplicate extractions using spatial clustering.
426
+ Tolerance is in PDF points.
427
+ """
428
+ if not results:
429
+ return []
430
+
431
+ unique_results = []
432
+ processed_indices = set()
433
+
434
+ for i, result in enumerate(results):
435
+ if i in processed_indices:
436
+ continue
437
+
438
+ bbox = result['bbox']
439
+ center_x = bbox['x'] + bbox['width'] / 2
440
+ center_y = bbox['y'] + bbox['height'] / 2
441
+
442
+ # Find all results that are close to this one
443
+ cluster = [result]
444
+ cluster_indices = {i}
445
+
446
+ for j, other in enumerate(results):
447
+ if j <= i or j in processed_indices:
448
+ continue
449
+
450
+ other_bbox = other['bbox']
451
+ other_center_x = other_bbox['x'] + other_bbox['width'] / 2
452
+ other_center_y = other_bbox['y'] + other_bbox['height'] / 2
453
+
454
+ # Check if centers are within tolerance
455
+ dist = math.sqrt((center_x - other_center_x)**2 + (center_y - other_center_y)**2)
456
+
457
+ if dist < tolerance:
458
+ # Check if it's roughly the same size
459
+ size_ratio_w = bbox['width'] / other_bbox['width'] if other_bbox['width'] > 0 else 1
460
+ size_ratio_h = bbox['height'] / other_bbox['height'] if other_bbox['height'] > 0 else 1
461
+
462
+ if 0.7 < size_ratio_w < 1.3 and 0.7 < size_ratio_h < 1.3:
463
+ cluster.append(other)
464
+ cluster_indices.add(j)
465
+
466
+ # Choose the best result from the cluster (e.g., longest text)
467
+ best_result = max(cluster, key=lambda r: len(r.get('text', '')))
468
+ unique_results.append(best_result)
469
+ processed_indices.update(cluster_indices)
470
+
471
+ return unique_results
472
+
473
  def process_image(image_bytes):
474
  """Process single image"""
475
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
476
  img_width, img_height = image.size
477
 
478
+ print(f"Processing single image: {img_width}x{img_height}")
 
479
 
480
+ # Check if image should be split
481
+ should_split, _ = should_split_page(img_width, img_height, 0, 2000)
482
+
483
+ if should_split:
484
+ print(" Image is wide, splitting into chunks...")
485
+ chunks = split_image_intelligently(image, 2000, overlap_ratio=0.2)
486
+
487
+ all_results = []
488
+ for chunk_idx, (chunk_image, x_offset) in enumerate(chunks):
489
+ chunk_results = process_image_chunk(chunk_image, max_tokens=768)
490
+
491
+ # Adjust coordinates for chunk offset
492
+ for result in chunk_results:
493
+ result['bbox']['x'] += x_offset
494
+
495
+ all_results.extend(chunk_results)
496
+
497
+ # Deduplicate
498
+ results = deduplicate_results(all_results)
499
+ else:
500
+ # Process the image as-is
501
+ results = process_image_chunk(image, max_tokens=768)
502
+
503
+ print(f" Total extractions: {len(results)}")
504
 
505
  return {
506
  "document_type": "image",
 
509
  "height": img_height
510
  },
511
  "extractions": results
512
+ }
513
+
514
+ if __name__ == "__main__":
515
+ import uvicorn
516
+ uvicorn.run(app, host="0.0.0.0", port=7860)