Alfonso Velasco commited on
Commit
7f73aee
·
1 Parent(s): 2d25a78
Files changed (1) hide show
  1. app.py +51 -13
app.py CHANGED
@@ -263,9 +263,17 @@ async def extract_image(request: ImageRequest):
263
  print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height} (original: {img_width}x{img_height})")
264
  image_for_cropping = annotated_image_bytes
265
 
266
- # Re-parse coordinates using the annotated image dimensions
267
- extractions = parse_deepseek_result(result_text, img_for_crop_width, img_for_crop_height, request.base_size)
268
- print(f"Re-parsed {len(extractions)} extractions for annotated image dimensions")
 
 
 
 
 
 
 
 
269
  except Exception as e:
270
  print(f"⚠ Could not use annotated image for cropping: {e}, falling back to original")
271
 
@@ -569,7 +577,7 @@ def simplify_extractions_for_layout(extractions: List[Dict]) -> List[Dict]:
569
 
570
  return simplified
571
 
572
- def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_size: int = 1024) -> List[Dict]:
573
  """
574
  Parse the DeepSeek-OCR result to extract text and bounding boxes.
575
 
@@ -579,6 +587,14 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
579
 
580
  The bounding boxes are in DeepSeek's coordinate space (based on base_size),
581
  so we need to scale them to the actual image dimensions.
 
 
 
 
 
 
 
 
582
  """
583
  import re
584
 
@@ -591,11 +607,15 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
591
  # DeepSeek-OCR appears to use a square coordinate space (base_size x base_size)
592
  # regardless of the actual image aspect ratio
593
  # So coordinates are always in the range [0, base_size] for both x and y
594
- scale_x = img_width / base_size
595
- scale_y = img_height / base_size
596
-
597
- print(f"Image dimensions: {img_width}x{img_height}, base_size: {base_size}")
598
- print(f"Coordinate space: {base_size}x{base_size}, scale_x: {scale_x:.2f}, scale_y: {scale_y:.2f}")
 
 
 
 
599
 
600
  # Pattern to match: <|ref|>TYPE<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|>
601
  pattern = r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>\[\[([\d, ]+)\]\]<\|/det\|>'
@@ -619,6 +639,17 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
619
  x2_scaled = int(x2 * scale_x)
620
  y2_scaled = int(y2 * scale_y)
621
 
 
 
 
 
 
 
 
 
 
 
 
622
  # Ensure coordinates are within image bounds
623
  x1_scaled = max(0, min(x1_scaled, img_width))
624
  y1_scaled = max(0, min(y1_scaled, img_height))
@@ -730,13 +761,20 @@ async def extract_simple(request: ImageRequest):
730
  img_for_crop_width, img_for_crop_height = test_img.size
731
  print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height}")
732
  image_for_cropping = annotated_image_bytes
733
- # Re-parse with annotated image dimensions
734
- extractions = parse_deepseek_result(result_text, img_for_crop_width, img_for_crop_height, request.base_size)
 
 
 
 
 
 
 
735
  except Exception as e:
736
  print(f"Could not use annotated image: {e}")
737
- extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size)
738
  else:
739
- extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size)
740
 
741
  patches_by_type = extract_patches_by_type(image_for_cropping, extractions)
742
 
 
263
  print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height} (original: {img_width}x{img_height})")
264
  image_for_cropping = annotated_image_bytes
265
 
266
+ # Re-parse coordinates for annotated image dimensions
267
+ # and add 200px padding around each box to avoid cutoff
268
+ extractions = parse_deepseek_result(
269
+ result_text,
270
+ img_for_crop_width,
271
+ img_for_crop_height,
272
+ request.base_size,
273
+ scale_coords=True, # Scale from base_size to annotated image size
274
+ padding=200 # Add 200px padding around each box
275
+ )
276
+ print(f"✓ Re-parsed {len(extractions)} extractions with 200px padding for annotated image")
277
  except Exception as e:
278
  print(f"⚠ Could not use annotated image for cropping: {e}, falling back to original")
279
 
 
577
 
578
  return simplified
579
 
580
+ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_size: int = 1024, scale_coords: bool = True, padding: int = 0) -> List[Dict]:
581
  """
582
  Parse the DeepSeek-OCR result to extract text and bounding boxes.
583
 
 
587
 
588
  The bounding boxes are in DeepSeek's coordinate space (based on base_size),
589
  so we need to scale them to the actual image dimensions.
590
+
591
+ Args:
592
+ result: The model output text
593
+ img_width: Target image width
594
+ img_height: Target image height
595
+ base_size: Model's coordinate space size (usually 1024)
596
+ scale_coords: Whether to scale coordinates (False if already in target space)
597
+ padding: Pixels to add around each bounding box (while keeping in bounds)
598
  """
599
  import re
600
 
 
607
  # DeepSeek-OCR appears to use a square coordinate space (base_size x base_size)
608
  # regardless of the actual image aspect ratio
609
  # So coordinates are always in the range [0, base_size] for both x and y
610
+ if scale_coords:
611
+ scale_x = img_width / base_size
612
+ scale_y = img_height / base_size
613
+ print(f"Image dimensions: {img_width}x{img_height}, base_size: {base_size}")
614
+ print(f"Coordinate space: {base_size}x{base_size}, scale_x: {scale_x:.2f}, scale_y: {scale_y:.2f}")
615
+ else:
616
+ scale_x = 1.0
617
+ scale_y = 1.0
618
+ print(f"Using coordinates as-is (no scaling) for image: {img_width}x{img_height}")
619
 
620
  # Pattern to match: <|ref|>TYPE<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|>
621
  pattern = r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>\[\[([\d, ]+)\]\]<\|/det\|>'
 
639
  x2_scaled = int(x2 * scale_x)
640
  y2_scaled = int(y2 * scale_y)
641
 
642
+ # Add padding around bounding box (before bounds checking)
643
+ if padding > 0:
644
+ original_x1, original_y1, original_x2, original_y2 = x1_scaled, y1_scaled, x2_scaled, y2_scaled
645
+ x1_scaled -= padding
646
+ y1_scaled -= padding
647
+ x2_scaled += padding
648
+ y2_scaled += padding
649
+ # Log first box padding for debugging
650
+ if i == 0:
651
+ print(f" Padding applied: {padding}px around boxes (e.g., box 0: {original_x1},{original_y1},{original_x2},{original_y2} -> {x1_scaled},{y1_scaled},{x2_scaled},{y2_scaled})")
652
+
653
  # Ensure coordinates are within image bounds
654
  x1_scaled = max(0, min(x1_scaled, img_width))
655
  y1_scaled = max(0, min(y1_scaled, img_height))
 
761
  img_for_crop_width, img_for_crop_height = test_img.size
762
  print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height}")
763
  image_for_cropping = annotated_image_bytes
764
+ # Re-parse with annotated image dimensions and 200px padding
765
+ extractions = parse_deepseek_result(
766
+ result_text,
767
+ img_for_crop_width,
768
+ img_for_crop_height,
769
+ request.base_size,
770
+ scale_coords=True,
771
+ padding=200
772
+ )
773
  except Exception as e:
774
  print(f"Could not use annotated image: {e}")
775
+ extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size, padding=200)
776
  else:
777
+ extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size, padding=200)
778
 
779
  patches_by_type = extract_patches_by_type(image_for_cropping, extractions)
780