Spaces:

alfonsovelp
/

deepseek-ocr

Sleeping

App Files Files Community

Alfonso Velasco commited on Oct 25, 2025

Commit

7f73aee

1 Parent(s): 2d25a78

lab

Browse files

Files changed (1) hide show

app.py +51 -13

app.py CHANGED Viewed

@@ -263,9 +263,17 @@ async def extract_image(request: ImageRequest):
                     print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height} (original: {img_width}x{img_height})")
                     image_for_cropping = annotated_image_bytes
-                    # Re-parse coordinates using the annotated image dimensions
-                    extractions = parse_deepseek_result(result_text, img_for_crop_width, img_for_crop_height, request.base_size)
-                    print(f"Re-parsed {len(extractions)} extractions for annotated image dimensions")
                 except Exception as e:
                     print(f"⚠ Could not use annotated image for cropping: {e}, falling back to original")
@@ -569,7 +577,7 @@ def simplify_extractions_for_layout(extractions: List[Dict]) -> List[Dict]:
     return simplified
-def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_size: int = 1024) -> List[Dict]:
     """
     Parse the DeepSeek-OCR result to extract text and bounding boxes.
@@ -579,6 +587,14 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
     The bounding boxes are in DeepSeek's coordinate space (based on base_size),
     so we need to scale them to the actual image dimensions.
     """
     import re
@@ -591,11 +607,15 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
     # DeepSeek-OCR appears to use a square coordinate space (base_size x base_size)
     # regardless of the actual image aspect ratio
     # So coordinates are always in the range [0, base_size] for both x and y
-    scale_x = img_width / base_size
-    scale_y = img_height / base_size
-    print(f"Image dimensions: {img_width}x{img_height}, base_size: {base_size}")
-    print(f"Coordinate space: {base_size}x{base_size}, scale_x: {scale_x:.2f}, scale_y: {scale_y:.2f}")
     # Pattern to match: <|ref|>TYPE<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|>
     pattern = r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>\[\[([\d, ]+)\]\]<\|/det\|>'
@@ -619,6 +639,17 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
                 x2_scaled = int(x2 * scale_x)
                 y2_scaled = int(y2 * scale_y)
                 # Ensure coordinates are within image bounds
                 x1_scaled = max(0, min(x1_scaled, img_width))
                 y1_scaled = max(0, min(y1_scaled, img_height))
@@ -730,13 +761,20 @@ async def extract_simple(request: ImageRequest):
                     img_for_crop_width, img_for_crop_height = test_img.size
                     print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height}")
                     image_for_cropping = annotated_image_bytes
-                    # Re-parse with annotated image dimensions
-                    extractions = parse_deepseek_result(result_text, img_for_crop_width, img_for_crop_height, request.base_size)
                 except Exception as e:
                     print(f"Could not use annotated image: {e}")
-                    extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size)
             else:
-                extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size)
             patches_by_type = extract_patches_by_type(image_for_cropping, extractions)

                     print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height} (original: {img_width}x{img_height})")
                     image_for_cropping = annotated_image_bytes
+                    # Re-parse coordinates for annotated image dimensions
+                    # and add 200px padding around each box to avoid cutoff
+                    extractions = parse_deepseek_result(
+                        result_text,
+                        img_for_crop_width,
+                        img_for_crop_height,
+                        request.base_size,
+                        scale_coords=True,  # Scale from base_size to annotated image size
+                        padding=200  # Add 200px padding around each box
+                    )
+                    print(f"✓ Re-parsed {len(extractions)} extractions with 200px padding for annotated image")
                 except Exception as e:
                     print(f"⚠ Could not use annotated image for cropping: {e}, falling back to original")
     return simplified
+def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_size: int = 1024, scale_coords: bool = True, padding: int = 0) -> List[Dict]:
     """
     Parse the DeepSeek-OCR result to extract text and bounding boxes.
     The bounding boxes are in DeepSeek's coordinate space (based on base_size),
     so we need to scale them to the actual image dimensions.
+    Args:
+        result: The model output text
+        img_width: Target image width
+        img_height: Target image height
+        base_size: Model's coordinate space size (usually 1024)
+        scale_coords: Whether to scale coordinates (False if already in target space)
+        padding: Pixels to add around each bounding box (while keeping in bounds)
     """
     import re
     # DeepSeek-OCR appears to use a square coordinate space (base_size x base_size)
     # regardless of the actual image aspect ratio
     # So coordinates are always in the range [0, base_size] for both x and y
+    if scale_coords:
+        scale_x = img_width / base_size
+        scale_y = img_height / base_size
+        print(f"Image dimensions: {img_width}x{img_height}, base_size: {base_size}")
+        print(f"Coordinate space: {base_size}x{base_size}, scale_x: {scale_x:.2f}, scale_y: {scale_y:.2f}")
+    else:
+        scale_x = 1.0
+        scale_y = 1.0
+        print(f"Using coordinates as-is (no scaling) for image: {img_width}x{img_height}")
     # Pattern to match: <|ref|>TYPE<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|>
     pattern = r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>\[\[([\d, ]+)\]\]<\|/det\|>'
                 x2_scaled = int(x2 * scale_x)
                 y2_scaled = int(y2 * scale_y)
+                # Add padding around bounding box (before bounds checking)
+                if padding > 0:
+                    original_x1, original_y1, original_x2, original_y2 = x1_scaled, y1_scaled, x2_scaled, y2_scaled
+                    x1_scaled -= padding
+                    y1_scaled -= padding
+                    x2_scaled += padding
+                    y2_scaled += padding
+                    # Log first box padding for debugging
+                    if i == 0:
+                        print(f"  Padding applied: {padding}px around boxes (e.g., box 0: {original_x1},{original_y1},{original_x2},{original_y2} -> {x1_scaled},{y1_scaled},{x2_scaled},{y2_scaled})")
                 # Ensure coordinates are within image bounds
                 x1_scaled = max(0, min(x1_scaled, img_width))
                 y1_scaled = max(0, min(y1_scaled, img_height))
                     img_for_crop_width, img_for_crop_height = test_img.size
                     print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height}")
                     image_for_cropping = annotated_image_bytes
+                    # Re-parse with annotated image dimensions and 200px padding
+                    extractions = parse_deepseek_result(
+                        result_text,
+                        img_for_crop_width,
+                        img_for_crop_height,
+                        request.base_size,
+                        scale_coords=True,
+                        padding=200
+                    )
                 except Exception as e:
                     print(f"Could not use annotated image: {e}")
+                    extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size, padding=200)
             else:
+                extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size, padding=200)
             patches_by_type = extract_patches_by_type(image_for_cropping, extractions)