Spaces:
Sleeping
Sleeping
Alfonso Velasco commited on
Commit ·
7f73aee
1
Parent(s): 2d25a78
lab
Browse files
app.py
CHANGED
|
@@ -263,9 +263,17 @@ async def extract_image(request: ImageRequest):
|
|
| 263 |
print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height} (original: {img_width}x{img_height})")
|
| 264 |
image_for_cropping = annotated_image_bytes
|
| 265 |
|
| 266 |
-
# Re-parse coordinates
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
except Exception as e:
|
| 270 |
print(f"⚠ Could not use annotated image for cropping: {e}, falling back to original")
|
| 271 |
|
|
@@ -569,7 +577,7 @@ def simplify_extractions_for_layout(extractions: List[Dict]) -> List[Dict]:
|
|
| 569 |
|
| 570 |
return simplified
|
| 571 |
|
| 572 |
-
def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_size: int = 1024) -> List[Dict]:
|
| 573 |
"""
|
| 574 |
Parse the DeepSeek-OCR result to extract text and bounding boxes.
|
| 575 |
|
|
@@ -579,6 +587,14 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
|
|
| 579 |
|
| 580 |
The bounding boxes are in DeepSeek's coordinate space (based on base_size),
|
| 581 |
so we need to scale them to the actual image dimensions.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
"""
|
| 583 |
import re
|
| 584 |
|
|
@@ -591,11 +607,15 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
|
|
| 591 |
# DeepSeek-OCR appears to use a square coordinate space (base_size x base_size)
|
| 592 |
# regardless of the actual image aspect ratio
|
| 593 |
# So coordinates are always in the range [0, base_size] for both x and y
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
|
| 600 |
# Pattern to match: <|ref|>TYPE<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|>
|
| 601 |
pattern = r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>\[\[([\d, ]+)\]\]<\|/det\|>'
|
|
@@ -619,6 +639,17 @@ def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_siz
|
|
| 619 |
x2_scaled = int(x2 * scale_x)
|
| 620 |
y2_scaled = int(y2 * scale_y)
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
# Ensure coordinates are within image bounds
|
| 623 |
x1_scaled = max(0, min(x1_scaled, img_width))
|
| 624 |
y1_scaled = max(0, min(y1_scaled, img_height))
|
|
@@ -730,13 +761,20 @@ async def extract_simple(request: ImageRequest):
|
|
| 730 |
img_for_crop_width, img_for_crop_height = test_img.size
|
| 731 |
print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height}")
|
| 732 |
image_for_cropping = annotated_image_bytes
|
| 733 |
-
# Re-parse with annotated image dimensions
|
| 734 |
-
extractions = parse_deepseek_result(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 735 |
except Exception as e:
|
| 736 |
print(f"Could not use annotated image: {e}")
|
| 737 |
-
extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size)
|
| 738 |
else:
|
| 739 |
-
extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size)
|
| 740 |
|
| 741 |
patches_by_type = extract_patches_by_type(image_for_cropping, extractions)
|
| 742 |
|
|
|
|
| 263 |
print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height} (original: {img_width}x{img_height})")
|
| 264 |
image_for_cropping = annotated_image_bytes
|
| 265 |
|
| 266 |
+
# Re-parse coordinates for annotated image dimensions
|
| 267 |
+
# and add 200px padding around each box to avoid cutoff
|
| 268 |
+
extractions = parse_deepseek_result(
|
| 269 |
+
result_text,
|
| 270 |
+
img_for_crop_width,
|
| 271 |
+
img_for_crop_height,
|
| 272 |
+
request.base_size,
|
| 273 |
+
scale_coords=True, # Scale from base_size to annotated image size
|
| 274 |
+
padding=200 # Add 200px padding around each box
|
| 275 |
+
)
|
| 276 |
+
print(f"✓ Re-parsed {len(extractions)} extractions with 200px padding for annotated image")
|
| 277 |
except Exception as e:
|
| 278 |
print(f"⚠ Could not use annotated image for cropping: {e}, falling back to original")
|
| 279 |
|
|
|
|
| 577 |
|
| 578 |
return simplified
|
| 579 |
|
| 580 |
+
def parse_deepseek_result(result: Any, img_width: int, img_height: int, base_size: int = 1024, scale_coords: bool = True, padding: int = 0) -> List[Dict]:
|
| 581 |
"""
|
| 582 |
Parse the DeepSeek-OCR result to extract text and bounding boxes.
|
| 583 |
|
|
|
|
| 587 |
|
| 588 |
The bounding boxes are in DeepSeek's coordinate space (based on base_size),
|
| 589 |
so we need to scale them to the actual image dimensions.
|
| 590 |
+
|
| 591 |
+
Args:
|
| 592 |
+
result: The model output text
|
| 593 |
+
img_width: Target image width
|
| 594 |
+
img_height: Target image height
|
| 595 |
+
base_size: Model's coordinate space size (usually 1024)
|
| 596 |
+
scale_coords: Whether to scale coordinates (False if already in target space)
|
| 597 |
+
padding: Pixels to add around each bounding box (while keeping in bounds)
|
| 598 |
"""
|
| 599 |
import re
|
| 600 |
|
|
|
|
| 607 |
# DeepSeek-OCR appears to use a square coordinate space (base_size x base_size)
|
| 608 |
# regardless of the actual image aspect ratio
|
| 609 |
# So coordinates are always in the range [0, base_size] for both x and y
|
| 610 |
+
if scale_coords:
|
| 611 |
+
scale_x = img_width / base_size
|
| 612 |
+
scale_y = img_height / base_size
|
| 613 |
+
print(f"Image dimensions: {img_width}x{img_height}, base_size: {base_size}")
|
| 614 |
+
print(f"Coordinate space: {base_size}x{base_size}, scale_x: {scale_x:.2f}, scale_y: {scale_y:.2f}")
|
| 615 |
+
else:
|
| 616 |
+
scale_x = 1.0
|
| 617 |
+
scale_y = 1.0
|
| 618 |
+
print(f"Using coordinates as-is (no scaling) for image: {img_width}x{img_height}")
|
| 619 |
|
| 620 |
# Pattern to match: <|ref|>TYPE<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|>
|
| 621 |
pattern = r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>\[\[([\d, ]+)\]\]<\|/det\|>'
|
|
|
|
| 639 |
x2_scaled = int(x2 * scale_x)
|
| 640 |
y2_scaled = int(y2 * scale_y)
|
| 641 |
|
| 642 |
+
# Add padding around bounding box (before bounds checking)
|
| 643 |
+
if padding > 0:
|
| 644 |
+
original_x1, original_y1, original_x2, original_y2 = x1_scaled, y1_scaled, x2_scaled, y2_scaled
|
| 645 |
+
x1_scaled -= padding
|
| 646 |
+
y1_scaled -= padding
|
| 647 |
+
x2_scaled += padding
|
| 648 |
+
y2_scaled += padding
|
| 649 |
+
# Log first box padding for debugging
|
| 650 |
+
if i == 0:
|
| 651 |
+
print(f" Padding applied: {padding}px around boxes (e.g., box 0: {original_x1},{original_y1},{original_x2},{original_y2} -> {x1_scaled},{y1_scaled},{x2_scaled},{y2_scaled})")
|
| 652 |
+
|
| 653 |
# Ensure coordinates are within image bounds
|
| 654 |
x1_scaled = max(0, min(x1_scaled, img_width))
|
| 655 |
y1_scaled = max(0, min(y1_scaled, img_height))
|
|
|
|
| 761 |
img_for_crop_width, img_for_crop_height = test_img.size
|
| 762 |
print(f"Using annotated image for cropping: {img_for_crop_width}x{img_for_crop_height}")
|
| 763 |
image_for_cropping = annotated_image_bytes
|
| 764 |
+
# Re-parse with annotated image dimensions and 200px padding
|
| 765 |
+
extractions = parse_deepseek_result(
|
| 766 |
+
result_text,
|
| 767 |
+
img_for_crop_width,
|
| 768 |
+
img_for_crop_height,
|
| 769 |
+
request.base_size,
|
| 770 |
+
scale_coords=True,
|
| 771 |
+
padding=200
|
| 772 |
+
)
|
| 773 |
except Exception as e:
|
| 774 |
print(f"Could not use annotated image: {e}")
|
| 775 |
+
extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size, padding=200)
|
| 776 |
else:
|
| 777 |
+
extractions = parse_deepseek_result(result_text, img_width, img_height, request.base_size, padding=200)
|
| 778 |
|
| 779 |
patches_by_type = extract_patches_by_type(image_for_cropping, extractions)
|
| 780 |
|