Yaz Hobooti
commited on
Commit
·
93954b8
1
Parent(s):
0b957fa
Fix coordinate mapping for text-based spell checking
Browse files- Add image_size parameter to find_misspell_boxes_from_text()
- Convert PDF coordinates to image coordinates using scale factors
- Pass image dimensions from compare_pdfs() to spell checking function
- Add debug output to track spell check results
- Ensure spell checking boxes align with rasterized image coordinates
- pdf_comparator.py +54 -8
pdf_comparator.py
CHANGED
|
@@ -338,11 +338,29 @@ def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
|
|
| 338 |
except Exception:
|
| 339 |
return []
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
def find_misspell_boxes_from_text(
|
| 342 |
pdf_path: str,
|
| 343 |
*,
|
| 344 |
extra_allow: Optional[Iterable[str]] = None,
|
| 345 |
-
max_pages: int = 5
|
|
|
|
| 346 |
) -> List[Box]:
|
| 347 |
"""Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
|
| 348 |
if not (HAS_SPELLCHECK and HAS_PYMUPDF):
|
|
@@ -388,12 +406,35 @@ def find_misspell_boxes_from_text(
|
|
| 388 |
# If this span has misspellings, create a box for it
|
| 389 |
if has_misspelling:
|
| 390 |
bbox = span["bbox"] # [x0, y0, x1, y1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
boxes.append(Box(
|
| 392 |
-
y1=
|
| 393 |
-
x1=
|
| 394 |
-
y2=
|
| 395 |
-
x2=
|
| 396 |
-
area=(
|
| 397 |
))
|
| 398 |
|
| 399 |
doc.close()
|
|
@@ -645,8 +686,13 @@ def compare_pdfs(file_a, file_b):
|
|
| 645 |
|
| 646 |
# Run all analysis features with defaults
|
| 647 |
# Use text-based spell checking instead of OCR for better accuracy
|
| 648 |
-
|
| 649 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
|
| 651 |
if HAS_BARCODE:
|
| 652 |
bar_a, info_a = find_barcode_boxes_and_info(a)
|
|
|
|
| 338 |
except Exception:
|
| 339 |
return []
|
| 340 |
|
| 341 |
+
def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000):
|
| 342 |
+
"""Convert PDF coordinates to image coordinates"""
|
| 343 |
+
pdf_width, pdf_height = pdf_page_size
|
| 344 |
+
img_width, img_height = image_size
|
| 345 |
+
|
| 346 |
+
# Scale factors
|
| 347 |
+
scale_x = img_width / pdf_width
|
| 348 |
+
scale_y = img_height / pdf_height
|
| 349 |
+
|
| 350 |
+
# Convert PDF coordinates to image coordinates
|
| 351 |
+
x1 = int(pdf_bbox[0] * scale_x)
|
| 352 |
+
y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height)
|
| 353 |
+
x2 = int(pdf_bbox[2] * scale_x)
|
| 354 |
+
y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height)
|
| 355 |
+
|
| 356 |
+
return x1, y1, x2, y2
|
| 357 |
+
|
| 358 |
def find_misspell_boxes_from_text(
|
| 359 |
pdf_path: str,
|
| 360 |
*,
|
| 361 |
extra_allow: Optional[Iterable[str]] = None,
|
| 362 |
+
max_pages: int = 5,
|
| 363 |
+
image_size: Optional[Tuple[int, int]] = None
|
| 364 |
) -> List[Box]:
|
| 365 |
"""Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
|
| 366 |
if not (HAS_SPELLCHECK and HAS_PYMUPDF):
|
|
|
|
| 406 |
# If this span has misspellings, create a box for it
|
| 407 |
if has_misspelling:
|
| 408 |
bbox = span["bbox"] # [x0, y0, x1, y1]
|
| 409 |
+
|
| 410 |
+
# Get page dimensions for coordinate conversion
|
| 411 |
+
page_rect = page.rect
|
| 412 |
+
pdf_width = page_rect.width
|
| 413 |
+
pdf_height = page_rect.height
|
| 414 |
+
|
| 415 |
+
if image_size:
|
| 416 |
+
img_width, img_height = image_size
|
| 417 |
+
# Convert PDF coordinates to image coordinates
|
| 418 |
+
scale_x = img_width / pdf_width
|
| 419 |
+
scale_y = img_height / pdf_height
|
| 420 |
+
|
| 421 |
+
x1 = int(bbox[0] * scale_x)
|
| 422 |
+
y1 = int(bbox[1] * scale_y) + (page_num * img_height)
|
| 423 |
+
x2 = int(bbox[2] * scale_x)
|
| 424 |
+
y2 = int(bbox[3] * scale_y) + (page_num * img_height)
|
| 425 |
+
else:
|
| 426 |
+
# Use PDF coordinates directly (fallback)
|
| 427 |
+
x1 = int(bbox[0])
|
| 428 |
+
y1 = int(bbox[1]) + (page_num * 1000)
|
| 429 |
+
x2 = int(bbox[2])
|
| 430 |
+
y2 = int(bbox[3]) + (page_num * 1000)
|
| 431 |
+
|
| 432 |
boxes.append(Box(
|
| 433 |
+
y1=y1,
|
| 434 |
+
x1=x1,
|
| 435 |
+
y2=y2,
|
| 436 |
+
x2=x2,
|
| 437 |
+
area=(x2 - x1) * (y2 - y1)
|
| 438 |
))
|
| 439 |
|
| 440 |
doc.close()
|
|
|
|
| 686 |
|
| 687 |
# Run all analysis features with defaults
|
| 688 |
# Use text-based spell checking instead of OCR for better accuracy
|
| 689 |
+
# Pass image dimensions for proper coordinate mapping
|
| 690 |
+
image_size = (a.width, a.height)
|
| 691 |
+
misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
|
| 692 |
+
misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
|
| 693 |
+
|
| 694 |
+
# Debug: Print spell check results
|
| 695 |
+
print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
|
| 696 |
|
| 697 |
if HAS_BARCODE:
|
| 698 |
bar_a, info_a = find_barcode_boxes_and_info(a)
|