Yaz Hobooti commited on
Commit
93954b8
·
1 Parent(s): 0b957fa

Fix coordinate mapping for text-based spell checking

Browse files

- Add image_size parameter to find_misspell_boxes_from_text()
- Convert PDF coordinates to image coordinates using scale factors
- Pass image dimensions from compare_pdfs() to spell checking function
- Add debug output to track spell check results
- Ensure spell checking boxes align with rasterized image coordinates

Files changed (1) hide show
  1. pdf_comparator.py +54 -8
pdf_comparator.py CHANGED
@@ -338,11 +338,29 @@ def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
338
  except Exception:
339
  return []
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  def find_misspell_boxes_from_text(
342
  pdf_path: str,
343
  *,
344
  extra_allow: Optional[Iterable[str]] = None,
345
- max_pages: int = 5
 
346
  ) -> List[Box]:
347
  """Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
348
  if not (HAS_SPELLCHECK and HAS_PYMUPDF):
@@ -388,12 +406,35 @@ def find_misspell_boxes_from_text(
388
  # If this span has misspellings, create a box for it
389
  if has_misspelling:
390
  bbox = span["bbox"] # [x0, y0, x1, y1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  boxes.append(Box(
392
- y1=bbox[1], # y0
393
- x1=bbox[0], # x0
394
- y2=bbox[3], # y1
395
- x2=bbox[2], # x1
396
- area=(bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
397
  ))
398
 
399
  doc.close()
@@ -645,8 +686,13 @@ def compare_pdfs(file_a, file_b):
645
 
646
  # Run all analysis features with defaults
647
  # Use text-based spell checking instead of OCR for better accuracy
648
- misspell_a = find_misspell_boxes_from_text(file_a.name) if HAS_SPELLCHECK and HAS_PYMUPDF else []
649
- misspell_b = find_misspell_boxes_from_text(file_b.name) if HAS_SPELLCHECK and HAS_PYMUPDF else []
 
 
 
 
 
650
 
651
  if HAS_BARCODE:
652
  bar_a, info_a = find_barcode_boxes_and_info(a)
 
338
  except Exception:
339
  return []
340
 
341
+ def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000):
342
+ """Convert PDF coordinates to image coordinates"""
343
+ pdf_width, pdf_height = pdf_page_size
344
+ img_width, img_height = image_size
345
+
346
+ # Scale factors
347
+ scale_x = img_width / pdf_width
348
+ scale_y = img_height / pdf_height
349
+
350
+ # Convert PDF coordinates to image coordinates
351
+ x1 = int(pdf_bbox[0] * scale_x)
352
+ y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height)
353
+ x2 = int(pdf_bbox[2] * scale_x)
354
+ y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height)
355
+
356
+ return x1, y1, x2, y2
357
+
358
  def find_misspell_boxes_from_text(
359
  pdf_path: str,
360
  *,
361
  extra_allow: Optional[Iterable[str]] = None,
362
+ max_pages: int = 5,
363
+ image_size: Optional[Tuple[int, int]] = None
364
  ) -> List[Box]:
365
  """Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
366
  if not (HAS_SPELLCHECK and HAS_PYMUPDF):
 
406
  # If this span has misspellings, create a box for it
407
  if has_misspelling:
408
  bbox = span["bbox"] # [x0, y0, x1, y1]
409
+
410
+ # Get page dimensions for coordinate conversion
411
+ page_rect = page.rect
412
+ pdf_width = page_rect.width
413
+ pdf_height = page_rect.height
414
+
415
+ if image_size:
416
+ img_width, img_height = image_size
417
+ # Convert PDF coordinates to image coordinates
418
+ scale_x = img_width / pdf_width
419
+ scale_y = img_height / pdf_height
420
+
421
+ x1 = int(bbox[0] * scale_x)
422
+ y1 = int(bbox[1] * scale_y) + (page_num * img_height)
423
+ x2 = int(bbox[2] * scale_x)
424
+ y2 = int(bbox[3] * scale_y) + (page_num * img_height)
425
+ else:
426
+ # Use PDF coordinates directly (fallback)
427
+ x1 = int(bbox[0])
428
+ y1 = int(bbox[1]) + (page_num * 1000)
429
+ x2 = int(bbox[2])
430
+ y2 = int(bbox[3]) + (page_num * 1000)
431
+
432
  boxes.append(Box(
433
+ y1=y1,
434
+ x1=x1,
435
+ y2=y2,
436
+ x2=x2,
437
+ area=(x2 - x1) * (y2 - y1)
438
  ))
439
 
440
  doc.close()
 
686
 
687
  # Run all analysis features with defaults
688
  # Use text-based spell checking instead of OCR for better accuracy
689
+ # Pass image dimensions for proper coordinate mapping
690
+ image_size = (a.width, a.height)
691
+ misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
692
+ misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
693
+
694
+ # Debug: Print spell check results
695
+ print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
696
 
697
  if HAS_BARCODE:
698
  bar_a, info_a = find_barcode_boxes_and_info(a)