Fix coordinate mapping for text-based spell checking

- Add image_size parameter to find_misspell_boxes_from_text()
- Convert PDF coordinates to image coordinates using scale factors
- Pass image dimensions from compare_pdfs() to spell checking function
- Add debug output to track spell check results
- Ensure spell checking boxes align with rasterized image coordinates

Files changed (1) hide show

pdf_comparator.py +54 -8

pdf_comparator.py CHANGED Viewed

@@ -338,11 +338,29 @@ def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
     except Exception:
         return []
 def find_misspell_boxes_from_text(
     pdf_path: str,
     *,
     extra_allow: Optional[Iterable[str]] = None,
-    max_pages: int = 5
 ) -> List[Box]:
     """Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
     if not (HAS_SPELLCHECK and HAS_PYMUPDF):
@@ -388,12 +406,35 @@ def find_misspell_boxes_from_text(
                         # If this span has misspellings, create a box for it
                         if has_misspelling:
                             bbox = span["bbox"]  # [x0, y0, x1, y1]
                             boxes.append(Box(
-                                y1=bbox[1],       # y0
-                                x1=bbox[0],      # x0
-                                y2=bbox[3],      # y1
-                                x2=bbox[2],      # x1
-                                area=(bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
                             ))
         doc.close()
@@ -645,8 +686,13 @@ def compare_pdfs(file_a, file_b):
         # Run all analysis features with defaults
         # Use text-based spell checking instead of OCR for better accuracy
-        misspell_a = find_misspell_boxes_from_text(file_a.name) if HAS_SPELLCHECK and HAS_PYMUPDF else []
-        misspell_b = find_misspell_boxes_from_text(file_b.name) if HAS_SPELLCHECK and HAS_PYMUPDF else []
         if HAS_BARCODE:
             bar_a, info_a = find_barcode_boxes_and_info(a)

     except Exception:
         return []
+def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000):
+    """Convert PDF coordinates to image coordinates"""
+    pdf_width, pdf_height = pdf_page_size
+    img_width, img_height = image_size
+    # Scale factors
+    scale_x = img_width / pdf_width
+    scale_y = img_height / pdf_height
+    # Convert PDF coordinates to image coordinates
+    x1 = int(pdf_bbox[0] * scale_x)
+    y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height)
+    x2 = int(pdf_bbox[2] * scale_x)
+    y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height)
+    return x1, y1, x2, y2
 def find_misspell_boxes_from_text(
     pdf_path: str,
     *,
     extra_allow: Optional[Iterable[str]] = None,
+    max_pages: int = 5,
+    image_size: Optional[Tuple[int, int]] = None
 ) -> List[Box]:
     """Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
     if not (HAS_SPELLCHECK and HAS_PYMUPDF):
                         # If this span has misspellings, create a box for it
                         if has_misspelling:
                             bbox = span["bbox"]  # [x0, y0, x1, y1]
+                            # Get page dimensions for coordinate conversion
+                            page_rect = page.rect
+                            pdf_width = page_rect.width
+                            pdf_height = page_rect.height
+                            if image_size:
+                                img_width, img_height = image_size
+                                # Convert PDF coordinates to image coordinates
+                                scale_x = img_width / pdf_width
+                                scale_y = img_height / pdf_height
+                                x1 = int(bbox[0] * scale_x)
+                                y1 = int(bbox[1] * scale_y) + (page_num * img_height)
+                                x2 = int(bbox[2] * scale_x)
+                                y2 = int(bbox[3] * scale_y) + (page_num * img_height)
+                            else:
+                                # Use PDF coordinates directly (fallback)
+                                x1 = int(bbox[0])
+                                y1 = int(bbox[1]) + (page_num * 1000)
+                                x2 = int(bbox[2])
+                                y2 = int(bbox[3]) + (page_num * 1000)
                             boxes.append(Box(
+                                y1=y1,
+                                x1=x1,
+                                y2=y2,
+                                x2=x2,
+                                area=(x2 - x1) * (y2 - y1)
                             ))
         doc.close()
         # Run all analysis features with defaults
         # Use text-based spell checking instead of OCR for better accuracy
+        # Pass image dimensions for proper coordinate mapping
+        image_size = (a.width, a.height)
+        misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
+        misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
+        # Debug: Print spell check results
+        print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")
         if HAS_BARCODE:
             bar_a, info_a = find_barcode_boxes_and_info(a)