Yaz Hobooti commited on
Commit
ad98b73
·
1 Parent(s): 7f42e9a

Implement text-based spell checking using PDF text extraction

Browse files

- Add extract_pdf_text() function to extract text directly from PDFs using PyMuPDF
- Add find_misspell_boxes_from_text() function that analyzes PDF text with coordinate mapping
- Update compare_pdfs() to use text-based spell checking instead of OCR
- Maintain fallback to OCR-based approach if needed
- Fix missing return statement in decode_with_variants() function
- Spell checking now operates on original PDF text for better accuracy and performance

Files changed (1) hide show
  1. pdf_comparator.py +104 -3
pdf_comparator.py CHANGED
@@ -321,6 +321,105 @@ def prepare_for_ocr(img: Image.Image) -> Image.Image:
321
  g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
322
  return g
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  def find_misspell_boxes(
325
  img: Image.Image,
326
  *,
@@ -331,6 +430,7 @@ def find_misspell_boxes(
331
  psm: int = 6,
332
  oem: int = 3
333
  ) -> List[Box]:
 
334
  if not (HAS_OCR and HAS_SPELLCHECK):
335
  return []
336
 
@@ -459,7 +559,7 @@ def decode_with_variants(img: Image.Image):
459
  if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
460
  if not results and img.mode != 'RGB':
461
  do_decode(img.convert('RGB'))
462
- return results
463
 
464
  def find_barcode_boxes_and_info(img: Image.Image):
465
  decodes = decode_with_variants(img)
@@ -544,8 +644,9 @@ def compare_pdfs(file_a, file_b):
544
  red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)
545
 
546
  # Run all analysis features with defaults
547
- misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else []
548
- misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else []
 
549
 
550
  if HAS_BARCODE:
551
  bar_a, info_a = find_barcode_boxes_and_info(a)
 
321
  g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
322
  return g
323
 
324
+ def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
325
+ """Extract text directly from PDF using PyMuPDF"""
326
+ if not HAS_PYMUPDF:
327
+ return []
328
+
329
+ try:
330
+ doc = fitz.open(path)
331
+ texts = []
332
+ for page_num in range(min(len(doc), max_pages)):
333
+ page = doc[page_num]
334
+ text = page.get_text()
335
+ texts.append(text)
336
+ doc.close()
337
+ return texts
338
+ except Exception:
339
+ return []
340
+
341
+ def find_misspell_boxes_from_text(
342
+ pdf_path: str,
343
+ *,
344
+ extra_allow: Optional[Iterable[str]] = None,
345
+ max_pages: int = 5
346
+ ) -> List[Box]:
347
+ """Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
348
+ if not (HAS_SPELLCHECK and HAS_PYMUPDF):
349
+ return []
350
+
351
+ # Load extra allowed words
352
+ if extra_allow and _SPELL_EN:
353
+ _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
354
+ if extra_allow and _SPELL_FR:
355
+ _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
356
+
357
+ boxes: List[Box] = []
358
+
359
+ try:
360
+ doc = fitz.open(pdf_path)
361
+
362
+ for page_num in range(min(len(doc), max_pages)):
363
+ page = doc[page_num]
364
+
365
+ # Get text with position information
366
+ text_dict = page.get_text("dict")
367
+
368
+ # Process each block of text
369
+ for block in text_dict.get("blocks", []):
370
+ if "lines" not in block:
371
+ continue
372
+
373
+ for line in block["lines"]:
374
+ for span in line["spans"]:
375
+ text = span.get("text", "").strip()
376
+ if not text:
377
+ continue
378
+
379
+ # Extract tokens and check for misspellings
380
+ tokens = _extract_tokens(text)
381
+ has_misspelling = False
382
+
383
+ for token in tokens:
384
+ if len(token) >= 2 and not _is_known_word(token):
385
+ has_misspelling = True
386
+ break
387
+
388
+ # If this span has misspellings, create a box for it
389
+ if has_misspelling:
390
+ bbox = span["bbox"] # [x0, y0, x1, y1]
391
+ boxes.append(Box(
392
+ top=bbox[1], # y0
393
+ left=bbox[0], # x0
394
+ bottom=bbox[3], # y1
395
+ right=bbox[2], # x1
396
+ area=(bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
397
+ ))
398
+
399
+ doc.close()
400
+
401
+ except Exception:
402
+ # Fallback to simple text extraction if coordinate mapping fails
403
+ page_texts = extract_pdf_text(pdf_path, max_pages)
404
+ for page_num, text in enumerate(page_texts):
405
+ if not text.strip():
406
+ continue
407
+
408
+ tokens = _extract_tokens(text)
409
+ misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)]
410
+
411
+ if misspelled_words:
412
+ # Create a placeholder box for the page
413
+ boxes.append(Box(
414
+ top=page_num * 1000,
415
+ left=0,
416
+ bottom=(page_num + 1) * 1000,
417
+ right=800,
418
+ area=800 * 1000
419
+ ))
420
+
421
+ return boxes
422
+
423
  def find_misspell_boxes(
424
  img: Image.Image,
425
  *,
 
430
  psm: int = 6,
431
  oem: int = 3
432
  ) -> List[Box]:
433
+ """Legacy OCR-based spell checking (kept for fallback)"""
434
  if not (HAS_OCR and HAS_SPELLCHECK):
435
  return []
436
 
 
559
  if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
560
  if not results and img.mode != 'RGB':
561
  do_decode(img.convert('RGB'))
562
+ return results
563
 
564
  def find_barcode_boxes_and_info(img: Image.Image):
565
  decodes = decode_with_variants(img)
 
644
  red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)
645
 
646
  # Run all analysis features with defaults
647
+ # Use text-based spell checking instead of OCR for better accuracy
648
+ misspell_a = find_misspell_boxes_from_text(file_a.name) if HAS_SPELLCHECK and HAS_PYMUPDF else []
649
+ misspell_b = find_misspell_boxes_from_text(file_b.name) if HAS_SPELLCHECK and HAS_PYMUPDF else []
650
 
651
  if HAS_BARCODE:
652
  bar_a, info_a = find_barcode_boxes_and_info(a)