import os import cv2 import numpy as np from PIL import Image, ImageDraw, ImageFont import pytesseract from pdf2image import convert_from_path from pyzbar.pyzbar import decode from spellchecker import SpellChecker import nltk from skimage.metrics import structural_similarity as ssim from skimage import color import json import tempfile import shutil import unicodedata import regex as re # Domain whitelist for spell checking DOMAIN_WHITELIST = { # units / abbreviations "mg", "mg/g", "ml", "g", "thc", "cbd", "tcm", "mct", # common packaging terms / bilingual words you expect "gouttes", "tennir", "net", "zoom", "tytann", "dome", "drops", # brand or proper names you want to ignore completely "purified", "brands", "tytann", "dome", "drops", } # lowercase everything in whitelist for comparisons DOMAIN_WHITELIST = {w.lower() for w in DOMAIN_WHITELIST} # Safe import for regex with fallback try: import regex as _re _USE_REGEX = True except ImportError: import re as _re _USE_REGEX = False TOKEN_PATTERN = r"(?:\p{L})(?:[\p{L}'-]{1,})" if _USE_REGEX else r"[A-Za-z][A-Za-z'-]{1,}" class PDFComparator: def __init__(self): # Initialize spell checkers for English and French self.english_spellchecker = SpellChecker(language='en') self.french_spellchecker = SpellChecker(language='fr') # Add domain whitelist to spell checkers for w in DOMAIN_WHITELIST: self.english_spellchecker.word_frequency.add(w) self.french_spellchecker.word_frequency.add(w) # Download required NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') def enhance_image_for_tiny_fonts(self, image): """Enhance image specifically for tiny font OCR""" try: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8)) enhanced = clahe.apply(gray) denoised = cv2.bilateralFilter(enhanced, 9, 75, 75) gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0) unsharp_mask = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0) thresh = cv2.adaptiveThreshold(unsharp_mask, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel) return cleaned except Exception as e: print(f"Error enhancing image for tiny fonts: {str(e)}") return image def create_inverted_image(self, image): """Create inverted image for white text on dark backgrounds""" try: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) inverted = cv2.bitwise_not(gray) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) enhanced = clahe.apply(inverted) _, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return thresh except Exception as e: print(f"Error creating inverted image: {str(e)}") return image def extract_color_channels(self, image): """Extract text from different color channels""" try: # RGB channels b, g, r = cv2.split(image) # HSV channels hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) h, s, v = cv2.split(hsv) # LAB channels lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) l, a, b_lab = cv2.split(lab) channels = [r, g, b, v, l] texts = [] for channel in channels: _, thresh = cv2.threshold(channel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) text = pytesseract.image_to_string(thresh, config='--oem 3 --psm 6') if text.strip(): texts.append(text) return texts except Exception as e: print(f"Error extracting color channels: {str(e)}") return [] def create_edge_enhanced_image(self, image): """Create edge-enhanced image for text detection""" try: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) edges = cv2.Canny(gray, 50, 150) kernel = np.ones((2,2), np.uint8) dilated = cv2.dilate(edges, kernel, iterations=1) inverted = cv2.bitwise_not(dilated) return inverted except Exception as e: print(f"Error creating edge-enhanced image: {str(e)}") return image def ocr_with_multiple_configs(self, image): """Run OCR with multiple configurations and return best result""" configs = [ '--oem 3 --psm 6', # Uniform block of text '--oem 3 --psm 8', # Single word '--oem 3 --psm 13', # Raw line '--oem 1 --psm 6', # LSTM + Uniform block '--oem 3 --psm 3', # Fully automatic page segmentation ] best_text = "" best_length = 0 for config in configs: try: text = pytesseract.image_to_string(image, config=config) if len(text.strip()) > best_length: best_text = text best_length = len(text.strip()) except Exception as e: print(f"OCR config {config} failed: {str(e)}") continue return best_text def extract_multi_color_text(self, image): """Extract text using multiple preprocessing methods""" texts = [] # Method 1: Standard black text enhanced = self.enhance_image_for_tiny_fonts(image) text1 = self.ocr_with_multiple_configs(enhanced) if text1.strip(): texts.append(text1) # Method 2: Inverted text (white on dark) inverted = self.create_inverted_image(image) text2 = self.ocr_with_multiple_configs(inverted) if text2.strip(): texts.append(text2) # Method 3: Color channel separation color_texts = self.extract_color_channels(image) texts.extend(color_texts) # Method 4: Edge-enhanced edge_enhanced = self.create_edge_enhanced_image(image) text4 = self.ocr_with_multiple_configs(edge_enhanced) if text4.strip(): texts.append(text4) # Combine all texts and return the best one combined_text = " ".join(texts) return combined_text def validate_pdf(self, pdf_path): """Validate that PDF contains '50 Carroll' using enhanced OCR""" try: # Multiple DPI settings for better detection dpi_settings = [200, 300, 400] for dpi in dpi_settings: try: images = convert_from_path(pdf_path, dpi=dpi) for page_num, image in enumerate(images): # Convert PIL image to OpenCV format opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # Enhanced text extraction text = self.extract_multi_color_text(opencv_image) # Check for "50 Carroll" with multiple patterns patterns = ["50 Carroll", "50 carroll", "50Carroll", "50 carroll"] for pattern in patterns: if pattern in text: return True # Also try standard OCR as fallback standard_text = pytesseract.image_to_string(opencv_image, config='--oem 3 --psm 6') for pattern in patterns: if pattern in standard_text: return True except Exception as e: print(f"DPI {dpi} failed: {str(e)}") continue return False except Exception as e: raise Exception(f"Error validating PDF: {str(e)}") def extract_text_from_pdf(self, pdf_path): """Extract text from PDF using enhanced OCR""" try: # Use higher DPI for better text extraction images = convert_from_path(pdf_path, dpi=300) all_text = [] for page_num, image in enumerate(images): opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # Enhanced text extraction text = self.extract_multi_color_text(opencv_image) # Fallback to standard OCR if enhanced extraction is empty if not text.strip(): text = pytesseract.image_to_string(opencv_image, config='--oem 3 --psm 6') all_text.append({ 'page': page_num + 1, 'text': text, 'image': image }) return all_text except Exception as e: raise Exception(f"Error extracting text from PDF: {str(e)}") def _likely_french(self, token: str) -> bool: """Helper function to guess if a token is likely French""" if _USE_REGEX: # any Latin letter outside ASCII => probably FR (é, è, ç…) return bool(_re.search(r"[\p{Letter}&&\p{Latin}&&[^A-Za-z]]", token)) # fallback: any non-ascii letter return any((not ('a' <= c.lower() <= 'z')) and c.isalpha() for c in token) def check_spelling(self, text): """ Robust EN/FR spell check: - Unicode-aware tokens (keeps accents) - Normalizes curly quotes/ligatures - Heuristic per-token language (accented => FR; else EN) - Flags if unknown in its likely language (not both) """ try: text = unicodedata.normalize("NFKC", text) text = text.replace("'", "'").replace(""", '"').replace(""", '"') tokens = _re.findall(TOKEN_PATTERN, text, flags=_re.UNICODE if _USE_REGEX else 0) issues = [] for raw in tokens: t = raw.lower() # skip very short, short ALL-CAPS acronyms, and whitelisted terms if len(t) < 3: continue if raw.isupper() and len(raw) <= 3: # Changed from <=5 to <=3 continue if t in DOMAIN_WHITELIST: continue miss_en = t in self.english_spellchecker.unknown([t]) miss_fr = t in self.french_spellchecker.unknown([t]) use_fr = self._likely_french(raw) # Prefer the likely language, but fall back to "either language unknown" if (use_fr and miss_fr) or ((not use_fr) and miss_en) or (miss_en and miss_fr): issues.append({ "word": raw, "lang": "fr" if use_fr else "en", "suggestions_en": list(self.english_spellchecker.candidates(t))[:3], "suggestions_fr": list(self.french_spellchecker.candidates(t))[:3], }) return issues except Exception as e: print(f"Error checking spelling: {e}") return [] def annotate_spelling_errors_on_image(self, pil_image, misspelled): """ Draw one red rectangle around each misspelled token using Tesseract word boxes. 'misspelled' must be a list of dicts with 'word' keys (from check_spelling). """ if not misspelled: return pil_image def _norm(s: str) -> str: return unicodedata.normalize("NFKC", s).replace("'","'").strip(".,:;!?)(").lower() miss_set = {_norm(m["word"]) for m in misspelled} img = pil_image try: data = pytesseract.image_to_data( img, lang="eng+fra", # Added lang parameter config="--oem 3 --psm 6", output_type=pytesseract.Output.DICT, ) except Exception as e: print("image_to_data failed:", e) return img draw = ImageDraw.Draw(img) n = len(data.get("text", [])) for i in range(n): word = (data["text"][i] or "").strip() if not word: continue clean = _norm(word) # Used _norm function if clean and clean in miss_set: x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i] draw.rectangle([x, y, x + w, y + h], outline="red", width=4) return img def detect_barcodes_qr_codes(self, image): """Detect and decode barcodes and QR codes""" try: # Convert PIL image to OpenCV format opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # Decode barcodes and QR codes decoded_objects = decode(opencv_image) barcodes = [] for obj in decoded_objects: barcode_info = { 'type': obj.type, 'data': obj.data.decode('utf-8'), 'rect': obj.rect } barcodes.append(barcode_info) return barcodes except Exception as e: print(f"Error detecting barcodes: {str(e)}") return [] def compare_colors(self, image1, image2): """Compare colors between two images and return differences""" try: # Convert images to same size img1 = np.array(image1) img2 = np.array(image2) # Resize images to same dimensions height = min(img1.shape[0], img2.shape[0]) width = min(img1.shape[1], img2.shape[1]) img1_resized = cv2.resize(img1, (width, height)) img2_resized = cv2.resize(img2, (width, height)) # Convert to grayscale for comparison gray1 = cv2.cvtColor(img1_resized, cv2.COLOR_RGB2GRAY) gray2 = cv2.cvtColor(img2_resized, cv2.COLOR_RGB2GRAY) # Calculate structural similarity (score, diff) = ssim(gray1, gray2, full=True) # Convert difference to binary mask diff = (diff * 255).astype("uint8") thresh = cv2.threshold(diff, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1] # Find contours of differences contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) color_differences = [] for contour in contours: if cv2.contourArea(contour) > 100: # Filter small differences x, y, w, h = cv2.boundingRect(contour) color_differences.append({ 'x': x, 'y': y, 'width': w, 'height': h, 'area': cv2.contourArea(contour) }) return color_differences except Exception as e: print(f"Error comparing colors: {str(e)}") return [] def create_annotated_image(self, image, differences, output_path): """Create annotated image with red boxes around differences""" try: # Create a copy of the image annotated_image = image.copy() draw = ImageDraw.Draw(annotated_image) # Draw red rectangles around differences for diff in differences: x, y, w, h = diff['x'], diff['y'], diff['width'], diff['height'] draw.rectangle([x, y, x + w, y + h], outline='red', width=3) # Save annotated image annotated_image.save(output_path) except Exception as e: print(f"Error creating annotated image: {str(e)}") def compare_pdfs(self, pdf1_path, pdf2_path, session_id): """Main comparison function""" try: # Validate both PDFs contain "50 Carroll" if not self.validate_pdf(pdf1_path): raise Exception("INVALID DOCUMENT") if not self.validate_pdf(pdf2_path): raise Exception("INVALID DOCUMENT") # Extract text and images from both PDFs pdf1_data = self.extract_text_from_pdf(pdf1_path) pdf2_data = self.extract_text_from_pdf(pdf2_path) # Initialize results results = { 'session_id': session_id, 'validation': { 'pdf1_valid': True, 'pdf2_valid': True, 'validation_text': '50 Carroll' }, 'text_comparison': [], 'spelling_issues': [], 'barcodes_qr_codes': [], 'color_differences': [], 'annotated_images': [] } # Compare text and check spelling for i, (page1, page2) in enumerate(zip(pdf1_data, pdf2_data)): page_results = { 'page': i + 1, 'text_differences': [], 'spelling_issues_pdf1': [], 'spelling_issues_pdf2': [], 'barcodes_pdf1': [], 'barcodes_pdf2': [], 'color_differences': [] } # Check spelling for both PDFs page_results['spelling_issues_pdf1'] = self.check_spelling(page1['text']) page_results['spelling_issues_pdf2'] = self.check_spelling(page2['text']) # Create spelling-only annotated images (one box per error) spell_dir = f'static/results/{session_id}' os.makedirs(spell_dir, exist_ok=True) spell_img1 = page1['image'].copy() spell_img2 = page2['image'].copy() spell_img1 = self.annotate_spelling_errors_on_image(spell_img1, page_results['spelling_issues_pdf1']) spell_img2 = self.annotate_spelling_errors_on_image(spell_img2, page_results['spelling_issues_pdf2']) spell_path1 = f'{spell_dir}/page_{i+1}_pdf1_spelling.png' spell_path2 = f'{spell_dir}/page_{i+1}_pdf2_spelling.png' spell_img1.save(spell_path1) spell_img2.save(spell_path2) # Detect barcodes and QR codes page_results['barcodes_pdf1'] = self.detect_barcodes_qr_codes(page1['image']) page_results['barcodes_pdf2'] = self.detect_barcodes_qr_codes(page2['image']) # Compare colors color_diffs = self.compare_colors(page1['image'], page2['image']) page_results['color_differences'] = color_diffs # Create annotated images if color_diffs: output_dir = f'static/results/{session_id}' os.makedirs(output_dir, exist_ok=True) annotated_path1 = f'{output_dir}/page_{i+1}_pdf1_annotated.png' annotated_path2 = f'{output_dir}/page_{i+1}_pdf2_annotated.png' self.create_annotated_image(page1['image'], color_diffs, annotated_path1) self.create_annotated_image(page2['image'], color_diffs, annotated_path2) page_results['annotated_images'] = { 'pdf1': f'results/{session_id}/page_{i+1}_pdf1_annotated.png', 'pdf2': f'results/{session_id}/page_{i+1}_pdf2_annotated.png', 'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png', 'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png' } else: # If no color differences, still save spelling images page_results['annotated_images'] = { 'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png', 'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png' } # Add spelling issues summary to text differences if page_results['spelling_issues_pdf1'] or page_results['spelling_issues_pdf2']: page_results['text_differences'].append({ 'type': 'spelling', 'pdf1_issues': len(page_results['spelling_issues_pdf1']), 'pdf2_issues': len(page_results['spelling_issues_pdf2']), 'details': { 'pdf1': [issue['word'] for issue in page_results['spelling_issues_pdf1']], 'pdf2': [issue['word'] for issue in page_results['spelling_issues_pdf2']] } }) results['text_comparison'].append(page_results) # Aggregate spelling issues all_spelling_issues = [] for page in results['text_comparison']: all_spelling_issues.extend(page['spelling_issues_pdf1']) all_spelling_issues.extend(page['spelling_issues_pdf2']) results['spelling_issues'] = all_spelling_issues # Aggregate barcodes and QR codes all_barcodes = [] for page in results['text_comparison']: all_barcodes.extend(page['barcodes_pdf1']) all_barcodes.extend(page['barcodes_pdf2']) results['barcodes_qr_codes'] = all_barcodes return results except Exception as e: raise Exception(f"Error comparing PDFs: {str(e)}")