ProofCheck / pdf_comparator.py

Yaz Hobooti

Increase PDF resolution: DPI from 300 to 600, scaling factors improved for better OCR and barcode detection

e7a28e8 3 months ago

22.8 kB

	import os
	import cv2
	import numpy as np
	from PIL import Image, ImageDraw, ImageFont
	import pytesseract
	from pdf2image import convert_from_path
	from pyzbar.pyzbar import decode
	from spellchecker import SpellChecker
	import nltk
	from skimage.metrics import structural_similarity as ssim
	from skimage import color
	import json
	import tempfile
	import shutil
	import unicodedata
	import regex as re

	# Domain whitelist for spell checking
	DOMAIN_WHITELIST = {
	# units / abbreviations
	"mg", "mg/g", "ml", "g", "thc", "cbd", "tcm", "mct",
	# common packaging terms / bilingual words you expect
	"gouttes", "tennir", "net", "zoom", "tytann", "dome", "drops",
	# brand or proper names you want to ignore completely
	"purified", "brands", "tytann", "dome", "drops",
	}
	# lowercase everything in whitelist for comparisons
	DOMAIN_WHITELIST = {w.lower() for w in DOMAIN_WHITELIST}

	# Safe import for regex with fallback
	try:
	import regex as _re
	_USE_REGEX = True
	except ImportError:
	import re as _re
	_USE_REGEX = False

	TOKEN_PATTERN = r"(?:\p{L})(?:[\p{L}'-]{1,})" if _USE_REGEX else r"[A-Za-z][A-Za-z'-]{1,}"

	class PDFComparator:
	def __init__(self):
	# Initialize spell checkers for English and French
	self.english_spellchecker = SpellChecker(language='en')
	self.french_spellchecker = SpellChecker(language='fr')

	# Add domain whitelist to spell checkers
	for w in DOMAIN_WHITELIST:
	self.english_spellchecker.word_frequency.add(w)
	self.french_spellchecker.word_frequency.add(w)

	# Download required NLTK data
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt')

	def enhance_image_for_tiny_fonts(self, image):
	"""Enhance image specifically for tiny font OCR"""
	try:
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
	enhanced = clahe.apply(gray)
	denoised = cv2.bilateralFilter(enhanced, 9, 75, 75)
	gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
	unsharp_mask = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
	thresh = cv2.adaptiveThreshold(unsharp_mask, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
	cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
	return cleaned
	except Exception as e:
	print(f"Error enhancing image for tiny fonts: {str(e)}")
	return image

	def create_inverted_image(self, image):
	"""Create inverted image for white text on dark backgrounds"""
	try:
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	inverted = cv2.bitwise_not(gray)
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
	enhanced = clahe.apply(inverted)
	_, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	return thresh
	except Exception as e:
	print(f"Error creating inverted image: {str(e)}")
	return image

	def extract_color_channels(self, image):
	"""Extract text from different color channels"""
	try:
	# RGB channels
	b, g, r = cv2.split(image)

	# HSV channels
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	h, s, v = cv2.split(hsv)

	# LAB channels
	lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
	l, a, b_lab = cv2.split(lab)

	channels = [r, g, b, v, l]
	texts = []

	for channel in channels:
	_, thresh = cv2.threshold(channel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	text = pytesseract.image_to_string(thresh, config='--oem 3 --psm 6')
	if text.strip():
	texts.append(text)

	return texts
	except Exception as e:
	print(f"Error extracting color channels: {str(e)}")
	return []

	def create_edge_enhanced_image(self, image):
	"""Create edge-enhanced image for text detection"""
	try:
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	edges = cv2.Canny(gray, 50, 150)
	kernel = np.ones((2,2), np.uint8)
	dilated = cv2.dilate(edges, kernel, iterations=1)
	inverted = cv2.bitwise_not(dilated)
	return inverted
	except Exception as e:
	print(f"Error creating edge-enhanced image: {str(e)}")
	return image

	def ocr_with_multiple_configs(self, image):
	"""Run OCR with multiple configurations and return best result"""
	configs = [
	'--oem 3 --psm 6', # Uniform block of text
	'--oem 3 --psm 8', # Single word
	'--oem 3 --psm 13', # Raw line
	'--oem 1 --psm 6', # LSTM + Uniform block
	'--oem 3 --psm 3', # Fully automatic page segmentation
	]

	best_text = ""
	best_length = 0

	for config in configs:
	try:
	text = pytesseract.image_to_string(image, config=config)
	if len(text.strip()) > best_length:
	best_text = text
	best_length = len(text.strip())
	except Exception as e:
	print(f"OCR config {config} failed: {str(e)}")
	continue

	return best_text

	def extract_multi_color_text(self, image):
	"""Extract text using multiple preprocessing methods"""
	texts = []

	# Method 1: Standard black text
	enhanced = self.enhance_image_for_tiny_fonts(image)
	text1 = self.ocr_with_multiple_configs(enhanced)
	if text1.strip():
	texts.append(text1)

	# Method 2: Inverted text (white on dark)
	inverted = self.create_inverted_image(image)
	text2 = self.ocr_with_multiple_configs(inverted)
	if text2.strip():
	texts.append(text2)

	# Method 3: Color channel separation
	color_texts = self.extract_color_channels(image)
	texts.extend(color_texts)

	# Method 4: Edge-enhanced
	edge_enhanced = self.create_edge_enhanced_image(image)
	text4 = self.ocr_with_multiple_configs(edge_enhanced)
	if text4.strip():
	texts.append(text4)

	# Combine all texts and return the best one
	combined_text = " ".join(texts)
	return combined_text

	def validate_pdf(self, pdf_path):
	"""Validate that PDF contains '50 Carroll' using enhanced OCR"""
	try:
	# Multiple DPI settings for better detection
	dpi_settings = [200, 300, 400]

	for dpi in dpi_settings:
	try:
	images = convert_from_path(pdf_path, dpi=dpi)

	for page_num, image in enumerate(images):
	# Convert PIL image to OpenCV format
	opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

	# Enhanced text extraction
	text = self.extract_multi_color_text(opencv_image)

	# Check for "50 Carroll" with multiple patterns
	patterns = ["50 Carroll", "50 carroll", "50Carroll", "50 carroll"]
	for pattern in patterns:
	if pattern in text:
	return True

	# Also try standard OCR as fallback
	standard_text = pytesseract.image_to_string(opencv_image, config='--oem 3 --psm 6')
	for pattern in patterns:
	if pattern in standard_text:
	return True

	except Exception as e:
	print(f"DPI {dpi} failed: {str(e)}")
	continue

	return False

	except Exception as e:
	raise Exception(f"Error validating PDF: {str(e)}")

	def extract_text_from_pdf(self, pdf_path):
	"""Extract text from PDF using enhanced OCR"""
	try:
	# Use higher DPI for better text extraction
	images = convert_from_path(pdf_path, dpi=300)
	all_text = []

	for page_num, image in enumerate(images):
	opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

	# Enhanced text extraction
	text = self.extract_multi_color_text(opencv_image)

	# Fallback to standard OCR if enhanced extraction is empty
	if not text.strip():
	text = pytesseract.image_to_string(opencv_image, config='--oem 3 --psm 6')

	all_text.append({
	'page': page_num + 1,
	'text': text,
	'image': image
	})

	return all_text

	except Exception as e:
	raise Exception(f"Error extracting text from PDF: {str(e)}")

	def _likely_french(self, token: str) -> bool:
	"""Helper function to guess if a token is likely French"""
	if _USE_REGEX:
	# any Latin letter outside ASCII => probably FR (é, è, ç…)
	return bool(_re.search(r"[\p{Letter}&&\p{Latin}&&[^A-Za-z]]", token))
	# fallback: any non-ascii letter
	return any((not ('a' <= c.lower() <= 'z')) and c.isalpha() for c in token)

	def check_spelling(self, text):
	"""
	Robust EN/FR spell check:
	- Unicode-aware tokens (keeps accents)
	- Normalizes curly quotes/ligatures
	- Heuristic per-token language (accented => FR; else EN)
	- Flags if unknown in its likely language (not both)
	"""
	try:
	text = unicodedata.normalize("NFKC", text)
	text = text.replace("'", "'").replace(""", '"').replace(""", '"')

	tokens = _re.findall(TOKEN_PATTERN, text, flags=_re.UNICODE if _USE_REGEX else 0)

	issues = []
	for raw in tokens:
	t = raw.lower()

	# skip very short, short ALL-CAPS acronyms, and whitelisted terms
	if len(t) < 3:
	continue
	if raw.isupper() and len(raw) <= 3: # Changed from <=5 to <=3
	continue
	if t in DOMAIN_WHITELIST:
	continue

	miss_en = t in self.english_spellchecker.unknown([t])
	miss_fr = t in self.french_spellchecker.unknown([t])

	use_fr = self._likely_french(raw)

	# Prefer the likely language, but fall back to "either language unknown"
	if (use_fr and miss_fr) or ((not use_fr) and miss_en) or (miss_en and miss_fr):
	issues.append({
	"word": raw,
	"lang": "fr" if use_fr else "en",
	"suggestions_en": list(self.english_spellchecker.candidates(t))[:3],
	"suggestions_fr": list(self.french_spellchecker.candidates(t))[:3],
	})

	return issues
	except Exception as e:
	print(f"Error checking spelling: {e}")
	return []

	def annotate_spelling_errors_on_image(self, pil_image, misspelled):
	"""
	Draw one red rectangle around each misspelled token using Tesseract word boxes.
	'misspelled' must be a list of dicts with 'word' keys (from check_spelling).
	"""
	if not misspelled:
	return pil_image

	def _norm(s: str) -> str:
	return unicodedata.normalize("NFKC", s).replace("'","'").strip(".,:;!?)(").lower()

	miss_set = {_norm(m["word"]) for m in misspelled}

	img = pil_image
	try:
	data = pytesseract.image_to_data(
	img,
	lang="eng+fra", # Added lang parameter
	config="--oem 3 --psm 6",
	output_type=pytesseract.Output.DICT,
	)
	except Exception as e:
	print("image_to_data failed:", e)
	return img

	draw = ImageDraw.Draw(img)
	n = len(data.get("text", []))
	for i in range(n):
	word = (data["text"][i] or "").strip()
	if not word:
	continue
	clean = _norm(word) # Used _norm function

	if clean and clean in miss_set:
	x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
	draw.rectangle([x, y, x + w, y + h], outline="red", width=4)

	return img

	def detect_barcodes_qr_codes(self, image):
	"""Detect and decode barcodes and QR codes"""
	try:
	# Convert PIL image to OpenCV format
	opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

	# Decode barcodes and QR codes
	decoded_objects = decode(opencv_image)

	barcodes = []
	for obj in decoded_objects:
	barcode_info = {
	'type': obj.type,
	'data': obj.data.decode('utf-8'),
	'rect': obj.rect
	}
	barcodes.append(barcode_info)

	return barcodes

	except Exception as e:
	print(f"Error detecting barcodes: {str(e)}")
	return []

	def compare_colors(self, image1, image2):
	"""Compare colors between two images and return differences"""
	try:
	# Convert images to same size
	img1 = np.array(image1)
	img2 = np.array(image2)

	# Resize images to same dimensions
	height = min(img1.shape[0], img2.shape[0])
	width = min(img1.shape[1], img2.shape[1])

	img1_resized = cv2.resize(img1, (width, height))
	img2_resized = cv2.resize(img2, (width, height))

	# Convert to grayscale for comparison
	gray1 = cv2.cvtColor(img1_resized, cv2.COLOR_RGB2GRAY)
	gray2 = cv2.cvtColor(img2_resized, cv2.COLOR_RGB2GRAY)

	# Calculate structural similarity
	(score, diff) = ssim(gray1, gray2, full=True)

	# Convert difference to binary mask
	diff = (diff * 255).astype("uint8")
	thresh = cv2.threshold(diff, 0, 255, cv2.THRESH_BINARY_INV \| cv2.THRESH_OTSU)[1]

	# Find contours of differences
	contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	color_differences = []
	for contour in contours:
	if cv2.contourArea(contour) > 100: # Filter small differences
	x, y, w, h = cv2.boundingRect(contour)
	color_differences.append({
	'x': x,
	'y': y,
	'width': w,
	'height': h,
	'area': cv2.contourArea(contour)
	})

	return color_differences

	except Exception as e:
	print(f"Error comparing colors: {str(e)}")
	return []

	def create_annotated_image(self, image, differences, output_path):
	"""Create annotated image with red boxes around differences"""
	try:
	# Create a copy of the image
	annotated_image = image.copy()
	draw = ImageDraw.Draw(annotated_image)

	# Draw red rectangles around differences
	for diff in differences:
	x, y, w, h = diff['x'], diff['y'], diff['width'], diff['height']
	draw.rectangle([x, y, x + w, y + h], outline='red', width=3)

	# Save annotated image
	annotated_image.save(output_path)

	except Exception as e:
	print(f"Error creating annotated image: {str(e)}")

	def compare_pdfs(self, pdf1_path, pdf2_path, session_id):
	"""Main comparison function"""
	try:
	# Validate both PDFs contain "50 Carroll"
	if not self.validate_pdf(pdf1_path):
	raise Exception("INVALID DOCUMENT")

	if not self.validate_pdf(pdf2_path):
	raise Exception("INVALID DOCUMENT")

	# Extract text and images from both PDFs
	pdf1_data = self.extract_text_from_pdf(pdf1_path)
	pdf2_data = self.extract_text_from_pdf(pdf2_path)

	# Initialize results
	results = {
	'session_id': session_id,
	'validation': {
	'pdf1_valid': True,
	'pdf2_valid': True,
	'validation_text': '50 Carroll'
	},
	'text_comparison': [],
	'spelling_issues': [],
	'barcodes_qr_codes': [],
	'color_differences': [],
	'annotated_images': []
	}

	# Compare text and check spelling
	for i, (page1, page2) in enumerate(zip(pdf1_data, pdf2_data)):
	page_results = {
	'page': i + 1,
	'text_differences': [],
	'spelling_issues_pdf1': [],
	'spelling_issues_pdf2': [],
	'barcodes_pdf1': [],
	'barcodes_pdf2': [],
	'color_differences': []
	}

	# Check spelling for both PDFs
	page_results['spelling_issues_pdf1'] = self.check_spelling(page1['text'])
	page_results['spelling_issues_pdf2'] = self.check_spelling(page2['text'])

	# Create spelling-only annotated images (one box per error)
	spell_dir = f'static/results/{session_id}'
	os.makedirs(spell_dir, exist_ok=True)
	spell_img1 = page1['image'].copy()
	spell_img2 = page2['image'].copy()
	spell_img1 = self.annotate_spelling_errors_on_image(spell_img1, page_results['spelling_issues_pdf1'])
	spell_img2 = self.annotate_spelling_errors_on_image(spell_img2, page_results['spelling_issues_pdf2'])
	spell_path1 = f'{spell_dir}/page_{i+1}_pdf1_spelling.png'
	spell_path2 = f'{spell_dir}/page_{i+1}_pdf2_spelling.png'
	spell_img1.save(spell_path1)
	spell_img2.save(spell_path2)

	# Detect barcodes and QR codes
	page_results['barcodes_pdf1'] = self.detect_barcodes_qr_codes(page1['image'])
	page_results['barcodes_pdf2'] = self.detect_barcodes_qr_codes(page2['image'])

	# Compare colors
	color_diffs = self.compare_colors(page1['image'], page2['image'])
	page_results['color_differences'] = color_diffs

	# Create annotated images
	if color_diffs:
	output_dir = f'static/results/{session_id}'
	os.makedirs(output_dir, exist_ok=True)

	annotated_path1 = f'{output_dir}/page_{i+1}_pdf1_annotated.png'
	annotated_path2 = f'{output_dir}/page_{i+1}_pdf2_annotated.png'

	self.create_annotated_image(page1['image'], color_diffs, annotated_path1)
	self.create_annotated_image(page2['image'], color_diffs, annotated_path2)

	page_results['annotated_images'] = {
	'pdf1': f'results/{session_id}/page_{i+1}_pdf1_annotated.png',
	'pdf2': f'results/{session_id}/page_{i+1}_pdf2_annotated.png',
	'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png',
	'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png'
	}
	else:
	# If no color differences, still save spelling images
	page_results['annotated_images'] = {
	'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png',
	'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png'
	}

	# Add spelling issues summary to text differences
	if page_results['spelling_issues_pdf1'] or page_results['spelling_issues_pdf2']:
	page_results['text_differences'].append({
	'type': 'spelling',
	'pdf1_issues': len(page_results['spelling_issues_pdf1']),
	'pdf2_issues': len(page_results['spelling_issues_pdf2']),
	'details': {
	'pdf1': [issue['word'] for issue in page_results['spelling_issues_pdf1']],
	'pdf2': [issue['word'] for issue in page_results['spelling_issues_pdf2']]
	}
	})

	results['text_comparison'].append(page_results)

	# Aggregate spelling issues
	all_spelling_issues = []
	for page in results['text_comparison']:
	all_spelling_issues.extend(page['spelling_issues_pdf1'])
	all_spelling_issues.extend(page['spelling_issues_pdf2'])

	results['spelling_issues'] = all_spelling_issues

	# Aggregate barcodes and QR codes
	all_barcodes = []
	for page in results['text_comparison']:
	all_barcodes.extend(page['barcodes_pdf1'])
	all_barcodes.extend(page['barcodes_pdf2'])

	results['barcodes_qr_codes'] = all_barcodes

	return results

	except Exception as e:
	raise Exception(f"Error comparing PDFs: {str(e)}")