ProofCheck / pdf_comparator.py

Yaz Hobooti

Fix PDF processing: add poppler dev package and better error handling

bae9f7f 4 months ago

15.6 kB

	#!/usr/bin/env python3
	"""
	Gradio PDF Comparison Tool
	Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
	"""

	import os, sys, re, csv, json, io
	from dataclasses import dataclass
	from typing import List, Tuple, Optional
	import tempfile

	import numpy as np
	from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
	from pdf2image import convert_from_path
	from skimage.measure import label, regionprops
	from skimage.morphology import dilation, rectangle
	import gradio as gr

	# Optional features
	try:
	import pytesseract
	HAS_OCR = True
	except Exception:
	pytesseract = None
	HAS_OCR = False

	try:
	from spellchecker import SpellChecker
	HAS_SPELLCHECK = True
	except Exception:
	SpellChecker = None
	HAS_SPELLCHECK = False

	try:
	from pyzbar.pyzbar import decode as zbar_decode
	HAS_BARCODE = True
	except Exception:
	zbar_decode = None
	HAS_BARCODE = False

	# -------------------- Core Data --------------------
	@dataclass
	class Box:
	y1: int; x1: int; y2: int; x2: int; area: int

	# -------------------- Helpers ----------------------
	def _is_pdf(path: str) -> bool:
	return os.path.splitext(path.lower())[1] == ".pdf"

	def load_first_page(path: str, dpi: int = 300) -> Image.Image:
	if _is_pdf(path):
	try:
	# Try with poppler_path explicitly set
	imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path="/usr/bin")
	if not imgs:
	raise ValueError(f"No pages in PDF: {path}")
	return imgs[0].convert("RGB")
	except Exception as e1:
	try:
	# Fallback: try without explicit poppler_path
	imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
	if not imgs:
	raise ValueError(f"No pages in PDF: {path}")
	return imgs[0].convert("RGB")
	except Exception as e2:
	raise ValueError(f"Failed to convert PDF to image. Error 1: {str(e1)}. Error 2: {str(e2)}. Make sure poppler-utils is installed.")
	return Image.open(path).convert("RGB")

	def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
	if a.size == b.size:
	return a, b
	w, h = min(a.width, b.width), min(a.height, b.height)
	return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))

	def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
	return ImageChops.difference(a, b)

	def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
	arr = np.asarray(diff_img).astype(np.uint16)
	gray = arr.max(axis=2).astype(np.uint8)
	mask = (gray >= threshold).astype(np.uint8)
	mask = dilation(mask, rectangle(3, 3))
	labeled = label(mask, connectivity=2)
	out: List[Box] = []
	for p in regionprops(labeled):
	if p.area < min_area:
	continue
	minr, minc, maxr, maxc = p.bbox
	out.append(Box(minr, minc, maxr, maxc, int(p.area)))
	return out

	def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
	width: int = 3, red_labels: List[int] = None) -> Image.Image:
	out = img.copy(); d = ImageDraw.Draw(out)
	# red (diff)
	for b in red_boxes:
	for w in range(width):
	d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
	# labels for red boxes
	if red_labels:
	for idx, b in enumerate(red_boxes):
	label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1)
	tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3)
	d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255))
	d.text((tx, ty), label, fill=(0,0,0))
	# cyan (misspellings)
	for b in cyan_boxes:
	for w in range(width):
	d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
	# green (barcodes)
	if green_boxes:
	for b in green_boxes:
	for w in range(width):
	d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
	return out

	def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
	A = np.asarray(a).copy(); B = np.asarray(b)
	mask = np.any(A != B, axis=2)
	A[mask] = [255, 0, 0]
	return Image.fromarray(A)

	# -------------------- OCR + Spellcheck -------------
	def normalize_token(token: str) -> str:
	cleaned = re.sub(r"[^A-Za-z']", "", token)
	return cleaned.lower()

	def find_misspell_boxes(img: Image.Image) -> List[Box]:
	if not (HAS_OCR and HAS_SPELLCHECK):
	return []
	try:
	spell = SpellChecker()
	data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
	except Exception:
	return []
	n = len(data.get("text", []))
	boxes: List[Box] = []
	for i in range(n):
	text = data["text"][i]
	if not text:
	continue
	token = normalize_token(text)
	if len(token) < 2:
	continue
	if token in spell:
	continue
	left = data.get("left", [0])[i]
	top = data.get("top", [0])[i]
	width = data.get("width", [0])[i]
	height= data.get("height",[0])[i]
	if width <= 0 or height <= 0:
	continue
	boxes.append(Box(top, left, top+height, left+width, width*height))
	return boxes

	# -------------------- Barcode / QR -----------------
	def ean_like_checksum_ok(digits: str) -> bool:
	if not digits.isdigit():
	return False
	n = len(digits)
	if n not in (8, 12, 13):
	return True
	nums = [int(c) for c in digits]
	if n == 8:
	body, check = nums[:7], nums[7]
	s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7))
	return (10 - (s % 10)) % 10 == check
	if n == 12:
	body, check = nums[:11], nums[11]
	s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11))
	return (10 - (s % 10)) % 10 == check
	if n == 13:
	body, check = nums[:12], nums[12]
	s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12))
	return (10 - (s % 10)) % 10 == check
	return True

	def validate_symbology(symbology: str, data: bytes) -> bool:
	try:
	text = data.decode('utf-8', errors='ignore')
	except Exception:
	return False
	sym = (symbology or '').upper()
	if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"):
	return ean_like_checksum_ok(re.sub(r"\D", "", text))
	if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"):
	return len(text) > 0
	return len(text) > 0

	def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
	return Box(y, x, y + h, x + w, w * h)

	def decode_with_variants(img: Image.Image):
	if not HAS_BARCODE:
	return []
	results = []
	def do_decode(pil_img):
	try:
	dec = zbar_decode(pil_img)
	if dec: results.extend(dec)
	except Exception:
	pass
	do_decode(img)
	if not results: do_decode(img.convert('L'))
	if not results: do_decode(img.resize((img.width2, img.height2), Image.BICUBIC))
	if not results and img.mode != 'RGB':
	do_decode(img.convert('RGB'))
	return results

	def find_barcode_boxes_and_info(img: Image.Image):
	decodes = decode_with_variants(img)
	boxes: List[Box] = []
	infos = []
	for d in decodes:
	rect = d.rect
	boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height))
	valid = validate_symbology(d.type, d.data)
	infos.append({
	'type': d.type,
	'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)),
	'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height,
	'valid': bool(valid)
	})
	return boxes, infos

	# -------------------- CMYK Panel -------------------
	def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
	return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255

	def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
	y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
	x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
	if y2<=y1 or x2<=x1:
	return (0.0,0.0,0.0,0.0)
	region = cmyk_arr[y1:y2, x1:x2, :]
	mean_vals = region.reshape(-1, 4).mean(axis=0)
	return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)

	def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
	a_cmyk = rgb_to_cmyk_array(a_img)
	b_cmyk = rgb_to_cmyk_array(b_img)
	entries = []
	for i, bx in enumerate(red_boxes):
	a_vals = avg_cmyk_in_box(a_cmyk, bx)
	b_vals = avg_cmyk_in_box(b_cmyk, bx)
	delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
	entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
	return entries

	def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
	w,h = base.size
	panel = Image.new('RGB', (panel_width, h), (245,245,245))
	out = Image.new('RGB', (w+panel_width, h), (255,255,255))
	out.paste(base, (0,0)); out.paste(panel, (w,0))
	d = ImageDraw.Draw(out)
	x0 = w + 8; y = 8
	d.text((x0, y), title, fill=(0,0,0)); y += 18
	if not entries:
	d.text((x0, y), 'No differing regions', fill=(80,80,80))
	return out
	for e in entries:
	idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
	d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
	d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
	d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
	d.text((x0, y), f"Δ: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
	if y > h - 40: break
	return out

	# -------------------- Gradio Interface -----------------
	def compare_pdfs(file_a, file_b):
	"""Main comparison function for Gradio interface"""
	try:
	if file_a is None or file_b is None:
	return None, None, None, "❌ Please upload both PDF files to compare", [], []

	# Load images with default settings
	a = load_first_page(file_a.name, dpi=300)
	b = load_first_page(file_b.name, dpi=300)

	# Match sizes
	a, b = match_sizes(a, b)

	# Find differences with default settings
	diff = difference_map(a, b)
	red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)

	# Run all analysis features with defaults
	misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else []
	misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else []

	if HAS_BARCODE:
	bar_a, info_a = find_barcode_boxes_and_info(a)
	bar_b, info_b = find_barcode_boxes_and_info(b)
	else:
	bar_a, info_a = [], []
	bar_b, info_b = [], []

	# Always enable CMYK analysis
	cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
	labels = [e['idx'] for e in cmyk_entries]

	# Create visualizations with default box width
	a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels)
	b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels)

	# Always show CMYK panel
	a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
	b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')

	# Create pixel difference overlay
	overlay = make_red_overlay(a, b)

	# Create status message
	status = f"""
	📊 Analysis Complete!
	- Difference regions found: {len(red_boxes)}
	- Misspellings detected: A: {len(misspell_a)}, B: {len(misspell_b)}
	- Barcodes found: A: {len(bar_a)}, B: {len(bar_b)}
	- Image dimensions: {a.width} × {a.height} pixels

	Legend:
	- 🔴 Red boxes: Visual differences
	- 🔵 Cyan boxes: Spelling errors
	- 🟢 Green boxes: Barcodes/QR codes
	"""

	# Prepare barcode data for tables
	codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
	c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
	codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
	c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]

	return overlay, a_disp, b_disp, status, codes_a, codes_b

	except Exception as e:
	error_msg = f"❌ Error: {str(e)}"
	return None, None, None, error_msg, [], []

	# -------------------- Gradio App -------------------
	def create_demo():
	with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🔍 Advanced PDF Comparison Tool

	Upload two PDF files to get comprehensive analysis including:
	- Visual differences with bounding boxes
	- OCR and spell checking
	- Barcode/QR code detection
	- CMYK color analysis
	""")

	with gr.Row():
	with gr.Column():
	file_a = gr.File(label="📄 PDF A (Reference)", file_types=[".pdf"])
	file_b = gr.File(label="📄 PDF B (Comparison)", file_types=[".pdf"])

	compare_btn = gr.Button("🔍 Compare PDF Files", variant="primary", size="lg")

	status_md = gr.Markdown("")

	with gr.Row():
	overlay_img = gr.Image(label="🔴 Pixel Differences (Red = Different)", type="pil")

	with gr.Row():
	img_a = gr.Image(label="📄 File A with Analysis", type="pil")
	img_b = gr.Image(label="📄 File B with Analysis", type="pil")

	gr.Markdown("### 📊 Barcode Detection Results")
	with gr.Row():
	codes_a_df = gr.Dataframe(
	headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
	label="Barcodes in File A",
	interactive=False
	)
	codes_b_df = gr.Dataframe(
	headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
	label="Barcodes in File B",
	interactive=False
	)

	# Event handlers
	compare_btn.click(
	fn=compare_pdfs,
	inputs=[file_a, file_b],
	outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
	)

	gr.Markdown("""
	### 📝 Instructions:
	1. Upload two PDF files
	2. Click "Compare PDF Files"
	3. View results with comprehensive analysis

	### 🎨 Color Legend:
	- 🔴 Red boxes: Visual differences between files
	- 🔵 Cyan boxes: Potential spelling errors (OCR)
	- 🟢 Green boxes: Detected barcodes/QR codes
	- 📊 Side panel: CMYK color analysis for print workflows
	""")

	return demo

	if __name__ == "__main__":
	demo = create_demo()
	demo.launch(
	server_name="0.0.0.0", # Allow external access
	share=True, # Set to True to create a public link
	show_error=True
	)