ProofCheck / pdf_comparator.py

Yaz Hobooti

Implement text-based spell checking using PDF text extraction

ad98b73 4 months ago

27.5 kB

	#!/usr/bin/env python3
	"""
	Gradio PDF Comparison Tool
	Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
	"""

	import os, sys, re, csv, json, io
	from dataclasses import dataclass
	from typing import List, Tuple, Optional, Iterable
	import tempfile
	import unicodedata

	import numpy as np
	from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
	from pdf2image import convert_from_path
	from skimage.measure import label, regionprops
	from skimage.morphology import dilation, rectangle
	import gradio as gr

	# Alternative PDF processing
	try:
	import fitz # PyMuPDF
	HAS_PYMUPDF = True
	except Exception:
	fitz = None
	HAS_PYMUPDF = False

	# Optional features
	try:
	import pytesseract
	HAS_OCR = True
	except Exception:
	pytesseract = None
	HAS_OCR = False

	try:
	from spellchecker import SpellChecker
	HAS_SPELLCHECK = True
	except Exception:
	SpellChecker = None
	HAS_SPELLCHECK = False

	try:
	import regex as re
	HAS_REGEX = True
	except Exception:
	import re
	HAS_REGEX = False

	try:
	from pyzbar.pyzbar import decode as zbar_decode
	HAS_BARCODE = True
	except Exception:
	zbar_decode = None
	HAS_BARCODE = False

	# -------------------- Core Data --------------------
	@dataclass
	class Box:
	y1: int; x1: int; y2: int; x2: int; area: int

	# ---- spell/tokenization helpers & caches ----
	if HAS_REGEX:
	_WORD_RE = re.compile(r"\p{Letter}+(?:['\-]\p{Letter}+)*", re.UNICODE)
	else:
	_WORD_RE = re.compile(r"[A-Za-z]+(?:['\-][A-Za-z]+)*")

	if HAS_SPELLCHECK:
	_SPELL_EN = SpellChecker(language="en")
	try:
	_SPELL_FR = SpellChecker(language="fr")
	except Exception:
	_SPELL_FR = None
	else:
	_SPELL_EN = None
	_SPELL_FR = None

	_DOMAIN_ALLOWLIST = {
	"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
	"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
	}
	_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}

	if _SPELL_EN:
	_SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
	if _SPELL_FR:
	_SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)

	def _normalize_text(s: str) -> str:
	s = unicodedata.normalize("NFC", s)
	return s.replace("'", "'").strip()

	def _extract_tokens(raw: str):
	s = _normalize_text(raw or "")
	return _WORD_RE.findall(s)

	def _looks_like_acronym(tok: str) -> bool:
	return tok.isupper() and 2 <= len(tok) <= 6

	def _has_digits(tok: str) -> bool:
	return any(ch.isdigit() for ch in tok)

	def _is_known_word(tok: str) -> bool:
	t = tok.lower()
	if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
	return True

	# Check hyphenated words - if any part is known, consider the whole word known
	if '-' in tok:
	parts = tok.split('-')
	if all(_is_known_word(part) for part in parts):
	return True

	if _SPELL_EN and not _SPELL_EN.unknown([t]): # known in EN
	return True
	if _SPELL_FR and not _SPELL_FR.unknown([t]): # known in FR
	return True
	return False

	# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
	def normalize_token(token: str) -> str:
	toks = _extract_tokens(token)
	return (toks[0].lower() if toks else "")

	# -------------------- Helpers ----------------------
	def _is_pdf(path: str) -> bool:
	return os.path.splitext(path.lower())[1] == ".pdf"

	def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
	if _is_pdf(path):
	# Try pdf2image with multiple poppler paths first
	poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]

	for poppler_path in poppler_paths:
	try:
	if poppler_path:
	imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
	else:
	imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)

	if not imgs:
	continue

	return [img.convert("RGB") for img in imgs]
	except Exception as e:
	if poppler_path is None: # All pdf2image attempts failed
	break
	continue # Try next path

	# Fallback to PyMuPDF if pdf2image fails
	if HAS_PYMUPDF:
	try:
	doc = fitz.open(path)
	pages = []
	for page_num in range(min(len(doc), max_pages)):
	page = doc[page_num]
	mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
	pix = page.get_pixmap(matrix=mat)
	img_data = pix.tobytes("ppm")
	img = Image.open(io.BytesIO(img_data))
	pages.append(img.convert("RGB"))
	doc.close()
	return pages
	except Exception as e:
	raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
	else:
	raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")

	raise ValueError(f"No pages in PDF: {path}")
	return [Image.open(path).convert("RGB")]

	def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
	"""Combine multiple pages into a single vertical image"""
	if not pages:
	raise ValueError("No pages to combine")
	if len(pages) == 1:
	return pages[0]

	# Find the maximum width
	max_width = max(page.width for page in pages)

	# Calculate total height
	total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1)

	# Create combined image
	combined = Image.new('RGB', (max_width, total_height), (255, 255, 255))

	y_offset = 0
	for page in pages:
	# Center the page horizontally if it's narrower than max_width
	x_offset = (max_width - page.width) // 2
	combined.paste(page, (x_offset, y_offset))
	y_offset += page.height + spacing

	return combined

	def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
	if a.size == b.size:
	return a, b
	w, h = min(a.width, b.width), min(a.height, b.height)
	return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))

	def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
	return ImageChops.difference(a, b)

	def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
	arr = np.asarray(diff_img).astype(np.uint16)
	gray = arr.max(axis=2).astype(np.uint8)
	mask = (gray >= threshold).astype(np.uint8)
	mask = dilation(mask, rectangle(3, 3))
	labeled = label(mask, connectivity=2)
	out: List[Box] = []
	for p in regionprops(labeled):
	if p.area < min_area:
	continue
	minr, minc, maxr, maxc = p.bbox
	out.append(Box(minr, minc, maxr, maxc, int(p.area)))
	return out

	def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
	width: int = 3, red_labels: List[int] = None) -> Image.Image:
	out = img.copy(); d = ImageDraw.Draw(out)
	# red (diff)
	for b in red_boxes:
	for w in range(width):
	d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
	# labels for red boxes
	if red_labels:
	for idx, b in enumerate(red_boxes):
	label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1)
	tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3)
	d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255))
	d.text((tx, ty), label, fill=(0,0,0))
	# cyan (misspellings)
	for b in cyan_boxes:
	for w in range(width):
	d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
	# green (barcodes)
	if green_boxes:
	for b in green_boxes:
	for w in range(width):
	d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
	return out

	def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
	A = np.asarray(a).copy(); B = np.asarray(b)
	mask = np.any(A != B, axis=2)
	A[mask] = [255, 0, 0]
	return Image.fromarray(A)

	# -------------------- OCR + Spellcheck -------------
	from typing import List, Iterable, Optional
	from PIL import Image
	import unicodedata
	import regex as re
	import pytesseract
	from spellchecker import SpellChecker

	# If these existed in your file, keep them; otherwise define defaults to avoid NameError
	try:
	HAS_OCR
	except NameError:
	HAS_OCR = True
	try:
	HAS_SPELLCHECK
	except NameError:
	HAS_SPELLCHECK = True

	# ---- spell/tokenization helpers & caches ----
	_WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE)

	_SPELL_EN = SpellChecker(language="en")
	_SPELL_FR = SpellChecker(language="fr")

	_DOMAIN_ALLOWLIST = {
	"Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
	"SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
	}
	_SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
	_SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)

	def _normalize_text(s: str) -> str:
	s = unicodedata.normalize("NFC", s)
	return s.replace("’", "'").strip()

	def _extract_tokens(raw: str):
	s = _normalize_text(raw or "")
	return _WORD_RE.findall(s)

	def _looks_like_acronym(tok: str) -> bool:
	return tok.isupper() and 2 <= len(tok) <= 6

	def _has_digits(tok: str) -> bool:
	return any(ch.isdigit() for ch in tok)

	# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
	def normalize_token(token: str) -> str:
	toks = _extract_tokens(token)
	return (toks[0].lower() if toks else "")

	def _get_available_tesseract_langs():
	"""Get available Tesseract languages"""
	try:
	langs = pytesseract.get_languages()
	if 'eng' in langs and 'fra' in langs:
	return "eng+fra"
	elif 'eng' in langs:
	return "eng"
	elif langs:
	return langs[0]
	else:
	return "eng"
	except Exception:
	return "eng"

	def prepare_for_ocr(img: Image.Image) -> Image.Image:
	"""Prepare image for better OCR results"""
	from PIL import ImageOps, ImageFilter
	g = img.convert("L")
	g = ImageOps.autocontrast(g)
	g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
	return g

	def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
	"""Extract text directly from PDF using PyMuPDF"""
	if not HAS_PYMUPDF:
	return []

	try:
	doc = fitz.open(path)
	texts = []
	for page_num in range(min(len(doc), max_pages)):
	page = doc[page_num]
	text = page.get_text()
	texts.append(text)
	doc.close()
	return texts
	except Exception:
	return []

	def find_misspell_boxes_from_text(
	pdf_path: str,
	*,
	extra_allow: Optional[Iterable[str]] = None,
	max_pages: int = 5
	) -> List[Box]:
	"""Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
	if not (HAS_SPELLCHECK and HAS_PYMUPDF):
	return []

	# Load extra allowed words
	if extra_allow and _SPELL_EN:
	_SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
	if extra_allow and _SPELL_FR:
	_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)

	boxes: List[Box] = []

	try:
	doc = fitz.open(pdf_path)

	for page_num in range(min(len(doc), max_pages)):
	page = doc[page_num]

	# Get text with position information
	text_dict = page.get_text("dict")

	# Process each block of text
	for block in text_dict.get("blocks", []):
	if "lines" not in block:
	continue

	for line in block["lines"]:
	for span in line["spans"]:
	text = span.get("text", "").strip()
	if not text:
	continue

	# Extract tokens and check for misspellings
	tokens = _extract_tokens(text)
	has_misspelling = False

	for token in tokens:
	if len(token) >= 2 and not _is_known_word(token):
	has_misspelling = True
	break

	# If this span has misspellings, create a box for it
	if has_misspelling:
	bbox = span["bbox"] # [x0, y0, x1, y1]
	boxes.append(Box(
	top=bbox[1], # y0
	left=bbox[0], # x0
	bottom=bbox[3], # y1
	right=bbox[2], # x1
	area=(bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
	))

	doc.close()

	except Exception:
	# Fallback to simple text extraction if coordinate mapping fails
	page_texts = extract_pdf_text(pdf_path, max_pages)
	for page_num, text in enumerate(page_texts):
	if not text.strip():
	continue

	tokens = _extract_tokens(text)
	misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)]

	if misspelled_words:
	# Create a placeholder box for the page
	boxes.append(Box(
	top=page_num * 1000,
	left=0,
	bottom=(page_num + 1) * 1000,
	right=800,
	area=800 * 1000
	))

	return boxes

	def find_misspell_boxes(
	img: Image.Image,
	*,
	min_conf: int = 60,
	lang: Optional[str] = None,
	extra_allow: Optional[Iterable[str]] = None,
	dpi: int = 300,
	psm: int = 6,
	oem: int = 3
	) -> List[Box]:
	"""Legacy OCR-based spell checking (kept for fallback)"""
	if not (HAS_OCR and HAS_SPELLCHECK):
	return []

	# Auto-detect language if not provided
	if lang is None:
	try:
	avail = set(pytesseract.get_languages(config="") or [])
	except Exception:
	avail = {"eng"}
	lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"

	# OPTIONAL: light upscale if the image is small (heuristic)
	# target width ~ 2500–3000 px for letter-sized pages
	if img.width < 1600:
	scale = 2
	img = img.resize((img.widthscale, img.heightscale), Image.LANCZOS)

	# Prepare image for better OCR
	img = prepare_for_ocr(img)

	try:
	if extra_allow and _SPELL_EN:
	_SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
	if extra_allow and _SPELL_FR:
	_SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)

	# Build a config that sets an explicit DPI and keeps spaces
	config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"

	data = pytesseract.image_to_data(
	img,
	lang=lang,
	config=config,
	output_type=pytesseract.Output.DICT,
	)
	except Exception:
	return []

	n = len(data.get("text", [])) or 0
	boxes: List[Box] = []

	for i in range(n):
	raw = data["text"][i]
	if not raw:
	continue

	# confidence filter
	conf_str = data.get("conf", ["-1"])[i]
	try:
	conf = int(float(conf_str))
	except Exception:
	conf = -1
	if conf < min_conf:
	continue

	tokens = _extract_tokens(raw)
	if not tokens:
	continue

	# flag the box if ANY token in it looks misspelled
	if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens):
	continue

	left = data.get("left", [0])[i]
	top = data.get("top", [0])[i]
	width = data.get("width", [0])[i]
	height = data.get("height",[0])[i]
	if width <= 0 or height <= 0:
	continue

	# NOTE: adjust to match your Box constructor if needed
	boxes.append(Box(top, left, top + height, left + width, width * height))

	return boxes


	# -------------------- Barcode / QR -----------------
	def ean_like_checksum_ok(digits: str) -> bool:
	if not digits.isdigit():
	return False
	n = len(digits)
	if n not in (8, 12, 13):
	return True
	nums = [int(c) for c in digits]
	if n == 8:
	body, check = nums[:7], nums[7]
	s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7))
	return (10 - (s % 10)) % 10 == check
	if n == 12:
	body, check = nums[:11], nums[11]
	s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11))
	return (10 - (s % 10)) % 10 == check
	if n == 13:
	body, check = nums[:12], nums[12]
	s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12))
	return (10 - (s % 10)) % 10 == check
	return True

	def validate_symbology(symbology: str, data: bytes) -> bool:
	try:
	text = data.decode('utf-8', errors='ignore')
	except Exception:
	return False
	sym = (symbology or '').upper()
	if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"):
	return ean_like_checksum_ok(re.sub(r"\D", "", text))
	if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"):
	return len(text) > 0
	return len(text) > 0

	def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
	return Box(y, x, y + h, x + w, w * h)

	def decode_with_variants(img: Image.Image):
	if not HAS_BARCODE:
	return []
	results = []
	def do_decode(pil_img):
	try:
	dec = zbar_decode(pil_img)
	if dec: results.extend(dec)
	except Exception:
	pass
	do_decode(img)
	if not results: do_decode(img.convert('L'))
	if not results: do_decode(img.resize((img.width2, img.height2), Image.BICUBIC))
	if not results and img.mode != 'RGB':
	do_decode(img.convert('RGB'))
	return results

	def find_barcode_boxes_and_info(img: Image.Image):
	decodes = decode_with_variants(img)
	boxes: List[Box] = []
	infos = []
	for d in decodes:
	rect = d.rect
	boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height))
	valid = validate_symbology(d.type, d.data)
	infos.append({
	'type': d.type,
	'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)),
	'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height,
	'valid': bool(valid)
	})
	return boxes, infos

	# -------------------- CMYK Panel -------------------
	def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
	return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255

	def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
	y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
	x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
	if y2<=y1 or x2<=x1:
	return (0.0,0.0,0.0,0.0)
	region = cmyk_arr[y1:y2, x1:x2, :]
	mean_vals = region.reshape(-1, 4).mean(axis=0)
	return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)

	def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
	a_cmyk = rgb_to_cmyk_array(a_img)
	b_cmyk = rgb_to_cmyk_array(b_img)
	entries = []
	for i, bx in enumerate(red_boxes):
	a_vals = avg_cmyk_in_box(a_cmyk, bx)
	b_vals = avg_cmyk_in_box(b_cmyk, bx)
	delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
	entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
	return entries

	def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
	w,h = base.size
	panel = Image.new('RGB', (panel_width, h), (245,245,245))
	out = Image.new('RGB', (w+panel_width, h), (255,255,255))
	out.paste(base, (0,0)); out.paste(panel, (w,0))
	d = ImageDraw.Draw(out)
	x0 = w + 8; y = 8
	d.text((x0, y), title, fill=(0,0,0)); y += 18
	if not entries:
	d.text((x0, y), 'No differing regions', fill=(80,80,80))
	return out
	for e in entries:
	idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
	d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
	d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
	d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
	d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
	if y > h - 40: break
	return out

	# -------------------- Gradio Interface -----------------
	def compare_pdfs(file_a, file_b):
	"""Main comparison function for Gradio interface"""
	try:
	if file_a is None or file_b is None:
	return None, None, None, "❌ Please upload both PDF files to compare", [], []

	# Load images with multiple pages support
	pages_a = load_pdf_pages(file_a.name, dpi=400, max_pages=5)
	pages_b = load_pdf_pages(file_b.name, dpi=400, max_pages=5)

	# Combine pages into single images for comparison
	a = combine_pages_vertically(pages_a)
	b = combine_pages_vertically(pages_b)

	# Match sizes
	a, b = match_sizes(a, b)

	# Find differences with default settings
	diff = difference_map(a, b)
	red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)

	# Run all analysis features with defaults
	# Use text-based spell checking instead of OCR for better accuracy
	misspell_a = find_misspell_boxes_from_text(file_a.name) if HAS_SPELLCHECK and HAS_PYMUPDF else []
	misspell_b = find_misspell_boxes_from_text(file_b.name) if HAS_SPELLCHECK and HAS_PYMUPDF else []

	if HAS_BARCODE:
	bar_a, info_a = find_barcode_boxes_and_info(a)
	bar_b, info_b = find_barcode_boxes_and_info(b)
	else:
	bar_a, info_a = [], []
	bar_b, info_b = [], []

	# Always enable CMYK analysis
	cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
	labels = [e['idx'] for e in cmyk_entries]

	# Create visualizations with default box width
	a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels)
	b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels)

	# Always show CMYK panel
	a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
	b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')

	# Create pixel difference overlay
	overlay = make_red_overlay(a, b)

	# Create status message
	status = f"""
	📊 Analysis Complete!
	- Pages processed: A: {len(pages_a)}, B: {len(pages_b)}
	- Difference regions found: {len(red_boxes)}
	- Misspellings detected: A: {len(misspell_a)}, B: {len(misspell_b)}
	- Barcodes found: A: {len(bar_a)}, B: {len(bar_b)}
	- Combined image dimensions: {a.width} × {a.height} pixels

	Legend:
	- 🔴 Red boxes: Visual differences
	- 🔵 Cyan boxes: Spelling errors
	- 🟢 Green boxes: Barcodes/QR codes
	"""

	# Prepare barcode data for tables
	codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
	c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
	codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
	c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]

	return overlay, a_disp, b_disp, status, codes_a, codes_b

	except Exception as e:
	error_msg = f"❌ Error: {str(e)}"
	return None, None, None, error_msg, [], []

	# -------------------- Gradio App -------------------
	def create_demo():
	with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🔍 Advanced PDF Comparison Tool

	Upload two PDF files to get comprehensive analysis including:
	- Multi-page PDF support (up to 5 pages per document)
	- Visual differences with bounding boxes
	- OCR and spell checking
	- Barcode/QR code detection
	- CMYK color analysis
	""")

	with gr.Row():
	with gr.Column():
	file_a = gr.File(label="📄 PDF A (Reference)", file_types=[".pdf"])
	file_b = gr.File(label="📄 PDF B (Comparison)", file_types=[".pdf"])

	compare_btn = gr.Button("🔍 Compare PDF Files", variant="primary", size="lg")

	status_md = gr.Markdown("")

	with gr.Row():
	overlay_img = gr.Image(label="🔴 Pixel Differences (Red = Different)", type="pil")

	with gr.Row():
	img_a = gr.Image(label="📄 File A with Analysis", type="pil")
	img_b = gr.Image(label="📄 File B with Analysis", type="pil")

	gr.Markdown("### 📊 Barcode Detection Results")
	with gr.Row():
	codes_a_df = gr.Dataframe(
	headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
	label="Barcodes in File A",
	interactive=False
	)
	codes_b_df = gr.Dataframe(
	headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
	label="Barcodes in File B",
	interactive=False
	)

	# Event handlers
	compare_btn.click(
	fn=compare_pdfs,
	inputs=[file_a, file_b],
	outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
	)

	gr.Markdown("""
	### 📝 Instructions:
	1. Upload two PDF files
	2. Click "Compare PDF Files"
	3. View results with comprehensive analysis

	### 🎨 Color Legend:
	- 🔴 Red boxes: Visual differences between files
	- 🔵 Cyan boxes: Potential spelling errors (OCR)
	- 🟢 Green boxes: Detected barcodes/QR codes
	- 📊 Side panel: CMYK color analysis for print workflows
	""")

	return demo

	if __name__ == "__main__":
	demo = create_demo()
	demo.launch(
	server_name="0.0.0.0", # Allow external access
	share=True, # Set to True to create a public link
	show_error=True
	)