#!/usr/bin/env python3 """ Gradio PDF Comparison Tool Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis. """ import os, sys, re, csv, json, io from dataclasses import dataclass from typing import List, Tuple, Optional, Iterable import tempfile import unicodedata import numpy as np from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError from pdf2image import convert_from_path from skimage.measure import label, regionprops from skimage.morphology import dilation, rectangle import gradio as gr # Alternative PDF processing try: import fitz # PyMuPDF HAS_PYMUPDF = True except Exception: fitz = None HAS_PYMUPDF = False # Optional features try: import pytesseract HAS_OCR = True except Exception: pytesseract = None HAS_OCR = False try: from spellchecker import SpellChecker HAS_SPELLCHECK = True except Exception: SpellChecker = None HAS_SPELLCHECK = False try: import regex as re HAS_REGEX = True except Exception: import re HAS_REGEX = False try: from pyzbar.pyzbar import decode as zbar_decode HAS_BARCODE = True except Exception: zbar_decode = None HAS_BARCODE = False # -------------------- Core Data -------------------- @dataclass class Box: y1: int; x1: int; y2: int; x2: int; area: int # ---- spell/tokenization helpers & caches ---- if HAS_REGEX: _WORD_RE = re.compile(r"\p{Letter}+(?:['\-]\p{Letter}+)*", re.UNICODE) else: _WORD_RE = re.compile(r"[A-Za-z]+(?:['\-][A-Za-z]+)*") if HAS_SPELLCHECK: _SPELL_EN = SpellChecker(language="en") try: _SPELL_FR = SpellChecker(language="fr") except Exception: _SPELL_FR = None else: _SPELL_EN = None _SPELL_FR = None _DOMAIN_ALLOWLIST = { "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid" } _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST} if _SPELL_EN: _SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) if _SPELL_FR: _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) def _normalize_text(s: str) -> str: s = unicodedata.normalize("NFC", s) return s.replace("'", "'").strip() def _extract_tokens(raw: str): s = _normalize_text(raw or "") return _WORD_RE.findall(s) def _looks_like_acronym(tok: str) -> bool: return tok.isupper() and 2 <= len(tok) <= 6 def _has_digits(tok: str) -> bool: return any(ch.isdigit() for ch in tok) def _is_known_word(tok: str) -> bool: t = tok.lower() if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok): return True # Check hyphenated words - if any part is known, consider the whole word known if '-' in tok: parts = tok.split('-') if all(_is_known_word(part) for part in parts): return True if _SPELL_EN and not _SPELL_EN.unknown([t]): # known in EN return True if _SPELL_FR and not _SPELL_FR.unknown([t]): # known in FR return True return False # (optional) keep a compatibility shim so any other code calling normalize_token() won't break def normalize_token(token: str) -> str: toks = _extract_tokens(token) return (toks[0].lower() if toks else "") # -------------------- Helpers ---------------------- def _is_pdf(path: str) -> bool: return os.path.splitext(path.lower())[1] == ".pdf" def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]: if _is_pdf(path): # Try pdf2image with multiple poppler paths first poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None] for poppler_path in poppler_paths: try: if poppler_path: imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path) else: imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages) if not imgs: continue return [img.convert("RGB") for img in imgs] except Exception as e: if poppler_path is None: # All pdf2image attempts failed break continue # Try next path # Fallback to PyMuPDF if pdf2image fails if HAS_PYMUPDF: try: doc = fitz.open(path) pages = [] for page_num in range(min(len(doc), max_pages)): page = doc[page_num] mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("ppm") img = Image.open(io.BytesIO(img_data)) pages.append(img.convert("RGB")) doc.close() return pages except Exception as e: raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}") else: raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.") raise ValueError(f"No pages in PDF: {path}") return [Image.open(path).convert("RGB")] def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image: """Combine multiple pages into a single vertical image""" if not pages: raise ValueError("No pages to combine") if len(pages) == 1: return pages[0] # Find the maximum width max_width = max(page.width for page in pages) # Calculate total height total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1) # Create combined image combined = Image.new('RGB', (max_width, total_height), (255, 255, 255)) y_offset = 0 for page in pages: # Center the page horizontally if it's narrower than max_width x_offset = (max_width - page.width) // 2 combined.paste(page, (x_offset, y_offset)) y_offset += page.height + spacing return combined def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]: if a.size == b.size: return a, b w, h = min(a.width, b.width), min(a.height, b.height) return a.crop((0, 0, w, h)), b.crop((0, 0, w, h)) def difference_map(a: Image.Image, b: Image.Image) -> Image.Image: return ImageChops.difference(a, b) def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]: arr = np.asarray(diff_img).astype(np.uint16) gray = arr.max(axis=2).astype(np.uint8) mask = (gray >= threshold).astype(np.uint8) mask = dilation(mask, rectangle(3, 3)) labeled = label(mask, connectivity=2) out: List[Box] = [] for p in regionprops(labeled): if p.area < min_area: continue minr, minc, maxr, maxc = p.bbox out.append(Box(minr, minc, maxr, maxc, int(p.area))) return out def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None, width: int = 3, red_labels: List[int] = None) -> Image.Image: out = img.copy(); d = ImageDraw.Draw(out) # red (diff) for b in red_boxes: for w in range(width): d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0)) # labels for red boxes if red_labels: for idx, b in enumerate(red_boxes): label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1) tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3) d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255)) d.text((tx, ty), label, fill=(0,0,0)) # cyan (misspellings) for b in cyan_boxes: for w in range(width): d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255)) # green (barcodes) if green_boxes: for b in green_boxes: for w in range(width): d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0)) return out def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image: A = np.asarray(a).copy(); B = np.asarray(b) mask = np.any(A != B, axis=2) A[mask] = [255, 0, 0] return Image.fromarray(A) # -------------------- OCR + Spellcheck ------------- from typing import List, Iterable, Optional from PIL import Image import unicodedata import regex as re import pytesseract from spellchecker import SpellChecker # If these existed in your file, keep them; otherwise define defaults to avoid NameError try: HAS_OCR except NameError: HAS_OCR = True try: HAS_SPELLCHECK except NameError: HAS_SPELLCHECK = True # ---- spell/tokenization helpers & caches ---- _WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE) _SPELL_EN = SpellChecker(language="en") _SPELL_FR = SpellChecker(language="fr") _DOMAIN_ALLOWLIST = { "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid" } _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) def _normalize_text(s: str) -> str: s = unicodedata.normalize("NFC", s) return s.replace("’", "'").strip() def _extract_tokens(raw: str): s = _normalize_text(raw or "") return _WORD_RE.findall(s) def _looks_like_acronym(tok: str) -> bool: return tok.isupper() and 2 <= len(tok) <= 6 def _has_digits(tok: str) -> bool: return any(ch.isdigit() for ch in tok) # (optional) keep a compatibility shim so any other code calling normalize_token() won't break def normalize_token(token: str) -> str: toks = _extract_tokens(token) return (toks[0].lower() if toks else "") def _get_available_tesseract_langs(): """Get available Tesseract languages""" try: langs = pytesseract.get_languages() if 'eng' in langs and 'fra' in langs: return "eng+fra" elif 'eng' in langs: return "eng" elif langs: return langs[0] else: return "eng" except Exception: return "eng" def prepare_for_ocr(img: Image.Image) -> Image.Image: """Prepare image for better OCR results""" from PIL import ImageOps, ImageFilter g = img.convert("L") g = ImageOps.autocontrast(g) g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2)) return g def find_misspell_boxes( img: Image.Image, *, min_conf: int = 60, lang: Optional[str] = None, extra_allow: Optional[Iterable[str]] = None, dpi: int = 300, psm: int = 6, oem: int = 3 ) -> List[Box]: if not (HAS_OCR and HAS_SPELLCHECK): return [] # Auto-detect language if not provided if lang is None: try: avail = set(pytesseract.get_languages(config="") or []) except Exception: avail = {"eng"} lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng" # OPTIONAL: light upscale if the image is small (heuristic) # target width ~ 2500–3000 px for letter-sized pages if img.width < 1600: scale = 2 img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS) # Prepare image for better OCR img = prepare_for_ocr(img) try: if extra_allow and _SPELL_EN: _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow) if extra_allow and _SPELL_FR: _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow) # Build a config that sets an explicit DPI and keeps spaces config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}" data = pytesseract.image_to_data( img, lang=lang, config=config, output_type=pytesseract.Output.DICT, ) except Exception: return [] n = len(data.get("text", [])) or 0 boxes: List[Box] = [] for i in range(n): raw = data["text"][i] if not raw: continue # confidence filter conf_str = data.get("conf", ["-1"])[i] try: conf = int(float(conf_str)) except Exception: conf = -1 if conf < min_conf: continue tokens = _extract_tokens(raw) if not tokens: continue # flag the box if ANY token in it looks misspelled if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens): continue left = data.get("left", [0])[i] top = data.get("top", [0])[i] width = data.get("width", [0])[i] height = data.get("height",[0])[i] if width <= 0 or height <= 0: continue # NOTE: adjust to match your Box constructor if needed boxes.append(Box(top, left, top + height, left + width, width * height)) return boxes # -------------------- Barcode / QR ----------------- def ean_like_checksum_ok(digits: str) -> bool: if not digits.isdigit(): return False n = len(digits) if n not in (8, 12, 13): return True nums = [int(c) for c in digits] if n == 8: body, check = nums[:7], nums[7] s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7)) return (10 - (s % 10)) % 10 == check if n == 12: body, check = nums[:11], nums[11] s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11)) return (10 - (s % 10)) % 10 == check if n == 13: body, check = nums[:12], nums[12] s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12)) return (10 - (s % 10)) % 10 == check return True def validate_symbology(symbology: str, data: bytes) -> bool: try: text = data.decode('utf-8', errors='ignore') except Exception: return False sym = (symbology or '').upper() if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"): return ean_like_checksum_ok(re.sub(r"\D", "", text)) if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"): return len(text) > 0 return len(text) > 0 def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box: return Box(y, x, y + h, x + w, w * h) def decode_with_variants(img: Image.Image): if not HAS_BARCODE: return [] results = [] def do_decode(pil_img): try: dec = zbar_decode(pil_img) if dec: results.extend(dec) except Exception: pass do_decode(img) if not results: do_decode(img.convert('L')) if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC)) if not results and img.mode != 'RGB': do_decode(img.convert('RGB')) return results def find_barcode_boxes_and_info(img: Image.Image): decodes = decode_with_variants(img) boxes: List[Box] = [] infos = [] for d in decodes: rect = d.rect boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height)) valid = validate_symbology(d.type, d.data) infos.append({ 'type': d.type, 'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)), 'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height, 'valid': bool(valid) }) return boxes, infos # -------------------- CMYK Panel ------------------- def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray: return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255 def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]: y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2) x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2) if y2<=y1 or x2<=x1: return (0.0,0.0,0.0,0.0) region = cmyk_arr[y1:y2, x1:x2, :] mean_vals = region.reshape(-1, 4).mean(axis=0) return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals) def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]): a_cmyk = rgb_to_cmyk_array(a_img) b_cmyk = rgb_to_cmyk_array(b_img) entries = [] for i, bx in enumerate(red_boxes): a_vals = avg_cmyk_in_box(a_cmyk, bx) b_vals = avg_cmyk_in_box(b_cmyk, bx) delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4)) entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta}) return entries def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image: w,h = base.size panel = Image.new('RGB', (panel_width, h), (245,245,245)) out = Image.new('RGB', (w+panel_width, h), (255,255,255)) out.paste(base, (0,0)); out.paste(panel, (w,0)) d = ImageDraw.Draw(out) x0 = w + 8; y = 8 d.text((x0, y), title, fill=(0,0,0)); y += 18 if not entries: d.text((x0, y), 'No differing regions', fill=(80,80,80)) return out for e in entries: idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta'] d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14 d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14 d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14 d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18 if y > h - 40: break return out # -------------------- Gradio Interface ----------------- def compare_pdfs(file_a, file_b): """Main comparison function for Gradio interface""" try: if file_a is None or file_b is None: return None, None, None, "❌ Please upload both PDF files to compare", [], [] # Load images with multiple pages support pages_a = load_pdf_pages(file_a.name, dpi=400, max_pages=5) pages_b = load_pdf_pages(file_b.name, dpi=400, max_pages=5) # Combine pages into single images for comparison a = combine_pages_vertically(pages_a) b = combine_pages_vertically(pages_b) # Match sizes a, b = match_sizes(a, b) # Find differences with default settings diff = difference_map(a, b) red_boxes = find_diff_boxes(diff, threshold=12, min_area=25) # Run all analysis features with defaults misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else [] misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else [] if HAS_BARCODE: bar_a, info_a = find_barcode_boxes_and_info(a) bar_b, info_b = find_barcode_boxes_and_info(b) else: bar_a, info_a = [], [] bar_b, info_b = [], [] # Always enable CMYK analysis cmyk_entries = compute_cmyk_diffs(a, b, red_boxes) labels = [e['idx'] for e in cmyk_entries] # Create visualizations with default box width a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels) b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels) # Always show CMYK panel a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') # Create pixel difference overlay overlay = make_red_overlay(a, b) # Create status message status = f""" 📊 **Analysis Complete!** - **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)} - **Difference regions found:** {len(red_boxes)} - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)} - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)} - **Combined image dimensions:** {a.width} × {a.height} pixels **Legend:** - 🔴 Red boxes: Visual differences - 🔵 Cyan boxes: Spelling errors - 🟢 Green boxes: Barcodes/QR codes """ # Prepare barcode data for tables codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a] codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b] return overlay, a_disp, b_disp, status, codes_a, codes_b except Exception as e: error_msg = f"❌ **Error:** {str(e)}" return None, None, None, error_msg, [], [] # -------------------- Gradio App ------------------- def create_demo(): with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🔍 Advanced PDF Comparison Tool Upload two PDF files to get comprehensive analysis including: - **Multi-page PDF support** (up to 5 pages per document) - **Visual differences** with bounding boxes - **OCR and spell checking** - **Barcode/QR code detection** - **CMYK color analysis** """) with gr.Row(): with gr.Column(): file_a = gr.File(label="📄 PDF A (Reference)", file_types=[".pdf"]) file_b = gr.File(label="📄 PDF B (Comparison)", file_types=[".pdf"]) compare_btn = gr.Button("🔍 Compare PDF Files", variant="primary", size="lg") status_md = gr.Markdown("") with gr.Row(): overlay_img = gr.Image(label="🔴 Pixel Differences (Red = Different)", type="pil") with gr.Row(): img_a = gr.Image(label="📄 File A with Analysis", type="pil") img_b = gr.Image(label="📄 File B with Analysis", type="pil") gr.Markdown("### 📊 Barcode Detection Results") with gr.Row(): codes_a_df = gr.Dataframe( headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], label="Barcodes in File A", interactive=False ) codes_b_df = gr.Dataframe( headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], label="Barcodes in File B", interactive=False ) # Event handlers compare_btn.click( fn=compare_pdfs, inputs=[file_a, file_b], outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df] ) gr.Markdown(""" ### 📝 Instructions: 1. Upload two PDF files 2. Click "Compare PDF Files" 3. View results with comprehensive analysis ### 🎨 Color Legend: - **🔴 Red boxes:** Visual differences between files - **🔵 Cyan boxes:** Potential spelling errors (OCR) - **🟢 Green boxes:** Detected barcodes/QR codes - **📊 Side panel:** CMYK color analysis for print workflows """) return demo if __name__ == "__main__": demo = create_demo() demo.launch( server_name="0.0.0.0", # Allow external access share=True, # Set to True to create a public link show_error=True )