#!/usr/bin/env python3 """ Gradio PDF Comparison Tool Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis. """ import os, sys, re, csv, json, io from dataclasses import dataclass from typing import List, Tuple, Optional import tempfile import numpy as np from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError from pdf2image import convert_from_path from skimage.measure import label, regionprops from skimage.morphology import dilation, rectangle import gradio as gr # Optional features try: import pytesseract HAS_OCR = True except Exception: pytesseract = None HAS_OCR = False try: from spellchecker import SpellChecker HAS_SPELLCHECK = True except Exception: SpellChecker = None HAS_SPELLCHECK = False try: from pyzbar.pyzbar import decode as zbar_decode HAS_BARCODE = True except Exception: zbar_decode = None HAS_BARCODE = False # -------------------- Core Data -------------------- @dataclass class Box: y1: int; x1: int; y2: int; x2: int; area: int # -------------------- Helpers ---------------------- def _is_pdf(path: str) -> bool: return os.path.splitext(path.lower())[1] == ".pdf" def load_first_page(path: str, dpi: int = 300) -> Image.Image: if _is_pdf(path): try: # Try with poppler_path explicitly set imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path="/usr/bin") if not imgs: raise ValueError(f"No pages in PDF: {path}") return imgs[0].convert("RGB") except Exception as e1: try: # Fallback: try without explicit poppler_path imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1) if not imgs: raise ValueError(f"No pages in PDF: {path}") return imgs[0].convert("RGB") except Exception as e2: raise ValueError(f"Failed to convert PDF to image. Error 1: {str(e1)}. Error 2: {str(e2)}. Make sure poppler-utils is installed.") return Image.open(path).convert("RGB") def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]: if a.size == b.size: return a, b w, h = min(a.width, b.width), min(a.height, b.height) return a.crop((0, 0, w, h)), b.crop((0, 0, w, h)) def difference_map(a: Image.Image, b: Image.Image) -> Image.Image: return ImageChops.difference(a, b) def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]: arr = np.asarray(diff_img).astype(np.uint16) gray = arr.max(axis=2).astype(np.uint8) mask = (gray >= threshold).astype(np.uint8) mask = dilation(mask, rectangle(3, 3)) labeled = label(mask, connectivity=2) out: List[Box] = [] for p in regionprops(labeled): if p.area < min_area: continue minr, minc, maxr, maxc = p.bbox out.append(Box(minr, minc, maxr, maxc, int(p.area))) return out def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None, width: int = 3, red_labels: List[int] = None) -> Image.Image: out = img.copy(); d = ImageDraw.Draw(out) # red (diff) for b in red_boxes: for w in range(width): d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0)) # labels for red boxes if red_labels: for idx, b in enumerate(red_boxes): label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1) tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3) d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255)) d.text((tx, ty), label, fill=(0,0,0)) # cyan (misspellings) for b in cyan_boxes: for w in range(width): d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255)) # green (barcodes) if green_boxes: for b in green_boxes: for w in range(width): d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0)) return out def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image: A = np.asarray(a).copy(); B = np.asarray(b) mask = np.any(A != B, axis=2) A[mask] = [255, 0, 0] return Image.fromarray(A) # -------------------- OCR + Spellcheck ------------- def normalize_token(token: str) -> str: cleaned = re.sub(r"[^A-Za-z']", "", token) return cleaned.lower() def find_misspell_boxes(img: Image.Image) -> List[Box]: if not (HAS_OCR and HAS_SPELLCHECK): return [] try: spell = SpellChecker() data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) except Exception: return [] n = len(data.get("text", [])) boxes: List[Box] = [] for i in range(n): text = data["text"][i] if not text: continue token = normalize_token(text) if len(token) < 2: continue if token in spell: continue left = data.get("left", [0])[i] top = data.get("top", [0])[i] width = data.get("width", [0])[i] height= data.get("height",[0])[i] if width <= 0 or height <= 0: continue boxes.append(Box(top, left, top+height, left+width, width*height)) return boxes # -------------------- Barcode / QR ----------------- def ean_like_checksum_ok(digits: str) -> bool: if not digits.isdigit(): return False n = len(digits) if n not in (8, 12, 13): return True nums = [int(c) for c in digits] if n == 8: body, check = nums[:7], nums[7] s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7)) return (10 - (s % 10)) % 10 == check if n == 12: body, check = nums[:11], nums[11] s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11)) return (10 - (s % 10)) % 10 == check if n == 13: body, check = nums[:12], nums[12] s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12)) return (10 - (s % 10)) % 10 == check return True def validate_symbology(symbology: str, data: bytes) -> bool: try: text = data.decode('utf-8', errors='ignore') except Exception: return False sym = (symbology or '').upper() if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"): return ean_like_checksum_ok(re.sub(r"\D", "", text)) if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"): return len(text) > 0 return len(text) > 0 def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box: return Box(y, x, y + h, x + w, w * h) def decode_with_variants(img: Image.Image): if not HAS_BARCODE: return [] results = [] def do_decode(pil_img): try: dec = zbar_decode(pil_img) if dec: results.extend(dec) except Exception: pass do_decode(img) if not results: do_decode(img.convert('L')) if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC)) if not results and img.mode != 'RGB': do_decode(img.convert('RGB')) return results def find_barcode_boxes_and_info(img: Image.Image): decodes = decode_with_variants(img) boxes: List[Box] = [] infos = [] for d in decodes: rect = d.rect boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height)) valid = validate_symbology(d.type, d.data) infos.append({ 'type': d.type, 'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)), 'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height, 'valid': bool(valid) }) return boxes, infos # -------------------- CMYK Panel ------------------- def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray: return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255 def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]: y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2) x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2) if y2<=y1 or x2<=x1: return (0.0,0.0,0.0,0.0) region = cmyk_arr[y1:y2, x1:x2, :] mean_vals = region.reshape(-1, 4).mean(axis=0) return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals) def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]): a_cmyk = rgb_to_cmyk_array(a_img) b_cmyk = rgb_to_cmyk_array(b_img) entries = [] for i, bx in enumerate(red_boxes): a_vals = avg_cmyk_in_box(a_cmyk, bx) b_vals = avg_cmyk_in_box(b_cmyk, bx) delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4)) entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta}) return entries def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image: w,h = base.size panel = Image.new('RGB', (panel_width, h), (245,245,245)) out = Image.new('RGB', (w+panel_width, h), (255,255,255)) out.paste(base, (0,0)); out.paste(panel, (w,0)) d = ImageDraw.Draw(out) x0 = w + 8; y = 8 d.text((x0, y), title, fill=(0,0,0)); y += 18 if not entries: d.text((x0, y), 'No differing regions', fill=(80,80,80)) return out for e in entries: idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta'] d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14 d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14 d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14 d.text((x0, y), f"Δ: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18 if y > h - 40: break return out # -------------------- Gradio Interface ----------------- def compare_pdfs(file_a, file_b): """Main comparison function for Gradio interface""" try: if file_a is None or file_b is None: return None, None, None, "❌ Please upload both PDF files to compare", [], [] # Load images with default settings a = load_first_page(file_a.name, dpi=300) b = load_first_page(file_b.name, dpi=300) # Match sizes a, b = match_sizes(a, b) # Find differences with default settings diff = difference_map(a, b) red_boxes = find_diff_boxes(diff, threshold=12, min_area=25) # Run all analysis features with defaults misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else [] misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else [] if HAS_BARCODE: bar_a, info_a = find_barcode_boxes_and_info(a) bar_b, info_b = find_barcode_boxes_and_info(b) else: bar_a, info_a = [], [] bar_b, info_b = [], [] # Always enable CMYK analysis cmyk_entries = compute_cmyk_diffs(a, b, red_boxes) labels = [e['idx'] for e in cmyk_entries] # Create visualizations with default box width a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels) b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels) # Always show CMYK panel a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') # Create pixel difference overlay overlay = make_red_overlay(a, b) # Create status message status = f""" 📊 **Analysis Complete!** - **Difference regions found:** {len(red_boxes)} - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)} - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)} - **Image dimensions:** {a.width} × {a.height} pixels **Legend:** - 🔴 Red boxes: Visual differences - 🔵 Cyan boxes: Spelling errors - 🟢 Green boxes: Barcodes/QR codes """ # Prepare barcode data for tables codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a] codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b] return overlay, a_disp, b_disp, status, codes_a, codes_b except Exception as e: error_msg = f"❌ **Error:** {str(e)}" return None, None, None, error_msg, [], [] # -------------------- Gradio App ------------------- def create_demo(): with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🔍 Advanced PDF Comparison Tool Upload two PDF files to get comprehensive analysis including: - **Visual differences** with bounding boxes - **OCR and spell checking** - **Barcode/QR code detection** - **CMYK color analysis** """) with gr.Row(): with gr.Column(): file_a = gr.File(label="📄 PDF A (Reference)", file_types=[".pdf"]) file_b = gr.File(label="📄 PDF B (Comparison)", file_types=[".pdf"]) compare_btn = gr.Button("🔍 Compare PDF Files", variant="primary", size="lg") status_md = gr.Markdown("") with gr.Row(): overlay_img = gr.Image(label="🔴 Pixel Differences (Red = Different)", type="pil") with gr.Row(): img_a = gr.Image(label="📄 File A with Analysis", type="pil") img_b = gr.Image(label="📄 File B with Analysis", type="pil") gr.Markdown("### 📊 Barcode Detection Results") with gr.Row(): codes_a_df = gr.Dataframe( headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], label="Barcodes in File A", interactive=False ) codes_b_df = gr.Dataframe( headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], label="Barcodes in File B", interactive=False ) # Event handlers compare_btn.click( fn=compare_pdfs, inputs=[file_a, file_b], outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df] ) gr.Markdown(""" ### 📝 Instructions: 1. Upload two PDF files 2. Click "Compare PDF Files" 3. View results with comprehensive analysis ### 🎨 Color Legend: - **🔴 Red boxes:** Visual differences between files - **🔵 Cyan boxes:** Potential spelling errors (OCR) - **🟢 Green boxes:** Detected barcodes/QR codes - **📊 Side panel:** CMYK color analysis for print workflows """) return demo if __name__ == "__main__": demo = create_demo() demo.launch( server_name="0.0.0.0", # Allow external access share=True, # Set to True to create a public link show_error=True )