| |
| """ |
| Gradio PDF Comparison Tool |
| Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis. |
| """ |
|
|
| import os, sys, re, csv, json, io |
| from dataclasses import dataclass |
| from typing import List, Tuple, Optional, Iterable |
| import tempfile |
| import unicodedata |
|
|
| import numpy as np |
| from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError |
| from pdf2image import convert_from_path |
| from skimage.measure import label, regionprops |
| from skimage.morphology import dilation, rectangle |
| import gradio as gr |
|
|
| |
| try: |
| import fitz |
| HAS_PYMUPDF = True |
| except Exception: |
| fitz = None |
| HAS_PYMUPDF = False |
|
|
| |
| try: |
| import pytesseract |
| HAS_OCR = True |
| except Exception: |
| pytesseract = None |
| HAS_OCR = False |
|
|
| try: |
| from spellchecker import SpellChecker |
| HAS_SPELLCHECK = True |
| except Exception: |
| SpellChecker = None |
| HAS_SPELLCHECK = False |
|
|
| try: |
| import regex as re |
| HAS_REGEX = True |
| except Exception: |
| import re |
| HAS_REGEX = False |
|
|
| try: |
| from pyzbar.pyzbar import decode as zbar_decode |
| HAS_BARCODE = True |
| except Exception: |
| zbar_decode = None |
| HAS_BARCODE = False |
|
|
| |
| @dataclass |
| class Box: |
| y1: int; x1: int; y2: int; x2: int; area: int |
|
|
| |
| if HAS_REGEX: |
| _WORD_RE = re.compile(r"\p{Letter}+(?:['\-]\p{Letter}+)*", re.UNICODE) |
| else: |
| _WORD_RE = re.compile(r"[A-Za-z]+(?:['\-][A-Za-z]+)*") |
|
|
| if HAS_SPELLCHECK: |
| _SPELL_EN = SpellChecker(language="en") |
| try: |
| _SPELL_FR = SpellChecker(language="fr") |
| except Exception: |
| _SPELL_FR = None |
| else: |
| _SPELL_EN = None |
| _SPELL_FR = None |
|
|
| _DOMAIN_ALLOWLIST = { |
| "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", |
| "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid" |
| } |
| _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST} |
|
|
| if _SPELL_EN: |
| _SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) |
| if _SPELL_FR: |
| _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) |
|
|
| def _normalize_text(s: str) -> str: |
| s = unicodedata.normalize("NFC", s) |
| return s.replace("'", "'").strip() |
|
|
| def _extract_tokens(raw: str): |
| s = _normalize_text(raw or "") |
| return _WORD_RE.findall(s) |
|
|
| def _looks_like_acronym(tok: str) -> bool: |
| return tok.isupper() and 2 <= len(tok) <= 6 |
|
|
| def _has_digits(tok: str) -> bool: |
| return any(ch.isdigit() for ch in tok) |
|
|
| def _is_known_word(tok: str) -> bool: |
| t = tok.lower() |
| if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok): |
| return True |
| |
| |
| if '-' in tok: |
| parts = tok.split('-') |
| if all(_is_known_word(part) for part in parts): |
| return True |
| |
| if _SPELL_EN and not _SPELL_EN.unknown([t]): |
| return True |
| if _SPELL_FR and not _SPELL_FR.unknown([t]): |
| return True |
| return False |
|
|
| |
| def normalize_token(token: str) -> str: |
| toks = _extract_tokens(token) |
| return (toks[0].lower() if toks else "") |
|
|
| |
| def _is_pdf(path: str) -> bool: |
| return os.path.splitext(path.lower())[1] == ".pdf" |
|
|
| def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]: |
| if _is_pdf(path): |
| |
| poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None] |
| |
| for poppler_path in poppler_paths: |
| try: |
| if poppler_path: |
| imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path) |
| else: |
| imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages) |
| |
| if not imgs: |
| continue |
| |
| return [img.convert("RGB") for img in imgs] |
| except Exception as e: |
| if poppler_path is None: |
| break |
| continue |
| |
| |
| if HAS_PYMUPDF: |
| try: |
| doc = fitz.open(path) |
| pages = [] |
| for page_num in range(min(len(doc), max_pages)): |
| page = doc[page_num] |
| mat = fitz.Matrix(dpi/72, dpi/72) |
| pix = page.get_pixmap(matrix=mat) |
| img_data = pix.tobytes("ppm") |
| img = Image.open(io.BytesIO(img_data)) |
| pages.append(img.convert("RGB")) |
| doc.close() |
| return pages |
| except Exception as e: |
| raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}") |
| else: |
| raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.") |
| |
| raise ValueError(f"No pages in PDF: {path}") |
| return [Image.open(path).convert("RGB")] |
|
|
| def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image: |
| """Combine multiple pages into a single vertical image""" |
| if not pages: |
| raise ValueError("No pages to combine") |
| if len(pages) == 1: |
| return pages[0] |
| |
| |
| max_width = max(page.width for page in pages) |
| |
| |
| total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1) |
| |
| |
| combined = Image.new('RGB', (max_width, total_height), (255, 255, 255)) |
| |
| y_offset = 0 |
| for page in pages: |
| |
| x_offset = (max_width - page.width) // 2 |
| combined.paste(page, (x_offset, y_offset)) |
| y_offset += page.height + spacing |
| |
| return combined |
|
|
| def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]: |
| if a.size == b.size: |
| return a, b |
| w, h = min(a.width, b.width), min(a.height, b.height) |
| return a.crop((0, 0, w, h)), b.crop((0, 0, w, h)) |
|
|
| def difference_map(a: Image.Image, b: Image.Image) -> Image.Image: |
| return ImageChops.difference(a, b) |
|
|
| def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]: |
| arr = np.asarray(diff_img).astype(np.uint16) |
| gray = arr.max(axis=2).astype(np.uint8) |
| mask = (gray >= threshold).astype(np.uint8) |
| mask = dilation(mask, rectangle(3, 3)) |
| labeled = label(mask, connectivity=2) |
| out: List[Box] = [] |
| for p in regionprops(labeled): |
| if p.area < min_area: |
| continue |
| minr, minc, maxr, maxc = p.bbox |
| out.append(Box(minr, minc, maxr, maxc, int(p.area))) |
| return out |
|
|
| def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None, |
| width: int = 3, red_labels: List[int] = None) -> Image.Image: |
| out = img.copy(); d = ImageDraw.Draw(out) |
| |
| for b in red_boxes: |
| for w in range(width): |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0)) |
| |
| if red_labels: |
| for idx, b in enumerate(red_boxes): |
| label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1) |
| tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3) |
| d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255)) |
| d.text((tx, ty), label, fill=(0,0,0)) |
| |
| for b in cyan_boxes: |
| for w in range(width): |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255)) |
| |
| if green_boxes: |
| for b in green_boxes: |
| for w in range(width): |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0)) |
| return out |
|
|
| def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image: |
| A = np.asarray(a).copy(); B = np.asarray(b) |
| mask = np.any(A != B, axis=2) |
| A[mask] = [255, 0, 0] |
| return Image.fromarray(A) |
|
|
| |
| from typing import List, Iterable, Optional |
| from PIL import Image |
| import unicodedata |
| import regex as re |
| import pytesseract |
| from spellchecker import SpellChecker |
|
|
| |
| try: |
| HAS_OCR |
| except NameError: |
| HAS_OCR = True |
| try: |
| HAS_SPELLCHECK |
| except NameError: |
| HAS_SPELLCHECK = True |
|
|
| |
| _WORD_RE = re.compile(r"\p{Letter}+(?:[β'\-]\p{Letter}+)*", re.UNICODE) |
|
|
| _SPELL_EN = SpellChecker(language="en") |
| _SPELL_FR = SpellChecker(language="fr") |
|
|
| _DOMAIN_ALLOWLIST = { |
| "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", |
| "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid" |
| } |
| _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) |
| _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) |
|
|
| def _normalize_text(s: str) -> str: |
| s = unicodedata.normalize("NFC", s) |
| return s.replace("β", "'").strip() |
|
|
| def _extract_tokens(raw: str): |
| s = _normalize_text(raw or "") |
| return _WORD_RE.findall(s) |
|
|
| def _looks_like_acronym(tok: str) -> bool: |
| return tok.isupper() and 2 <= len(tok) <= 6 |
|
|
| def _has_digits(tok: str) -> bool: |
| return any(ch.isdigit() for ch in tok) |
|
|
| |
| def normalize_token(token: str) -> str: |
| toks = _extract_tokens(token) |
| return (toks[0].lower() if toks else "") |
|
|
| def _get_available_tesseract_langs(): |
| """Get available Tesseract languages""" |
| try: |
| langs = pytesseract.get_languages() |
| if 'eng' in langs and 'fra' in langs: |
| return "eng+fra" |
| elif 'eng' in langs: |
| return "eng" |
| elif langs: |
| return langs[0] |
| else: |
| return "eng" |
| except Exception: |
| return "eng" |
|
|
| def prepare_for_ocr(img: Image.Image) -> Image.Image: |
| """Prepare image for better OCR results""" |
| from PIL import ImageOps, ImageFilter |
| g = img.convert("L") |
| g = ImageOps.autocontrast(g) |
| g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2)) |
| return g |
|
|
| def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]: |
| """Extract text directly from PDF using PyMuPDF""" |
| if not HAS_PYMUPDF: |
| return [] |
| |
| try: |
| doc = fitz.open(path) |
| texts = [] |
| for page_num in range(min(len(doc), max_pages)): |
| page = doc[page_num] |
| text = page.get_text() |
| texts.append(text) |
| doc.close() |
| return texts |
| except Exception: |
| return [] |
|
|
| def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000): |
| """Convert PDF coordinates to image coordinates""" |
| pdf_width, pdf_height = pdf_page_size |
| img_width, img_height = image_size |
| |
| |
| scale_x = img_width / pdf_width |
| scale_y = img_height / pdf_height |
| |
| |
| x1 = int(pdf_bbox[0] * scale_x) |
| y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height) |
| x2 = int(pdf_bbox[2] * scale_x) |
| y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height) |
| |
| return x1, y1, x2, y2 |
|
|
| def find_misspell_boxes_from_text( |
| pdf_path: str, |
| *, |
| extra_allow: Optional[Iterable[str]] = None, |
| max_pages: int = 5, |
| image_size: Optional[Tuple[int, int]] = None |
| ) -> List[Box]: |
| """Find misspellings by analyzing extracted PDF text directly with coordinate mapping""" |
| if not (HAS_SPELLCHECK and HAS_PYMUPDF): |
| return [] |
| |
| |
| if extra_allow and _SPELL_EN: |
| _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow) |
| if extra_allow and _SPELL_FR: |
| _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow) |
| |
| boxes: List[Box] = [] |
| |
| try: |
| doc = fitz.open(pdf_path) |
| |
| for page_num in range(min(len(doc), max_pages)): |
| page = doc[page_num] |
| |
| |
| text_dict = page.get_text("dict") |
| |
| |
| for block in text_dict.get("blocks", []): |
| if "lines" not in block: |
| continue |
| |
| for line in block["lines"]: |
| for span in line["spans"]: |
| text = span.get("text", "").strip() |
| if not text: |
| continue |
| |
| |
| tokens = _extract_tokens(text) |
| has_misspelling = False |
| |
| for token in tokens: |
| if len(token) >= 2 and not _is_known_word(token): |
| has_misspelling = True |
| break |
| |
| |
| if has_misspelling: |
| bbox = span["bbox"] |
| |
| |
| page_rect = page.rect |
| pdf_width = page_rect.width |
| pdf_height = page_rect.height |
| |
| if image_size: |
| img_width, img_height = image_size |
| |
| scale_x = img_width / pdf_width |
| scale_y = img_height / pdf_height |
| |
| x1 = int(bbox[0] * scale_x) |
| y1 = int(bbox[1] * scale_y) + (page_num * img_height) |
| x2 = int(bbox[2] * scale_x) |
| y2 = int(bbox[3] * scale_y) + (page_num * img_height) |
| else: |
| |
| x1 = int(bbox[0]) |
| y1 = int(bbox[1]) + (page_num * 1000) |
| x2 = int(bbox[2]) |
| y2 = int(bbox[3]) + (page_num * 1000) |
| |
| boxes.append(Box( |
| y1=y1, |
| x1=x1, |
| y2=y2, |
| x2=x2, |
| area=(x2 - x1) * (y2 - y1) |
| )) |
| |
| doc.close() |
| |
| except Exception: |
| |
| page_texts = extract_pdf_text(pdf_path, max_pages) |
| for page_num, text in enumerate(page_texts): |
| if not text.strip(): |
| continue |
| |
| tokens = _extract_tokens(text) |
| misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)] |
| |
| if misspelled_words: |
| |
| boxes.append(Box( |
| y1=page_num * 1000, |
| x1=0, |
| y2=(page_num + 1) * 1000, |
| x2=800, |
| area=800 * 1000 |
| )) |
| |
| return boxes |
|
|
| def find_misspell_boxes( |
| img: Image.Image, |
| *, |
| min_conf: int = 60, |
| lang: Optional[str] = None, |
| extra_allow: Optional[Iterable[str]] = None, |
| dpi: int = 300, |
| psm: int = 6, |
| oem: int = 3 |
| ) -> List[Box]: |
| """Legacy OCR-based spell checking (kept for fallback)""" |
| if not (HAS_OCR and HAS_SPELLCHECK): |
| return [] |
| |
| |
| if lang is None: |
| try: |
| avail = set(pytesseract.get_languages(config="") or []) |
| except Exception: |
| avail = {"eng"} |
| lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng" |
| |
| |
| |
| if img.width < 1600: |
| scale = 2 |
| img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS) |
| |
| |
| img = prepare_for_ocr(img) |
| |
| try: |
| if extra_allow and _SPELL_EN: |
| _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow) |
| if extra_allow and _SPELL_FR: |
| _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow) |
|
|
| |
| config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}" |
|
|
| data = pytesseract.image_to_data( |
| img, |
| lang=lang, |
| config=config, |
| output_type=pytesseract.Output.DICT, |
| ) |
| except Exception: |
| return [] |
|
|
| n = len(data.get("text", [])) or 0 |
| boxes: List[Box] = [] |
|
|
| for i in range(n): |
| raw = data["text"][i] |
| if not raw: |
| continue |
|
|
| |
| conf_str = data.get("conf", ["-1"])[i] |
| try: |
| conf = int(float(conf_str)) |
| except Exception: |
| conf = -1 |
| if conf < min_conf: |
| continue |
|
|
| tokens = _extract_tokens(raw) |
| if not tokens: |
| continue |
|
|
| |
| if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens): |
| continue |
|
|
| left = data.get("left", [0])[i] |
| top = data.get("top", [0])[i] |
| width = data.get("width", [0])[i] |
| height = data.get("height",[0])[i] |
| if width <= 0 or height <= 0: |
| continue |
|
|
| |
| boxes.append(Box(top, left, top + height, left + width, width * height)) |
|
|
| return boxes |
|
|
|
|
| |
| from typing import List, Tuple, Optional |
| from PIL import Image, ImageOps |
| import io, regex as re |
|
|
| try: |
| from pyzbar.pyzbar import decode as zbar_decode, ZBarSymbol |
| HAS_BARCODE = True |
| except Exception: |
| HAS_BARCODE = False |
| ZBarSymbol = None |
|
|
| try: |
| import fitz |
| HAS_PYMUPDF = True |
| except Exception: |
| HAS_PYMUPDF = False |
|
|
| try: |
| from pylibdmtx.pylibdmtx import decode as dmtx_decode |
| HAS_DMTX = True |
| except Exception: |
| HAS_DMTX = False |
|
|
| |
|
|
| def _binarize(pil_img: Image.Image) -> Image.Image: |
| g = ImageOps.grayscale(pil_img) |
| g = ImageOps.autocontrast(g) |
| |
| return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L') |
|
|
| def _decode_pyzbar(img: Image.Image) -> list: |
| if not HAS_BARCODE: |
| return [] |
| symbols = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128] if ZBarSymbol else None |
| res = zbar_decode(img, symbols=symbols) if symbols else zbar_decode(img) |
| if res: |
| return res |
| |
| variants = [ImageOps.grayscale(img), _binarize(img)] |
| for v in variants: |
| res = zbar_decode(v, symbols=symbols) if symbols else zbar_decode(v) |
| if res: return res |
| for angle in (90, 180, 270): |
| r = v.rotate(angle, expand=True) |
| res = zbar_decode(r, symbols=symbols) if symbols else zbar_decode(r) |
| if res: return res |
| w, h = img.size |
| if max(w, h) < 1600: |
| try: |
| from PIL import Image as _PIL |
| u = img.resize((w*2, h*2), resample=_PIL.Resampling.BICUBIC) |
| except Exception: |
| u = img.resize((w*2, h*2), resample=Image.BICUBIC) |
| res = zbar_decode(u, symbols=symbols) if symbols else zbar_decode(u) |
| if res: return res |
| return [] |
|
|
| def _decode_datamatrix(img: Image.Image) -> list: |
| if not HAS_DMTX: |
| return [] |
| try: |
| res = dmtx_decode(ImageOps.grayscale(img)) |
| |
| outs = [] |
| for r in res: |
| rect = r.rect |
| outs.append(type("DM", (), { |
| "type": "DATAMATRIX", |
| "data": r.data, |
| "rect": type("R", (), {"left": rect.left, "top": rect.top, "width": rect.width, "height": rect.height}) |
| })) |
| return outs |
| except Exception: |
| return [] |
|
|
| def _decode_all(img: Image.Image) -> list: |
| out = _decode_pyzbar(img) |
| if not out: |
| out = _decode_datamatrix(img) or out |
| return out |
|
|
| def _pix_to_pil(pix) -> Image.Image: |
| |
| if pix.alpha: |
| pix = fitz.Pixmap(pix, 0) |
| |
| try: |
| pix = fitz.Pixmap(fitz.csGRAY, pix) |
| except Exception: |
| pass |
| return Image.open(io.BytesIO(pix.tobytes("ppm"))) |
|
|
| def find_barcode_boxes_and_info_from_pdf(pdf_path: str, *, max_pages: int = 5, dpi: int = 600) -> Tuple[List["Box"], List[dict]]: |
| """Render each page at high DPI + scan embedded images. Return (boxes, infos).""" |
| if not HAS_PYMUPDF: |
| return [], [] |
| boxes: List["Box"] = [] |
| infos: List[dict] = [] |
| try: |
| doc = fitz.open(pdf_path) |
| n_pages = min(len(doc), max_pages) |
| scale = dpi / 72.0 |
| mat = fitz.Matrix(scale, scale) |
| for page_idx in range(n_pages): |
| page = doc[page_idx] |
|
|
| |
| pix = page.get_pixmap(matrix=mat, alpha=False) |
| img = _pix_to_pil(pix) |
| decs = _decode_all(img) |
|
|
| |
| for xref, *_rest in page.get_images(full=True): |
| try: |
| ipix = fitz.Pixmap(doc, xref) |
| pil = _pix_to_pil(ipix) |
| decs += _decode_all(pil) |
| except Exception: |
| pass |
|
|
| |
| for d in decs: |
| rect = d.rect |
| left, top, width, height = int(rect.left), int(rect.top), int(rect.width), int(rect.height) |
| boxes.append(Box(top, left, top + height, left + width, width * height)) |
| |
| try: |
| payload = d.data.decode("utf-8", errors="ignore") if isinstance(d.data, (bytes, bytearray)) else str(d.data) |
| except Exception: |
| payload = "" |
| infos.append({ |
| "type": getattr(d, "type", "UNKNOWN"), |
| "data": payload, |
| "left": left, "top": top, "width": width, "height": height, |
| "page": page_idx + 1, |
| }) |
| doc.close() |
| except Exception: |
| return [], [] |
| return boxes, infos |
|
|
|
|
|
|
| |
| def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray: |
| return np.asarray(img.convert('CMYK')).astype(np.float32) |
|
|
| def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]: |
| y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2) |
| x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2) |
| if y2<=y1 or x2<=x1: |
| return (0.0,0.0,0.0,0.0) |
| region = cmyk_arr[y1:y2, x1:x2, :] |
| mean_vals = region.reshape(-1, 4).mean(axis=0) |
| return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals) |
|
|
| def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]): |
| a_cmyk = rgb_to_cmyk_array(a_img) |
| b_cmyk = rgb_to_cmyk_array(b_img) |
| entries = [] |
| for i, bx in enumerate(red_boxes): |
| a_vals = avg_cmyk_in_box(a_cmyk, bx) |
| b_vals = avg_cmyk_in_box(b_cmyk, bx) |
| delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4)) |
| entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta}) |
| return entries |
|
|
| def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image: |
| w,h = base.size |
| panel = Image.new('RGB', (panel_width, h), (245,245,245)) |
| out = Image.new('RGB', (w+panel_width, h), (255,255,255)) |
| out.paste(base, (0,0)); out.paste(panel, (w,0)) |
| d = ImageDraw.Draw(out) |
| x0 = w + 8; y = 8 |
| d.text((x0, y), title, fill=(0,0,0)); y += 18 |
| if not entries: |
| d.text((x0, y), 'No differing regions', fill=(80,80,80)) |
| return out |
| for e in entries: |
| idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta'] |
| d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14 |
| d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14 |
| d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14 |
| d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18 |
| if y > h - 40: break |
| return out |
|
|
| |
| def compare_pdfs(file_a, file_b): |
| """Main comparison function for Gradio interface""" |
| try: |
| if file_a is None or file_b is None: |
| return None, None, None, "β Please upload both PDF files to compare", [], [] |
|
|
| |
| pages_a = load_pdf_pages(file_a.name, dpi=400, max_pages=5) |
| pages_b = load_pdf_pages(file_b.name, dpi=400, max_pages=5) |
| |
| |
| a = combine_pages_vertically(pages_a) |
| b = combine_pages_vertically(pages_b) |
|
|
| |
| a, b = match_sizes(a, b) |
|
|
| |
| diff = difference_map(a, b) |
| red_boxes = find_diff_boxes(diff, threshold=12, min_area=25) |
|
|
| |
| |
| |
| image_size = (a.width, a.height) |
| misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else [] |
| misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else [] |
| |
| |
| print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes") |
|
|
| if HAS_BARCODE: |
| |
| bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a) |
| bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b) |
| |
| |
| print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes") |
| else: |
| bar_a, info_a = [], [] |
| bar_b, info_b = [], [] |
|
|
| |
| cmyk_entries = compute_cmyk_diffs(a, b, red_boxes) |
| labels = [e['idx'] for e in cmyk_entries] |
|
|
| |
| a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels) |
| b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels) |
|
|
| |
| a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') |
| b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') |
|
|
| |
| overlay = make_red_overlay(a, b) |
|
|
| |
| status = f""" |
| π **Analysis Complete!** |
| - **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)} |
| - **Difference regions found:** {len(red_boxes)} |
| - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)} |
| - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)} |
| - **Combined image dimensions:** {a.width} Γ {a.height} pixels |
| |
| **Legend:** |
| - π΄ Red boxes: Visual differences |
| - π΅ Cyan boxes: Spelling errors |
| - π’ Green boxes: Barcodes/QR codes |
| """ |
|
|
| |
| codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), |
| c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a] |
| codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), |
| c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b] |
|
|
| return overlay, a_disp, b_disp, status, codes_a, codes_b |
|
|
| except Exception as e: |
| error_msg = f"β **Error:** {str(e)}" |
| return None, None, None, error_msg, [], [] |
|
|
| |
| def create_demo(): |
| with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo: |
| gr.Markdown(""" |
| # π Advanced PDF Comparison Tool |
| |
| Upload two PDF files to get comprehensive analysis including: |
| - **Multi-page PDF support** (up to 5 pages per document) |
| - **Visual differences** with bounding boxes |
| - **OCR and spell checking** |
| - **Barcode/QR code detection** |
| - **CMYK color analysis** |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| file_a = gr.File(label="π PDF A (Reference)", file_types=[".pdf"]) |
| file_b = gr.File(label="π PDF B (Comparison)", file_types=[".pdf"]) |
|
|
| compare_btn = gr.Button("π Compare PDF Files", variant="primary", size="lg") |
|
|
| status_md = gr.Markdown("") |
|
|
| with gr.Row(): |
| overlay_img = gr.Image(label="π΄ Pixel Differences (Red = Different)", type="pil") |
|
|
| with gr.Row(): |
| img_a = gr.Image(label="π File A with Analysis", type="pil") |
| img_b = gr.Image(label="π File B with Analysis", type="pil") |
|
|
| gr.Markdown("### π Barcode Detection Results") |
| with gr.Row(): |
| codes_a_df = gr.Dataframe( |
| headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], |
| label="Barcodes in File A", |
| interactive=False |
| ) |
| codes_b_df = gr.Dataframe( |
| headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], |
| label="Barcodes in File B", |
| interactive=False |
| ) |
|
|
| |
| compare_btn.click( |
| fn=compare_pdfs, |
| inputs=[file_a, file_b], |
| outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df] |
| ) |
|
|
| gr.Markdown(""" |
| ### π Instructions: |
| 1. Upload two PDF files |
| 2. Click "Compare PDF Files" |
| 3. View results with comprehensive analysis |
| |
| ### π¨ Color Legend: |
| - **π΄ Red boxes:** Visual differences between files |
| - **π΅ Cyan boxes:** Potential spelling errors (OCR) |
| - **π’ Green boxes:** Detected barcodes/QR codes |
| - **π Side panel:** CMYK color analysis for print workflows |
| """) |
|
|
| return demo |
|
|
| def _binarize(pil_img: Image.Image) -> Image.Image: |
| """Create a binarized (black/white) version of the image for better barcode detection""" |
| g = ImageOps.grayscale(pil_img) |
| g = ImageOps.autocontrast(g) |
| return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L') |
|
|
| def _decode_once(img: Image.Image): |
| """Single decode attempt with common barcode symbols""" |
| if not HAS_BARCODE: |
| return [] |
| syms = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128] |
| return zbar_decode(img, symbols=syms) |
|
|
| def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2): |
| """ |
| Debug function to scan PDF at multiple DPIs and variants to diagnose barcode detection issues. |
| |
| This function: |
| - Renders pages at 600/900/1200 DPI |
| - Tries grayscale, binarized, and rotated versions |
| - Scans embedded images (XObjects) |
| - Prints what it finds and writes debug PNGs |
| - Helps identify if barcodes are too thin/low resolution |
| |
| Usage: |
| debug_scan_pdf("your.pdf", outdir="barcode_debug", max_pages=2) |
| """ |
| if not (HAS_BARCODE and HAS_PYMUPDF): |
| print("ERROR: Missing dependencies (pyzbar or PyMuPDF)") |
| return |
| |
| os.makedirs(outdir, exist_ok=True) |
| doc = fitz.open(pdf_path) |
| |
| for dpi in (600, 900, 1200): |
| scale = dpi / 72.0 |
| mat = fitz.Matrix(scale, scale) |
| print(f"\n=== DPI {dpi} ===") |
| |
| for p in range(min(len(doc), max_pages)): |
| page = doc[p] |
| pix = page.get_pixmap(matrix=mat, alpha=False) |
| img = Image.open(io.BytesIO(pix.tobytes("ppm"))) |
| img.save(f"{outdir}/page{p+1}_{dpi}.png") |
| |
| |
| variants = [ |
| ("orig", img), |
| ("gray", ImageOps.grayscale(img)), |
| ("bin", _binarize(img)), |
| ] |
| found = [] |
| |
| for tag, v in variants: |
| r = _decode_once(v) |
| if r: |
| found.extend((tag, rr.type, rr.data) for rr in r) |
| else: |
| |
| for angle in (90, 180, 270): |
| rr = _decode_once(v.rotate(angle, expand=True)) |
| if rr: |
| found.extend((f"{tag}_rot{angle}", rri.type, rri.data) for rri in rr) |
| break |
| |
| print(f"Page {p+1}: {len(found)} hits at DPI {dpi} -> {found}") |
|
|
| |
| imgs = page.get_images(full=True) |
| for ix, (xref, *_) in enumerate(imgs): |
| try: |
| ipix = fitz.Pixmap(doc, xref) |
| if ipix.alpha: |
| ipix = fitz.Pixmap(ipix, 0) |
| pil = Image.open(io.BytesIO(ipix.tobytes("ppm"))) |
| pil.save(f"{outdir}/page{p+1}_embed{ix+1}.png") |
| rr = _decode_once(pil) or _decode_once(_binarize(pil)) |
| if rr: |
| print(f" Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}") |
| except Exception as e: |
| print(" Embedded image error:", e) |
| |
| doc.close() |
| print(f"\nDebug images saved to: {outdir}/") |
| print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.") |
|
|
| if __name__ == "__main__": |
| demo = create_demo() |
| demo.launch( |
| server_name="0.0.0.0", |
| share=True, |
| show_error=True |
| ) |
|
|