Yaz Hobooti
Improve OCR performance: add image preprocessing, higher DPI, better Tesseract config
e88aad6
| #!/usr/bin/env python3 | |
| """ | |
| Gradio PDF Comparison Tool | |
| Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis. | |
| """ | |
| import os, sys, re, csv, json, io | |
| from dataclasses import dataclass | |
| from typing import List, Tuple, Optional, Iterable | |
| import tempfile | |
| import unicodedata | |
| import numpy as np | |
| from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError | |
| from pdf2image import convert_from_path | |
| from skimage.measure import label, regionprops | |
| from skimage.morphology import dilation, rectangle | |
| import gradio as gr | |
| # Alternative PDF processing | |
| try: | |
| import fitz # PyMuPDF | |
| HAS_PYMUPDF = True | |
| except Exception: | |
| fitz = None | |
| HAS_PYMUPDF = False | |
| # Optional features | |
| try: | |
| import pytesseract | |
| HAS_OCR = True | |
| except Exception: | |
| pytesseract = None | |
| HAS_OCR = False | |
| try: | |
| from spellchecker import SpellChecker | |
| HAS_SPELLCHECK = True | |
| except Exception: | |
| SpellChecker = None | |
| HAS_SPELLCHECK = False | |
| try: | |
| import regex as re | |
| HAS_REGEX = True | |
| except Exception: | |
| import re | |
| HAS_REGEX = False | |
| try: | |
| from pyzbar.pyzbar import decode as zbar_decode | |
| HAS_BARCODE = True | |
| except Exception: | |
| zbar_decode = None | |
| HAS_BARCODE = False | |
| # -------------------- Core Data -------------------- | |
| class Box: | |
| y1: int; x1: int; y2: int; x2: int; area: int | |
| # ---- spell/tokenization helpers & caches ---- | |
| if HAS_REGEX: | |
| _WORD_RE = re.compile(r"\p{Letter}+(?:['\-]\p{Letter}+)*", re.UNICODE) | |
| else: | |
| _WORD_RE = re.compile(r"[A-Za-z]+(?:['\-][A-Za-z]+)*") | |
| if HAS_SPELLCHECK: | |
| _SPELL_EN = SpellChecker(language="en") | |
| try: | |
| _SPELL_FR = SpellChecker(language="fr") | |
| except Exception: | |
| _SPELL_FR = None | |
| else: | |
| _SPELL_EN = None | |
| _SPELL_FR = None | |
| _DOMAIN_ALLOWLIST = { | |
| "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", | |
| "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid" | |
| } | |
| _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST} | |
| if _SPELL_EN: | |
| _SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) | |
| if _SPELL_FR: | |
| _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER) | |
| def _normalize_text(s: str) -> str: | |
| s = unicodedata.normalize("NFC", s) | |
| return s.replace("'", "'").strip() | |
| def _extract_tokens(raw: str): | |
| s = _normalize_text(raw or "") | |
| return _WORD_RE.findall(s) | |
| def _looks_like_acronym(tok: str) -> bool: | |
| return tok.isupper() and 2 <= len(tok) <= 6 | |
| def _has_digits(tok: str) -> bool: | |
| return any(ch.isdigit() for ch in tok) | |
| def _is_known_word(tok: str) -> bool: | |
| t = tok.lower() | |
| if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok): | |
| return True | |
| # Check hyphenated words - if any part is known, consider the whole word known | |
| if '-' in tok: | |
| parts = tok.split('-') | |
| if all(_is_known_word(part) for part in parts): | |
| return True | |
| if _SPELL_EN and not _SPELL_EN.unknown([t]): # known in EN | |
| return True | |
| if _SPELL_FR and not _SPELL_FR.unknown([t]): # known in FR | |
| return True | |
| return False | |
| # (optional) keep a compatibility shim so any other code calling normalize_token() won't break | |
| def normalize_token(token: str) -> str: | |
| toks = _extract_tokens(token) | |
| return (toks[0].lower() if toks else "") | |
| # -------------------- Helpers ---------------------- | |
| def _is_pdf(path: str) -> bool: | |
| return os.path.splitext(path.lower())[1] == ".pdf" | |
| def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]: | |
| if _is_pdf(path): | |
| # Try pdf2image with multiple poppler paths first | |
| poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None] | |
| for poppler_path in poppler_paths: | |
| try: | |
| if poppler_path: | |
| imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path) | |
| else: | |
| imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages) | |
| if not imgs: | |
| continue | |
| return [img.convert("RGB") for img in imgs] | |
| except Exception as e: | |
| if poppler_path is None: # All pdf2image attempts failed | |
| break | |
| continue # Try next path | |
| # Fallback to PyMuPDF if pdf2image fails | |
| if HAS_PYMUPDF: | |
| try: | |
| doc = fitz.open(path) | |
| pages = [] | |
| for page_num in range(min(len(doc), max_pages)): | |
| page = doc[page_num] | |
| mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI | |
| pix = page.get_pixmap(matrix=mat) | |
| img_data = pix.tobytes("ppm") | |
| img = Image.open(io.BytesIO(img_data)) | |
| pages.append(img.convert("RGB")) | |
| doc.close() | |
| return pages | |
| except Exception as e: | |
| raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}") | |
| else: | |
| raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.") | |
| raise ValueError(f"No pages in PDF: {path}") | |
| return [Image.open(path).convert("RGB")] | |
| def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image: | |
| """Combine multiple pages into a single vertical image""" | |
| if not pages: | |
| raise ValueError("No pages to combine") | |
| if len(pages) == 1: | |
| return pages[0] | |
| # Find the maximum width | |
| max_width = max(page.width for page in pages) | |
| # Calculate total height | |
| total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1) | |
| # Create combined image | |
| combined = Image.new('RGB', (max_width, total_height), (255, 255, 255)) | |
| y_offset = 0 | |
| for page in pages: | |
| # Center the page horizontally if it's narrower than max_width | |
| x_offset = (max_width - page.width) // 2 | |
| combined.paste(page, (x_offset, y_offset)) | |
| y_offset += page.height + spacing | |
| return combined | |
| def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]: | |
| if a.size == b.size: | |
| return a, b | |
| w, h = min(a.width, b.width), min(a.height, b.height) | |
| return a.crop((0, 0, w, h)), b.crop((0, 0, w, h)) | |
| def difference_map(a: Image.Image, b: Image.Image) -> Image.Image: | |
| return ImageChops.difference(a, b) | |
| def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]: | |
| arr = np.asarray(diff_img).astype(np.uint16) | |
| gray = arr.max(axis=2).astype(np.uint8) | |
| mask = (gray >= threshold).astype(np.uint8) | |
| mask = dilation(mask, rectangle(3, 3)) | |
| labeled = label(mask, connectivity=2) | |
| out: List[Box] = [] | |
| for p in regionprops(labeled): | |
| if p.area < min_area: | |
| continue | |
| minr, minc, maxr, maxc = p.bbox | |
| out.append(Box(minr, minc, maxr, maxc, int(p.area))) | |
| return out | |
| def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None, | |
| width: int = 3, red_labels: List[int] = None) -> Image.Image: | |
| out = img.copy(); d = ImageDraw.Draw(out) | |
| # red (diff) | |
| for b in red_boxes: | |
| for w in range(width): | |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0)) | |
| # labels for red boxes | |
| if red_labels: | |
| for idx, b in enumerate(red_boxes): | |
| label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1) | |
| tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3) | |
| d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255)) | |
| d.text((tx, ty), label, fill=(0,0,0)) | |
| # cyan (misspellings) | |
| for b in cyan_boxes: | |
| for w in range(width): | |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255)) | |
| # green (barcodes) | |
| if green_boxes: | |
| for b in green_boxes: | |
| for w in range(width): | |
| d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0)) | |
| return out | |
| def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image: | |
| A = np.asarray(a).copy(); B = np.asarray(b) | |
| mask = np.any(A != B, axis=2) | |
| A[mask] = [255, 0, 0] | |
| return Image.fromarray(A) | |
| # -------------------- OCR + Spellcheck ------------- | |
| from typing import List, Iterable, Optional | |
| from PIL import Image | |
| import unicodedata | |
| import regex as re | |
| import pytesseract | |
| from spellchecker import SpellChecker | |
| # If these existed in your file, keep them; otherwise define defaults to avoid NameError | |
| try: | |
| HAS_OCR | |
| except NameError: | |
| HAS_OCR = True | |
| try: | |
| HAS_SPELLCHECK | |
| except NameError: | |
| HAS_SPELLCHECK = True | |
| # ---- spell/tokenization helpers & caches ---- | |
| _WORD_RE = re.compile(r"\p{Letter}+(?:[β'\-]\p{Letter}+)*", re.UNICODE) | |
| _SPELL_EN = SpellChecker(language="en") | |
| _SPELL_FR = SpellChecker(language="fr") | |
| _DOMAIN_ALLOWLIST = { | |
| "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF", | |
| "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid" | |
| } | |
| _SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) | |
| _SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST) | |
| def _normalize_text(s: str) -> str: | |
| s = unicodedata.normalize("NFC", s) | |
| return s.replace("β", "'").strip() | |
| def _extract_tokens(raw: str): | |
| s = _normalize_text(raw or "") | |
| return _WORD_RE.findall(s) | |
| def _looks_like_acronym(tok: str) -> bool: | |
| return tok.isupper() and 2 <= len(tok) <= 6 | |
| def _has_digits(tok: str) -> bool: | |
| return any(ch.isdigit() for ch in tok) | |
| # (optional) keep a compatibility shim so any other code calling normalize_token() won't break | |
| def normalize_token(token: str) -> str: | |
| toks = _extract_tokens(token) | |
| return (toks[0].lower() if toks else "") | |
| def _get_available_tesseract_langs(): | |
| """Get available Tesseract languages""" | |
| try: | |
| langs = pytesseract.get_languages() | |
| if 'eng' in langs and 'fra' in langs: | |
| return "eng+fra" | |
| elif 'eng' in langs: | |
| return "eng" | |
| elif langs: | |
| return langs[0] | |
| else: | |
| return "eng" | |
| except Exception: | |
| return "eng" | |
| def prepare_for_ocr(img: Image.Image) -> Image.Image: | |
| """Prepare image for better OCR results""" | |
| from PIL import ImageOps, ImageFilter | |
| g = img.convert("L") | |
| g = ImageOps.autocontrast(g) | |
| g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2)) | |
| return g | |
| def find_misspell_boxes( | |
| img: Image.Image, | |
| *, | |
| min_conf: int = 60, | |
| lang: Optional[str] = None, | |
| extra_allow: Optional[Iterable[str]] = None, | |
| dpi: int = 300, | |
| psm: int = 6, | |
| oem: int = 3 | |
| ) -> List[Box]: | |
| if not (HAS_OCR and HAS_SPELLCHECK): | |
| return [] | |
| # Auto-detect language if not provided | |
| if lang is None: | |
| try: | |
| avail = set(pytesseract.get_languages(config="") or []) | |
| except Exception: | |
| avail = {"eng"} | |
| lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng" | |
| # OPTIONAL: light upscale if the image is small (heuristic) | |
| # target width ~ 2500β3000 px for letter-sized pages | |
| if img.width < 1600: | |
| scale = 2 | |
| img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS) | |
| # Prepare image for better OCR | |
| img = prepare_for_ocr(img) | |
| try: | |
| if extra_allow and _SPELL_EN: | |
| _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow) | |
| if extra_allow and _SPELL_FR: | |
| _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow) | |
| # Build a config that sets an explicit DPI and keeps spaces | |
| config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}" | |
| data = pytesseract.image_to_data( | |
| img, | |
| lang=lang, | |
| config=config, | |
| output_type=pytesseract.Output.DICT, | |
| ) | |
| except Exception: | |
| return [] | |
| n = len(data.get("text", [])) or 0 | |
| boxes: List[Box] = [] | |
| for i in range(n): | |
| raw = data["text"][i] | |
| if not raw: | |
| continue | |
| # confidence filter | |
| conf_str = data.get("conf", ["-1"])[i] | |
| try: | |
| conf = int(float(conf_str)) | |
| except Exception: | |
| conf = -1 | |
| if conf < min_conf: | |
| continue | |
| tokens = _extract_tokens(raw) | |
| if not tokens: | |
| continue | |
| # flag the box if ANY token in it looks misspelled | |
| if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens): | |
| continue | |
| left = data.get("left", [0])[i] | |
| top = data.get("top", [0])[i] | |
| width = data.get("width", [0])[i] | |
| height = data.get("height",[0])[i] | |
| if width <= 0 or height <= 0: | |
| continue | |
| # NOTE: adjust to match your Box constructor if needed | |
| boxes.append(Box(top, left, top + height, left + width, width * height)) | |
| return boxes | |
| # -------------------- Barcode / QR ----------------- | |
| def ean_like_checksum_ok(digits: str) -> bool: | |
| if not digits.isdigit(): | |
| return False | |
| n = len(digits) | |
| if n not in (8, 12, 13): | |
| return True | |
| nums = [int(c) for c in digits] | |
| if n == 8: | |
| body, check = nums[:7], nums[7] | |
| s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7)) | |
| return (10 - (s % 10)) % 10 == check | |
| if n == 12: | |
| body, check = nums[:11], nums[11] | |
| s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11)) | |
| return (10 - (s % 10)) % 10 == check | |
| if n == 13: | |
| body, check = nums[:12], nums[12] | |
| s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12)) | |
| return (10 - (s % 10)) % 10 == check | |
| return True | |
| def validate_symbology(symbology: str, data: bytes) -> bool: | |
| try: | |
| text = data.decode('utf-8', errors='ignore') | |
| except Exception: | |
| return False | |
| sym = (symbology or '').upper() | |
| if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"): | |
| return ean_like_checksum_ok(re.sub(r"\D", "", text)) | |
| if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"): | |
| return len(text) > 0 | |
| return len(text) > 0 | |
| def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box: | |
| return Box(y, x, y + h, x + w, w * h) | |
| def decode_with_variants(img: Image.Image): | |
| if not HAS_BARCODE: | |
| return [] | |
| results = [] | |
| def do_decode(pil_img): | |
| try: | |
| dec = zbar_decode(pil_img) | |
| if dec: results.extend(dec) | |
| except Exception: | |
| pass | |
| do_decode(img) | |
| if not results: do_decode(img.convert('L')) | |
| if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC)) | |
| if not results and img.mode != 'RGB': | |
| do_decode(img.convert('RGB')) | |
| return results | |
| def find_barcode_boxes_and_info(img: Image.Image): | |
| decodes = decode_with_variants(img) | |
| boxes: List[Box] = [] | |
| infos = [] | |
| for d in decodes: | |
| rect = d.rect | |
| boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height)) | |
| valid = validate_symbology(d.type, d.data) | |
| infos.append({ | |
| 'type': d.type, | |
| 'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)), | |
| 'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height, | |
| 'valid': bool(valid) | |
| }) | |
| return boxes, infos | |
| # -------------------- CMYK Panel ------------------- | |
| def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray: | |
| return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255 | |
| def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]: | |
| y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2) | |
| x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2) | |
| if y2<=y1 or x2<=x1: | |
| return (0.0,0.0,0.0,0.0) | |
| region = cmyk_arr[y1:y2, x1:x2, :] | |
| mean_vals = region.reshape(-1, 4).mean(axis=0) | |
| return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals) | |
| def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]): | |
| a_cmyk = rgb_to_cmyk_array(a_img) | |
| b_cmyk = rgb_to_cmyk_array(b_img) | |
| entries = [] | |
| for i, bx in enumerate(red_boxes): | |
| a_vals = avg_cmyk_in_box(a_cmyk, bx) | |
| b_vals = avg_cmyk_in_box(b_cmyk, bx) | |
| delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4)) | |
| entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta}) | |
| return entries | |
| def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image: | |
| w,h = base.size | |
| panel = Image.new('RGB', (panel_width, h), (245,245,245)) | |
| out = Image.new('RGB', (w+panel_width, h), (255,255,255)) | |
| out.paste(base, (0,0)); out.paste(panel, (w,0)) | |
| d = ImageDraw.Draw(out) | |
| x0 = w + 8; y = 8 | |
| d.text((x0, y), title, fill=(0,0,0)); y += 18 | |
| if not entries: | |
| d.text((x0, y), 'No differing regions', fill=(80,80,80)) | |
| return out | |
| for e in entries: | |
| idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta'] | |
| d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14 | |
| d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14 | |
| d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14 | |
| d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18 | |
| if y > h - 40: break | |
| return out | |
| # -------------------- Gradio Interface ----------------- | |
| def compare_pdfs(file_a, file_b): | |
| """Main comparison function for Gradio interface""" | |
| try: | |
| if file_a is None or file_b is None: | |
| return None, None, None, "β Please upload both PDF files to compare", [], [] | |
| # Load images with multiple pages support | |
| pages_a = load_pdf_pages(file_a.name, dpi=400, max_pages=5) | |
| pages_b = load_pdf_pages(file_b.name, dpi=400, max_pages=5) | |
| # Combine pages into single images for comparison | |
| a = combine_pages_vertically(pages_a) | |
| b = combine_pages_vertically(pages_b) | |
| # Match sizes | |
| a, b = match_sizes(a, b) | |
| # Find differences with default settings | |
| diff = difference_map(a, b) | |
| red_boxes = find_diff_boxes(diff, threshold=12, min_area=25) | |
| # Run all analysis features with defaults | |
| misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else [] | |
| misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else [] | |
| if HAS_BARCODE: | |
| bar_a, info_a = find_barcode_boxes_and_info(a) | |
| bar_b, info_b = find_barcode_boxes_and_info(b) | |
| else: | |
| bar_a, info_a = [], [] | |
| bar_b, info_b = [], [] | |
| # Always enable CMYK analysis | |
| cmyk_entries = compute_cmyk_diffs(a, b, red_boxes) | |
| labels = [e['idx'] for e in cmyk_entries] | |
| # Create visualizations with default box width | |
| a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels) | |
| b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels) | |
| # Always show CMYK panel | |
| a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') | |
| b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') | |
| # Create pixel difference overlay | |
| overlay = make_red_overlay(a, b) | |
| # Create status message | |
| status = f""" | |
| π **Analysis Complete!** | |
| - **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)} | |
| - **Difference regions found:** {len(red_boxes)} | |
| - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)} | |
| - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)} | |
| - **Combined image dimensions:** {a.width} Γ {a.height} pixels | |
| **Legend:** | |
| - π΄ Red boxes: Visual differences | |
| - π΅ Cyan boxes: Spelling errors | |
| - π’ Green boxes: Barcodes/QR codes | |
| """ | |
| # Prepare barcode data for tables | |
| codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), | |
| c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a] | |
| codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), | |
| c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b] | |
| return overlay, a_disp, b_disp, status, codes_a, codes_b | |
| except Exception as e: | |
| error_msg = f"β **Error:** {str(e)}" | |
| return None, None, None, error_msg, [], [] | |
| # -------------------- Gradio App ------------------- | |
| def create_demo(): | |
| with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π Advanced PDF Comparison Tool | |
| Upload two PDF files to get comprehensive analysis including: | |
| - **Multi-page PDF support** (up to 5 pages per document) | |
| - **Visual differences** with bounding boxes | |
| - **OCR and spell checking** | |
| - **Barcode/QR code detection** | |
| - **CMYK color analysis** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_a = gr.File(label="π PDF A (Reference)", file_types=[".pdf"]) | |
| file_b = gr.File(label="π PDF B (Comparison)", file_types=[".pdf"]) | |
| compare_btn = gr.Button("π Compare PDF Files", variant="primary", size="lg") | |
| status_md = gr.Markdown("") | |
| with gr.Row(): | |
| overlay_img = gr.Image(label="π΄ Pixel Differences (Red = Different)", type="pil") | |
| with gr.Row(): | |
| img_a = gr.Image(label="π File A with Analysis", type="pil") | |
| img_b = gr.Image(label="π File B with Analysis", type="pil") | |
| gr.Markdown("### π Barcode Detection Results") | |
| with gr.Row(): | |
| codes_a_df = gr.Dataframe( | |
| headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], | |
| label="Barcodes in File A", | |
| interactive=False | |
| ) | |
| codes_b_df = gr.Dataframe( | |
| headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], | |
| label="Barcodes in File B", | |
| interactive=False | |
| ) | |
| # Event handlers | |
| compare_btn.click( | |
| fn=compare_pdfs, | |
| inputs=[file_a, file_b], | |
| outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df] | |
| ) | |
| gr.Markdown(""" | |
| ### π Instructions: | |
| 1. Upload two PDF files | |
| 2. Click "Compare PDF Files" | |
| 3. View results with comprehensive analysis | |
| ### π¨ Color Legend: | |
| - **π΄ Red boxes:** Visual differences between files | |
| - **π΅ Cyan boxes:** Potential spelling errors (OCR) | |
| - **π’ Green boxes:** Detected barcodes/QR codes | |
| - **π Side panel:** CMYK color analysis for print workflows | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_demo() | |
| demo.launch( | |
| server_name="0.0.0.0", # Allow external access | |
| share=True, # Set to True to create a public link | |
| show_error=True | |
| ) | |