|
|
|
|
|
""" |
|
|
Gradio PDF Comparison Tool |
|
|
Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis. |
|
|
""" |
|
|
|
|
|
import os, sys, re, csv, json, io |
|
|
from dataclasses import dataclass |
|
|
from typing import List, Tuple, Optional |
|
|
import tempfile |
|
|
|
|
|
import numpy as np |
|
|
from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError |
|
|
from pdf2image import convert_from_path |
|
|
from skimage.measure import label, regionprops |
|
|
from skimage.morphology import dilation, rectangle |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
try: |
|
|
import pytesseract |
|
|
HAS_OCR = True |
|
|
except Exception: |
|
|
pytesseract = None |
|
|
HAS_OCR = False |
|
|
|
|
|
try: |
|
|
from spellchecker import SpellChecker |
|
|
HAS_SPELLCHECK = True |
|
|
except Exception: |
|
|
SpellChecker = None |
|
|
HAS_SPELLCHECK = False |
|
|
|
|
|
try: |
|
|
from pyzbar.pyzbar import decode as zbar_decode |
|
|
HAS_BARCODE = True |
|
|
except Exception: |
|
|
zbar_decode = None |
|
|
HAS_BARCODE = False |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class Box: |
|
|
y1: int; x1: int; y2: int; x2: int; area: int |
|
|
|
|
|
|
|
|
def _is_pdf(path: str) -> bool: |
|
|
return os.path.splitext(path.lower())[1] == ".pdf" |
|
|
|
|
|
def load_first_page(path: str, dpi: int = 300) -> Image.Image: |
|
|
if _is_pdf(path): |
|
|
try: |
|
|
|
|
|
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path="/usr/bin") |
|
|
if not imgs: |
|
|
raise ValueError(f"No pages in PDF: {path}") |
|
|
return imgs[0].convert("RGB") |
|
|
except Exception as e1: |
|
|
try: |
|
|
|
|
|
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1) |
|
|
if not imgs: |
|
|
raise ValueError(f"No pages in PDF: {path}") |
|
|
return imgs[0].convert("RGB") |
|
|
except Exception as e2: |
|
|
raise ValueError(f"Failed to convert PDF to image. Error 1: {str(e1)}. Error 2: {str(e2)}. Make sure poppler-utils is installed.") |
|
|
return Image.open(path).convert("RGB") |
|
|
|
|
|
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]: |
|
|
if a.size == b.size: |
|
|
return a, b |
|
|
w, h = min(a.width, b.width), min(a.height, b.height) |
|
|
return a.crop((0, 0, w, h)), b.crop((0, 0, w, h)) |
|
|
|
|
|
def difference_map(a: Image.Image, b: Image.Image) -> Image.Image: |
|
|
return ImageChops.difference(a, b) |
|
|
|
|
|
def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]: |
|
|
arr = np.asarray(diff_img).astype(np.uint16) |
|
|
gray = arr.max(axis=2).astype(np.uint8) |
|
|
mask = (gray >= threshold).astype(np.uint8) |
|
|
mask = dilation(mask, rectangle(3, 3)) |
|
|
labeled = label(mask, connectivity=2) |
|
|
out: List[Box] = [] |
|
|
for p in regionprops(labeled): |
|
|
if p.area < min_area: |
|
|
continue |
|
|
minr, minc, maxr, maxc = p.bbox |
|
|
out.append(Box(minr, minc, maxr, maxc, int(p.area))) |
|
|
return out |
|
|
|
|
|
def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None, |
|
|
width: int = 3, red_labels: List[int] = None) -> Image.Image: |
|
|
out = img.copy(); d = ImageDraw.Draw(out) |
|
|
|
|
|
for b in red_boxes: |
|
|
for w in range(width): |
|
|
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0)) |
|
|
|
|
|
if red_labels: |
|
|
for idx, b in enumerate(red_boxes): |
|
|
label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1) |
|
|
tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3) |
|
|
d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255)) |
|
|
d.text((tx, ty), label, fill=(0,0,0)) |
|
|
|
|
|
for b in cyan_boxes: |
|
|
for w in range(width): |
|
|
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255)) |
|
|
|
|
|
if green_boxes: |
|
|
for b in green_boxes: |
|
|
for w in range(width): |
|
|
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0)) |
|
|
return out |
|
|
|
|
|
def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image: |
|
|
A = np.asarray(a).copy(); B = np.asarray(b) |
|
|
mask = np.any(A != B, axis=2) |
|
|
A[mask] = [255, 0, 0] |
|
|
return Image.fromarray(A) |
|
|
|
|
|
|
|
|
def normalize_token(token: str) -> str: |
|
|
cleaned = re.sub(r"[^A-Za-z']", "", token) |
|
|
return cleaned.lower() |
|
|
|
|
|
def find_misspell_boxes(img: Image.Image) -> List[Box]: |
|
|
if not (HAS_OCR and HAS_SPELLCHECK): |
|
|
return [] |
|
|
try: |
|
|
spell = SpellChecker() |
|
|
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) |
|
|
except Exception: |
|
|
return [] |
|
|
n = len(data.get("text", [])) |
|
|
boxes: List[Box] = [] |
|
|
for i in range(n): |
|
|
text = data["text"][i] |
|
|
if not text: |
|
|
continue |
|
|
token = normalize_token(text) |
|
|
if len(token) < 2: |
|
|
continue |
|
|
if token in spell: |
|
|
continue |
|
|
left = data.get("left", [0])[i] |
|
|
top = data.get("top", [0])[i] |
|
|
width = data.get("width", [0])[i] |
|
|
height= data.get("height",[0])[i] |
|
|
if width <= 0 or height <= 0: |
|
|
continue |
|
|
boxes.append(Box(top, left, top+height, left+width, width*height)) |
|
|
return boxes |
|
|
|
|
|
|
|
|
def ean_like_checksum_ok(digits: str) -> bool: |
|
|
if not digits.isdigit(): |
|
|
return False |
|
|
n = len(digits) |
|
|
if n not in (8, 12, 13): |
|
|
return True |
|
|
nums = [int(c) for c in digits] |
|
|
if n == 8: |
|
|
body, check = nums[:7], nums[7] |
|
|
s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7)) |
|
|
return (10 - (s % 10)) % 10 == check |
|
|
if n == 12: |
|
|
body, check = nums[:11], nums[11] |
|
|
s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11)) |
|
|
return (10 - (s % 10)) % 10 == check |
|
|
if n == 13: |
|
|
body, check = nums[:12], nums[12] |
|
|
s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12)) |
|
|
return (10 - (s % 10)) % 10 == check |
|
|
return True |
|
|
|
|
|
def validate_symbology(symbology: str, data: bytes) -> bool: |
|
|
try: |
|
|
text = data.decode('utf-8', errors='ignore') |
|
|
except Exception: |
|
|
return False |
|
|
sym = (symbology or '').upper() |
|
|
if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"): |
|
|
return ean_like_checksum_ok(re.sub(r"\D", "", text)) |
|
|
if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"): |
|
|
return len(text) > 0 |
|
|
return len(text) > 0 |
|
|
|
|
|
def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box: |
|
|
return Box(y, x, y + h, x + w, w * h) |
|
|
|
|
|
def decode_with_variants(img: Image.Image): |
|
|
if not HAS_BARCODE: |
|
|
return [] |
|
|
results = [] |
|
|
def do_decode(pil_img): |
|
|
try: |
|
|
dec = zbar_decode(pil_img) |
|
|
if dec: results.extend(dec) |
|
|
except Exception: |
|
|
pass |
|
|
do_decode(img) |
|
|
if not results: do_decode(img.convert('L')) |
|
|
if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC)) |
|
|
if not results and img.mode != 'RGB': |
|
|
do_decode(img.convert('RGB')) |
|
|
return results |
|
|
|
|
|
def find_barcode_boxes_and_info(img: Image.Image): |
|
|
decodes = decode_with_variants(img) |
|
|
boxes: List[Box] = [] |
|
|
infos = [] |
|
|
for d in decodes: |
|
|
rect = d.rect |
|
|
boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height)) |
|
|
valid = validate_symbology(d.type, d.data) |
|
|
infos.append({ |
|
|
'type': d.type, |
|
|
'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)), |
|
|
'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height, |
|
|
'valid': bool(valid) |
|
|
}) |
|
|
return boxes, infos |
|
|
|
|
|
|
|
|
def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray: |
|
|
return np.asarray(img.convert('CMYK')).astype(np.float32) |
|
|
|
|
|
def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]: |
|
|
y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2) |
|
|
x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2) |
|
|
if y2<=y1 or x2<=x1: |
|
|
return (0.0,0.0,0.0,0.0) |
|
|
region = cmyk_arr[y1:y2, x1:x2, :] |
|
|
mean_vals = region.reshape(-1, 4).mean(axis=0) |
|
|
return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals) |
|
|
|
|
|
def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]): |
|
|
a_cmyk = rgb_to_cmyk_array(a_img) |
|
|
b_cmyk = rgb_to_cmyk_array(b_img) |
|
|
entries = [] |
|
|
for i, bx in enumerate(red_boxes): |
|
|
a_vals = avg_cmyk_in_box(a_cmyk, bx) |
|
|
b_vals = avg_cmyk_in_box(b_cmyk, bx) |
|
|
delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4)) |
|
|
entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta}) |
|
|
return entries |
|
|
|
|
|
def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image: |
|
|
w,h = base.size |
|
|
panel = Image.new('RGB', (panel_width, h), (245,245,245)) |
|
|
out = Image.new('RGB', (w+panel_width, h), (255,255,255)) |
|
|
out.paste(base, (0,0)); out.paste(panel, (w,0)) |
|
|
d = ImageDraw.Draw(out) |
|
|
x0 = w + 8; y = 8 |
|
|
d.text((x0, y), title, fill=(0,0,0)); y += 18 |
|
|
if not entries: |
|
|
d.text((x0, y), 'No differing regions', fill=(80,80,80)) |
|
|
return out |
|
|
for e in entries: |
|
|
idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta'] |
|
|
d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14 |
|
|
d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14 |
|
|
d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14 |
|
|
d.text((x0, y), f"Ξ: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18 |
|
|
if y > h - 40: break |
|
|
return out |
|
|
|
|
|
|
|
|
def compare_pdfs(file_a, file_b): |
|
|
"""Main comparison function for Gradio interface""" |
|
|
try: |
|
|
if file_a is None or file_b is None: |
|
|
return None, None, None, "β Please upload both PDF files to compare", [], [] |
|
|
|
|
|
|
|
|
a = load_first_page(file_a.name, dpi=300) |
|
|
b = load_first_page(file_b.name, dpi=300) |
|
|
|
|
|
|
|
|
a, b = match_sizes(a, b) |
|
|
|
|
|
|
|
|
diff = difference_map(a, b) |
|
|
red_boxes = find_diff_boxes(diff, threshold=12, min_area=25) |
|
|
|
|
|
|
|
|
misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else [] |
|
|
misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else [] |
|
|
|
|
|
if HAS_BARCODE: |
|
|
bar_a, info_a = find_barcode_boxes_and_info(a) |
|
|
bar_b, info_b = find_barcode_boxes_and_info(b) |
|
|
else: |
|
|
bar_a, info_a = [], [] |
|
|
bar_b, info_b = [], [] |
|
|
|
|
|
|
|
|
cmyk_entries = compute_cmyk_diffs(a, b, red_boxes) |
|
|
labels = [e['idx'] for e in cmyk_entries] |
|
|
|
|
|
|
|
|
a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels) |
|
|
b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels) |
|
|
|
|
|
|
|
|
a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') |
|
|
b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)') |
|
|
|
|
|
|
|
|
overlay = make_red_overlay(a, b) |
|
|
|
|
|
|
|
|
status = f""" |
|
|
π **Analysis Complete!** |
|
|
- **Difference regions found:** {len(red_boxes)} |
|
|
- **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)} |
|
|
- **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)} |
|
|
- **Image dimensions:** {a.width} Γ {a.height} pixels |
|
|
|
|
|
**Legend:** |
|
|
- π΄ Red boxes: Visual differences |
|
|
- π΅ Cyan boxes: Spelling errors |
|
|
- π’ Green boxes: Barcodes/QR codes |
|
|
""" |
|
|
|
|
|
|
|
|
codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), |
|
|
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a] |
|
|
codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0), |
|
|
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b] |
|
|
|
|
|
return overlay, a_disp, b_disp, status, codes_a, codes_b |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"β **Error:** {str(e)}" |
|
|
return None, None, None, error_msg, [], [] |
|
|
|
|
|
|
|
|
def create_demo(): |
|
|
with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# π Advanced PDF Comparison Tool |
|
|
|
|
|
Upload two PDF files to get comprehensive analysis including: |
|
|
- **Visual differences** with bounding boxes |
|
|
- **OCR and spell checking** |
|
|
- **Barcode/QR code detection** |
|
|
- **CMYK color analysis** |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
file_a = gr.File(label="π PDF A (Reference)", file_types=[".pdf"]) |
|
|
file_b = gr.File(label="π PDF B (Comparison)", file_types=[".pdf"]) |
|
|
|
|
|
compare_btn = gr.Button("π Compare PDF Files", variant="primary", size="lg") |
|
|
|
|
|
status_md = gr.Markdown("") |
|
|
|
|
|
with gr.Row(): |
|
|
overlay_img = gr.Image(label="π΄ Pixel Differences (Red = Different)", type="pil") |
|
|
|
|
|
with gr.Row(): |
|
|
img_a = gr.Image(label="π File A with Analysis", type="pil") |
|
|
img_b = gr.Image(label="π File B with Analysis", type="pil") |
|
|
|
|
|
gr.Markdown("### π Barcode Detection Results") |
|
|
with gr.Row(): |
|
|
codes_a_df = gr.Dataframe( |
|
|
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], |
|
|
label="Barcodes in File A", |
|
|
interactive=False |
|
|
) |
|
|
codes_b_df = gr.Dataframe( |
|
|
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"], |
|
|
label="Barcodes in File B", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
compare_btn.click( |
|
|
fn=compare_pdfs, |
|
|
inputs=[file_a, file_b], |
|
|
outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
### π Instructions: |
|
|
1. Upload two PDF files |
|
|
2. Click "Compare PDF Files" |
|
|
3. View results with comprehensive analysis |
|
|
|
|
|
### π¨ Color Legend: |
|
|
- **π΄ Red boxes:** Visual differences between files |
|
|
- **π΅ Cyan boxes:** Potential spelling errors (OCR) |
|
|
- **π’ Green boxes:** Detected barcodes/QR codes |
|
|
- **π Side panel:** CMYK color analysis for print workflows |
|
|
""") |
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = create_demo() |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
share=True, |
|
|
show_error=True |
|
|
) |
|
|
|