ProofCheck / pdf_comparator.py
Yaz Hobooti
Fix PDF processing: add poppler dev package and better error handling
bae9f7f
raw
history blame
15.6 kB
#!/usr/bin/env python3
"""
Gradio PDF Comparison Tool
Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
"""
import os, sys, re, csv, json, io
from dataclasses import dataclass
from typing import List, Tuple, Optional
import tempfile
import numpy as np
from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
from pdf2image import convert_from_path
from skimage.measure import label, regionprops
from skimage.morphology import dilation, rectangle
import gradio as gr
# Optional features
try:
import pytesseract
HAS_OCR = True
except Exception:
pytesseract = None
HAS_OCR = False
try:
from spellchecker import SpellChecker
HAS_SPELLCHECK = True
except Exception:
SpellChecker = None
HAS_SPELLCHECK = False
try:
from pyzbar.pyzbar import decode as zbar_decode
HAS_BARCODE = True
except Exception:
zbar_decode = None
HAS_BARCODE = False
# -------------------- Core Data --------------------
@dataclass
class Box:
y1: int; x1: int; y2: int; x2: int; area: int
# -------------------- Helpers ----------------------
def _is_pdf(path: str) -> bool:
return os.path.splitext(path.lower())[1] == ".pdf"
def load_first_page(path: str, dpi: int = 300) -> Image.Image:
if _is_pdf(path):
try:
# Try with poppler_path explicitly set
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path="/usr/bin")
if not imgs:
raise ValueError(f"No pages in PDF: {path}")
return imgs[0].convert("RGB")
except Exception as e1:
try:
# Fallback: try without explicit poppler_path
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
if not imgs:
raise ValueError(f"No pages in PDF: {path}")
return imgs[0].convert("RGB")
except Exception as e2:
raise ValueError(f"Failed to convert PDF to image. Error 1: {str(e1)}. Error 2: {str(e2)}. Make sure poppler-utils is installed.")
return Image.open(path).convert("RGB")
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
if a.size == b.size:
return a, b
w, h = min(a.width, b.width), min(a.height, b.height)
return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))
def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
return ImageChops.difference(a, b)
def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
arr = np.asarray(diff_img).astype(np.uint16)
gray = arr.max(axis=2).astype(np.uint8)
mask = (gray >= threshold).astype(np.uint8)
mask = dilation(mask, rectangle(3, 3))
labeled = label(mask, connectivity=2)
out: List[Box] = []
for p in regionprops(labeled):
if p.area < min_area:
continue
minr, minc, maxr, maxc = p.bbox
out.append(Box(minr, minc, maxr, maxc, int(p.area)))
return out
def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
width: int = 3, red_labels: List[int] = None) -> Image.Image:
out = img.copy(); d = ImageDraw.Draw(out)
# red (diff)
for b in red_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
# labels for red boxes
if red_labels:
for idx, b in enumerate(red_boxes):
label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1)
tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3)
d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255))
d.text((tx, ty), label, fill=(0,0,0))
# cyan (misspellings)
for b in cyan_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
# green (barcodes)
if green_boxes:
for b in green_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
return out
def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
A = np.asarray(a).copy(); B = np.asarray(b)
mask = np.any(A != B, axis=2)
A[mask] = [255, 0, 0]
return Image.fromarray(A)
# -------------------- OCR + Spellcheck -------------
def normalize_token(token: str) -> str:
cleaned = re.sub(r"[^A-Za-z']", "", token)
return cleaned.lower()
def find_misspell_boxes(img: Image.Image) -> List[Box]:
if not (HAS_OCR and HAS_SPELLCHECK):
return []
try:
spell = SpellChecker()
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
except Exception:
return []
n = len(data.get("text", []))
boxes: List[Box] = []
for i in range(n):
text = data["text"][i]
if not text:
continue
token = normalize_token(text)
if len(token) < 2:
continue
if token in spell:
continue
left = data.get("left", [0])[i]
top = data.get("top", [0])[i]
width = data.get("width", [0])[i]
height= data.get("height",[0])[i]
if width <= 0 or height <= 0:
continue
boxes.append(Box(top, left, top+height, left+width, width*height))
return boxes
# -------------------- Barcode / QR -----------------
def ean_like_checksum_ok(digits: str) -> bool:
if not digits.isdigit():
return False
n = len(digits)
if n not in (8, 12, 13):
return True
nums = [int(c) for c in digits]
if n == 8:
body, check = nums[:7], nums[7]
s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7))
return (10 - (s % 10)) % 10 == check
if n == 12:
body, check = nums[:11], nums[11]
s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11))
return (10 - (s % 10)) % 10 == check
if n == 13:
body, check = nums[:12], nums[12]
s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12))
return (10 - (s % 10)) % 10 == check
return True
def validate_symbology(symbology: str, data: bytes) -> bool:
try:
text = data.decode('utf-8', errors='ignore')
except Exception:
return False
sym = (symbology or '').upper()
if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"):
return ean_like_checksum_ok(re.sub(r"\D", "", text))
if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"):
return len(text) > 0
return len(text) > 0
def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
return Box(y, x, y + h, x + w, w * h)
def decode_with_variants(img: Image.Image):
if not HAS_BARCODE:
return []
results = []
def do_decode(pil_img):
try:
dec = zbar_decode(pil_img)
if dec: results.extend(dec)
except Exception:
pass
do_decode(img)
if not results: do_decode(img.convert('L'))
if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
if not results and img.mode != 'RGB':
do_decode(img.convert('RGB'))
return results
def find_barcode_boxes_and_info(img: Image.Image):
decodes = decode_with_variants(img)
boxes: List[Box] = []
infos = []
for d in decodes:
rect = d.rect
boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height))
valid = validate_symbology(d.type, d.data)
infos.append({
'type': d.type,
'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)),
'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height,
'valid': bool(valid)
})
return boxes, infos
# -------------------- CMYK Panel -------------------
def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255
def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
if y2<=y1 or x2<=x1:
return (0.0,0.0,0.0,0.0)
region = cmyk_arr[y1:y2, x1:x2, :]
mean_vals = region.reshape(-1, 4).mean(axis=0)
return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)
def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
a_cmyk = rgb_to_cmyk_array(a_img)
b_cmyk = rgb_to_cmyk_array(b_img)
entries = []
for i, bx in enumerate(red_boxes):
a_vals = avg_cmyk_in_box(a_cmyk, bx)
b_vals = avg_cmyk_in_box(b_cmyk, bx)
delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
return entries
def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
w,h = base.size
panel = Image.new('RGB', (panel_width, h), (245,245,245))
out = Image.new('RGB', (w+panel_width, h), (255,255,255))
out.paste(base, (0,0)); out.paste(panel, (w,0))
d = ImageDraw.Draw(out)
x0 = w + 8; y = 8
d.text((x0, y), title, fill=(0,0,0)); y += 18
if not entries:
d.text((x0, y), 'No differing regions', fill=(80,80,80))
return out
for e in entries:
idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
d.text((x0, y), f"Ξ”: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
if y > h - 40: break
return out
# -------------------- Gradio Interface -----------------
def compare_pdfs(file_a, file_b):
"""Main comparison function for Gradio interface"""
try:
if file_a is None or file_b is None:
return None, None, None, "❌ Please upload both PDF files to compare", [], []
# Load images with default settings
a = load_first_page(file_a.name, dpi=300)
b = load_first_page(file_b.name, dpi=300)
# Match sizes
a, b = match_sizes(a, b)
# Find differences with default settings
diff = difference_map(a, b)
red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)
# Run all analysis features with defaults
misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else []
misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else []
if HAS_BARCODE:
bar_a, info_a = find_barcode_boxes_and_info(a)
bar_b, info_b = find_barcode_boxes_and_info(b)
else:
bar_a, info_a = [], []
bar_b, info_b = [], []
# Always enable CMYK analysis
cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
labels = [e['idx'] for e in cmyk_entries]
# Create visualizations with default box width
a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels)
b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels)
# Always show CMYK panel
a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
# Create pixel difference overlay
overlay = make_red_overlay(a, b)
# Create status message
status = f"""
πŸ“Š **Analysis Complete!**
- **Difference regions found:** {len(red_boxes)}
- **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
- **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
- **Image dimensions:** {a.width} Γ— {a.height} pixels
**Legend:**
- πŸ”΄ Red boxes: Visual differences
- πŸ”΅ Cyan boxes: Spelling errors
- 🟒 Green boxes: Barcodes/QR codes
"""
# Prepare barcode data for tables
codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]
return overlay, a_disp, b_disp, status, codes_a, codes_b
except Exception as e:
error_msg = f"❌ **Error:** {str(e)}"
return None, None, None, error_msg, [], []
# -------------------- Gradio App -------------------
def create_demo():
with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ” Advanced PDF Comparison Tool
Upload two PDF files to get comprehensive analysis including:
- **Visual differences** with bounding boxes
- **OCR and spell checking**
- **Barcode/QR code detection**
- **CMYK color analysis**
""")
with gr.Row():
with gr.Column():
file_a = gr.File(label="πŸ“„ PDF A (Reference)", file_types=[".pdf"])
file_b = gr.File(label="πŸ“„ PDF B (Comparison)", file_types=[".pdf"])
compare_btn = gr.Button("πŸ” Compare PDF Files", variant="primary", size="lg")
status_md = gr.Markdown("")
with gr.Row():
overlay_img = gr.Image(label="πŸ”΄ Pixel Differences (Red = Different)", type="pil")
with gr.Row():
img_a = gr.Image(label="πŸ“„ File A with Analysis", type="pil")
img_b = gr.Image(label="πŸ“„ File B with Analysis", type="pil")
gr.Markdown("### πŸ“Š Barcode Detection Results")
with gr.Row():
codes_a_df = gr.Dataframe(
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
label="Barcodes in File A",
interactive=False
)
codes_b_df = gr.Dataframe(
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
label="Barcodes in File B",
interactive=False
)
# Event handlers
compare_btn.click(
fn=compare_pdfs,
inputs=[file_a, file_b],
outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
)
gr.Markdown("""
### πŸ“ Instructions:
1. Upload two PDF files
2. Click "Compare PDF Files"
3. View results with comprehensive analysis
### 🎨 Color Legend:
- **πŸ”΄ Red boxes:** Visual differences between files
- **πŸ”΅ Cyan boxes:** Potential spelling errors (OCR)
- **🟒 Green boxes:** Detected barcodes/QR codes
- **πŸ“Š Side panel:** CMYK color analysis for print workflows
""")
return demo
if __name__ == "__main__":
demo = create_demo()
demo.launch(
server_name="0.0.0.0", # Allow external access
share=True, # Set to True to create a public link
show_error=True
)