ProofCheck / pdf_comparator.py
Yaz Hobooti
Fix Unicode encoding error: replace Ξ” with Delta
4899a48
raw
history blame
16.5 kB
#!/usr/bin/env python3
"""
Gradio PDF Comparison Tool
Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
"""
import os, sys, re, csv, json, io
from dataclasses import dataclass
from typing import List, Tuple, Optional
import tempfile
import numpy as np
from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
from pdf2image import convert_from_path
from skimage.measure import label, regionprops
from skimage.morphology import dilation, rectangle
import gradio as gr
# Alternative PDF processing
try:
import fitz # PyMuPDF
HAS_PYMUPDF = True
except Exception:
fitz = None
HAS_PYMUPDF = False
# Optional features
try:
import pytesseract
HAS_OCR = True
except Exception:
pytesseract = None
HAS_OCR = False
try:
from spellchecker import SpellChecker
HAS_SPELLCHECK = True
except Exception:
SpellChecker = None
HAS_SPELLCHECK = False
try:
from pyzbar.pyzbar import decode as zbar_decode
HAS_BARCODE = True
except Exception:
zbar_decode = None
HAS_BARCODE = False
# -------------------- Core Data --------------------
@dataclass
class Box:
y1: int; x1: int; y2: int; x2: int; area: int
# -------------------- Helpers ----------------------
def _is_pdf(path: str) -> bool:
return os.path.splitext(path.lower())[1] == ".pdf"
def load_first_page(path: str, dpi: int = 300) -> Image.Image:
if _is_pdf(path):
# Try pdf2image with multiple poppler paths first
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
for poppler_path in poppler_paths:
try:
if poppler_path:
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path=poppler_path)
else:
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
if not imgs:
continue
return imgs[0].convert("RGB")
except Exception as e:
if poppler_path is None: # All pdf2image attempts failed
break
continue # Try next path
# Fallback to PyMuPDF if pdf2image fails
if HAS_PYMUPDF:
try:
doc = fitz.open(path)
page = doc[0] # First page
mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("ppm")
img = Image.open(io.BytesIO(img_data))
doc.close()
return img.convert("RGB")
except Exception as e:
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
else:
raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
raise ValueError(f"No pages in PDF: {path}")
return Image.open(path).convert("RGB")
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
if a.size == b.size:
return a, b
w, h = min(a.width, b.width), min(a.height, b.height)
return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))
def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
return ImageChops.difference(a, b)
def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
arr = np.asarray(diff_img).astype(np.uint16)
gray = arr.max(axis=2).astype(np.uint8)
mask = (gray >= threshold).astype(np.uint8)
mask = dilation(mask, rectangle(3, 3))
labeled = label(mask, connectivity=2)
out: List[Box] = []
for p in regionprops(labeled):
if p.area < min_area:
continue
minr, minc, maxr, maxc = p.bbox
out.append(Box(minr, minc, maxr, maxc, int(p.area)))
return out
def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
width: int = 3, red_labels: List[int] = None) -> Image.Image:
out = img.copy(); d = ImageDraw.Draw(out)
# red (diff)
for b in red_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
# labels for red boxes
if red_labels:
for idx, b in enumerate(red_boxes):
label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1)
tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3)
d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255))
d.text((tx, ty), label, fill=(0,0,0))
# cyan (misspellings)
for b in cyan_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
# green (barcodes)
if green_boxes:
for b in green_boxes:
for w in range(width):
d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
return out
def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
A = np.asarray(a).copy(); B = np.asarray(b)
mask = np.any(A != B, axis=2)
A[mask] = [255, 0, 0]
return Image.fromarray(A)
# -------------------- OCR + Spellcheck -------------
def normalize_token(token: str) -> str:
cleaned = re.sub(r"[^A-Za-z']", "", token)
return cleaned.lower()
def find_misspell_boxes(img: Image.Image) -> List[Box]:
if not (HAS_OCR and HAS_SPELLCHECK):
return []
try:
spell = SpellChecker()
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
except Exception:
return []
n = len(data.get("text", []))
boxes: List[Box] = []
for i in range(n):
text = data["text"][i]
if not text:
continue
token = normalize_token(text)
if len(token) < 2:
continue
if token in spell:
continue
left = data.get("left", [0])[i]
top = data.get("top", [0])[i]
width = data.get("width", [0])[i]
height= data.get("height",[0])[i]
if width <= 0 or height <= 0:
continue
boxes.append(Box(top, left, top+height, left+width, width*height))
return boxes
# -------------------- Barcode / QR -----------------
def ean_like_checksum_ok(digits: str) -> bool:
if not digits.isdigit():
return False
n = len(digits)
if n not in (8, 12, 13):
return True
nums = [int(c) for c in digits]
if n == 8:
body, check = nums[:7], nums[7]
s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7))
return (10 - (s % 10)) % 10 == check
if n == 12:
body, check = nums[:11], nums[11]
s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11))
return (10 - (s % 10)) % 10 == check
if n == 13:
body, check = nums[:12], nums[12]
s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12))
return (10 - (s % 10)) % 10 == check
return True
def validate_symbology(symbology: str, data: bytes) -> bool:
try:
text = data.decode('utf-8', errors='ignore')
except Exception:
return False
sym = (symbology or '').upper()
if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"):
return ean_like_checksum_ok(re.sub(r"\D", "", text))
if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"):
return len(text) > 0
return len(text) > 0
def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
return Box(y, x, y + h, x + w, w * h)
def decode_with_variants(img: Image.Image):
if not HAS_BARCODE:
return []
results = []
def do_decode(pil_img):
try:
dec = zbar_decode(pil_img)
if dec: results.extend(dec)
except Exception:
pass
do_decode(img)
if not results: do_decode(img.convert('L'))
if not results: do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
if not results and img.mode != 'RGB':
do_decode(img.convert('RGB'))
return results
def find_barcode_boxes_and_info(img: Image.Image):
decodes = decode_with_variants(img)
boxes: List[Box] = []
infos = []
for d in decodes:
rect = d.rect
boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height))
valid = validate_symbology(d.type, d.data)
infos.append({
'type': d.type,
'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)),
'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height,
'valid': bool(valid)
})
return boxes, infos
# -------------------- CMYK Panel -------------------
def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
return np.asarray(img.convert('CMYK')).astype(np.float32) # 0..255
def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
if y2<=y1 or x2<=x1:
return (0.0,0.0,0.0,0.0)
region = cmyk_arr[y1:y2, x1:x2, :]
mean_vals = region.reshape(-1, 4).mean(axis=0)
return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)
def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
a_cmyk = rgb_to_cmyk_array(a_img)
b_cmyk = rgb_to_cmyk_array(b_img)
entries = []
for i, bx in enumerate(red_boxes):
a_vals = avg_cmyk_in_box(a_cmyk, bx)
b_vals = avg_cmyk_in_box(b_cmyk, bx)
delta = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
return entries
def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
w,h = base.size
panel = Image.new('RGB', (panel_width, h), (245,245,245))
out = Image.new('RGB', (w+panel_width, h), (255,255,255))
out.paste(base, (0,0)); out.paste(panel, (w,0))
d = ImageDraw.Draw(out)
x0 = w + 8; y = 8
d.text((x0, y), title, fill=(0,0,0)); y += 18
if not entries:
d.text((x0, y), 'No differing regions', fill=(80,80,80))
return out
for e in entries:
idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
if y > h - 40: break
return out
# -------------------- Gradio Interface -----------------
def compare_pdfs(file_a, file_b):
"""Main comparison function for Gradio interface"""
try:
if file_a is None or file_b is None:
return None, None, None, "❌ Please upload both PDF files to compare", [], []
# Load images with default settings
a = load_first_page(file_a.name, dpi=300)
b = load_first_page(file_b.name, dpi=300)
# Match sizes
a, b = match_sizes(a, b)
# Find differences with default settings
diff = difference_map(a, b)
red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)
# Run all analysis features with defaults
misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else []
misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else []
if HAS_BARCODE:
bar_a, info_a = find_barcode_boxes_and_info(a)
bar_b, info_b = find_barcode_boxes_and_info(b)
else:
bar_a, info_a = [], []
bar_b, info_b = [], []
# Always enable CMYK analysis
cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
labels = [e['idx'] for e in cmyk_entries]
# Create visualizations with default box width
a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels)
b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels)
# Always show CMYK panel
a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
# Create pixel difference overlay
overlay = make_red_overlay(a, b)
# Create status message
status = f"""
πŸ“Š **Analysis Complete!**
- **Difference regions found:** {len(red_boxes)}
- **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
- **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
- **Image dimensions:** {a.width} Γ— {a.height} pixels
**Legend:**
- πŸ”΄ Red boxes: Visual differences
- πŸ”΅ Cyan boxes: Spelling errors
- 🟒 Green boxes: Barcodes/QR codes
"""
# Prepare barcode data for tables
codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]
return overlay, a_disp, b_disp, status, codes_a, codes_b
except Exception as e:
error_msg = f"❌ **Error:** {str(e)}"
return None, None, None, error_msg, [], []
# -------------------- Gradio App -------------------
def create_demo():
with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ” Advanced PDF Comparison Tool
Upload two PDF files to get comprehensive analysis including:
- **Visual differences** with bounding boxes
- **OCR and spell checking**
- **Barcode/QR code detection**
- **CMYK color analysis**
""")
with gr.Row():
with gr.Column():
file_a = gr.File(label="πŸ“„ PDF A (Reference)", file_types=[".pdf"])
file_b = gr.File(label="πŸ“„ PDF B (Comparison)", file_types=[".pdf"])
compare_btn = gr.Button("πŸ” Compare PDF Files", variant="primary", size="lg")
status_md = gr.Markdown("")
with gr.Row():
overlay_img = gr.Image(label="πŸ”΄ Pixel Differences (Red = Different)", type="pil")
with gr.Row():
img_a = gr.Image(label="πŸ“„ File A with Analysis", type="pil")
img_b = gr.Image(label="πŸ“„ File B with Analysis", type="pil")
gr.Markdown("### πŸ“Š Barcode Detection Results")
with gr.Row():
codes_a_df = gr.Dataframe(
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
label="Barcodes in File A",
interactive=False
)
codes_b_df = gr.Dataframe(
headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
label="Barcodes in File B",
interactive=False
)
# Event handlers
compare_btn.click(
fn=compare_pdfs,
inputs=[file_a, file_b],
outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
)
gr.Markdown("""
### πŸ“ Instructions:
1. Upload two PDF files
2. Click "Compare PDF Files"
3. View results with comprehensive analysis
### 🎨 Color Legend:
- **πŸ”΄ Red boxes:** Visual differences between files
- **πŸ”΅ Cyan boxes:** Potential spelling errors (OCR)
- **🟒 Green boxes:** Detected barcodes/QR codes
- **πŸ“Š Side panel:** CMYK color analysis for print workflows
""")
return demo
if __name__ == "__main__":
demo = create_demo()
demo.launch(
server_name="0.0.0.0", # Allow external access
share=True, # Set to True to create a public link
show_error=True
)