#!/usr/bin/env python3
"""
Gradio PDF Comparison Tool
Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
"""

import os, sys, re, csv, json, io
from dataclasses import dataclass
from typing import List, Tuple, Optional
import tempfile

import numpy as np
from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
from pdf2image import convert_from_path
from skimage.measure import label, regionprops
from skimage.morphology import dilation, rectangle
import gradio as gr

# Optional features
try:
    import pytesseract
    HAS_OCR = True
except Exception:
    pytesseract = None
    HAS_OCR = False

try:
    from spellchecker import SpellChecker
    HAS_SPELLCHECK = True
except Exception:
    SpellChecker = None
    HAS_SPELLCHECK = False

try:
    from pyzbar.pyzbar import decode as zbar_decode
    HAS_BARCODE = True
except Exception:
    zbar_decode = None
    HAS_BARCODE = False

# -------------------- Core Data --------------------
@dataclass
class Box:
    y1: int; x1: int; y2: int; x2: int; area: int

# -------------------- Helpers ----------------------
def _is_pdf(path: str) -> bool:
    return os.path.splitext(path.lower())[1] == ".pdf"

def load_first_page(path: str, dpi: int = 300) -> Image.Image:
    if _is_pdf(path):
        try:
            # Try with poppler_path explicitly set
            imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path="/usr/bin")
            if not imgs:
                raise ValueError(f"No pages in PDF: {path}")
            return imgs[0].convert("RGB")
        except Exception as e1:
            try:
                # Fallback: try without explicit poppler_path
                imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
                if not imgs:
                    raise ValueError(f"No pages in PDF: {path}")
                return imgs[0].convert("RGB")
            except Exception as e2:
                raise ValueError(f"Failed to convert PDF to image. Error 1: {str(e1)}. Error 2: {str(e2)}. Make sure poppler-utils is installed.")
    return Image.open(path).convert("RGB")

def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
    if a.size == b.size:
        return a, b
    w, h = min(a.width, b.width), min(a.height, b.height)
    return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))

def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
    return ImageChops.difference(a, b)

def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
    arr = np.asarray(diff_img).astype(np.uint16)
    gray = arr.max(axis=2).astype(np.uint8)
    mask = (gray >= threshold).astype(np.uint8)
    mask = dilation(mask, rectangle(3, 3))
    labeled = label(mask, connectivity=2)
    out: List[Box] = []
    for p in regionprops(labeled):
        if p.area < min_area:
                            continue
        minr, minc, maxr, maxc = p.bbox
        out.append(Box(minr, minc, maxr, maxc, int(p.area)))
    return out

def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
                     width: int = 3, red_labels: List[int] = None) -> Image.Image:
    out = img.copy(); d = ImageDraw.Draw(out)
    # red (diff)
    for b in red_boxes:
        for w in range(width):
            d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
    # labels for red boxes
    if red_labels:
        for idx, b in enumerate(red_boxes):
            label = str(red_labels[idx]) if idx < len(red_labels) else str(idx+1)
            tx = max(0, b.x1 + 3); ty = max(0, b.y1 + 3)
            d.rectangle([tx-2, ty-2, tx+14, ty+14], fill=(255,255,255))
            d.text((tx, ty), label, fill=(0,0,0))
    # cyan (misspellings)
    for b in cyan_boxes:
        for w in range(width):
            d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
    # green (barcodes)
    if green_boxes:
        for b in green_boxes:
            for w in range(width):
                d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
    return out

def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
    A = np.asarray(a).copy(); B = np.asarray(b)
    mask = np.any(A != B, axis=2)
    A[mask] = [255, 0, 0]
    return Image.fromarray(A)

# -------------------- OCR + Spellcheck -------------
def normalize_token(token: str) -> str:
    cleaned = re.sub(r"[^A-Za-z']", "", token)
    return cleaned.lower()

def find_misspell_boxes(img: Image.Image) -> List[Box]:
    if not (HAS_OCR and HAS_SPELLCHECK):
        return []
    try:
        spell = SpellChecker()
        data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
            except Exception:
            return []
        n = len(data.get("text", []))
    boxes: List[Box] = []
        for i in range(n):
        text = data["text"][i]
        if not text:
                continue
        token = normalize_token(text)
        if len(token) < 2:
                    continue
        if token in spell:
                    continue
        left  = data.get("left",  [0])[i]
        top   = data.get("top",   [0])[i]
        width = data.get("width", [0])[i]
        height= data.get("height",[0])[i]
        if width <= 0 or height <= 0:
                    continue
        boxes.append(Box(top, left, top+height, left+width, width*height))
    return boxes

# -------------------- Barcode / QR -----------------
def ean_like_checksum_ok(digits: str) -> bool:
    if not digits.isdigit():
        return False
    n = len(digits)
    if n not in (8, 12, 13):
        return True
    nums = [int(c) for c in digits]
    if n == 8:
        body, check = nums[:7], nums[7]
        s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(7))
        return (10 - (s % 10)) % 10 == check
    if n == 12:
        body, check = nums[:11], nums[11]
        s = sum(body[i] * (3 if i % 2 == 0 else 1) for i in range(11))
        return (10 - (s % 10)) % 10 == check
    if n == 13:
        body, check = nums[:12], nums[12]
        s = sum(body[i] * (1 if i % 2 == 0 else 3) for i in range(12))
        return (10 - (s % 10)) % 10 == check
    return True

def validate_symbology(symbology: str, data: bytes) -> bool:
    try:
        text = data.decode('utf-8', errors='ignore')
    except Exception:
        return False
    sym = (symbology or '').upper()
    if sym in ("EAN13","EAN-13","EAN8","EAN-8","UPCA","UPC-A"):
        return ean_like_checksum_ok(re.sub(r"\D", "", text))
    if sym in ("QRCODE","QRCODEMODEL2","QR-CODE"):
        return len(text) > 0
    return len(text) > 0

def boxes_from_rect(x: int, y: int, w: int, h: int) -> Box:
    return Box(y, x, y + h, x + w, w * h)

def decode_with_variants(img: Image.Image):
    if not HAS_BARCODE:
        return []
        results = []
    def do_decode(pil_img):
        try:
            dec = zbar_decode(pil_img)
            if dec: results.extend(dec)
        except Exception:
            pass
    do_decode(img)
    if not results:      do_decode(img.convert('L'))
    if not results:      do_decode(img.resize((img.width*2, img.height*2), Image.BICUBIC))
    if not results and img.mode != 'RGB':
        do_decode(img.convert('RGB'))
        return results
    
def find_barcode_boxes_and_info(img: Image.Image):
    decodes = decode_with_variants(img)
    boxes: List[Box] = []
    infos = []
    for d in decodes:
        rect = d.rect
        boxes.append(boxes_from_rect(rect.left, rect.top, rect.width, rect.height))
        valid = validate_symbology(d.type, d.data)
        infos.append({
            'type': d.type,
            'data': (d.data.decode('utf-8', errors='ignore') if isinstance(d.data, (bytes, bytearray)) else str(d.data)),
            'left': rect.left, 'top': rect.top, 'width': rect.width, 'height': rect.height,
            'valid': bool(valid)
        })
    return boxes, infos

# -------------------- CMYK Panel -------------------
def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
    return np.asarray(img.convert('CMYK')).astype(np.float32)  # 0..255

def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
    y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
    x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
    if y2<=y1 or x2<=x1:
        return (0.0,0.0,0.0,0.0)
    region = cmyk_arr[y1:y2, x1:x2, :]
    mean_vals = region.reshape(-1, 4).mean(axis=0)
    return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)

def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
    a_cmyk = rgb_to_cmyk_array(a_img)
    b_cmyk = rgb_to_cmyk_array(b_img)
    entries = []
    for i, bx in enumerate(red_boxes):
        a_vals = avg_cmyk_in_box(a_cmyk, bx)
        b_vals = avg_cmyk_in_box(b_cmyk, bx)
        delta  = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
        entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
    return entries

def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
    w,h = base.size
    panel = Image.new('RGB', (panel_width, h), (245,245,245))
    out = Image.new('RGB', (w+panel_width, h), (255,255,255))
    out.paste(base, (0,0)); out.paste(panel, (w,0))
    d = ImageDraw.Draw(out)
    x0 = w + 8; y = 8
    d.text((x0, y), title, fill=(0,0,0)); y += 18
    if not entries:
        d.text((x0, y), 'No differing regions', fill=(80,80,80))
        return out
    for e in entries:
        idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
        d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
        d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
        d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
        d.text((x0, y), f"Δ: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
        if y > h - 40: break
    return out

# -------------------- Gradio Interface -----------------
def compare_pdfs(file_a, file_b):
    """Main comparison function for Gradio interface"""
    try:
        if file_a is None or file_b is None:
            return None, None, None, "❌ Please upload both PDF files to compare", [], []

        # Load images with default settings
        a = load_first_page(file_a.name, dpi=300)
        b = load_first_page(file_b.name, dpi=300)

        # Match sizes
        a, b = match_sizes(a, b)

        # Find differences with default settings
        diff = difference_map(a, b)
        red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)

        # Run all analysis features with defaults
        misspell_a = find_misspell_boxes(a) if HAS_OCR and HAS_SPELLCHECK else []
        misspell_b = find_misspell_boxes(b) if HAS_OCR and HAS_SPELLCHECK else []

        if HAS_BARCODE:
            bar_a, info_a = find_barcode_boxes_and_info(a)
            bar_b, info_b = find_barcode_boxes_and_info(b)
                    else:
            bar_a, info_a = [], []
            bar_b, info_b = [], []

        # Always enable CMYK analysis
        cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)
        labels = [e['idx'] for e in cmyk_entries]

        # Create visualizations with default box width
        a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3, red_labels=labels)
        b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3, red_labels=labels)

        # Always show CMYK panel
        a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
        b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')

        # Create pixel difference overlay
        overlay = make_red_overlay(a, b)

        # Create status message
        status = f"""
        📊 **Analysis Complete!**
        - **Difference regions found:** {len(red_boxes)}
        - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
        - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
        - **Image dimensions:** {a.width} × {a.height} pixels

        **Legend:**
        - 🔴 Red boxes: Visual differences
        - 🔵 Cyan boxes: Spelling errors
        - 🟢 Green boxes: Barcodes/QR codes
        """

        # Prepare barcode data for tables
        codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
                   c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
        codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
                   c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]

        return overlay, a_disp, b_disp, status, codes_a, codes_b

        except Exception as e:
        error_msg = f"❌ **Error:** {str(e)}"
        return None, None, None, error_msg, [], []

# -------------------- Gradio App -------------------
def create_demo():
    with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🔍 Advanced PDF Comparison Tool

        Upload two PDF files to get comprehensive analysis including:
        - **Visual differences** with bounding boxes
        - **OCR and spell checking**
        - **Barcode/QR code detection**
        - **CMYK color analysis**
        """)

        with gr.Row():
            with gr.Column():
                file_a = gr.File(label="📄 PDF A (Reference)", file_types=[".pdf"])
                file_b = gr.File(label="📄 PDF B (Comparison)", file_types=[".pdf"])

        compare_btn = gr.Button("🔍 Compare PDF Files", variant="primary", size="lg")

        status_md = gr.Markdown("")

        with gr.Row():
            overlay_img = gr.Image(label="🔴 Pixel Differences (Red = Different)", type="pil")

        with gr.Row():
            img_a = gr.Image(label="📄 File A with Analysis", type="pil")
            img_b = gr.Image(label="📄 File B with Analysis", type="pil")

        gr.Markdown("### 📊 Barcode Detection Results")
        with gr.Row():
            codes_a_df = gr.Dataframe(
                headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
                label="Barcodes in File A",
                interactive=False
            )
            codes_b_df = gr.Dataframe(
                headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
                label="Barcodes in File B",
                interactive=False
            )

        # Event handlers
        compare_btn.click(
            fn=compare_pdfs,
            inputs=[file_a, file_b],
            outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
        )

        gr.Markdown("""
        ### 📝 Instructions:
        1. Upload two PDF files
        2. Click "Compare PDF Files"
        3. View results with comprehensive analysis

        ### 🎨 Color Legend:
        - **🔴 Red boxes:** Visual differences between files
        - **🔵 Cyan boxes:** Potential spelling errors (OCR)
        - **🟢 Green boxes:** Detected barcodes/QR codes
        - **📊 Side panel:** CMYK color analysis for print workflows
        """)

    return demo

if __name__ == "__main__":
    demo = create_demo()
    demo.launch(
        server_name="0.0.0.0",  # Allow external access
        share=True,  # Set to True to create a public link
        show_error=True
    )