#!/usr/bin/env python3
"""
Gradio PDF Comparison Tool
Upload two PDF files and get comprehensive analysis including differences, OCR, barcodes, and CMYK analysis.
"""

import os, sys, re, csv, json, io
from dataclasses import dataclass
from typing import List, Tuple, Optional, Iterable
import tempfile
import unicodedata

import numpy as np
from PIL import Image, ImageChops, ImageDraw, UnidentifiedImageError
from pdf2image import convert_from_path
from skimage.measure import label, regionprops
from skimage.morphology import dilation, rectangle
import gradio as gr

# Alternative PDF processing
try:
    import fitz  # PyMuPDF
    HAS_PYMUPDF = True
except Exception:
    fitz = None
    HAS_PYMUPDF = False

# Optional features
try:
    import pytesseract
    HAS_OCR = True
except Exception:
    pytesseract = None
    HAS_OCR = False

try:
    from spellchecker import SpellChecker
    HAS_SPELLCHECK = True
except Exception:
    SpellChecker = None
    HAS_SPELLCHECK = False

try:
    import regex as re
    HAS_REGEX = True
except Exception:
    import re
    HAS_REGEX = False

try:
    from pyzbar.pyzbar import decode as zbar_decode
    HAS_BARCODE = True
except Exception:
    zbar_decode = None
    HAS_BARCODE = False

# -------------------- Core Data --------------------
@dataclass
class Box:
    y1: int; x1: int; y2: int; x2: int; area: int

# ---- spell/tokenization helpers & caches ----
if HAS_REGEX:
    # Improved regex: better word boundaries, handle apostrophes, hyphens, and spaces
    _WORD_RE = re.compile(r"\b\p{Letter}+(?:['\-]\p{Letter}+)*\b", re.UNICODE)
else:
    # Fallback regex for basic ASCII
    _WORD_RE = re.compile(r"\b[A-Za-z]+(?:['\-][A-Za-z]+)*\b")

if HAS_SPELLCHECK:
    _SPELL_EN = SpellChecker(language="en")
    try:
        _SPELL_FR = SpellChecker(language="fr")
    except Exception:
        _SPELL_FR = None
else:
    _SPELL_EN = None
    _SPELL_FR = None

_DOMAIN_ALLOWLIST = {
    "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
    "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
}
_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}

if _SPELL_EN:
    _SPELL_EN.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)
if _SPELL_FR:
    _SPELL_FR.word_frequency.load_words(_DOMAIN_ALLOWLIST_LOWER)

def _normalize_text(s: str) -> str:
    """Normalize text for better word extraction"""
    if not s:
        return ""
    
    # Unicode normalization
    s = unicodedata.normalize("NFC", s)
    
    # Fix common apostrophe issues
    s = s.replace("'", "'").replace("'", "'")
    
    # Normalize whitespace - replace multiple spaces with single space
    s = re.sub(r'\s+', ' ', s)
    
    # Remove leading/trailing whitespace
    s = s.strip()
    
    return s

def _extract_tokens(raw: str):
    """Extract word tokens with improved filtering"""
    s = _normalize_text(raw or "")
    tokens = _WORD_RE.findall(s)
    
    # Filter out tokens that are too short or don't look like words
    filtered_tokens = []
    for token in tokens:
        if len(token) >= 2 and _is_likely_word(token):
            filtered_tokens.append(token)
    
    return filtered_tokens

def _looks_like_acronym(tok: str) -> bool:
    """Check if token looks like a valid acronym"""
    return tok.isupper() and 2 <= len(tok) <= 6

def _has_digits(tok: str) -> bool:
    """Check if token contains digits"""
    return any(ch.isdigit() for ch in tok)

def _is_likely_word(tok: str) -> bool:
    """Check if token looks like a real word (not random characters)"""
    if len(tok) < 2:
        return False
    
    # Filter out tokens that are mostly non-letter characters
    letter_count = sum(1 for c in tok if c.isalpha())
    if letter_count < len(tok) * 0.6:  # At least 60% letters
        return False
    
    # Filter out tokens with too many consecutive consonants/vowels
    vowels = set('aeiouAEIOU')
    consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ')
    
    # Check for excessive consonant clusters (like "qwerty" or "zxcvb")
    if len(tok) >= 4:
        consonant_clusters = 0
        vowel_clusters = 0
        for i in range(len(tok) - 2):
            if tok[i:i+3].lower() in consonants:
                consonant_clusters += 1
            if tok[i:i+3].lower() in vowels:
                vowel_clusters += 1
        
        # If more than half the possible clusters are consonant clusters, likely not a word
        if consonant_clusters > len(tok) * 0.3:
            return False
    
    # Filter out tokens that look like random keyboard patterns
    keyboard_patterns = [
        'qwerty', 'asdfgh', 'zxcvbn', 'qwertyuiop', 'asdfghjkl', 'zxcvbnm',
        'abcdef', 'bcdefg', 'cdefgh', 'defghi', 'efghij', 'fghijk',
        '123456', '234567', '345678', '456789', '567890'
    ]
    
    tok_lower = tok.lower()
    for pattern in keyboard_patterns:
        if pattern in tok_lower or tok_lower in pattern:
            return False
    
    return True

def _is_known_word(tok: str) -> bool:
    """Check if token is a known word with improved filtering"""
    t = tok.lower()
    
    # First check if it looks like a real word
    if not _is_likely_word(tok):
        return True  # Don't flag non-words as misspellings
    
    # Check domain allowlist, acronyms, and words with digits
    if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
        return True
    
    # Check hyphenated words - if any part is known, consider the whole word known
    if '-' in tok:
        parts = tok.split('-')
        if all(_is_known_word(part) for part in parts):
            return True
    
    # Check against spell checkers
    if _SPELL_EN and not _SPELL_EN.unknown([t]):  # known in EN
        return True
    if _SPELL_FR and not _SPELL_FR.unknown([t]):  # known in FR
        return True
    
    return False

# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
def normalize_token(token: str) -> str:
    toks = _extract_tokens(token)
    return (toks[0].lower() if toks else "")

# -------------------- Helpers ----------------------
def _is_pdf(path: str) -> bool:
    return os.path.splitext(path.lower())[1] == ".pdf"

def load_pdf_pages(path: str, dpi: int = 400, max_pages: int = 5) -> List[Image.Image]:
    if _is_pdf(path):
        # Try pdf2image with multiple poppler paths first
        poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
        
        for poppler_path in poppler_paths:
            try:
                if poppler_path:
                    imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
                else:
                    imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
                
                if not imgs:
                    continue
                    
                return [img.convert("RGB") for img in imgs]
            except Exception as e:
                if poppler_path is None:  # All pdf2image attempts failed
                    break
                continue  # Try next path
        
        # Fallback to PyMuPDF if pdf2image fails
        if HAS_PYMUPDF:
            try:
                doc = fitz.open(path)
                pages = []
                for page_num in range(min(len(doc), max_pages)):
                    page = doc[page_num]
                    mat = fitz.Matrix(dpi/72, dpi/72)  # Scale factor for DPI
                    pix = page.get_pixmap(matrix=mat)
                    img_data = pix.tobytes("ppm")
                    img = Image.open(io.BytesIO(img_data))
                    pages.append(img.convert("RGB"))
                doc.close()
                return pages
            except Exception as e:
                raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
        else:
            raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
                
        raise ValueError(f"No pages in PDF: {path}")
    return [Image.open(path).convert("RGB")]

def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
    """Combine multiple pages into a single vertical image"""
    if not pages:
        raise ValueError("No pages to combine")
    if len(pages) == 1:
        return pages[0]
    
    # Find the maximum width
    max_width = max(page.width for page in pages)
    
    # Calculate total height
    total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1)
    
    # Create combined image
    combined = Image.new('RGB', (max_width, total_height), (255, 255, 255))
    
    y_offset = 0
    for page in pages:
        # Center the page horizontally if it's narrower than max_width
        x_offset = (max_width - page.width) // 2
        combined.paste(page, (x_offset, y_offset))
        y_offset += page.height + spacing
    
    return combined

def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
    if a.size == b.size:
        return a, b
    w, h = min(a.width, b.width), min(a.height, b.height)
    return a.crop((0, 0, w, h)), b.crop((0, 0, w, h))

def difference_map(a: Image.Image, b: Image.Image) -> Image.Image:
    return ImageChops.difference(a, b)

def find_diff_boxes(diff_img: Image.Image, threshold: int = 12, min_area: int = 25) -> List[Box]:
    arr = np.asarray(diff_img).astype(np.uint16)
    gray = arr.max(axis=2).astype(np.uint8)
    mask = (gray >= threshold).astype(np.uint8)
    mask = dilation(mask, rectangle(3, 3))
    labeled = label(mask, connectivity=2)
    out: List[Box] = []
    for p in regionprops(labeled):
        if p.area < min_area:
                            continue
        minr, minc, maxr, maxc = p.bbox
        out.append(Box(minr, minc, maxr, maxc, int(p.area)))
    return out

def draw_boxes_multi(img: Image.Image, red_boxes: List[Box], cyan_boxes: List[Box], green_boxes: List[Box] = None,
                     width: int = 3) -> Image.Image:
    out = img.copy(); d = ImageDraw.Draw(out)
    # red (diff)
    for b in red_boxes:
        for w in range(width):
            d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(255,0,0))
    # cyan (misspellings)
    for b in cyan_boxes:
        for w in range(width):
            d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,255))
    # green (barcodes)
    if green_boxes:
        for b in green_boxes:
            for w in range(width):
                d.rectangle([b.x1-w,b.y1-w,b.x2+w,b.y2+w], outline=(0,255,0))
    return out

def make_red_overlay(a: Image.Image, b: Image.Image) -> Image.Image:
    A = np.asarray(a).copy(); B = np.asarray(b)
    mask = np.any(A != B, axis=2)
    A[mask] = [255, 0, 0]
    return Image.fromarray(A)

# -------------------- OCR + Spellcheck -------------
from typing import List, Iterable, Optional
from PIL import Image
import unicodedata
import regex as re
import pytesseract
from spellchecker import SpellChecker

# If these existed in your file, keep them; otherwise define defaults to avoid NameError
try:
    HAS_OCR
except NameError:
    HAS_OCR = True
try:
    HAS_SPELLCHECK
except NameError:
    HAS_SPELLCHECK = True

# ---- spell/tokenization helpers & caches ----
_WORD_RE = re.compile(r"\p{Letter}+(?:[’'\-]\p{Letter}+)*", re.UNICODE)

_SPELL_EN = SpellChecker(language="en")
_SPELL_FR = SpellChecker(language="fr")

_DOMAIN_ALLOWLIST = {
    "Furry", "Fox", "Packaging", "Digitaljoint", "ProofCheck", "PDF",
    "SKU", "SKUs", "ISO", "G7", "WebCenter", "Hybrid"
}
_SPELL_EN.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)
_SPELL_FR.word_frequency.load_words(w.lower() for w in _DOMAIN_ALLOWLIST)

def _normalize_text(s: str) -> str:
    s = unicodedata.normalize("NFC", s)
    return s.replace("’", "'").strip()

def _extract_tokens(raw: str):
    s = _normalize_text(raw or "")
    return _WORD_RE.findall(s)

def _looks_like_acronym(tok: str) -> bool:
    return tok.isupper() and 2 <= len(tok) <= 6

def _has_digits(tok: str) -> bool:
    return any(ch.isdigit() for ch in tok)

# (optional) keep a compatibility shim so any other code calling normalize_token() won't break
def normalize_token(token: str) -> str:
    toks = _extract_tokens(token)
    return (toks[0].lower() if toks else "")

def _get_available_tesseract_langs():
    """Get available Tesseract languages"""
    try:
        langs = pytesseract.get_languages()
        if 'eng' in langs and 'fra' in langs:
            return "eng+fra"
        elif 'eng' in langs:
            return "eng"
        elif langs:
            return langs[0]
        else:
            return "eng"
    except Exception:
        return "eng"

def prepare_for_ocr(img: Image.Image) -> Image.Image:
    """Prepare image for better OCR results"""
    from PIL import ImageOps, ImageFilter
    g = img.convert("L")
    g = ImageOps.autocontrast(g)
    g = g.filter(ImageFilter.UnsharpMask(radius=1.0, percent=150, threshold=2))
    return g

def extract_pdf_text(path: str, max_pages: int = 5) -> List[str]:
    """Extract text directly from PDF using PyMuPDF"""
    if not HAS_PYMUPDF:
        return []
    
    try:
        doc = fitz.open(path)
        texts = []
        for page_num in range(min(len(doc), max_pages)):
            page = doc[page_num]
            text = page.get_text()
            texts.append(text)
        doc.close()
        return texts
    except Exception:
        return []

def convert_pdf_to_image_coords(pdf_bbox, pdf_page_size, image_size, page_num=0, page_height=1000):
    """Convert PDF coordinates to image coordinates"""
    pdf_width, pdf_height = pdf_page_size
    img_width, img_height = image_size
    
    # Scale factors
    scale_x = img_width / pdf_width
    scale_y = img_height / pdf_height
    
    # Convert PDF coordinates to image coordinates
    x1 = int(pdf_bbox[0] * scale_x)
    y1 = int(pdf_bbox[1] * scale_y) + (page_num * page_height)
    x2 = int(pdf_bbox[2] * scale_x)
    y2 = int(pdf_bbox[3] * scale_y) + (page_num * page_height)
    
    return x1, y1, x2, y2

def find_misspell_boxes_from_text(
    pdf_path: str,
    *,
    extra_allow: Optional[Iterable[str]] = None,
    max_pages: int = 5,
    image_size: Optional[Tuple[int, int]] = None
) -> List[Box]:
    """Find misspellings by analyzing extracted PDF text directly with coordinate mapping"""
    if not (HAS_SPELLCHECK and HAS_PYMUPDF):
        return []
    
    # Load extra allowed words
    if extra_allow and _SPELL_EN:
        _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
    if extra_allow and _SPELL_FR:
        _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)
    
    boxes: List[Box] = []
    
    try:
        doc = fitz.open(pdf_path)
        
        for page_num in range(min(len(doc), max_pages)):
            page = doc[page_num]
            
            # Get text with position information
            text_dict = page.get_text("dict")
            
            # Process each block of text
            for block in text_dict.get("blocks", []):
                if "lines" not in block:
                    continue
                    
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span.get("text", "").strip()
                        if not text:
                            continue
                            
                        # Extract tokens and check for misspellings
                        tokens = _extract_tokens(text)
                        has_misspelling = False
                        
                        for token in tokens:
                            if len(token) >= 2 and not _is_known_word(token):
                                has_misspelling = True
                                break
                        
                        # If this span has misspellings, create a box for it
                        if has_misspelling:
                            bbox = span["bbox"]  # [x0, y0, x1, y1]
                            
                            # Get page dimensions for coordinate conversion
                            page_rect = page.rect
                            pdf_width = page_rect.width
                            pdf_height = page_rect.height
                            
                            if image_size:
                                img_width, img_height = image_size
                                # Convert PDF coordinates to image coordinates
                                scale_x = img_width / pdf_width
                                scale_y = img_height / pdf_height
                                
                                x1 = int(bbox[0] * scale_x)
                                y1 = int(bbox[1] * scale_y) + (page_num * img_height)
                                x2 = int(bbox[2] * scale_x)
                                y2 = int(bbox[3] * scale_y) + (page_num * img_height)
                            else:
                                # Use PDF coordinates directly (fallback)
                                x1 = int(bbox[0])
                                y1 = int(bbox[1]) + (page_num * 1000)
                                x2 = int(bbox[2])
                                y2 = int(bbox[3]) + (page_num * 1000)
                            
                            boxes.append(Box(
                                y1=y1,
                                x1=x1,
                                y2=y2,
                                x2=x2,
                                area=(x2 - x1) * (y2 - y1)
                            ))
        
        doc.close()
        
    except Exception:
        # Fallback to simple text extraction if coordinate mapping fails
        page_texts = extract_pdf_text(pdf_path, max_pages)
        for page_num, text in enumerate(page_texts):
            if not text.strip():
                continue
                
            tokens = _extract_tokens(text)
            misspelled_words = [token for token in tokens if len(token) >= 2 and not _is_known_word(token)]
            
            if misspelled_words:
                # Create a placeholder box for the page
                boxes.append(Box(
                    y1=page_num * 1000,
                    x1=0,
                    y2=(page_num + 1) * 1000,
                    x2=800,
                    area=800 * 1000
                ))
    
    return boxes

def find_misspell_boxes(
    img: Image.Image,
    *,
    min_conf: int = 60,
    lang: Optional[str] = None,
    extra_allow: Optional[Iterable[str]] = None,
    dpi: int = 300,
    psm: int = 6,
    oem: int = 3
) -> List[Box]:
    """Legacy OCR-based spell checking (kept for fallback)"""
    if not (HAS_OCR and HAS_SPELLCHECK):
        return []
    
    # Auto-detect language if not provided
    if lang is None:
        try:
            avail = set(pytesseract.get_languages(config="") or [])
        except Exception:
            avail = {"eng"}
        lang = "eng+fra" if {"eng","fra"}.issubset(avail) else "eng"
    
    # OPTIONAL: light upscale if the image is small (heuristic)
    # target width ~ 2500–3000 px for letter-sized pages
    if img.width < 1600:
        scale = 2
        img = img.resize((img.width*scale, img.height*scale), Image.LANCZOS)
    
    # Prepare image for better OCR
    img = prepare_for_ocr(img)
    
    try:
        if extra_allow and _SPELL_EN:
            _SPELL_EN.word_frequency.load_words(w.lower() for w in extra_allow)
        if extra_allow and _SPELL_FR:
            _SPELL_FR.word_frequency.load_words(w.lower() for w in extra_allow)

        # Build a config that sets an explicit DPI and keeps spaces
        config = f"--psm {psm} --oem {oem} -c preserve_interword_spaces=1 -c user_defined_dpi={dpi}"

        data = pytesseract.image_to_data(
            img,
            lang=lang,
            config=config,
            output_type=pytesseract.Output.DICT,
        )
    except Exception:
        return []

    n = len(data.get("text", [])) or 0
    boxes: List[Box] = []

    for i in range(n):
        raw = data["text"][i]
        if not raw:
            continue

        # confidence filter
        conf_str = data.get("conf", ["-1"])[i]
        try:
            conf = int(float(conf_str))
        except Exception:
            conf = -1
        if conf < min_conf:
            continue

        tokens = _extract_tokens(raw)
        if not tokens:
            continue

        # flag the box if ANY token in it looks misspelled
        if all(_is_known_word(tok) or len(tok) < 2 for tok in tokens):
            continue

        left   = data.get("left",  [0])[i]
        top    = data.get("top",   [0])[i]
        width  = data.get("width", [0])[i]
        height = data.get("height",[0])[i]
        if width <= 0 or height <= 0:
            continue

        # NOTE: adjust to match your Box constructor if needed
        boxes.append(Box(top, left, top + height, left + width, width * height))

    return boxes


# --- Robust PDF barcode scan (page render + embedded images) ---
from typing import List, Tuple, Optional
from PIL import Image, ImageOps
import io, regex as re

try:
    from pyzbar.pyzbar import decode as zbar_decode, ZBarSymbol
    HAS_BARCODE = True
except Exception:
    HAS_BARCODE = False
    ZBarSymbol = None

try:
    import fitz  # PyMuPDF
    HAS_PYMUPDF = True
except Exception:
    HAS_PYMUPDF = False

try:
    from pylibdmtx.pylibdmtx import decode as dmtx_decode  # DataMatrix
    HAS_DMTX = True
except Exception:
    HAS_DMTX = False

# assumes you already have: class Box(y1, x1, y2, x2, area)

def _binarize(pil_img: Image.Image) -> Image.Image:
    g = ImageOps.grayscale(pil_img)
    g = ImageOps.autocontrast(g)
    # simple global threshold around midtone; adjust if needed
    return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L')

def _decode_pyzbar(img: Image.Image) -> list:
    if not HAS_BARCODE:
        return []
    symbols = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128] if ZBarSymbol else None
    res = zbar_decode(img, symbols=symbols) if symbols else zbar_decode(img)
    if res:
        return res
    # try grayscale, binarized, rotations, and 2x upscale
    variants = [ImageOps.grayscale(img), _binarize(img)]
    for v in variants:
        res = zbar_decode(v, symbols=symbols) if symbols else zbar_decode(v)
        if res: return res
        for angle in (90, 180, 270):
            r = v.rotate(angle, expand=True)
            res = zbar_decode(r, symbols=symbols) if symbols else zbar_decode(r)
            if res: return res
    w, h = img.size
    if max(w, h) < 1600:
        try:
            from PIL import Image as _PIL
            u = img.resize((w*2, h*2), resample=_PIL.Resampling.BICUBIC)
        except Exception:
            u = img.resize((w*2, h*2), resample=Image.BICUBIC)
        res = zbar_decode(u, symbols=symbols) if symbols else zbar_decode(u)
        if res: return res
    return []

def _decode_datamatrix(img: Image.Image) -> list:
    if not HAS_DMTX:
        return []
    try:
        res = dmtx_decode(ImageOps.grayscale(img))
        # shape into pyzbar-like objects
        outs = []
        for r in res:
            rect = r.rect  # (left, top, width, height)
            outs.append(type("DM", (), {
                "type": "DATAMATRIX",
                "data": r.data,
                "rect": type("R", (), {"left": rect.left, "top": rect.top, "width": rect.width, "height": rect.height})
            }))
        return outs
    except Exception:
        return []

def _decode_all(img: Image.Image) -> list:
    out = _decode_pyzbar(img)
    if not out:
        out = _decode_datamatrix(img) or out
    return out

def _pix_to_pil(pix) -> Image.Image:
    # pix: fitz.Pixmap
    if pix.alpha:  # drop alpha; reduces zbar confusion
        pix = fitz.Pixmap(pix, 0)  # copy without alpha
    # use grayscale to avoid color AA artifacts
    try:
        pix = fitz.Pixmap(fitz.csGRAY, pix)
    except Exception:
        pass
    return Image.open(io.BytesIO(pix.tobytes("ppm")))

def find_barcode_boxes_and_info_from_pdf(pdf_path: str, *, max_pages: int = 5, dpi: int = 600) -> Tuple[List["Box"], List[dict]]:
    """Render each page at high DPI + scan embedded images. Return (boxes, infos)."""
    if not HAS_PYMUPDF:
        return [], []
    boxes: List["Box"] = []
    infos: List[dict] = []
    try:
        doc = fitz.open(pdf_path)
        n_pages = min(len(doc), max_pages)
        scale = dpi / 72.0
        mat = fitz.Matrix(scale, scale)
        for page_idx in range(n_pages):
            page = doc[page_idx]

            # A) Render the page raster (grayscale, high DPI)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = _pix_to_pil(pix)
            decs = _decode_all(img)

            # B) Also try each embedded image/XObject as-is (often barcodes are placed as images)
            for xref, *_rest in page.get_images(full=True):
                try:
                    ipix = fitz.Pixmap(doc, xref)
                    pil = _pix_to_pil(ipix)
                    decs += _decode_all(pil)
                except Exception:
                    pass

            # Collect results
            for d in decs:
                rect = d.rect
                left, top, width, height = int(rect.left), int(rect.top), int(rect.width), int(rect.height)
                boxes.append(Box(top, left, top + height, left + width, width * height))
                # basic validation (you already have ean_like_checksum_ok / validate_symbology)
                try:
                    payload = d.data.decode("utf-8", errors="ignore") if isinstance(d.data, (bytes, bytearray)) else str(d.data)
                except Exception:
                    payload = ""
                infos.append({
                    "type": getattr(d, "type", "UNKNOWN"),
                    "data": payload,
                    "left": left, "top": top, "width": width, "height": height,
                    "page": page_idx + 1,
                })
        doc.close()
    except Exception:
        return [], []
    return boxes, infos


# -------------------- CMYK Panel -------------------
def rgb_to_cmyk_array(img: Image.Image) -> np.ndarray:
    return np.asarray(img.convert('CMYK')).astype(np.float32)  # 0..255

def avg_cmyk_in_box(cmyk_arr: np.ndarray, box: Box) -> Tuple[float,float,float,float]:
    y1,y2 = max(0, box.y1), min(cmyk_arr.shape[0], box.y2)
    x1,x2 = max(0, box.x1), min(cmyk_arr.shape[1], box.x2)
    if y2<=y1 or x2<=x1:
        return (0.0,0.0,0.0,0.0)
    region = cmyk_arr[y1:y2, x1:x2, :]
    mean_vals = region.reshape(-1, 4).mean(axis=0)
    return tuple(float(round(v * 100.0 / 255.0, 1)) for v in mean_vals)

def compute_cmyk_diffs(a_img: Image.Image, b_img: Image.Image, red_boxes: List[Box]):
    a_cmyk = rgb_to_cmyk_array(a_img)
    b_cmyk = rgb_to_cmyk_array(b_img)
    entries = []
    for i, bx in enumerate(red_boxes):
        a_vals = avg_cmyk_in_box(a_cmyk, bx)
        b_vals = avg_cmyk_in_box(b_cmyk, bx)
        delta  = tuple(round(b_vals[j] - a_vals[j], 1) for j in range(4))
        entries.append({'idx': i+1, 'A': a_vals, 'B': b_vals, 'Delta': delta})
    return entries

def draw_cmyk_panel(base: Image.Image, entries, title: str = 'CMYK breakdowns', panel_width: int = 260) -> Image.Image:
    w,h = base.size
    panel = Image.new('RGB', (panel_width, h), (245,245,245))
    out = Image.new('RGB', (w+panel_width, h), (255,255,255))
    out.paste(base, (0,0)); out.paste(panel, (w,0))
    d = ImageDraw.Draw(out)
    x0 = w + 8; y = 8
    d.text((x0, y), title, fill=(0,0,0)); y += 18
    if not entries:
        d.text((x0, y), 'No differing regions', fill=(80,80,80))
        return out
    for e in entries:
        idx = e['idx']; aC,aM,aY,aK = e['A']; bC,bM,bY,bK = e['B']; dC,dM,dY,dK = e['Delta']
        d.text((x0, y), f"#{idx}", fill=(0,0,0)); y += 14
        d.text((x0, y), f"A: C {aC}% M {aM}% Y {aY}% K {aK}%", fill=(0,0,0)); y += 14
        d.text((x0, y), f"B: C {bC}% M {bM}% Y {bY}% K {bK}%", fill=(0,0,0)); y += 14
        d.text((x0, y), f"Delta: C {dC}% M {dM}% Y {dY}% K {dK}%", fill=(120,0,0)); y += 18
        if y > h - 40: break
    return out

# -------------------- Gradio Interface -----------------
def compare_pdfs(file_a, file_b):
    """Main comparison function for Gradio interface"""
    try:
        if file_a is None or file_b is None:
            return None, None, None, "❌ Please upload both PDF files to compare", [], []

        # Load images with multiple pages support
        pages_a = load_pdf_pages(file_a.name, dpi=400, max_pages=5)
        pages_b = load_pdf_pages(file_b.name, dpi=400, max_pages=5)
        
        # Combine pages into single images for comparison
        a = combine_pages_vertically(pages_a)
        b = combine_pages_vertically(pages_b)

        # Match sizes
        a, b = match_sizes(a, b)

        # Find differences with default settings
        diff = difference_map(a, b)
        red_boxes = find_diff_boxes(diff, threshold=12, min_area=25)

        # Run all analysis features with defaults
        # Use text-based spell checking instead of OCR for better accuracy
        # Pass image dimensions for proper coordinate mapping
        image_size = (a.width, a.height)
        misspell_a = find_misspell_boxes_from_text(file_a.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
        misspell_b = find_misspell_boxes_from_text(file_b.name, image_size=image_size) if HAS_SPELLCHECK and HAS_PYMUPDF else []
        
        # Debug: Print spell check results
        print(f"Spell check results - A: {len(misspell_a)} boxes, B: {len(misspell_b)} boxes")

        if HAS_BARCODE:
            # Use PDF-based barcode detection instead of rasterized image
            bar_a, info_a = find_barcode_boxes_and_info_from_pdf(file_a.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(a)
            bar_b, info_b = find_barcode_boxes_and_info_from_pdf(file_b.name, image_size=image_size) if HAS_PYMUPDF else find_barcode_boxes_and_info(b)
            
            # Debug: Print barcode detection results
            print(f"Barcode detection results - A: {len(bar_a)} codes, B: {len(bar_b)} codes")
        else:
            bar_a, info_a = [], []
            bar_b, info_b = [], []

        # Always enable CMYK analysis
        cmyk_entries = compute_cmyk_diffs(a, b, red_boxes)

        # Create visualizations with default box width
        a_boxed_core = draw_boxes_multi(a, red_boxes, misspell_a, bar_a, width=3)
        b_boxed_core = draw_boxes_multi(b, red_boxes, misspell_b, bar_b, width=3)

        # Always show CMYK panel
        a_disp = draw_cmyk_panel(a_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')
        b_disp = draw_cmyk_panel(b_boxed_core, cmyk_entries, title='CMYK Analysis (A vs B)')

        # Create pixel difference overlay
        overlay = make_red_overlay(a, b)

        # Create status message
        status = f"""
        📊 **Analysis Complete!**
        - **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)}
        - **Difference regions found:** {len(red_boxes)}
        - **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
        - **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
        - **Combined image dimensions:** {a.width} × {a.height} pixels

        **Legend:**
        - 🔴 Red boxes: Visual differences
        - 🔵 Cyan boxes: Spelling errors
        - 🟢 Green boxes: Barcodes/QR codes
        """

        # Prepare barcode data for tables
        codes_a = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
                   c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_a]
        codes_b = [[c.get('type',''), c.get('data',''), c.get('left',0), c.get('top',0),
                   c.get('width',0), c.get('height',0), c.get('valid', False)] for c in info_b]

        return overlay, a_disp, b_disp, status, codes_a, codes_b

    except Exception as e:
        error_msg = f"❌ **Error:** {str(e)}"
        return None, None, None, error_msg, [], []

# -------------------- Gradio App -------------------
def create_demo():
    with gr.Blocks(title="PDF Comparison Tool", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🔍 Advanced PDF Comparison Tool

        Upload two PDF files to get comprehensive analysis including:
        - **Multi-page PDF support** (up to 5 pages per document)
        - **Visual differences** with bounding boxes
        - **OCR and spell checking**
        - **Barcode/QR code detection**
        - **CMYK color analysis**
        """)

        with gr.Row():
            with gr.Column():
                file_a = gr.File(label="📄 PDF A (Reference)", file_types=[".pdf"])
                file_b = gr.File(label="📄 PDF B (Comparison)", file_types=[".pdf"])

        compare_btn = gr.Button("🔍 Compare PDF Files", variant="primary", size="lg")

        status_md = gr.Markdown("")

        with gr.Row():
            overlay_img = gr.Image(label="🔴 Pixel Differences (Red = Different)", type="pil")

        with gr.Row():
            img_a = gr.Image(label="📄 File A with Analysis", type="pil")
            img_b = gr.Image(label="📄 File B with Analysis", type="pil")

        gr.Markdown("### 📊 Barcode Detection Results")
        with gr.Row():
            codes_a_df = gr.Dataframe(
                headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
                label="Barcodes in File A",
                interactive=False
            )
            codes_b_df = gr.Dataframe(
                headers=["Type", "Data", "Left", "Top", "Width", "Height", "Valid"],
                label="Barcodes in File B",
                interactive=False
            )

        # Event handlers
        compare_btn.click(
            fn=compare_pdfs,
            inputs=[file_a, file_b],
            outputs=[overlay_img, img_a, img_b, status_md, codes_a_df, codes_b_df]
        )

        gr.Markdown("""
        ### 📝 Instructions:
        1. Upload two PDF files
        2. Click "Compare PDF Files"
        3. View results with comprehensive analysis

        ### 🎨 Color Legend:
        - **🔴 Red boxes:** Visual differences between files
        - **🔵 Cyan boxes:** Potential spelling errors (OCR)
        - **🟢 Green boxes:** Detected barcodes/QR codes
        - **📊 Side panel:** CMYK color analysis for print workflows
        """)

    return demo

def _binarize(pil_img: Image.Image) -> Image.Image:
    """Create a binarized (black/white) version of the image for better barcode detection"""
    g = ImageOps.grayscale(pil_img)
    g = ImageOps.autocontrast(g)
    return g.point(lambda x: 255 if x > 140 else 0, mode='1').convert('L')

def _decode_once(img: Image.Image):
    """Single decode attempt with common barcode symbols"""
    if not HAS_BARCODE:
        return []
    syms = [ZBarSymbol.QRCODE, ZBarSymbol.EAN13, ZBarSymbol.EAN8, ZBarSymbol.UPCA, ZBarSymbol.CODE128]
    return zbar_decode(img, symbols=syms)

def debug_scan_pdf(pdf_path: str, outdir: str = "barcode_debug", max_pages=2):
    """
    Debug function to scan PDF at multiple DPIs and variants to diagnose barcode detection issues.
    
    This function:
    - Renders pages at 600/900/1200 DPI
    - Tries grayscale, binarized, and rotated versions
    - Scans embedded images (XObjects)
    - Prints what it finds and writes debug PNGs
    - Helps identify if barcodes are too thin/low resolution
    
    Usage:
        debug_scan_pdf("your.pdf", outdir="barcode_debug", max_pages=2)
    """
    if not (HAS_BARCODE and HAS_PYMUPDF):
        print("ERROR: Missing dependencies (pyzbar or PyMuPDF)")
        return
        
    os.makedirs(outdir, exist_ok=True)
    doc = fitz.open(pdf_path)
    
    for dpi in (600, 900, 1200):
        scale = dpi / 72.0
        mat = fitz.Matrix(scale, scale)
        print(f"\n=== DPI {dpi} ===")
        
        for p in range(min(len(doc), max_pages)):
            page = doc[p]
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.open(io.BytesIO(pix.tobytes("ppm")))
            img.save(f"{outdir}/page{p+1}_{dpi}.png")
            
            # Try different image variants
            variants = [
                ("orig", img),
                ("gray", ImageOps.grayscale(img)),
                ("bin", _binarize(img)),
            ]
            found = []
            
            for tag, v in variants:
                r = _decode_once(v)
                if r:
                    found.extend((tag, rr.type, rr.data) for rr in r)
                else:
                    # Try rotations
                    for angle in (90, 180, 270):
                        rr = _decode_once(v.rotate(angle, expand=True))
                        if rr:
                            found.extend((f"{tag}_rot{angle}", rri.type, rri.data) for rri in rr)
                            break
            
            print(f"Page {p+1}: {len(found)} hits at DPI {dpi} -> {found}")

            # Scan embedded images too
            imgs = page.get_images(full=True)
            for ix, (xref, *_) in enumerate(imgs):
                try:
                    ipix = fitz.Pixmap(doc, xref)
                    if ipix.alpha: 
                        ipix = fitz.Pixmap(ipix, 0)
                    pil = Image.open(io.BytesIO(ipix.tobytes("ppm")))
                    pil.save(f"{outdir}/page{p+1}_embed{ix+1}.png")
                    rr = _decode_once(pil) or _decode_once(_binarize(pil))
                    if rr:
                        print(f"  Embedded image {ix+1}: {[(r.type, r.data) for r in rr]}")
                except Exception as e:
                    print("  Embedded image error:", e)
    
    doc.close()
    print(f"\nDebug images saved to: {outdir}/")
    print("Open the PNGs and zoom in to check bar width. If narrow bars are <2px at 600 DPI, you need 900-1200 DPI.")

if __name__ == "__main__":
    demo = create_demo()
    demo.launch(
        server_name="0.0.0.0",  # Allow external access
        share=True,  # Set to True to create a public link
        show_error=True
    )