Yaz Hobooti
commited on
Commit
Β·
def48ce
1
Parent(s):
07087d8
Add multi-page PDF support: process up to 5 pages and combine vertically
Browse files- pdf_comparator.py +49 -15
pdf_comparator.py
CHANGED
|
@@ -114,7 +114,7 @@ def normalize_token(token: str) -> str:
|
|
| 114 |
def _is_pdf(path: str) -> bool:
|
| 115 |
return os.path.splitext(path.lower())[1] == ".pdf"
|
| 116 |
|
| 117 |
-
def
|
| 118 |
if _is_pdf(path):
|
| 119 |
# Try pdf2image with multiple poppler paths first
|
| 120 |
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
|
@@ -122,14 +122,14 @@ def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
|
| 122 |
for poppler_path in poppler_paths:
|
| 123 |
try:
|
| 124 |
if poppler_path:
|
| 125 |
-
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=
|
| 126 |
else:
|
| 127 |
-
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=
|
| 128 |
|
| 129 |
if not imgs:
|
| 130 |
continue
|
| 131 |
|
| 132 |
-
return
|
| 133 |
except Exception as e:
|
| 134 |
if poppler_path is None: # All pdf2image attempts failed
|
| 135 |
break
|
|
@@ -139,20 +139,48 @@ def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
|
| 139 |
if HAS_PYMUPDF:
|
| 140 |
try:
|
| 141 |
doc = fitz.open(path)
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
| 147 |
doc.close()
|
| 148 |
-
return
|
| 149 |
except Exception as e:
|
| 150 |
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
|
| 151 |
else:
|
| 152 |
raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
|
| 153 |
|
| 154 |
raise ValueError(f"No pages in PDF: {path}")
|
| 155 |
-
return Image.open(path).convert("RGB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
|
| 158 |
if a.size == b.size:
|
|
@@ -455,9 +483,13 @@ def compare_pdfs(file_a, file_b):
|
|
| 455 |
if file_a is None or file_b is None:
|
| 456 |
return None, None, None, "β Please upload both PDF files to compare", [], []
|
| 457 |
|
| 458 |
-
# Load images with
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
|
| 462 |
# Match sizes
|
| 463 |
a, b = match_sizes(a, b)
|
|
@@ -495,10 +527,11 @@ def compare_pdfs(file_a, file_b):
|
|
| 495 |
# Create status message
|
| 496 |
status = f"""
|
| 497 |
π **Analysis Complete!**
|
|
|
|
| 498 |
- **Difference regions found:** {len(red_boxes)}
|
| 499 |
- **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
|
| 500 |
- **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
|
| 501 |
-
- **
|
| 502 |
|
| 503 |
**Legend:**
|
| 504 |
- π΄ Red boxes: Visual differences
|
|
@@ -525,6 +558,7 @@ def create_demo():
|
|
| 525 |
# π Advanced PDF Comparison Tool
|
| 526 |
|
| 527 |
Upload two PDF files to get comprehensive analysis including:
|
|
|
|
| 528 |
- **Visual differences** with bounding boxes
|
| 529 |
- **OCR and spell checking**
|
| 530 |
- **Barcode/QR code detection**
|
|
|
|
| 114 |
def _is_pdf(path: str) -> bool:
|
| 115 |
return os.path.splitext(path.lower())[1] == ".pdf"
|
| 116 |
|
| 117 |
+
def load_pdf_pages(path: str, dpi: int = 300, max_pages: int = 5) -> List[Image.Image]:
|
| 118 |
if _is_pdf(path):
|
| 119 |
# Try pdf2image with multiple poppler paths first
|
| 120 |
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
|
|
|
| 122 |
for poppler_path in poppler_paths:
|
| 123 |
try:
|
| 124 |
if poppler_path:
|
| 125 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages, poppler_path=poppler_path)
|
| 126 |
else:
|
| 127 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=max_pages)
|
| 128 |
|
| 129 |
if not imgs:
|
| 130 |
continue
|
| 131 |
|
| 132 |
+
return [img.convert("RGB") for img in imgs]
|
| 133 |
except Exception as e:
|
| 134 |
if poppler_path is None: # All pdf2image attempts failed
|
| 135 |
break
|
|
|
|
| 139 |
if HAS_PYMUPDF:
|
| 140 |
try:
|
| 141 |
doc = fitz.open(path)
|
| 142 |
+
pages = []
|
| 143 |
+
for page_num in range(min(len(doc), max_pages)):
|
| 144 |
+
page = doc[page_num]
|
| 145 |
+
mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
|
| 146 |
+
pix = page.get_pixmap(matrix=mat)
|
| 147 |
+
img_data = pix.tobytes("ppm")
|
| 148 |
+
img = Image.open(io.BytesIO(img_data))
|
| 149 |
+
pages.append(img.convert("RGB"))
|
| 150 |
doc.close()
|
| 151 |
+
return pages
|
| 152 |
except Exception as e:
|
| 153 |
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
|
| 154 |
else:
|
| 155 |
raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
|
| 156 |
|
| 157 |
raise ValueError(f"No pages in PDF: {path}")
|
| 158 |
+
return [Image.open(path).convert("RGB")]
|
| 159 |
+
|
| 160 |
+
def combine_pages_vertically(pages: List[Image.Image], spacing: int = 20) -> Image.Image:
|
| 161 |
+
"""Combine multiple pages into a single vertical image"""
|
| 162 |
+
if not pages:
|
| 163 |
+
raise ValueError("No pages to combine")
|
| 164 |
+
if len(pages) == 1:
|
| 165 |
+
return pages[0]
|
| 166 |
+
|
| 167 |
+
# Find the maximum width
|
| 168 |
+
max_width = max(page.width for page in pages)
|
| 169 |
+
|
| 170 |
+
# Calculate total height
|
| 171 |
+
total_height = sum(page.height for page in pages) + spacing * (len(pages) - 1)
|
| 172 |
+
|
| 173 |
+
# Create combined image
|
| 174 |
+
combined = Image.new('RGB', (max_width, total_height), (255, 255, 255))
|
| 175 |
+
|
| 176 |
+
y_offset = 0
|
| 177 |
+
for page in pages:
|
| 178 |
+
# Center the page horizontally if it's narrower than max_width
|
| 179 |
+
x_offset = (max_width - page.width) // 2
|
| 180 |
+
combined.paste(page, (x_offset, y_offset))
|
| 181 |
+
y_offset += page.height + spacing
|
| 182 |
+
|
| 183 |
+
return combined
|
| 184 |
|
| 185 |
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
|
| 186 |
if a.size == b.size:
|
|
|
|
| 483 |
if file_a is None or file_b is None:
|
| 484 |
return None, None, None, "β Please upload both PDF files to compare", [], []
|
| 485 |
|
| 486 |
+
# Load images with multiple pages support
|
| 487 |
+
pages_a = load_pdf_pages(file_a.name, dpi=300, max_pages=5)
|
| 488 |
+
pages_b = load_pdf_pages(file_b.name, dpi=300, max_pages=5)
|
| 489 |
+
|
| 490 |
+
# Combine pages into single images for comparison
|
| 491 |
+
a = combine_pages_vertically(pages_a)
|
| 492 |
+
b = combine_pages_vertically(pages_b)
|
| 493 |
|
| 494 |
# Match sizes
|
| 495 |
a, b = match_sizes(a, b)
|
|
|
|
| 527 |
# Create status message
|
| 528 |
status = f"""
|
| 529 |
π **Analysis Complete!**
|
| 530 |
+
- **Pages processed:** A: {len(pages_a)}, B: {len(pages_b)}
|
| 531 |
- **Difference regions found:** {len(red_boxes)}
|
| 532 |
- **Misspellings detected:** A: {len(misspell_a)}, B: {len(misspell_b)}
|
| 533 |
- **Barcodes found:** A: {len(bar_a)}, B: {len(bar_b)}
|
| 534 |
+
- **Combined image dimensions:** {a.width} Γ {a.height} pixels
|
| 535 |
|
| 536 |
**Legend:**
|
| 537 |
- π΄ Red boxes: Visual differences
|
|
|
|
| 558 |
# π Advanced PDF Comparison Tool
|
| 559 |
|
| 560 |
Upload two PDF files to get comprehensive analysis including:
|
| 561 |
+
- **Multi-page PDF support** (up to 5 pages per document)
|
| 562 |
- **Visual differences** with bounding boxes
|
| 563 |
- **OCR and spell checking**
|
| 564 |
- **Barcode/QR code detection**
|