Spaces:

SathvikGanta
/

UC2_Image_Based_PDF_omparison

Sleeping

App Files Files Community

SathvikGanta commited on Dec 2, 2024

Commit

b2bb51d

verified ·

1 Parent(s): 31e9e89

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -159

app.py CHANGED Viewed

@@ -3,114 +3,94 @@ import fitz  # PyMuPDF
 import cv2
 from pdf2image import convert_from_path
 import pytesseract
-from pytesseract import Output
 import numpy as np
 import os
 from fpdf import FPDF
-import difflib  # For text comparison
-# Convert PDFs to images
 def convert_pdf_to_images(pdf_path, dpi=300):
     images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
     return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
-# Align images
-def align_images(img1, img2):
-    gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
-    gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
-    orb = cv2.ORB_create()
-    kp1, des1 = orb.detectAndCompute(gray1, None)
-    kp2, des2 = orb.detectAndCompute(gray2, None)
-    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
-    matches = bf.match(des1, des2)
-    matches = sorted(matches, key=lambda x: x.distance)
-    src_pts = np.float32([kp1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
-    dst_pts = np.float32([kp2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)
-    matrix, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
-    # Validate if alignment is good enough
-    if matrix is None or len(matches) < 10:  # Check if sufficient matches exist
-        raise ValueError("Alignment failed. Insufficient matches between images.")
-    aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
-    return aligned_img
-# Compare visual changes
-def compare_visual_changes(orig_img, edit_img, start_position):
-    diff = cv2.absdiff(orig_img, edit_img)
-    gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
-    # Apply Gaussian blur to reduce noise
-    blurred_diff = cv2.GaussianBlur(gray_diff, (5, 5), 0)
-    # Apply thresholding
-    _, thresh = cv2.threshold(blurred_diff, 40, 255, cv2.THRESH_BINARY)
-    # Morphological operations to clean noise
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
-    cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
-    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    overlay = edit_img.copy()
-    visual_changes = []
-    position_counter = start_position
-    font = cv2.FONT_HERSHEY_SIMPLEX
-    font_scale = 0.8
-    thickness = 2
-    for cnt in contours:
-        if cv2.contourArea(cnt) > 100:  # Filter out small regions
-            x, y, w, h = cv2.boundingRect(cnt)
-            cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)  # Red bounding box
-            cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
-            visual_changes.append((position_counter, f'Visual change detected at position {position_counter}'))
-            position_counter += 1
-    return overlay, visual_changes, position_counter
-# Compare text changes with bounding boxes
-def compare_text_changes_with_boxes(orig_img, edit_img, start_position):
-    orig_data = pytesseract.image_to_data(orig_img, output_type=Output.DICT)
-    edit_data = pytesseract.image_to_data(edit_img, output_type=Output.DICT)
-    orig_text = "\n".join(orig_data['text']).splitlines()
-    edit_text = "\n".join(edit_data['text']).splitlines()
-    diff = difflib.ndiff(orig_text, edit_text)
-    overlay = edit_img.copy()
-    text_changes = []
-    position_counter = start_position
-    font = cv2.FONT_HERSHEY_SIMPLEX
-    font_scale = 0.8
-    thickness = 2
-    for line in diff:
-        if line.startswith("+ "):  # Added text
-            text = line[2:]
-            if text in edit_data['text']:
-                index = edit_data['text'].index(text)
-                x, y, w, h = edit_data['left'][index], edit_data['top'][index], edit_data['width'][index], edit_data['height'][index]
-                cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)
-                cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
-                text_changes.append((position_counter, f'"{text}" added at position {position_counter}'))
-                position_counter += 1
-        elif line.startswith("- "):  # Removed text
-            text = line[2:]
-            if text in orig_data['text']:
-                index = orig_data['text'].index(text)
-                x, y, w, h = orig_data['left'][index], orig_data['top'][index], orig_data['width'][index], orig_data['height'][index]
-                cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)
-                cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
-                text_changes.append((position_counter, f'"{text}" removed at position {position_counter}'))
-                position_counter += 1
-    return overlay, text_changes, position_counter
-# Sanitize text for PDF compatibility
-def sanitize_text(text):
-    return text.encode('latin-1', errors='replace').decode('latin-1')
 # Generate PDF report
-def generate_report(images, changes, title, output_path):
     pdf = FPDF()
     for img in images:
         temp_path = "temp_image.png"
@@ -118,76 +98,26 @@ def generate_report(images, changes, title, output_path):
         pdf.add_page()
         pdf.image(temp_path, x=10, y=10, w=190)
         os.remove(temp_path)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
-    pdf.cell(0, 10, sanitize_text(title), ln=True, align="C")
-    pdf.ln(10)
-    for _, change in changes:
-        pdf.cell(0, 10, sanitize_text(change), ln=True)
     pdf.output(output_path)
-    return output_path
-# Perform visual and text comparisons separately
-def generate_separate_comparisons(original_pdf, edited_pdf):
-    original_images = convert_pdf_to_images(original_pdf)
-    edited_images = convert_pdf_to_images(edited_pdf)
-    # Visual comparison
-    visual_combined_images = []
-    visual_changes = []
-    position_counter = 1
-    for orig_img, edit_img in zip(original_images, edited_images):
-        aligned_img = align_images(orig_img, edit_img)
-        highlighted_img, page_visual_changes, position_counter = compare_visual_changes(
-            orig_img, aligned_img, position_counter
-        )
-        visual_changes.extend(page_visual_changes)
-        visual_combined_images.append(np.hstack((orig_img, highlighted_img)))
-    # Generate visual changes report
-    visual_report_path = generate_report(
-        visual_combined_images, visual_changes, "Visual Changes", "outputs/visual_changes.pdf"
-    )
-    # Text comparison
-    text_combined_images = []
-    text_changes = []
-    position_counter = 1
-    for orig_img, edit_img in zip(original_images, edited_images):
-        aligned_img = align_images(orig_img, edit_img)
-        highlighted_img, page_text_changes, position_counter = compare_text_changes_with_boxes(
-            orig_img, aligned_img, position_counter
-        )
-        text_changes.extend(page_text_changes)
-        text_combined_images.append(np.hstack((orig_img, highlighted_img)))
-    # Generate text changes report
-    text_report_path = generate_report(
-        text_combined_images, text_changes, "Text Changes", "outputs/text_changes.pdf"
-    )
-    return visual_report_path, text_report_path
-# Gradio interface function
 def pdf_comparison(original_pdf, edited_pdf):
-    visual_path, text_path = generate_separate_comparisons(original_pdf.name, edited_pdf.name)
-    return visual_path, text_path
-# Gradio interface
 interface = gr.Interface(
     fn=pdf_comparison,
-    inputs=[
-        gr.File(label="Upload Original PDF", file_types=[".pdf"]),
-        gr.File(label="Upload Edited PDF", file_types=[".pdf"])
-    ],
-    outputs=[
-        gr.File(label="Download Visual Changes Report"),
-        gr.File(label="Download Text Changes Report")
-    ],
-    title="PDF Comparison Tool with Separate Comparisons",
-    description="Upload two PDFs: the original and the edited version. The tool generates separate reports for visual and text changes."
 )
 if __name__ == "__main__":

 import cv2
 from pdf2image import convert_from_path
 import pytesseract
 import numpy as np
 import os
 from fpdf import FPDF
+# Helper: Convert PDFs to images
 def convert_pdf_to_images(pdf_path, dpi=300):
     images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
     return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
+# Helper: Extract text and bounding boxes
+def extract_text_with_boxes(pdf_path):
+    doc = fitz.open(pdf_path)
+    text_data = []
+    for page_num, page in enumerate(doc):
+        for block in page.get_text("dict")["blocks"]:
+            for line in block["lines"]:
+                for span in line["spans"]:
+                    text_data.append({
+                        "text": span["text"],
+                        "bbox": span["bbox"],
+                        "page": page_num + 1
+                    })
+    return text_data
+# Helper: Highlight changes with bounding boxes
+def highlight_changes(img, changes):
+    overlay = img.copy()
+    for change in changes:
+        x0, y0, x1, y1 = map(int, change["bbox"])
+        cv2.rectangle(overlay, (x0, y0), (x1, y1), (0, 0, 255), 2)  # Red for changes
+        cv2.putText(
+            overlay,
+            str(change["position"]),
+            (x0, y0 - 10),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,
+            (0, 255, 0),
+            1,
+            cv2.LINE_AA,
+        )
+    return overlay
+# Text comparison logic
+def compare_texts(original_text, edited_text):
+    changes = []
+    position = 1
+    for o, e in zip(original_text, edited_text):
+        if o["text"] != e["text"]:
+            changes.append({
+                "text": f'"{e["text"]}" added' if not o["text"] else f'"{o["text"]}" removed',
+                "bbox": e["bbox"],
+                "position": position
+            })
+            position += 1
+    return changes
+# Generate reports for text and visual changes
+def generate_reports(original_pdf, edited_pdf):
+    # Process original and edited PDFs
+    original_images = convert_pdf_to_images(original_pdf)
+    edited_images = convert_pdf_to_images(edited_pdf)
+    # Extract text
+    original_text = extract_text_with_boxes(original_pdf)
+    edited_text = extract_text_with_boxes(edited_pdf)
+    # Compare text and visual changes
+    text_changes = compare_texts(original_text, edited_text)
+    # Highlight changes in images
+    text_highlighted_images = [
+        highlight_changes(edited, text_changes) for edited in edited_images
+    ]
+    # Generate separate PDF reports
+    text_pdf_path = "outputs/text_changes.pdf"
+    generate_pdf_report(text_highlighted_images, text_changes, text_pdf_path, "Text Changes")
+    return text_pdf_path
 # Generate PDF report
+def generate_pdf_report(images, changes, output_path, report_type):
     pdf = FPDF()
     for img in images:
         temp_path = "temp_image.png"
         pdf.add_page()
         pdf.image(temp_path, x=10, y=10, w=190)
         os.remove(temp_path)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
+    pdf.cell(0, 10, f"{report_type} Summary", ln=True, align="C")
+    for change in changes:
+        pdf.cell(0, 10, f'Position {change["position"]}: {change["text"]}', ln=True)
     pdf.output(output_path)
+# Gradio interface
 def pdf_comparison(original_pdf, edited_pdf):
+    text_report = generate_reports(original_pdf.name, edited_pdf.name)
+    return text_report
+# Interface
 interface = gr.Interface(
     fn=pdf_comparison,
+    inputs=[gr.File(label="Upload Original PDF"), gr.File(label="Upload Edited PDF")],
+    outputs=[gr.File(label="Download Text Changes Report")],
+    live=True
 )
 if __name__ == "__main__":