Spaces:

SathvikGanta
/

UC2_Image_Based_PDF_omparison

Sleeping

App Files Files Community

SathvikGanta commited on Dec 2, 2024

Commit

8ae85b7

verified ·

1 Parent(s): 70c61ed

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -95

app.py CHANGED Viewed

@@ -7,95 +7,110 @@ from pytesseract import Output
 import numpy as np
 import os
 from fpdf import FPDF
-# Helper: Convert PDFs to images
 def convert_pdf_to_images(pdf_path, dpi=300):
     images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
     return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
-# Helper: Extract text and bounding boxes
-def extract_text_with_boxes(pdf_path):
-    doc = fitz.open(pdf_path)
-    text_data = []
-    for page_num, page in enumerate(doc):
-        for block in page.get_text("dict")["blocks"]:
-            # Skip blocks without "lines"
-            if "lines" not in block:
-                continue
-            for line in block["lines"]:
-                for span in line["spans"]:
-                    if span["text"].strip():  # Skip empty text spans
-                        text_data.append({
-                            "text": span["text"],
-                            "bbox": span["bbox"],
-                            "page": page_num + 1
-                        })
-    return text_data
-# Helper: Highlight changes with bounding boxes
-def highlight_changes(img, changes):
-    overlay = img.copy()
-    for change in changes:
-        x0, y0, x1, y1 = map(int, change["bbox"])
-        cv2.rectangle(overlay, (x0, y0), (x1, y1), (0, 0, 255), 2)  # Red for changes
-        cv2.putText(
-            overlay,
-            str(change["position"]),
-            (x0, y0 - 10),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.5,
-            (0, 255, 0),
-            1,
-            cv2.LINE_AA,
-        )
-    return overlay
-# Text comparison logic
-def compare_texts(original_text, edited_text):
-    changes = []
-    position = 1
-    for o, e in zip(original_text, edited_text):
-        if o["text"] != e["text"]:
-            changes.append({
-                "text": f'"{e["text"]}" added' if not o["text"] else f'"{o["text"]}" removed',
-                "bbox": e["bbox"],
-                "position": position
-            })
-            position += 1
-    return changes
-# Generate reports for text and visual changes
-def generate_reports(original_pdf, edited_pdf):
-    # Process original and edited PDFs
-    original_images = convert_pdf_to_images(original_pdf)
-    edited_images = convert_pdf_to_images(edited_pdf)
-    # Extract text
-    original_text = extract_text_with_boxes(original_pdf)
-    edited_text = extract_text_with_boxes(edited_pdf)
-    # Compare text and visual changes
-    text_changes = compare_texts(original_text, edited_text)
-    # Highlight changes in images
-    text_highlighted_images = [
-        highlight_changes(edited, text_changes) for edited in edited_images
-    ]
-    # Generate separate PDF reports
-    text_pdf_path = "outputs/text_changes.pdf"
-    generate_pdf_report(text_highlighted_images, text_changes, text_pdf_path, "Text Changes")
-    return text_pdf_path
 # Generate PDF report
-def generate_pdf_report(images, changes, output_path, report_type):
     pdf = FPDF()
     for img in images:
         temp_path = "temp_image.png"
@@ -103,28 +118,76 @@ def generate_pdf_report(images, changes, output_path, report_type):
         pdf.add_page()
         pdf.image(temp_path, x=10, y=10, w=190)
         os.remove(temp_path)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
-    pdf.cell(0, 10, f"{report_type} Summary", ln=True, align="C")
-    for change in changes:
-        pdf.cell(0, 10, f'Position {change["position"]}: {change["text"]}', ln=True)
     pdf.output(output_path)
-# Gradio interface
-def pdf_comparison(original_pdf, edited_pdf):
-    if original_pdf is None or edited_pdf is None:
-        return "Error: Please upload both PDFs."
-    text_report = generate_reports(original_pdf.name, edited_pdf.name)
-    return text_report
-# Interface
 interface = gr.Interface(
     fn=pdf_comparison,
-    inputs=[gr.File(label="Upload Original PDF"), gr.File(label="Upload Edited PDF")],
-    outputs=[gr.File(label="Download Text Changes Report")],
-    live=True
 )
 if __name__ == "__main__":

 import numpy as np
 import os
 from fpdf import FPDF
+import difflib  # For text comparison
+# Convert PDFs to images
 def convert_pdf_to_images(pdf_path, dpi=300):
     images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
     return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
+# Align images
+def align_images(img1, img2):
+    gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
+    gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
+    orb = cv2.ORB_create()
+    kp1, des1 = orb.detectAndCompute(gray1, None)
+    kp2, des2 = orb.detectAndCompute(gray2, None)
+    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
+    matches = bf.match(des1, des2)
+    matches = sorted(matches, key=lambda x: x.distance)
+    src_pts = np.float32([kp1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
+    dst_pts = np.float32([kp2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)
+    matrix, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
+    # Validate if alignment is good enough
+    if matrix is None or len(matches) < 10:  # Check if sufficient matches exist
+        raise ValueError("Alignment failed. Insufficient matches between images.")
+    aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
+    return aligned_img
+# Compare visual changes
+def compare_visual_changes(orig_img, edit_img, start_position):
+    diff = cv2.absdiff(orig_img, edit_img)
+    gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
+    # Apply Gaussian blur to reduce noise
+    blurred_diff = cv2.GaussianBlur(gray_diff, (5, 5), 0)
+    # Apply thresholding
+    _, thresh = cv2.threshold(blurred_diff, 40, 255, cv2.THRESH_BINARY)
+    # Morphological operations to clean noise
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+    cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
+    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    overlay = edit_img.copy()
+    visual_changes = []
+    position_counter = start_position
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.8
+    thickness = 2
+    for cnt in contours:
+        if cv2.contourArea(cnt) > 100:  # Filter out small regions
+            x, y, w, h = cv2.boundingRect(cnt)
+            cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)  # Red bounding box
+            cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
+            visual_changes.append((position_counter, f'Visual change detected at position {position_counter}'))
+            position_counter += 1
+    return overlay, visual_changes, position_counter
+# Compare text changes with bounding boxes
+def compare_text_changes_with_boxes(orig_img, edit_img, start_position):
+    orig_data = pytesseract.image_to_data(orig_img, output_type=Output.DICT)
+    edit_data = pytesseract.image_to_data(edit_img, output_type=Output.DICT)
+    orig_text = "\n".join(orig_data['text']).splitlines()
+    edit_text = "\n".join(edit_data['text']).splitlines()
+    diff = difflib.ndiff(orig_text, edit_text)
+    overlay = edit_img.copy()
+    text_changes = []
+    position_counter = start_position
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.8
+    thickness = 2
+    for line in diff:
+        if line.startswith("+ "):  # Added text
+            text = line[2:]
+            if text in edit_data['text']:
+                index = edit_data['text'].index(text)
+                x, y, w, h = edit_data['left'][index], edit_data['top'][index], edit_data['width'][index], edit_data['height'][index]
+                cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)
+                cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
+                text_changes.append((position_counter, f'"{text}" added at position {position_counter}'))
+                position_counter += 1
+        elif line.startswith("- "):  # Removed text
+            text = line[2:]
+            if text in orig_data['text']:
+                index = orig_data['text'].index(text)
+                x, y, w, h = orig_data['left'][index], orig_data['top'][index], orig_data['width'][index], orig_data['height'][index]
+                cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)
+                cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
+                text_changes.append((position_counter, f'"{text}" removed at position {position_counter}'))
+                position_counter += 1
+    return overlay, text_changes, position_counter
+# Sanitize text for PDF compatibility
+def sanitize_text(text):
+    return text.encode('latin-1', errors='replace').decode('latin-1')
 # Generate PDF report
+def generate_report(images, changes, title, output_path):
     pdf = FPDF()
     for img in images:
         temp_path = "temp_image.png"
         pdf.add_page()
         pdf.image(temp_path, x=10, y=10, w=190)
         os.remove(temp_path)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
+    pdf.cell(0, 10, sanitize_text(title), ln=True, align="C")
+    pdf.ln(10)
+    for _, change in changes:
+        pdf.cell(0, 10, sanitize_text(change), ln=True)
     pdf.output(output_path)
+    return output_path
+# Perform visual and text comparisons separately
+def generate_separate_comparisons(original_pdf, edited_pdf):
+    original_images = convert_pdf_to_images(original_pdf)
+    edited_images = convert_pdf_to_images(edited_pdf)
+    # Visual comparison
+    visual_combined_images = []
+    visual_changes = []
+    position_counter = 1
+    for orig_img, edit_img in zip(original_images, edited_images):
+        aligned_img = align_images(orig_img, edit_img)
+        highlighted_img, page_visual_changes, position_counter = compare_visual_changes(
+            orig_img, aligned_img, position_counter
+        )
+        visual_changes.extend(page_visual_changes)
+        visual_combined_images.append(np.hstack((orig_img, highlighted_img)))
+    # Generate visual changes report
+    visual_report_path = generate_report(
+        visual_combined_images, visual_changes, "Visual Changes", "outputs/visual_changes.pdf"
+    )
+    # Text comparison
+    text_combined_images = []
+    text_changes = []
+    position_counter = 1
+    for orig_img, edit_img in zip(original_images, edited_images):
+        aligned_img = align_images(orig_img, edit_img)
+        highlighted_img, page_text_changes, position_counter = compare_text_changes_with_boxes(
+            orig_img, aligned_img, position_counter
+        )
+        text_changes.extend(page_text_changes)
+        text_combined_images.append(np.hstack((orig_img, highlighted_img)))
+    # Generate text changes report
+    text_report_path = generate_report(
+        text_combined_images, text_changes, "Text Changes", "outputs/text_changes.pdf"
+    )
+    return visual_report_path, text_report_path
+# Gradio interface function
+def pdf_comparison(original_pdf, edited_pdf):
+    visual_path, text_path = generate_separate_comparisons(original_pdf.name, edited_pdf.name)
+    return visual_path, text_path
+# Gradio interface
 interface = gr.Interface(
     fn=pdf_comparison,
+    inputs=[
+        gr.File(label="Upload Original PDF", file_types=[".pdf"]),
+        gr.File(label="Upload Edited PDF", file_types=[".pdf"])
+    ],
+    outputs=[
+        gr.File(label="Download Visual Changes Report"),
+        gr.File(label="Download Text Changes Report")
+    ],
+    title="PDF Comparison Tool with Separate Comparisons",
+    description="Upload two PDFs: the original and the edited version. The tool generates separate reports for visual and text changes."
 )
 if __name__ == "__main__":