Spaces:

SathvikGanta
/

UC2_Image_Based_PDF_omparison

Sleeping

App Files Files Community

SathvikGanta commited on Dec 2, 2024

Commit

ee5f2b1

verified ·

1 Parent(s): 78d7db8

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -55

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ def align_images(img1, img2):
     aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
     return aligned_img
-# Compare images with noise reduction and filtering
 def compare_images(img1, img2):
     diff = cv2.absdiff(img1, img2)
     gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
@@ -51,22 +51,22 @@ def compare_images(img1, img2):
     return cleaned
-# Generate text-based differences
-def generate_text_differences(orig_text, edit_text, start_position):
     diff = difflib.ndiff(orig_text.splitlines(), edit_text.splitlines())
-    changes = []
-    position_number = start_position
     for line in diff:
         if line.startswith("+ "):  # Added text
-            changes.append((position_number, f'"{line[2:]}" added at {position_number}'))
         elif line.startswith("- "):  # Removed text
-            changes.append((position_number, f'"{line[2:]}" removed at {position_number}'))
-        position_number += 1
-    return changes, position_number
 # Highlight visual changes
-def highlight_visual_changes(orig_img, edit_img, mask, start_position):
-    overlay = edit_img.copy()
     contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     visual_changes = []
     font = cv2.FONT_HERSHEY_SIMPLEX
@@ -84,40 +84,9 @@ def highlight_visual_changes(orig_img, edit_img, mask, start_position):
     return overlay, visual_changes, position_counter
-# Sanitize text for PDF compatibility
-def sanitize_text(text):
-    """Sanitize text for FPDF by replacing unsupported characters."""
-    return text.encode('latin-1', errors='replace').decode('latin-1')
-# Generate separate PDFs for visual and text changes
-def generate_separate_pdfs(original_pdf, edited_pdf):
-    original_images = convert_pdf_to_images(original_pdf)
-    edited_images = convert_pdf_to_images(edited_pdf)
-    combined_images = []
-    visual_changes = []  # Visual changes summary
-    text_changes = []  # Text-based changes summary
-    position_counter = 1
-    for orig_img, edit_img in zip(original_images, edited_images):
-        aligned_img = align_images(orig_img, edit_img)
-        diff_mask = compare_images(orig_img, aligned_img)
-        highlighted_img, page_visual_changes, position_counter = highlight_visual_changes(
-            orig_img, edit_img, diff_mask, position_counter
-        )
-        text_differences, position_counter = generate_text_differences(
-            pytesseract.image_to_string(orig_img), pytesseract.image_to_string(edit_img), position_counter
-        )
-        visual_changes.extend(page_visual_changes)
-        text_changes.extend(text_differences)
-        # Ensure dimensions match
-        height = min(orig_img.shape[0], highlighted_img.shape[0])
-        orig_img_resized = orig_img[:height]
-        highlighted_img_resized = highlighted_img[:height]
-        combined_images.append(np.hstack((orig_img_resized, highlighted_img_resized)))
-    # Generate Visual Changes PDF
-    visual_pdf_path = "outputs/visual_changes.pdf"
     pdf_visual = FPDF()
     for img in combined_images:
         temp_path = "temp_image_visual.png"
@@ -127,14 +96,16 @@ def generate_separate_pdfs(original_pdf, edited_pdf):
         os.remove(temp_path)
     pdf_visual.add_page()
     pdf_visual.set_font("Arial", size=12)
-    pdf_visual.cell(0, 10, sanitize_text("Visual Changes"), ln=True, align="C")
     pdf_visual.ln(10)
     for _, change in visual_changes:
         pdf_visual.cell(0, 10, sanitize_text(change), ln=True)
-    pdf_visual.output(visual_pdf_path)
-    # Generate Text Changes PDF
-    text_pdf_path = "outputs/text_changes.pdf"
     pdf_text = FPDF()
     for img in combined_images:
         temp_path = "temp_image_text.png"
@@ -144,17 +115,53 @@ def generate_separate_pdfs(original_pdf, edited_pdf):
         os.remove(temp_path)
     pdf_text.add_page()
     pdf_text.set_font("Arial", size=12)
-    pdf_text.cell(0, 10, sanitize_text("Text Changes"), ln=True, align="C")
     pdf_text.ln(10)
     for _, change in text_changes:
         pdf_text.cell(0, 10, sanitize_text(change), ln=True)
-    pdf_text.output(text_pdf_path)
-    return visual_pdf_path, text_pdf_path
 # Gradio interface function
 def pdf_comparison(original_pdf, edited_pdf):
-    visual_path, text_path = generate_separate_pdfs(original_pdf.name, edited_pdf.name)
     return visual_path, text_path
 # Gradio interface
@@ -168,8 +175,8 @@ interface = gr.Interface(
         gr.File(label="Download Visual Changes Report"),
         gr.File(label="Download Text Changes Report")
     ],
-    title="PDF Comparison Tool with Separate Reports",
-    description="Upload two PDFs: the original and the edited version. The tool generates two separate reports: one for visual changes and another for text changes."
 )
 if __name__ == "__main__":

     aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
     return aligned_img
+# Compare images for visual changes
 def compare_images(img1, img2):
     diff = cv2.absdiff(img1, img2)
     gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
     return cleaned
+# Compare text and generate differences
+def compare_text(orig_text, edit_text, start_position):
     diff = difflib.ndiff(orig_text.splitlines(), edit_text.splitlines())
+    text_changes = []
+    position_counter = start_position
     for line in diff:
         if line.startswith("+ "):  # Added text
+            text_changes.append((position_counter, f'"{line[2:]}" added at {position_counter}'))
         elif line.startswith("- "):  # Removed text
+            text_changes.append((position_counter, f'"{line[2:]}" removed at {position_counter}'))
+        position_counter += 1
+    return text_changes, position_counter
 # Highlight visual changes
+def highlight_visual_changes(img1, img2, mask, start_position):
+    overlay = img2.copy()
     contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     visual_changes = []
     font = cv2.FONT_HERSHEY_SIMPLEX
     return overlay, visual_changes, position_counter
+# Generate visual changes report
+def generate_visual_report(original_images, edited_images, combined_images, visual_changes):
+    output_path = "outputs/visual_changes.pdf"
     pdf_visual = FPDF()
     for img in combined_images:
         temp_path = "temp_image_visual.png"
         os.remove(temp_path)
     pdf_visual.add_page()
     pdf_visual.set_font("Arial", size=12)
+    pdf_visual.cell(0, 10, "Visual Changes", ln=True, align="C")
     pdf_visual.ln(10)
     for _, change in visual_changes:
         pdf_visual.cell(0, 10, sanitize_text(change), ln=True)
+    pdf_visual.output(output_path)
+    return output_path
+# Generate text changes report
+def generate_text_report(original_images, edited_images, combined_images, text_changes):
+    output_path = "outputs/text_changes.pdf"
     pdf_text = FPDF()
     for img in combined_images:
         temp_path = "temp_image_text.png"
         os.remove(temp_path)
     pdf_text.add_page()
     pdf_text.set_font("Arial", size=12)
+    pdf_text.cell(0, 10, "Text Changes", ln=True, align="C")
     pdf_text.ln(10)
     for _, change in text_changes:
         pdf_text.cell(0, 10, sanitize_text(change), ln=True)
+    pdf_text.output(output_path)
+    return output_path
+# Generate separate PDFs for visual and text changes
+def generate_separate_comparisons(original_pdf, edited_pdf):
+    original_images = convert_pdf_to_images(original_pdf)
+    edited_images = convert_pdf_to_images(edited_pdf)
+    combined_images = []
+    visual_changes = []
+    text_changes = []
+    position_counter = 1
+    for orig_img, edit_img in zip(original_images, edited_images):
+        aligned_img = align_images(orig_img, edit_img)
+        # Visual comparison
+        diff_mask = compare_images(orig_img, aligned_img)
+        highlighted_img, page_visual_changes, position_counter = highlight_visual_changes(
+            orig_img, edit_img, diff_mask, position_counter
+        )
+        visual_changes.extend(page_visual_changes)
+        # Text comparison
+        orig_text = pytesseract.image_to_string(orig_img)
+        edit_text = pytesseract.image_to_string(edit_img)
+        page_text_changes, position_counter = compare_text(orig_text, edit_text, position_counter)
+        text_changes.extend(page_text_changes)
+        # Combine images for side-by-side display
+        height = min(orig_img.shape[0], highlighted_img.shape[0])
+        orig_img_resized = orig_img[:height]
+        highlighted_img_resized = highlighted_img[:height]
+        combined_images.append(np.hstack((orig_img_resized, highlighted_img_resized)))
+    # Generate separate reports
+    visual_report = generate_visual_report(original_images, edited_images, combined_images, visual_changes)
+    text_report = generate_text_report(original_images, edited_images, combined_images, text_changes)
+    return visual_report, text_report
 # Gradio interface function
 def pdf_comparison(original_pdf, edited_pdf):
+    visual_path, text_path = generate_separate_comparisons(original_pdf.name, edited_pdf.name)
     return visual_path, text_path
 # Gradio interface
         gr.File(label="Download Visual Changes Report"),
         gr.File(label="Download Text Changes Report")
     ],
+    title="PDF Comparison Tool with Separate Comparisons",
+    description="Upload two PDFs: the original and the edited version. The tool generates separate reports for visual and text changes."
 )
 if __name__ == "__main__":