Spaces:

SuriRaja
/

usecase2

Sleeping

App Files Files Community

SuriRaja commited on Nov 12, 2024

Commit

0daa9c2

verified ·

1 Parent(s): 95fb28f

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -10

app.py CHANGED Viewed

@@ -22,10 +22,10 @@ def load_and_compare_documents(file1, file2):
     # Extract and compare text with font properties, placement, and special characters
     text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
-    # Perform OCR-based comparison and underline OCR differences on images
     ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
-    # Generate a PDF with marked OCR differences and positions
     pdf_buffer = create_pdf_with_differences(marked_images, ocr_differences)
     # Compile an overall summary of differences
@@ -54,7 +54,7 @@ def compare_text_with_properties(content1, content2):
     placement_changes = []
     for e1, e2 in zip(elements1, elements2):
-        if e1["text"] != e2["text"]:
             diff = list(difflib.ndiff(e1["text"], e2["text"]))
             for i, change in enumerate(diff):
                 if change.startswith("+ "):
@@ -130,7 +130,7 @@ def perform_ocr_and_compare(content1, content2):
         marked_img = img2.copy()
         draw = ImageDraw.Draw(marked_img)
-        if text1 != text2:
             diff = list(difflib.ndiff(text1, text2))
             page_diffs = []
             diff_index = 1  # Start index for marking
@@ -148,7 +148,7 @@ def perform_ocr_and_compare(content1, content2):
             # Mark OCR-detected differences and indices on image
             for result in ocr_reader.readtext(img2_np):
                 bbox, detected_text = result[0], result[1]
-                if detected_text in text2 and detected_text not in text1:
                     # Flatten bounding box
                     flattened_bbox = [coord for point in bbox for coord in point]
                     # Draw a rectangle around the OCR difference and label with index
@@ -161,12 +161,10 @@ def perform_ocr_and_compare(content1, content2):
     return ocr_differences, marked_images
 def create_pdf_with_differences(marked_images, ocr_differences):
-    # Use BytesIO to create an in-memory PDF file
     pdf_buffer = BytesIO()
     c = canvas.Canvas(pdf_buffer, pagesize=letter)
     for page_num, img in marked_images.items():
-        # Save the marked image to a temporary file
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file:
             img.save(temp_img_file, format="PNG")
             temp_img_path = temp_img_file.name
@@ -174,7 +172,7 @@ def create_pdf_with_differences(marked_images, ocr_differences):
         # Draw the saved image on the PDF
         c.drawImage(temp_img_path, 0, 0, width=letter[0], height=letter[1])
-        # Add OCR differences to PDF
         c.setFont("Helvetica", 10)
         c.drawString(10, 750, f"Page {page_num} OCR Differences:")
         y_position = 730
@@ -187,7 +185,6 @@ def create_pdf_with_differences(marked_images, ocr_differences):
                     c.drawString(10, y_position, diff)
                     y_position -= 15
-        # Move to the next page and delete the temporary image file
         c.showPage()
         temp_img_file.close()
         try:
@@ -195,7 +192,6 @@ def create_pdf_with_differences(marked_images, ocr_differences):
         except OSError:
             pass
-    # Save the PDF to the in-memory buffer
     c.save()
     pdf_buffer.seek(0)
     return pdf_buffer

     # Extract and compare text with font properties, placement, and special characters
     text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
+    # Perform OCR-based comparison and mark OCR differences with indices
     ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
+    # Generate a PDF with marked OCR differences and observations
     pdf_buffer = create_pdf_with_differences(marked_images, ocr_differences)
     # Compile an overall summary of differences
     placement_changes = []
     for e1, e2 in zip(elements1, elements2):
+        if e1["text"].strip().lower() != e2["text"].strip().lower():  # Enhanced: Case-insensitive, whitespace-trimmed
             diff = list(difflib.ndiff(e1["text"], e2["text"]))
             for i, change in enumerate(diff):
                 if change.startswith("+ "):
         marked_img = img2.copy()
         draw = ImageDraw.Draw(marked_img)
+        if text1.strip().lower() != text2.strip().lower():  # Enhanced: Case-insensitive, whitespace-trimmed
             diff = list(difflib.ndiff(text1, text2))
             page_diffs = []
             diff_index = 1  # Start index for marking
             # Mark OCR-detected differences and indices on image
             for result in ocr_reader.readtext(img2_np):
                 bbox, detected_text = result[0], result[1]
+                if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
                     # Flatten bounding box
                     flattened_bbox = [coord for point in bbox for coord in point]
                     # Draw a rectangle around the OCR difference and label with index
     return ocr_differences, marked_images
 def create_pdf_with_differences(marked_images, ocr_differences):
     pdf_buffer = BytesIO()
     c = canvas.Canvas(pdf_buffer, pagesize=letter)
     for page_num, img in marked_images.items():
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file:
             img.save(temp_img_file, format="PNG")
             temp_img_path = temp_img_file.name
         # Draw the saved image on the PDF
         c.drawImage(temp_img_path, 0, 0, width=letter[0], height=letter[1])
+        # Add OCR observations below the image
         c.setFont("Helvetica", 10)
         c.drawString(10, 750, f"Page {page_num} OCR Differences:")
         y_position = 730
                     c.drawString(10, y_position, diff)
                     y_position -= 15
         c.showPage()
         temp_img_file.close()
         try:
         except OSError:
             pass
     c.save()
     pdf_buffer.seek(0)
     return pdf_buffer