Spaces:

SuriRaja
/

usecase2

Sleeping

App Files Files Community

SuriRaja commited on Nov 12, 2024

Commit

c91c330

verified ·

1 Parent(s): f7a769c

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -64

app.py CHANGED Viewed

@@ -19,10 +19,10 @@ def load_and_compare_documents(file1, file2):
     file2_content = file2.read()
     # Perform OCR-based comparison across all pages
-    ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
-    # Generate a PDF with marked OCR differences for each page and observation tables
-    pdf_buffer = create_pdf_with_observations(marked_images, ocr_differences)
     # Compile an overall summary of differences
     overall_summary = generate_overall_summary(ocr_differences)
@@ -34,7 +34,7 @@ def pdf_to_images(file_content):
     pdf_document = fitz.open(stream=file_content, filetype="pdf")
     for page_num in range(pdf_document.page_count):
         page = pdf_document.load_page(page_num)
-        pix = page.get_pixmap(dpi=150)  # Higher DPI for clearer images
         img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         # Preprocess image: adjust brightness, contrast, and apply filter
@@ -50,7 +50,8 @@ def pdf_to_images(file_content):
 def perform_ocr_and_compare(content1, content2):
     ocr_differences = []
-    marked_images = {}
     images1 = pdf_to_images(content1)
     images2 = pdf_to_images(content2)
@@ -62,9 +63,11 @@ def perform_ocr_and_compare(content1, content2):
         text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
         text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
-        # Duplicate image for marking OCR differences
-        marked_img = img2.copy()
-        draw = ImageDraw.Draw(marked_img)
         if text1.strip().lower() != text2.strip().lower():  # Case-insensitive, whitespace-trimmed
             diff = list(difflib.ndiff(text1, text2))
@@ -81,71 +84,80 @@ def perform_ocr_and_compare(content1, content2):
             ocr_differences.append({"page": page_num, "differences": page_diffs})
-            # Mark OCR-detected differences and indices on image
             for result in ocr_reader.readtext(img2_np):
                 bbox, detected_text = result[0], result[1]
                 if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
                     flattened_bbox = [coord for point in bbox for coord in point]
-                    draw.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)
-                    draw.text((flattened_bbox[0], flattened_bbox[1] - 10), str(diff_index), fill="blue")
                     diff_index += 1
-            marked_images[page_num] = marked_img
-    return ocr_differences, marked_images
-def create_pdf_with_observations(marked_images, ocr_differences):
     pdf_buffer = BytesIO()
-    c = canvas.Canvas(pdf_buffer, pagesize=letter)
-    # Loop through each page to add image with differences and observations
-    for page_num, img in marked_images.items():
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file:
-            img.save(temp_img_file, format="PNG")
-            temp_img_path = temp_img_file.name
-        # Draw the saved image on the PDF
-        c.drawImage(temp_img_path, 0, 0, width=letter[0], height=letter[1])
-        c.showPage()
-        temp_img_file.close()
-        try:
-            os.remove(temp_img_path)
-        except OSError:
-            pass
-        # Generate the observation table for each page
-        c.setFont("Helvetica", 10)
-        y_position = 750
-        c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
-        y_position -= 20
-        # Table data for each page
-        data = {"Additions": [], "Deletions": [], "Modifications": []}
-        for ocr_diff in ocr_differences:
-            if ocr_diff["page"] == page_num:
-                for diff in ocr_diff["differences"]:
-                    if "Added" in diff:
-                        data["Additions"].append(diff)
-                    elif "Deleted" in diff:
-                        data["Deletions"].append(diff)
-                    elif "Modified" in diff:
-                        data["Modifications"].append(diff)
-        # Convert data to DataFrame for formatting
-        df = pd.DataFrame.from_dict(data, orient="index").transpose()
-        column_widths = [150, 150, 150]
-        # Render the DataFrame as a table in the PDF
-        for row in df.itertuples(index=False):
-            for col_index, value in enumerate(row):
-                c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
-            y_position -= 15
-            if y_position < 50:  # Start a new page if space is running out
-                c.showPage()
-                y_position = 750
-        c.showPage()
     c.save()
     pdf_buffer.seek(0)
@@ -183,8 +195,8 @@ def main():
             st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
         # Provide download link for generated PDF with marked differences
-        st.subheader("Download PDF with Marked OCR Differences and Observations")
-        st.download_button("Download Marked PDF", data=pdf_buffer, file_name="marked_differences_and_observations.pdf", mime="application/pdf")
 if __name__ == "__main__":
     main()

     file2_content = file2.read()
     # Perform OCR-based comparison across all pages
+    ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
+    # Generate a PDF with side-by-side comparisons and observation tables
+    pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
     # Compile an overall summary of differences
     overall_summary = generate_overall_summary(ocr_differences)
     pdf_document = fitz.open(stream=file_content, filetype="pdf")
     for page_num in range(pdf_document.page_count):
         page = pdf_document.load_page(page_num)
+        pix = page.get_pixmap(dpi=300)  # High DPI for better zoom capability
         img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         # Preprocess image: adjust brightness, contrast, and apply filter
 def perform_ocr_and_compare(content1, content2):
     ocr_differences = []
+    marked_images_1 = {}
+    marked_images_2 = {}
     images1 = pdf_to_images(content1)
     images2 = pdf_to_images(content2)
         text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
         text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
+        # Duplicate images for marking OCR differences
+        marked_img1 = img1.copy()
+        marked_img2 = img2.copy()
+        draw1 = ImageDraw.Draw(marked_img1)
+        draw2 = ImageDraw.Draw(marked_img2)
         if text1.strip().lower() != text2.strip().lower():  # Case-insensitive, whitespace-trimmed
             diff = list(difflib.ndiff(text1, text2))
             ocr_differences.append({"page": page_num, "differences": page_diffs})
+            # Mark OCR-detected differences as boxed highlights on both images
             for result in ocr_reader.readtext(img2_np):
                 bbox, detected_text = result[0], result[1]
                 if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
                     flattened_bbox = [coord for point in bbox for coord in point]
+                    draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
+                    draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
                     diff_index += 1
+            marked_images_1[page_num] = marked_img1
+            marked_images_2[page_num] = marked_img2
+    return ocr_differences, marked_images_1, marked_images_2
+def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences):
     pdf_buffer = BytesIO()
+    c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1]))  # Adjusted for side-by-side layout
+    # Loop through each page to add side-by-side images and observations
+    for page_num, img1 in marked_images_1.items():
+        img2 = marked_images_2.get(page_num)
+        if img2:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1:
+                img1.save(temp_img_file1, format="PNG")
+                temp_img_path1 = temp_img_file1.name
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2:
+                img2.save(temp_img_file2, format="PNG")
+                temp_img_path2 = temp_img_file2.name
+            # Draw the saved images side-by-side on the PDF
+            c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1])
+            c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1])
+            c.showPage()
+            try:
+                os.remove(temp_img_path1)
+                os.remove(temp_img_path2)
+            except OSError:
+                pass
+            # Generate the observation table for each page
+            c.setFont("Helvetica", 10)
+            y_position = 750
+            c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
+            y_position -= 20
+            # Table data for each page
+            data = {"Additions": [], "Deletions": [], "Modifications": []}
+            for ocr_diff in ocr_differences:
+                if ocr_diff["page"] == page_num:
+                    for diff in ocr_diff["differences"]:
+                        if "Added" in diff:
+                            data["Additions"].append(diff)
+                        elif "Deleted" in diff:
+                            data["Deletions"].append(diff)
+                        elif "Modified" in diff:
+                            data["Modifications"].append(diff)
+            # Convert data to DataFrame for formatting
+            df = pd.DataFrame.from_dict(data, orient="index").transpose()
+            column_widths = [150, 150, 150]
+            # Render the DataFrame as a table in the PDF
+            for row in df.itertuples(index=False):
+                for col_index, value in enumerate(row):
+                    c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
+                y_position -= 15
+                if y_position < 50:  # Start a new page if space is running out
+                    c.showPage()
+                    y_position = 750
+            c.showPage()
     c.save()
     pdf_buffer.seek(0)
             st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
         # Provide download link for generated PDF with marked differences
+        st.subheader("Download PDF with Side-by-Side Comparisons and Observations")
+        st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf")
 if __name__ == "__main__":
     main()