Spaces:

SuriRaja
/

usecase2

Sleeping

App Files Files Community

SuriRaja commited on Nov 11, 2024

Commit

455fefb

verified ·

1 Parent(s): 2ade524

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -75

app.py CHANGED Viewed

@@ -2,27 +2,32 @@ import streamlit as st
 import fitz  # PyMuPDF
 import difflib
 from PIL import Image, ImageChops, ImageDraw
 import io
 import re
 def load_and_compare_documents(file1, file2):
-    # Read the files into memory to avoid re-reading them
     file1_content = file1.read()
     file2_content = file2.read()
-    # Extract and compare text with font properties and placement
     text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
-    # Check for visual differences and box them on the second image
     boxed_images, visual_summary = check_and_box_image_differences(file1_content, file2_content)
-    # Generate an overall comparison summary
-    overall_summary = generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, visual_summary)
-    return text_differences, text_property_changes, special_char_changes, placement_changes, boxed_images, visual_summary, overall_summary
 def extract_text_with_properties(pdf_content):
-    # Extract text with font, size, color, and placement properties from PDF
     text_elements = []
     pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
@@ -45,7 +50,6 @@ def extract_text_with_properties(pdf_content):
     return text_elements
 def compare_text_with_properties(content1, content2):
-    # Extract text with font properties from both documents
     elements1 = extract_text_with_properties(content1)
     elements2 = extract_text_with_properties(content2)
@@ -55,18 +59,16 @@ def compare_text_with_properties(content1, content2):
     placement_changes = []
     for e1, e2 in zip(elements1, elements2):
-        # Character-by-character comparison
         if e1["text"] != e2["text"]:
             diff = list(difflib.ndiff(e1["text"], e2["text"]))
             for i, change in enumerate(diff):
-                if change.startswith("+ "):  # Character only in CorelDRAW Output
                     differences["additions"].append(f"Added '{change[2:]}' at position {i} on page {e1['page']}")
-                elif change.startswith("- "):  # Character only in Customer Document
                     differences["deletions"].append(f"Deleted '{change[2:]}' at position {i} on page {e1['page']}")
-                elif change.startswith("? "):  # Minor character difference
                     differences["modifications"].append(f"Modified '{change[2:]}' at position {i} on page {e1['page']}")
-        # Special character detection
         special_chars = re.findall(r"[^\w\s]", e1["text"] + e2["text"])
         if special_chars:
             special_char_changes.append({
@@ -76,7 +78,6 @@ def compare_text_with_properties(content1, content2):
                 "special_characters": special_chars
             })
-        # Font, size, color, and placement changes
         if e1["font"] != e2["font"] or e1["size"] != e2["size"] or e1["color"] != e2["color"] or e1["bbox"] != e2["bbox"]:
             property_changes.append({
                 "page": e1["page"],
@@ -95,7 +96,6 @@ def compare_text_with_properties(content1, content2):
     return differences, property_changes, special_char_changes, placement_changes
 def pdf_to_images(file_content):
-    # Convert each page of the PDF to images using in-memory content
     images = []
     pdf_document = fitz.open(stream=file_content, filetype="pdf")
     for page_num in range(pdf_document.page_count):
@@ -107,7 +107,6 @@ def pdf_to_images(file_content):
     return images
 def check_and_box_image_differences(file1_content, file2_content):
-    # Convert PDFs to images and check for visual differences, drawing boxes on the second image
     images1 = pdf_to_images(file1_content)
     images2 = pdf_to_images(file2_content)
@@ -115,13 +114,11 @@ def check_and_box_image_differences(file1_content, file2_content):
     visual_summary = {"size_changes": [], "color_changes": [], "text_changes": []}
     for (page_num, img1), (_, img2) in zip(images1, images2):
-        # Check for size differences
         if img1.size != img2.size:
             visual_summary["size_changes"].append(f"Page {page_num}: Significant layout or size difference detected.")
-            boxed_images.append((page_num, img2))  # Display unmodified if size is different
             continue
-        # Detect differences and box them
         diff = ImageChops.difference(img1, img2).convert("L")
         diff_sum = sum(diff.getdata())
@@ -131,15 +128,37 @@ def check_and_box_image_differences(file1_content, file2_content):
             bbox = diff.getbbox()
             if bbox:
-                draw.rectangle(bbox, outline="red", width=3)  # Draw a red box around differences
             boxed_images.append((page_num, highlighted_img))
             visual_summary["text_changes"].append(f"Page {page_num}: Text/font differences detected.")
     return boxed_images, visual_summary
-def generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, visual_summary):
-    # Compile a high-level summary of changes
     overall_summary = {
         "total_additions": len(text_differences["additions"]),
         "total_deletions": len(text_differences["deletions"]),
@@ -151,14 +170,15 @@ def generate_overall_summary(text_differences, text_property_changes, special_ch
             "size_changes": len(visual_summary["size_changes"]),
             "color_changes": len(visual_summary["color_changes"]),
             "text_changes": len(visual_summary["text_changes"]),
-        }
     }
     return overall_summary
 # Streamlit app interface
 def main():
-    st.title("Comprehensive Document Comparison Tool")
-    st.write("Upload Customer Document and CorelDRAW Output for a detailed comparison.")
     customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
     output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
@@ -168,59 +188,21 @@ def main():
             st.error("One or both files are empty. Please upload valid PDF files.")
             return
-        text_differences, text_property_changes, special_char_changes, placement_changes, boxed_images, visual_summary, overall_summary = load_and_compare_documents(customer_file, output_file)
-        # Text Differences
-        st.subheader("Detailed Text Differences (Character by Character)")
-        for category, changes in text_differences.items():
-            if changes:
-                st.write(f"**{category.capitalize()}**:")
-                for change in changes:
-                    st.write(f"- {change}")
-        # Font, Size, Color, and Placement Changes
-        st.subheader("Font, Size, Color, and Placement Changes")
-        for change in text_property_changes:
-            st.write(f"**Page {change['page']}:** Font {change['original_font']} -> {change['new_font']}, Size {change['original_size']} -> {change['new_size']}, Color {change['original_color']} -> {change['new_color']}")
-        for change in placement_changes:
-            st.write(f"- {change}")
-        # Special Character Changes
-        if special_char_changes:
-            st.subheader("Special Character Changes")
-            for change in special_char_changes:
-                st.write(f"Page {change['page']} - Original Text: '{change['original_text']}' -> New Text: '{change['new_text']}' with Special Characters: {change['special_characters']}")
-        # Display images with boxed differences
-        st.subheader("Pages with Visual Differences (Boxed)")
-        if boxed_images:
-            for page_num, img in boxed_images:
-                st.write(f"**Page {page_num} with Differences Boxed**")
-                img_io = io.BytesIO()
-                img.save(img_io, format="PNG")
-                st.image(img_io, caption=f"Differences on Page {page_num}", use_column_width=True)
-        else:
-            st.write("No visual differences detected between the images.")
-        # Display Visual Differences Summary
-        st.subheader("Visual Differences Summary")
-        for key, changes in visual_summary.items():
-            if changes:
-                st.write(f"**{key.capitalize().replace('_', ' ')}:**")
-                for item in changes:
-                    st.write(f"- {item}")
-        # Overall Summary
         st.subheader("Overall Comparison Summary")
-        st.write(f"**Total Additions:** {overall_summary['total_additions']}")
-        st.write(f"**Total Deletions:** {overall_summary['total_deletions']}")
-        st.write(f"**Total Modifications:** {overall_summary['total_modifications']}")
-        st.write(f"**Font, Size, and Color Changes:** {overall_summary['font_size_color_changes']}")
-        st.write(f"**Special Character Changes:** {overall_summary['special_character_changes']}")
-        st.write(f"**Placement Changes:** {overall_summary['placement_changes']}")
-        st.write(f"**Pages with Size Differences:** {overall_summary['visual_differences']['size_changes']}")
-        st.write(f"**Pages with Color Differences:** {overall_summary['visual_differences']['color_changes']}")
-        st.write(f"**Pages with Text Differences:** {overall_summary['visual_differences']['text_changes']}")
 if __name__ == "__main__":
     main()

 import fitz  # PyMuPDF
 import difflib
 from PIL import Image, ImageChops, ImageDraw
+import pytesseract
 import io
 import re
+# Set up Tesseract path if needed (adjust as per system requirements)
+# pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
 def load_and_compare_documents(file1, file2):
     file1_content = file1.read()
     file2_content = file2.read()
+    # Compare text with font properties, placement, and special characters
     text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
+    # Visual differences with boxed highlights
     boxed_images, visual_summary = check_and_box_image_differences(file1_content, file2_content)
+    # OCR on images for text comparison
+    ocr_differences = perform_ocr_and_compare(file1_content, file2_content)
+    # Generate overall summary
+    overall_summary = generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, visual_summary, ocr_differences)
+    return text_differences, text_property_changes, special_char_changes, placement_changes, boxed_images, visual_summary, ocr_differences, overall_summary
 def extract_text_with_properties(pdf_content):
     text_elements = []
     pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
     return text_elements
 def compare_text_with_properties(content1, content2):
     elements1 = extract_text_with_properties(content1)
     elements2 = extract_text_with_properties(content2)
     placement_changes = []
     for e1, e2 in zip(elements1, elements2):
         if e1["text"] != e2["text"]:
             diff = list(difflib.ndiff(e1["text"], e2["text"]))
             for i, change in enumerate(diff):
+                if change.startswith("+ "):
                     differences["additions"].append(f"Added '{change[2:]}' at position {i} on page {e1['page']}")
+                elif change.startswith("- "):
                     differences["deletions"].append(f"Deleted '{change[2:]}' at position {i} on page {e1['page']}")
+                elif change.startswith("? "):
                     differences["modifications"].append(f"Modified '{change[2:]}' at position {i} on page {e1['page']}")
         special_chars = re.findall(r"[^\w\s]", e1["text"] + e2["text"])
         if special_chars:
             special_char_changes.append({
                 "special_characters": special_chars
             })
         if e1["font"] != e2["font"] or e1["size"] != e2["size"] or e1["color"] != e2["color"] or e1["bbox"] != e2["bbox"]:
             property_changes.append({
                 "page": e1["page"],
     return differences, property_changes, special_char_changes, placement_changes
 def pdf_to_images(file_content):
     images = []
     pdf_document = fitz.open(stream=file_content, filetype="pdf")
     for page_num in range(pdf_document.page_count):
     return images
 def check_and_box_image_differences(file1_content, file2_content):
     images1 = pdf_to_images(file1_content)
     images2 = pdf_to_images(file2_content)
     visual_summary = {"size_changes": [], "color_changes": [], "text_changes": []}
     for (page_num, img1), (_, img2) in zip(images1, images2):
         if img1.size != img2.size:
             visual_summary["size_changes"].append(f"Page {page_num}: Significant layout or size difference detected.")
+            boxed_images.append((page_num, img2))
             continue
         diff = ImageChops.difference(img1, img2).convert("L")
         diff_sum = sum(diff.getdata())
             bbox = diff.getbbox()
             if bbox:
+                draw.rectangle(bbox, outline="red", width=3)
             boxed_images.append((page_num, highlighted_img))
             visual_summary["text_changes"].append(f"Page {page_num}: Text/font differences detected.")
     return boxed_images, visual_summary
+def perform_ocr_and_compare(content1, content2):
+    ocr_differences = []
+    images1 = pdf_to_images(content1)
+    images2 = pdf_to_images(content2)
+    for (page_num, img1), (_, img2) in zip(images1, images2):
+        text1 = pytesseract.image_to_string(img1)
+        text2 = pytesseract.image_to_string(img2)
+        if text1 != text2:
+            diff = list(difflib.ndiff(text1, text2))
+            page_diffs = []
+            for i, change in enumerate(diff):
+                if change.startswith("+ "):
+                    page_diffs.append(f"Added '{change[2:]}' at position {i} on page {page_num}")
+                elif change.startswith("- "):
+                    page_diffs.append(f"Deleted '{change[2:]}' at position {i} on page {page_num}")
+                elif change.startswith("? "):
+                    page_diffs.append(f"Modified '{change[2:]}' at position {i} on page {page_num}")
+            ocr_differences.append({"page": page_num, "differences": page_diffs})
+    return ocr_differences
+def generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, visual_summary, ocr_differences):
     overall_summary = {
         "total_additions": len(text_differences["additions"]),
         "total_deletions": len(text_differences["deletions"]),
             "size_changes": len(visual_summary["size_changes"]),
             "color_changes": len(visual_summary["color_changes"]),
             "text_changes": len(visual_summary["text_changes"]),
+        },
+        "ocr_differences": len(ocr_differences)
     }
     return overall_summary
 # Streamlit app interface
 def main():
+    st.title("Comprehensive Document Comparison Tool with OCR Text Extraction")
+    st.write("Upload Customer Document and CorelDRAW Output for detailed comparison.")
     customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
     output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
             st.error("One or both files are empty. Please upload valid PDF files.")
             return
+        text_differences, text_property_changes, special_char_changes, placement_changes, boxed_images, visual_summary, ocr_differences, overall_summary = load_and_compare_documents(customer_file, output_file)
+        # Display various outputs (as outlined previously)...
+        # OCR Differences
+        st.subheader("OCR-Based Text Differences (Embedded in Images)")
+        for ocr_diff in ocr_differences:
+            st.write(f"**Page {ocr_diff['page']} OCR Differences:**")
+            for diff in ocr_diff["differences"]:
+                st.write(f"- {diff}")
+        # Overall Comparison Summary
         st.subheader("Overall Comparison Summary")
+        for key, value in overall_summary.items():
+            st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
 if __name__ == "__main__":
     main()