File size: 8,853 Bytes
df7cb44
85b4e6b
 
acadf13
2e4777d
cace578
acadf13
1a5f5a8
 
9231f24
 
95fb28f
8e65970
2e4777d
 
455fefb
85b4e6b
6003106
 
 
f7a769c
c91c330
a548971
8784b68
c91c330
455fefb
022ec7c
f7a769c
2ade524
f7a769c
85b4e6b
6003106
778fed4
6003106
778fed4
 
c91c330
fe36958
6f819f5
acadf13
6f819f5
 
 
 
acadf13
6f819f5
fe36958
778fed4
 
 
455fefb
 
c91c330
 
455fefb
 
 
 
cace578
 
 
 
 
 
455fefb
c91c330
 
 
 
 
a548971
6f819f5
455fefb
 
1a5f5a8
 
455fefb
 
8784b68
455fefb
8784b68
455fefb
8784b68
a548971
455fefb
a548971
c91c330
a548971
 
0daa9c2
022ec7c
8784b68
 
a548971
c91c330
 
a548971
c91c330
a548971
c91c330
95fb28f
c91c330
455fefb
c91c330
 
 
1a5f5a8
c91c330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8784b68
c91c330
 
 
8784b68
 
c91c330
 
8784b68
 
c91c330
 
 
 
 
 
 
 
 
 
 
acadf13
1a5f5a8
95fb28f
 
455fefb
f7a769c
2d38d1d
 
 
f7a769c
2ade524
f7a769c
 
 
2ade524
 
 
9aff2fb
85b4e6b
455fefb
 
85b4e6b
9aff2fb
 
85b4e6b
 
6003106
 
 
 
f7a769c
fe36958
2ade524
455fefb
 
1a5f5a8
 
c91c330
 
2ade524
85b4e6b
6f819f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import streamlit as st
import fitz  # PyMuPDF
import difflib
from PIL import Image, ImageDraw, ImageEnhance, ImageFilter
import easyocr
import numpy as np
import pandas as pd
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import tempfile
import os
from io import BytesIO

# Initialize the easyocr Reader
ocr_reader = easyocr.Reader(['en'])

def load_and_compare_documents(file1, file2):
    file1_content = file1.read()
    file2_content = file2.read()
    
    # Perform OCR-based comparison across all pages
    ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
    
    # Generate a PDF with side-by-side comparisons and observation tables
    pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
    
    # Compile an overall summary of differences
    overall_summary = generate_overall_summary(ocr_differences)
    
    return pdf_buffer, overall_summary

def pdf_to_images(file_content):
    images = []
    pdf_document = fitz.open(stream=file_content, filetype="pdf")
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # High DPI for better zoom capability
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Preprocess image: adjust brightness, contrast, and apply filter
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(1.5)  # Increase contrast
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(1.2)  # Increase brightness
        img = img.filter(ImageFilter.SHARPEN)  # Sharpen to reduce noise
        
        images.append((page_num + 1, img))
    pdf_document.close()
    return images

def perform_ocr_and_compare(content1, content2):
    ocr_differences = []
    marked_images_1 = {}
    marked_images_2 = {}
    images1 = pdf_to_images(content1)
    images2 = pdf_to_images(content2)

    for (page_num, img1), (_, img2) in zip(images1, images2):
        img1_np = np.array(img1)
        img2_np = np.array(img2)
        
        # Perform OCR using easyocr
        text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
        text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
        
        # Duplicate images for marking OCR differences
        marked_img1 = img1.copy()
        marked_img2 = img2.copy()
        draw1 = ImageDraw.Draw(marked_img1)
        draw2 = ImageDraw.Draw(marked_img2)
        
        if text1.strip().lower() != text2.strip().lower():  # Case-insensitive, whitespace-trimmed
            diff = list(difflib.ndiff(text1, text2))
            page_diffs = []
            diff_index = 1  # Start index for marking
            
            for i, change in enumerate(diff):
                if change.startswith("+ "):
                    page_diffs.append({"type": "Added", "value": change[2:], "index": i, "description": f"'{change[2:]}' added in second PDF but not in first PDF at position {i}"})
                elif change.startswith("- "):
                    page_diffs.append({"type": "Deleted", "value": change[2:], "index": i, "description": f"'{change[2:]}' present in first PDF but missing in second PDF at position {i}"})
                elif change.startswith("? "):
                    page_diffs.append({"type": "Modified", "value": change[2:], "index": i, "description": f"'{change[2:]}' modified at position {i}"})
            
            ocr_differences.append({"page": page_num, "differences": page_diffs})
            
            # Mark OCR-detected differences as boxed highlights on both images
            for result in ocr_reader.readtext(img2_np):
                bbox, detected_text = result[0], result[1]
                if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
                    flattened_bbox = [coord for point in bbox for coord in point]
                    draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)
                    draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)

            marked_images_1[page_num] = marked_img1
            marked_images_2[page_num] = marked_img2

    return ocr_differences, marked_images_1, marked_images_2

def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences):
    pdf_buffer = BytesIO()
    c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1]))  # Adjusted for side-by-side layout

    # Loop through each page to add side-by-side images and observations
    for page_num, img1 in marked_images_1.items():
        img2 = marked_images_2.get(page_num)
        
        if img2:
            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1:
                img1.save(temp_img_file1, format="PNG")
                temp_img_path1 = temp_img_file1.name

            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2:
                img2.save(temp_img_file2, format="PNG")
                temp_img_path2 = temp_img_file2.name

            # Draw the saved images side-by-side on the PDF
            c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1])
            c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1])
            c.showPage()
            
            try:
                os.remove(temp_img_path1)
                os.remove(temp_img_path2)
            except OSError:
                pass

            # Generate the observation table for each page
            c.setFont("Helvetica", 10)
            y_position = 750
            c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
            y_position -= 20

            # Table data for each page
            data = {"Description": [], "Position": []}
            for ocr_diff in ocr_differences:
                if ocr_diff["page"] == page_num:
                    for diff in ocr_diff["differences"]:
                        data["Description"].append(diff["description"])
                        data["Position"].append(diff["index"])

            # Convert data to DataFrame for formatting
            df = pd.DataFrame(data)
            column_widths = [350, 100]

            # Render the DataFrame as a table in the PDF
            for row in df.itertuples(index=False):
                for col_index, value in enumerate(row):
                    c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
                y_position -= 15
                if y_position < 50:  # Start a new page if space is running out
                    c.showPage()
                    y_position = 750

            c.showPage()

    c.save()
    pdf_buffer.seek(0)
    return pdf_buffer

def generate_overall_summary(ocr_differences):
    total_additions = sum(len([d for d in diff["differences"] if d["type"] == "Added"]) for diff in ocr_differences)
    total_deletions = sum(len([d for d in diff["differences"] if d["type"] == "Deleted"]) for diff in ocr_differences)
    total_modifications = sum(len([d for d in diff["differences"] if d["type"] == "Modified"]) for diff in ocr_differences)
    
    overall_summary = {
        "total_additions": total_additions,
        "total_deletions": total_deletions,
        "total_modifications": total_modifications,
    }
    return overall_summary

# Streamlit app interface
def main():
    st.title("Comprehensive Document Comparison Tool with OCR Text Extraction")
    st.write("Upload Customer Document and CorelDRAW Output for detailed comparison.")
    
    customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
    output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
    
    if st.button("Compare Documents") and customer_file and output_file:
        if customer_file.size == 0 or output_file.size == 0:
            st.error("One or both files are empty. Please upload valid PDF files.")
            return

        pdf_buffer, overall_summary = load_and_compare_documents(customer_file, output_file)
        
        st.subheader("Overall Comparison Summary")
        for key, value in overall_summary.items():
            st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
        
        # Provide download link for generated PDF with marked differences
        st.subheader("Download PDF with Side-by-Side Comparisons and Observations")
        st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf")

if __name__ == "__main__":
    main()