File size: 8,853 Bytes
df7cb44 85b4e6b acadf13 2e4777d cace578 acadf13 1a5f5a8 9231f24 95fb28f 8e65970 2e4777d 455fefb 85b4e6b 6003106 f7a769c c91c330 a548971 8784b68 c91c330 455fefb 022ec7c f7a769c 2ade524 f7a769c 85b4e6b 6003106 778fed4 6003106 778fed4 c91c330 fe36958 6f819f5 acadf13 6f819f5 acadf13 6f819f5 fe36958 778fed4 455fefb c91c330 455fefb cace578 455fefb c91c330 a548971 6f819f5 455fefb 1a5f5a8 455fefb 8784b68 455fefb 8784b68 455fefb 8784b68 a548971 455fefb a548971 c91c330 a548971 0daa9c2 022ec7c 8784b68 a548971 c91c330 a548971 c91c330 a548971 c91c330 95fb28f c91c330 455fefb c91c330 1a5f5a8 c91c330 8784b68 c91c330 8784b68 c91c330 8784b68 c91c330 acadf13 1a5f5a8 95fb28f 455fefb f7a769c 2d38d1d f7a769c 2ade524 f7a769c 2ade524 9aff2fb 85b4e6b 455fefb 85b4e6b 9aff2fb 85b4e6b 6003106 f7a769c fe36958 2ade524 455fefb 1a5f5a8 c91c330 2ade524 85b4e6b 6f819f5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | import streamlit as st
import fitz # PyMuPDF
import difflib
from PIL import Image, ImageDraw, ImageEnhance, ImageFilter
import easyocr
import numpy as np
import pandas as pd
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import tempfile
import os
from io import BytesIO
# Initialize the easyocr Reader
ocr_reader = easyocr.Reader(['en'])
def load_and_compare_documents(file1, file2):
file1_content = file1.read()
file2_content = file2.read()
# Perform OCR-based comparison across all pages
ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
# Generate a PDF with side-by-side comparisons and observation tables
pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
# Compile an overall summary of differences
overall_summary = generate_overall_summary(ocr_differences)
return pdf_buffer, overall_summary
def pdf_to_images(file_content):
images = []
pdf_document = fitz.open(stream=file_content, filetype="pdf")
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap(dpi=300) # High DPI for better zoom capability
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Preprocess image: adjust brightness, contrast, and apply filter
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(1.5) # Increase contrast
enhancer = ImageEnhance.Brightness(img)
img = enhancer.enhance(1.2) # Increase brightness
img = img.filter(ImageFilter.SHARPEN) # Sharpen to reduce noise
images.append((page_num + 1, img))
pdf_document.close()
return images
def perform_ocr_and_compare(content1, content2):
ocr_differences = []
marked_images_1 = {}
marked_images_2 = {}
images1 = pdf_to_images(content1)
images2 = pdf_to_images(content2)
for (page_num, img1), (_, img2) in zip(images1, images2):
img1_np = np.array(img1)
img2_np = np.array(img2)
# Perform OCR using easyocr
text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
# Duplicate images for marking OCR differences
marked_img1 = img1.copy()
marked_img2 = img2.copy()
draw1 = ImageDraw.Draw(marked_img1)
draw2 = ImageDraw.Draw(marked_img2)
if text1.strip().lower() != text2.strip().lower(): # Case-insensitive, whitespace-trimmed
diff = list(difflib.ndiff(text1, text2))
page_diffs = []
diff_index = 1 # Start index for marking
for i, change in enumerate(diff):
if change.startswith("+ "):
page_diffs.append({"type": "Added", "value": change[2:], "index": i, "description": f"'{change[2:]}' added in second PDF but not in first PDF at position {i}"})
elif change.startswith("- "):
page_diffs.append({"type": "Deleted", "value": change[2:], "index": i, "description": f"'{change[2:]}' present in first PDF but missing in second PDF at position {i}"})
elif change.startswith("? "):
page_diffs.append({"type": "Modified", "value": change[2:], "index": i, "description": f"'{change[2:]}' modified at position {i}"})
ocr_differences.append({"page": page_num, "differences": page_diffs})
# Mark OCR-detected differences as boxed highlights on both images
for result in ocr_reader.readtext(img2_np):
bbox, detected_text = result[0], result[1]
if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
flattened_bbox = [coord for point in bbox for coord in point]
draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)
draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)
marked_images_1[page_num] = marked_img1
marked_images_2[page_num] = marked_img2
return ocr_differences, marked_images_1, marked_images_2
def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences):
pdf_buffer = BytesIO()
c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1])) # Adjusted for side-by-side layout
# Loop through each page to add side-by-side images and observations
for page_num, img1 in marked_images_1.items():
img2 = marked_images_2.get(page_num)
if img2:
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1:
img1.save(temp_img_file1, format="PNG")
temp_img_path1 = temp_img_file1.name
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2:
img2.save(temp_img_file2, format="PNG")
temp_img_path2 = temp_img_file2.name
# Draw the saved images side-by-side on the PDF
c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1])
c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1])
c.showPage()
try:
os.remove(temp_img_path1)
os.remove(temp_img_path2)
except OSError:
pass
# Generate the observation table for each page
c.setFont("Helvetica", 10)
y_position = 750
c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
y_position -= 20
# Table data for each page
data = {"Description": [], "Position": []}
for ocr_diff in ocr_differences:
if ocr_diff["page"] == page_num:
for diff in ocr_diff["differences"]:
data["Description"].append(diff["description"])
data["Position"].append(diff["index"])
# Convert data to DataFrame for formatting
df = pd.DataFrame(data)
column_widths = [350, 100]
# Render the DataFrame as a table in the PDF
for row in df.itertuples(index=False):
for col_index, value in enumerate(row):
c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
y_position -= 15
if y_position < 50: # Start a new page if space is running out
c.showPage()
y_position = 750
c.showPage()
c.save()
pdf_buffer.seek(0)
return pdf_buffer
def generate_overall_summary(ocr_differences):
total_additions = sum(len([d for d in diff["differences"] if d["type"] == "Added"]) for diff in ocr_differences)
total_deletions = sum(len([d for d in diff["differences"] if d["type"] == "Deleted"]) for diff in ocr_differences)
total_modifications = sum(len([d for d in diff["differences"] if d["type"] == "Modified"]) for diff in ocr_differences)
overall_summary = {
"total_additions": total_additions,
"total_deletions": total_deletions,
"total_modifications": total_modifications,
}
return overall_summary
# Streamlit app interface
def main():
st.title("Comprehensive Document Comparison Tool with OCR Text Extraction")
st.write("Upload Customer Document and CorelDRAW Output for detailed comparison.")
customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
if st.button("Compare Documents") and customer_file and output_file:
if customer_file.size == 0 or output_file.size == 0:
st.error("One or both files are empty. Please upload valid PDF files.")
return
pdf_buffer, overall_summary = load_and_compare_documents(customer_file, output_file)
st.subheader("Overall Comparison Summary")
for key, value in overall_summary.items():
st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
# Provide download link for generated PDF with marked differences
st.subheader("Download PDF with Side-by-Side Comparisons and Observations")
st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf")
if __name__ == "__main__":
main()
|