import streamlit as st import fitz # PyMuPDF import difflib from PIL import Image, ImageDraw, ImageEnhance, ImageFilter import easyocr import numpy as np import pandas as pd from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter import tempfile import os from io import BytesIO # Initialize the easyocr Reader ocr_reader = easyocr.Reader(['en']) def load_and_compare_documents(file1, file2): file1_content = file1.read() file2_content = file2.read() # Perform OCR-based comparison across all pages ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content) # Generate a PDF with side-by-side comparisons and observation tables pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences) # Compile an overall summary of differences overall_summary = generate_overall_summary(ocr_differences) return pdf_buffer, overall_summary def pdf_to_images(file_content): images = [] pdf_document = fitz.open(stream=file_content, filetype="pdf") for page_num in range(pdf_document.page_count): page = pdf_document.load_page(page_num) pix = page.get_pixmap(dpi=300) # High DPI for better zoom capability img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # Preprocess image: adjust brightness, contrast, and apply filter enhancer = ImageEnhance.Contrast(img) img = enhancer.enhance(1.5) # Increase contrast enhancer = ImageEnhance.Brightness(img) img = enhancer.enhance(1.2) # Increase brightness img = img.filter(ImageFilter.SHARPEN) # Sharpen to reduce noise images.append((page_num + 1, img)) pdf_document.close() return images def perform_ocr_and_compare(content1, content2): ocr_differences = [] marked_images_1 = {} marked_images_2 = {} images1 = pdf_to_images(content1) images2 = pdf_to_images(content2) for (page_num, img1), (_, img2) in zip(images1, images2): img1_np = np.array(img1) img2_np = np.array(img2) # Perform OCR using easyocr text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)]) text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)]) # Duplicate images for marking OCR differences marked_img1 = img1.copy() marked_img2 = img2.copy() draw1 = ImageDraw.Draw(marked_img1) draw2 = ImageDraw.Draw(marked_img2) if text1.strip().lower() != text2.strip().lower(): # Case-insensitive, whitespace-trimmed diff = list(difflib.ndiff(text1, text2)) page_diffs = [] diff_index = 1 # Start index for marking for i, change in enumerate(diff): if change.startswith("+ "): page_diffs.append({"type": "Added", "value": change[2:], "index": i, "description": f"'{change[2:]}' added in second PDF but not in first PDF at position {i}"}) elif change.startswith("- "): page_diffs.append({"type": "Deleted", "value": change[2:], "index": i, "description": f"'{change[2:]}' present in first PDF but missing in second PDF at position {i}"}) elif change.startswith("? "): page_diffs.append({"type": "Modified", "value": change[2:], "index": i, "description": f"'{change[2:]}' modified at position {i}"}) ocr_differences.append({"page": page_num, "differences": page_diffs}) # Mark OCR-detected differences as boxed highlights on both images for result in ocr_reader.readtext(img2_np): bbox, detected_text = result[0], result[1] if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower(): flattened_bbox = [coord for point in bbox for coord in point] draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2) draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2) marked_images_1[page_num] = marked_img1 marked_images_2[page_num] = marked_img2 return ocr_differences, marked_images_1, marked_images_2 def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences): pdf_buffer = BytesIO() c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1])) # Adjusted for side-by-side layout # Loop through each page to add side-by-side images and observations for page_num, img1 in marked_images_1.items(): img2 = marked_images_2.get(page_num) if img2: with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1: img1.save(temp_img_file1, format="PNG") temp_img_path1 = temp_img_file1.name with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2: img2.save(temp_img_file2, format="PNG") temp_img_path2 = temp_img_file2.name # Draw the saved images side-by-side on the PDF c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1]) c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1]) c.showPage() try: os.remove(temp_img_path1) os.remove(temp_img_path2) except OSError: pass # Generate the observation table for each page c.setFont("Helvetica", 10) y_position = 750 c.drawString(10, y_position, f"Observation Summary for Page {page_num}:") y_position -= 20 # Table data for each page data = {"Description": [], "Position": []} for ocr_diff in ocr_differences: if ocr_diff["page"] == page_num: for diff in ocr_diff["differences"]: data["Description"].append(diff["description"]) data["Position"].append(diff["index"]) # Convert data to DataFrame for formatting df = pd.DataFrame(data) column_widths = [350, 100] # Render the DataFrame as a table in the PDF for row in df.itertuples(index=False): for col_index, value in enumerate(row): c.drawString(10 + col_index * column_widths[col_index], y_position, str(value)) y_position -= 15 if y_position < 50: # Start a new page if space is running out c.showPage() y_position = 750 c.showPage() c.save() pdf_buffer.seek(0) return pdf_buffer def generate_overall_summary(ocr_differences): total_additions = sum(len([d for d in diff["differences"] if d["type"] == "Added"]) for diff in ocr_differences) total_deletions = sum(len([d for d in diff["differences"] if d["type"] == "Deleted"]) for diff in ocr_differences) total_modifications = sum(len([d for d in diff["differences"] if d["type"] == "Modified"]) for diff in ocr_differences) overall_summary = { "total_additions": total_additions, "total_deletions": total_deletions, "total_modifications": total_modifications, } return overall_summary # Streamlit app interface def main(): st.title("Comprehensive Document Comparison Tool with OCR Text Extraction") st.write("Upload Customer Document and CorelDRAW Output for detailed comparison.") customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"]) output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"]) if st.button("Compare Documents") and customer_file and output_file: if customer_file.size == 0 or output_file.size == 0: st.error("One or both files are empty. Please upload valid PDF files.") return pdf_buffer, overall_summary = load_and_compare_documents(customer_file, output_file) st.subheader("Overall Comparison Summary") for key, value in overall_summary.items(): st.write(f"{key.replace('_', ' ').capitalize()}: {value}") # Provide download link for generated PDF with marked differences st.subheader("Download PDF with Side-by-Side Comparisons and Observations") st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf") if __name__ == "__main__": main()