| import streamlit as st |
| import fitz |
| import difflib |
| from PIL import Image, ImageDraw, ImageEnhance, ImageFilter |
| import easyocr |
| import numpy as np |
| import pandas as pd |
| from reportlab.pdfgen import canvas |
| from reportlab.lib.pagesizes import letter |
| import tempfile |
| import os |
| from io import BytesIO |
|
|
| |
| ocr_reader = easyocr.Reader(['en']) |
|
|
| def load_and_compare_documents(file1, file2): |
| file1_content = file1.read() |
| file2_content = file2.read() |
| |
| |
| ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content) |
| |
| |
| pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences) |
| |
| |
| overall_summary = generate_overall_summary(ocr_differences) |
| |
| return pdf_buffer, overall_summary |
|
|
| def pdf_to_images(file_content): |
| images = [] |
| pdf_document = fitz.open(stream=file_content, filetype="pdf") |
| for page_num in range(pdf_document.page_count): |
| page = pdf_document.load_page(page_num) |
| pix = page.get_pixmap(dpi=300) |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
| |
| |
| enhancer = ImageEnhance.Contrast(img) |
| img = enhancer.enhance(1.5) |
| enhancer = ImageEnhance.Brightness(img) |
| img = enhancer.enhance(1.2) |
| img = img.filter(ImageFilter.SHARPEN) |
| |
| images.append((page_num + 1, img)) |
| pdf_document.close() |
| return images |
|
|
| def perform_ocr_and_compare(content1, content2): |
| ocr_differences = [] |
| marked_images_1 = {} |
| marked_images_2 = {} |
| images1 = pdf_to_images(content1) |
| images2 = pdf_to_images(content2) |
|
|
| for (page_num, img1), (_, img2) in zip(images1, images2): |
| img1_np = np.array(img1) |
| img2_np = np.array(img2) |
| |
| |
| text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)]) |
| text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)]) |
| |
| |
| marked_img1 = img1.copy() |
| marked_img2 = img2.copy() |
| draw1 = ImageDraw.Draw(marked_img1) |
| draw2 = ImageDraw.Draw(marked_img2) |
| |
| if text1.strip().lower() != text2.strip().lower(): |
| diff = list(difflib.ndiff(text1, text2)) |
| page_diffs = [] |
| diff_index = 1 |
| |
| for i, change in enumerate(diff): |
| if change.startswith("+ "): |
| page_diffs.append({"type": "Added", "value": change[2:], "index": i, "description": f"'{change[2:]}' added in second PDF but not in first PDF at position {i}"}) |
| elif change.startswith("- "): |
| page_diffs.append({"type": "Deleted", "value": change[2:], "index": i, "description": f"'{change[2:]}' present in first PDF but missing in second PDF at position {i}"}) |
| elif change.startswith("? "): |
| page_diffs.append({"type": "Modified", "value": change[2:], "index": i, "description": f"'{change[2:]}' modified at position {i}"}) |
| |
| ocr_differences.append({"page": page_num, "differences": page_diffs}) |
| |
| |
| for result in ocr_reader.readtext(img2_np): |
| bbox, detected_text = result[0], result[1] |
| if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower(): |
| flattened_bbox = [coord for point in bbox for coord in point] |
| draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2) |
| draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2) |
|
|
| marked_images_1[page_num] = marked_img1 |
| marked_images_2[page_num] = marked_img2 |
|
|
| return ocr_differences, marked_images_1, marked_images_2 |
|
|
| def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences): |
| pdf_buffer = BytesIO() |
| c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1])) |
|
|
| |
| for page_num, img1 in marked_images_1.items(): |
| img2 = marked_images_2.get(page_num) |
| |
| if img2: |
| with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1: |
| img1.save(temp_img_file1, format="PNG") |
| temp_img_path1 = temp_img_file1.name |
|
|
| with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2: |
| img2.save(temp_img_file2, format="PNG") |
| temp_img_path2 = temp_img_file2.name |
|
|
| |
| c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1]) |
| c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1]) |
| c.showPage() |
| |
| try: |
| os.remove(temp_img_path1) |
| os.remove(temp_img_path2) |
| except OSError: |
| pass |
|
|
| |
| c.setFont("Helvetica", 10) |
| y_position = 750 |
| c.drawString(10, y_position, f"Observation Summary for Page {page_num}:") |
| y_position -= 20 |
|
|
| |
| data = {"Description": [], "Position": []} |
| for ocr_diff in ocr_differences: |
| if ocr_diff["page"] == page_num: |
| for diff in ocr_diff["differences"]: |
| data["Description"].append(diff["description"]) |
| data["Position"].append(diff["index"]) |
|
|
| |
| df = pd.DataFrame(data) |
| column_widths = [350, 100] |
|
|
| |
| for row in df.itertuples(index=False): |
| for col_index, value in enumerate(row): |
| c.drawString(10 + col_index * column_widths[col_index], y_position, str(value)) |
| y_position -= 15 |
| if y_position < 50: |
| c.showPage() |
| y_position = 750 |
|
|
| c.showPage() |
|
|
| c.save() |
| pdf_buffer.seek(0) |
| return pdf_buffer |
|
|
| def generate_overall_summary(ocr_differences): |
| total_additions = sum(len([d for d in diff["differences"] if d["type"] == "Added"]) for diff in ocr_differences) |
| total_deletions = sum(len([d for d in diff["differences"] if d["type"] == "Deleted"]) for diff in ocr_differences) |
| total_modifications = sum(len([d for d in diff["differences"] if d["type"] == "Modified"]) for diff in ocr_differences) |
| |
| overall_summary = { |
| "total_additions": total_additions, |
| "total_deletions": total_deletions, |
| "total_modifications": total_modifications, |
| } |
| return overall_summary |
|
|
| |
| def main(): |
| st.title("Comprehensive Document Comparison Tool with OCR Text Extraction") |
| st.write("Upload Customer Document and CorelDRAW Output for detailed comparison.") |
| |
| customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"]) |
| output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"]) |
| |
| if st.button("Compare Documents") and customer_file and output_file: |
| if customer_file.size == 0 or output_file.size == 0: |
| st.error("One or both files are empty. Please upload valid PDF files.") |
| return |
|
|
| pdf_buffer, overall_summary = load_and_compare_documents(customer_file, output_file) |
| |
| st.subheader("Overall Comparison Summary") |
| for key, value in overall_summary.items(): |
| st.write(f"{key.replace('_', ' ').capitalize()}: {value}") |
| |
| |
| st.subheader("Download PDF with Side-by-Side Comparisons and Observations") |
| st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf") |
|
|
| if __name__ == "__main__": |
| main() |
|
|