usecase2 / app.py
SuriRaja's picture
Update app.py
8784b68 verified
import streamlit as st
import fitz # PyMuPDF
import difflib
from PIL import Image, ImageDraw, ImageEnhance, ImageFilter
import easyocr
import numpy as np
import pandas as pd
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import tempfile
import os
from io import BytesIO
# Initialize the easyocr Reader
ocr_reader = easyocr.Reader(['en'])
def load_and_compare_documents(file1, file2):
file1_content = file1.read()
file2_content = file2.read()
# Perform OCR-based comparison across all pages
ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
# Generate a PDF with side-by-side comparisons and observation tables
pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
# Compile an overall summary of differences
overall_summary = generate_overall_summary(ocr_differences)
return pdf_buffer, overall_summary
def pdf_to_images(file_content):
images = []
pdf_document = fitz.open(stream=file_content, filetype="pdf")
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap(dpi=300) # High DPI for better zoom capability
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Preprocess image: adjust brightness, contrast, and apply filter
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(1.5) # Increase contrast
enhancer = ImageEnhance.Brightness(img)
img = enhancer.enhance(1.2) # Increase brightness
img = img.filter(ImageFilter.SHARPEN) # Sharpen to reduce noise
images.append((page_num + 1, img))
pdf_document.close()
return images
def perform_ocr_and_compare(content1, content2):
ocr_differences = []
marked_images_1 = {}
marked_images_2 = {}
images1 = pdf_to_images(content1)
images2 = pdf_to_images(content2)
for (page_num, img1), (_, img2) in zip(images1, images2):
img1_np = np.array(img1)
img2_np = np.array(img2)
# Perform OCR using easyocr
text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
# Duplicate images for marking OCR differences
marked_img1 = img1.copy()
marked_img2 = img2.copy()
draw1 = ImageDraw.Draw(marked_img1)
draw2 = ImageDraw.Draw(marked_img2)
if text1.strip().lower() != text2.strip().lower(): # Case-insensitive, whitespace-trimmed
diff = list(difflib.ndiff(text1, text2))
page_diffs = []
diff_index = 1 # Start index for marking
for i, change in enumerate(diff):
if change.startswith("+ "):
page_diffs.append({"type": "Added", "value": change[2:], "index": i, "description": f"'{change[2:]}' added in second PDF but not in first PDF at position {i}"})
elif change.startswith("- "):
page_diffs.append({"type": "Deleted", "value": change[2:], "index": i, "description": f"'{change[2:]}' present in first PDF but missing in second PDF at position {i}"})
elif change.startswith("? "):
page_diffs.append({"type": "Modified", "value": change[2:], "index": i, "description": f"'{change[2:]}' modified at position {i}"})
ocr_differences.append({"page": page_num, "differences": page_diffs})
# Mark OCR-detected differences as boxed highlights on both images
for result in ocr_reader.readtext(img2_np):
bbox, detected_text = result[0], result[1]
if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
flattened_bbox = [coord for point in bbox for coord in point]
draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)
draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)
marked_images_1[page_num] = marked_img1
marked_images_2[page_num] = marked_img2
return ocr_differences, marked_images_1, marked_images_2
def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences):
pdf_buffer = BytesIO()
c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1])) # Adjusted for side-by-side layout
# Loop through each page to add side-by-side images and observations
for page_num, img1 in marked_images_1.items():
img2 = marked_images_2.get(page_num)
if img2:
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1:
img1.save(temp_img_file1, format="PNG")
temp_img_path1 = temp_img_file1.name
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2:
img2.save(temp_img_file2, format="PNG")
temp_img_path2 = temp_img_file2.name
# Draw the saved images side-by-side on the PDF
c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1])
c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1])
c.showPage()
try:
os.remove(temp_img_path1)
os.remove(temp_img_path2)
except OSError:
pass
# Generate the observation table for each page
c.setFont("Helvetica", 10)
y_position = 750
c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
y_position -= 20
# Table data for each page
data = {"Description": [], "Position": []}
for ocr_diff in ocr_differences:
if ocr_diff["page"] == page_num:
for diff in ocr_diff["differences"]:
data["Description"].append(diff["description"])
data["Position"].append(diff["index"])
# Convert data to DataFrame for formatting
df = pd.DataFrame(data)
column_widths = [350, 100]
# Render the DataFrame as a table in the PDF
for row in df.itertuples(index=False):
for col_index, value in enumerate(row):
c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
y_position -= 15
if y_position < 50: # Start a new page if space is running out
c.showPage()
y_position = 750
c.showPage()
c.save()
pdf_buffer.seek(0)
return pdf_buffer
def generate_overall_summary(ocr_differences):
total_additions = sum(len([d for d in diff["differences"] if d["type"] == "Added"]) for diff in ocr_differences)
total_deletions = sum(len([d for d in diff["differences"] if d["type"] == "Deleted"]) for diff in ocr_differences)
total_modifications = sum(len([d for d in diff["differences"] if d["type"] == "Modified"]) for diff in ocr_differences)
overall_summary = {
"total_additions": total_additions,
"total_deletions": total_deletions,
"total_modifications": total_modifications,
}
return overall_summary
# Streamlit app interface
def main():
st.title("Comprehensive Document Comparison Tool with OCR Text Extraction")
st.write("Upload Customer Document and CorelDRAW Output for detailed comparison.")
customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
if st.button("Compare Documents") and customer_file and output_file:
if customer_file.size == 0 or output_file.size == 0:
st.error("One or both files are empty. Please upload valid PDF files.")
return
pdf_buffer, overall_summary = load_and_compare_documents(customer_file, output_file)
st.subheader("Overall Comparison Summary")
for key, value in overall_summary.items():
st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
# Provide download link for generated PDF with marked differences
st.subheader("Download PDF with Side-by-Side Comparisons and Observations")
st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf")
if __name__ == "__main__":
main()