Update app.py
Browse files
app.py
CHANGED
|
@@ -19,10 +19,10 @@ def load_and_compare_documents(file1, file2):
|
|
| 19 |
file2_content = file2.read()
|
| 20 |
|
| 21 |
# Perform OCR-based comparison across all pages
|
| 22 |
-
ocr_differences,
|
| 23 |
|
| 24 |
-
# Generate a PDF with
|
| 25 |
-
pdf_buffer =
|
| 26 |
|
| 27 |
# Compile an overall summary of differences
|
| 28 |
overall_summary = generate_overall_summary(ocr_differences)
|
|
@@ -34,7 +34,7 @@ def pdf_to_images(file_content):
|
|
| 34 |
pdf_document = fitz.open(stream=file_content, filetype="pdf")
|
| 35 |
for page_num in range(pdf_document.page_count):
|
| 36 |
page = pdf_document.load_page(page_num)
|
| 37 |
-
pix = page.get_pixmap(dpi=
|
| 38 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 39 |
|
| 40 |
# Preprocess image: adjust brightness, contrast, and apply filter
|
|
@@ -50,7 +50,8 @@ def pdf_to_images(file_content):
|
|
| 50 |
|
| 51 |
def perform_ocr_and_compare(content1, content2):
|
| 52 |
ocr_differences = []
|
| 53 |
-
|
|
|
|
| 54 |
images1 = pdf_to_images(content1)
|
| 55 |
images2 = pdf_to_images(content2)
|
| 56 |
|
|
@@ -62,9 +63,11 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 62 |
text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
|
| 63 |
text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
|
| 64 |
|
| 65 |
-
# Duplicate
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
|
| 69 |
if text1.strip().lower() != text2.strip().lower(): # Case-insensitive, whitespace-trimmed
|
| 70 |
diff = list(difflib.ndiff(text1, text2))
|
|
@@ -81,71 +84,80 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 81 |
|
| 82 |
ocr_differences.append({"page": page_num, "differences": page_diffs})
|
| 83 |
|
| 84 |
-
# Mark OCR-detected differences
|
| 85 |
for result in ocr_reader.readtext(img2_np):
|
| 86 |
bbox, detected_text = result[0], result[1]
|
| 87 |
if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
|
| 88 |
flattened_bbox = [coord for point in bbox for coord in point]
|
| 89 |
-
|
| 90 |
-
|
| 91 |
diff_index += 1
|
| 92 |
|
| 93 |
-
|
|
|
|
| 94 |
|
| 95 |
-
return ocr_differences,
|
| 96 |
|
| 97 |
-
def
|
| 98 |
pdf_buffer = BytesIO()
|
| 99 |
-
c = canvas.Canvas(pdf_buffer, pagesize=letter)
|
| 100 |
|
| 101 |
-
# Loop through each page to add
|
| 102 |
-
for page_num,
|
| 103 |
-
|
| 104 |
-
img.save(temp_img_file, format="PNG")
|
| 105 |
-
temp_img_path = temp_img_file.name
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
c.save()
|
| 151 |
pdf_buffer.seek(0)
|
|
@@ -183,8 +195,8 @@ def main():
|
|
| 183 |
st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
|
| 184 |
|
| 185 |
# Provide download link for generated PDF with marked differences
|
| 186 |
-
st.subheader("Download PDF with
|
| 187 |
-
st.download_button("Download Marked PDF", data=pdf_buffer, file_name="
|
| 188 |
|
| 189 |
if __name__ == "__main__":
|
| 190 |
main()
|
|
|
|
| 19 |
file2_content = file2.read()
|
| 20 |
|
| 21 |
# Perform OCR-based comparison across all pages
|
| 22 |
+
ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
|
| 23 |
|
| 24 |
+
# Generate a PDF with side-by-side comparisons and observation tables
|
| 25 |
+
pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
|
| 26 |
|
| 27 |
# Compile an overall summary of differences
|
| 28 |
overall_summary = generate_overall_summary(ocr_differences)
|
|
|
|
| 34 |
pdf_document = fitz.open(stream=file_content, filetype="pdf")
|
| 35 |
for page_num in range(pdf_document.page_count):
|
| 36 |
page = pdf_document.load_page(page_num)
|
| 37 |
+
pix = page.get_pixmap(dpi=300) # High DPI for better zoom capability
|
| 38 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 39 |
|
| 40 |
# Preprocess image: adjust brightness, contrast, and apply filter
|
|
|
|
| 50 |
|
| 51 |
def perform_ocr_and_compare(content1, content2):
|
| 52 |
ocr_differences = []
|
| 53 |
+
marked_images_1 = {}
|
| 54 |
+
marked_images_2 = {}
|
| 55 |
images1 = pdf_to_images(content1)
|
| 56 |
images2 = pdf_to_images(content2)
|
| 57 |
|
|
|
|
| 63 |
text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
|
| 64 |
text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
|
| 65 |
|
| 66 |
+
# Duplicate images for marking OCR differences
|
| 67 |
+
marked_img1 = img1.copy()
|
| 68 |
+
marked_img2 = img2.copy()
|
| 69 |
+
draw1 = ImageDraw.Draw(marked_img1)
|
| 70 |
+
draw2 = ImageDraw.Draw(marked_img2)
|
| 71 |
|
| 72 |
if text1.strip().lower() != text2.strip().lower(): # Case-insensitive, whitespace-trimmed
|
| 73 |
diff = list(difflib.ndiff(text1, text2))
|
|
|
|
| 84 |
|
| 85 |
ocr_differences.append({"page": page_num, "differences": page_diffs})
|
| 86 |
|
| 87 |
+
# Mark OCR-detected differences as boxed highlights on both images
|
| 88 |
for result in ocr_reader.readtext(img2_np):
|
| 89 |
bbox, detected_text = result[0], result[1]
|
| 90 |
if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
|
| 91 |
flattened_bbox = [coord for point in bbox for coord in point]
|
| 92 |
+
draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
|
| 93 |
+
draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
|
| 94 |
diff_index += 1
|
| 95 |
|
| 96 |
+
marked_images_1[page_num] = marked_img1
|
| 97 |
+
marked_images_2[page_num] = marked_img2
|
| 98 |
|
| 99 |
+
return ocr_differences, marked_images_1, marked_images_2
|
| 100 |
|
| 101 |
+
def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences):
|
| 102 |
pdf_buffer = BytesIO()
|
| 103 |
+
c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1])) # Adjusted for side-by-side layout
|
| 104 |
|
| 105 |
+
# Loop through each page to add side-by-side images and observations
|
| 106 |
+
for page_num, img1 in marked_images_1.items():
|
| 107 |
+
img2 = marked_images_2.get(page_num)
|
|
|
|
|
|
|
| 108 |
|
| 109 |
+
if img2:
|
| 110 |
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1:
|
| 111 |
+
img1.save(temp_img_file1, format="PNG")
|
| 112 |
+
temp_img_path1 = temp_img_file1.name
|
| 113 |
+
|
| 114 |
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2:
|
| 115 |
+
img2.save(temp_img_file2, format="PNG")
|
| 116 |
+
temp_img_path2 = temp_img_file2.name
|
| 117 |
+
|
| 118 |
+
# Draw the saved images side-by-side on the PDF
|
| 119 |
+
c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1])
|
| 120 |
+
c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1])
|
| 121 |
+
c.showPage()
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
os.remove(temp_img_path1)
|
| 125 |
+
os.remove(temp_img_path2)
|
| 126 |
+
except OSError:
|
| 127 |
+
pass
|
| 128 |
+
|
| 129 |
+
# Generate the observation table for each page
|
| 130 |
+
c.setFont("Helvetica", 10)
|
| 131 |
+
y_position = 750
|
| 132 |
+
c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
|
| 133 |
+
y_position -= 20
|
| 134 |
+
|
| 135 |
+
# Table data for each page
|
| 136 |
+
data = {"Additions": [], "Deletions": [], "Modifications": []}
|
| 137 |
+
for ocr_diff in ocr_differences:
|
| 138 |
+
if ocr_diff["page"] == page_num:
|
| 139 |
+
for diff in ocr_diff["differences"]:
|
| 140 |
+
if "Added" in diff:
|
| 141 |
+
data["Additions"].append(diff)
|
| 142 |
+
elif "Deleted" in diff:
|
| 143 |
+
data["Deletions"].append(diff)
|
| 144 |
+
elif "Modified" in diff:
|
| 145 |
+
data["Modifications"].append(diff)
|
| 146 |
+
|
| 147 |
+
# Convert data to DataFrame for formatting
|
| 148 |
+
df = pd.DataFrame.from_dict(data, orient="index").transpose()
|
| 149 |
+
column_widths = [150, 150, 150]
|
| 150 |
+
|
| 151 |
+
# Render the DataFrame as a table in the PDF
|
| 152 |
+
for row in df.itertuples(index=False):
|
| 153 |
+
for col_index, value in enumerate(row):
|
| 154 |
+
c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
|
| 155 |
+
y_position -= 15
|
| 156 |
+
if y_position < 50: # Start a new page if space is running out
|
| 157 |
+
c.showPage()
|
| 158 |
+
y_position = 750
|
| 159 |
+
|
| 160 |
+
c.showPage()
|
| 161 |
|
| 162 |
c.save()
|
| 163 |
pdf_buffer.seek(0)
|
|
|
|
| 195 |
st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
|
| 196 |
|
| 197 |
# Provide download link for generated PDF with marked differences
|
| 198 |
+
st.subheader("Download PDF with Side-by-Side Comparisons and Observations")
|
| 199 |
+
st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf")
|
| 200 |
|
| 201 |
if __name__ == "__main__":
|
| 202 |
main()
|