Update app.py
Browse files
app.py
CHANGED
|
@@ -22,10 +22,10 @@ def load_and_compare_documents(file1, file2):
|
|
| 22 |
# Extract and compare text with font properties, placement, and special characters
|
| 23 |
text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
|
| 24 |
|
| 25 |
-
# Perform OCR-based comparison and
|
| 26 |
ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
|
| 27 |
|
| 28 |
-
# Generate a PDF with marked OCR differences and
|
| 29 |
pdf_buffer = create_pdf_with_differences(marked_images, ocr_differences)
|
| 30 |
|
| 31 |
# Compile an overall summary of differences
|
|
@@ -54,7 +54,7 @@ def compare_text_with_properties(content1, content2):
|
|
| 54 |
placement_changes = []
|
| 55 |
|
| 56 |
for e1, e2 in zip(elements1, elements2):
|
| 57 |
-
if e1["text"] != e2["text"]:
|
| 58 |
diff = list(difflib.ndiff(e1["text"], e2["text"]))
|
| 59 |
for i, change in enumerate(diff):
|
| 60 |
if change.startswith("+ "):
|
|
@@ -130,7 +130,7 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 130 |
marked_img = img2.copy()
|
| 131 |
draw = ImageDraw.Draw(marked_img)
|
| 132 |
|
| 133 |
-
if text1 != text2:
|
| 134 |
diff = list(difflib.ndiff(text1, text2))
|
| 135 |
page_diffs = []
|
| 136 |
diff_index = 1 # Start index for marking
|
|
@@ -148,7 +148,7 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 148 |
# Mark OCR-detected differences and indices on image
|
| 149 |
for result in ocr_reader.readtext(img2_np):
|
| 150 |
bbox, detected_text = result[0], result[1]
|
| 151 |
-
if detected_text in text2 and detected_text not in text1:
|
| 152 |
# Flatten bounding box
|
| 153 |
flattened_bbox = [coord for point in bbox for coord in point]
|
| 154 |
# Draw a rectangle around the OCR difference and label with index
|
|
@@ -161,12 +161,10 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 161 |
return ocr_differences, marked_images
|
| 162 |
|
| 163 |
def create_pdf_with_differences(marked_images, ocr_differences):
|
| 164 |
-
# Use BytesIO to create an in-memory PDF file
|
| 165 |
pdf_buffer = BytesIO()
|
| 166 |
c = canvas.Canvas(pdf_buffer, pagesize=letter)
|
| 167 |
|
| 168 |
for page_num, img in marked_images.items():
|
| 169 |
-
# Save the marked image to a temporary file
|
| 170 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file:
|
| 171 |
img.save(temp_img_file, format="PNG")
|
| 172 |
temp_img_path = temp_img_file.name
|
|
@@ -174,7 +172,7 @@ def create_pdf_with_differences(marked_images, ocr_differences):
|
|
| 174 |
# Draw the saved image on the PDF
|
| 175 |
c.drawImage(temp_img_path, 0, 0, width=letter[0], height=letter[1])
|
| 176 |
|
| 177 |
-
# Add OCR
|
| 178 |
c.setFont("Helvetica", 10)
|
| 179 |
c.drawString(10, 750, f"Page {page_num} OCR Differences:")
|
| 180 |
y_position = 730
|
|
@@ -187,7 +185,6 @@ def create_pdf_with_differences(marked_images, ocr_differences):
|
|
| 187 |
c.drawString(10, y_position, diff)
|
| 188 |
y_position -= 15
|
| 189 |
|
| 190 |
-
# Move to the next page and delete the temporary image file
|
| 191 |
c.showPage()
|
| 192 |
temp_img_file.close()
|
| 193 |
try:
|
|
@@ -195,7 +192,6 @@ def create_pdf_with_differences(marked_images, ocr_differences):
|
|
| 195 |
except OSError:
|
| 196 |
pass
|
| 197 |
|
| 198 |
-
# Save the PDF to the in-memory buffer
|
| 199 |
c.save()
|
| 200 |
pdf_buffer.seek(0)
|
| 201 |
return pdf_buffer
|
|
|
|
| 22 |
# Extract and compare text with font properties, placement, and special characters
|
| 23 |
text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
|
| 24 |
|
| 25 |
+
# Perform OCR-based comparison and mark OCR differences with indices
|
| 26 |
ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
|
| 27 |
|
| 28 |
+
# Generate a PDF with marked OCR differences and observations
|
| 29 |
pdf_buffer = create_pdf_with_differences(marked_images, ocr_differences)
|
| 30 |
|
| 31 |
# Compile an overall summary of differences
|
|
|
|
| 54 |
placement_changes = []
|
| 55 |
|
| 56 |
for e1, e2 in zip(elements1, elements2):
|
| 57 |
+
if e1["text"].strip().lower() != e2["text"].strip().lower(): # Enhanced: Case-insensitive, whitespace-trimmed
|
| 58 |
diff = list(difflib.ndiff(e1["text"], e2["text"]))
|
| 59 |
for i, change in enumerate(diff):
|
| 60 |
if change.startswith("+ "):
|
|
|
|
| 130 |
marked_img = img2.copy()
|
| 131 |
draw = ImageDraw.Draw(marked_img)
|
| 132 |
|
| 133 |
+
if text1.strip().lower() != text2.strip().lower(): # Enhanced: Case-insensitive, whitespace-trimmed
|
| 134 |
diff = list(difflib.ndiff(text1, text2))
|
| 135 |
page_diffs = []
|
| 136 |
diff_index = 1 # Start index for marking
|
|
|
|
| 148 |
# Mark OCR-detected differences and indices on image
|
| 149 |
for result in ocr_reader.readtext(img2_np):
|
| 150 |
bbox, detected_text = result[0], result[1]
|
| 151 |
+
if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
|
| 152 |
# Flatten bounding box
|
| 153 |
flattened_bbox = [coord for point in bbox for coord in point]
|
| 154 |
# Draw a rectangle around the OCR difference and label with index
|
|
|
|
| 161 |
return ocr_differences, marked_images
|
| 162 |
|
| 163 |
def create_pdf_with_differences(marked_images, ocr_differences):
|
|
|
|
| 164 |
pdf_buffer = BytesIO()
|
| 165 |
c = canvas.Canvas(pdf_buffer, pagesize=letter)
|
| 166 |
|
| 167 |
for page_num, img in marked_images.items():
|
|
|
|
| 168 |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file:
|
| 169 |
img.save(temp_img_file, format="PNG")
|
| 170 |
temp_img_path = temp_img_file.name
|
|
|
|
| 172 |
# Draw the saved image on the PDF
|
| 173 |
c.drawImage(temp_img_path, 0, 0, width=letter[0], height=letter[1])
|
| 174 |
|
| 175 |
+
# Add OCR observations below the image
|
| 176 |
c.setFont("Helvetica", 10)
|
| 177 |
c.drawString(10, 750, f"Page {page_num} OCR Differences:")
|
| 178 |
y_position = 730
|
|
|
|
| 185 |
c.drawString(10, y_position, diff)
|
| 186 |
y_position -= 15
|
| 187 |
|
|
|
|
| 188 |
c.showPage()
|
| 189 |
temp_img_file.close()
|
| 190 |
try:
|
|
|
|
| 192 |
except OSError:
|
| 193 |
pass
|
| 194 |
|
|
|
|
| 195 |
c.save()
|
| 196 |
pdf_buffer.seek(0)
|
| 197 |
return pdf_buffer
|