Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,7 @@ from reportlab.pdfgen import canvas
|
|
| 10 |
from reportlab.lib.pagesizes import letter
|
| 11 |
import tempfile
|
| 12 |
import os
|
|
|
|
| 13 |
|
| 14 |
# Initialize the easyocr Reader
|
| 15 |
ocr_reader = easyocr.Reader(['en'])
|
|
@@ -25,12 +26,12 @@ def load_and_compare_documents(file1, file2):
|
|
| 25 |
ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
|
| 26 |
|
| 27 |
# Generate a PDF with marked OCR differences and positions
|
| 28 |
-
|
| 29 |
|
| 30 |
# Compile an overall summary of differences
|
| 31 |
overall_summary = generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, ocr_differences)
|
| 32 |
|
| 33 |
-
return text_differences, text_property_changes, special_char_changes, placement_changes,
|
| 34 |
|
| 35 |
def pdf_to_images(file_content):
|
| 36 |
images = []
|
|
@@ -160,8 +161,9 @@ def perform_ocr_and_compare(content1, content2):
|
|
| 160 |
return ocr_differences, marked_images
|
| 161 |
|
| 162 |
def create_pdf_with_differences(marked_images, ocr_differences):
|
| 163 |
-
|
| 164 |
-
|
|
|
|
| 165 |
|
| 166 |
for page_num, img in marked_images.items():
|
| 167 |
# Save the marked image to a temporary file
|
|
@@ -188,14 +190,15 @@ def create_pdf_with_differences(marked_images, ocr_differences):
|
|
| 188 |
# Move to the next page and delete the temporary image file
|
| 189 |
c.showPage()
|
| 190 |
temp_img_file.close()
|
| 191 |
-
# Remove the temporary file to avoid clutter
|
| 192 |
try:
|
| 193 |
os.remove(temp_img_path)
|
| 194 |
except OSError:
|
| 195 |
pass
|
| 196 |
|
|
|
|
| 197 |
c.save()
|
| 198 |
-
|
|
|
|
| 199 |
|
| 200 |
def generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, ocr_differences):
|
| 201 |
overall_summary = {
|
|
@@ -222,7 +225,7 @@ def main():
|
|
| 222 |
st.error("One or both files are empty. Please upload valid PDF files.")
|
| 223 |
return
|
| 224 |
|
| 225 |
-
text_differences, text_property_changes, special_char_changes, placement_changes,
|
| 226 |
|
| 227 |
st.subheader("Overall Comparison Summary")
|
| 228 |
for key, value in overall_summary.items():
|
|
@@ -230,9 +233,7 @@ def main():
|
|
| 230 |
|
| 231 |
# Provide download link for generated PDF with marked differences
|
| 232 |
st.subheader("Download PDF with Marked OCR Differences")
|
| 233 |
-
|
| 234 |
-
pdf_bytes = pdf_file.read()
|
| 235 |
-
st.download_button("Download Marked PDF", data=pdf_bytes, file_name="marked_differences.pdf", mime="application/pdf")
|
| 236 |
|
| 237 |
if __name__ == "__main__":
|
| 238 |
main()
|
|
|
|
| 10 |
from reportlab.lib.pagesizes import letter
|
| 11 |
import tempfile
|
| 12 |
import os
|
| 13 |
+
from io import BytesIO
|
| 14 |
|
| 15 |
# Initialize the easyocr Reader
|
| 16 |
ocr_reader = easyocr.Reader(['en'])
|
|
|
|
| 26 |
ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
|
| 27 |
|
| 28 |
# Generate a PDF with marked OCR differences and positions
|
| 29 |
+
pdf_buffer = create_pdf_with_differences(marked_images, ocr_differences)
|
| 30 |
|
| 31 |
# Compile an overall summary of differences
|
| 32 |
overall_summary = generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, ocr_differences)
|
| 33 |
|
| 34 |
+
return text_differences, text_property_changes, special_char_changes, placement_changes, pdf_buffer, overall_summary
|
| 35 |
|
| 36 |
def pdf_to_images(file_content):
|
| 37 |
images = []
|
|
|
|
| 161 |
return ocr_differences, marked_images
|
| 162 |
|
| 163 |
def create_pdf_with_differences(marked_images, ocr_differences):
|
| 164 |
+
# Use BytesIO to create an in-memory PDF file
|
| 165 |
+
pdf_buffer = BytesIO()
|
| 166 |
+
c = canvas.Canvas(pdf_buffer, pagesize=letter)
|
| 167 |
|
| 168 |
for page_num, img in marked_images.items():
|
| 169 |
# Save the marked image to a temporary file
|
|
|
|
| 190 |
# Move to the next page and delete the temporary image file
|
| 191 |
c.showPage()
|
| 192 |
temp_img_file.close()
|
|
|
|
| 193 |
try:
|
| 194 |
os.remove(temp_img_path)
|
| 195 |
except OSError:
|
| 196 |
pass
|
| 197 |
|
| 198 |
+
# Save the PDF to the in-memory buffer
|
| 199 |
c.save()
|
| 200 |
+
pdf_buffer.seek(0)
|
| 201 |
+
return pdf_buffer
|
| 202 |
|
| 203 |
def generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, ocr_differences):
|
| 204 |
overall_summary = {
|
|
|
|
| 225 |
st.error("One or both files are empty. Please upload valid PDF files.")
|
| 226 |
return
|
| 227 |
|
| 228 |
+
text_differences, text_property_changes, special_char_changes, placement_changes, pdf_buffer, overall_summary = load_and_compare_documents(customer_file, output_file)
|
| 229 |
|
| 230 |
st.subheader("Overall Comparison Summary")
|
| 231 |
for key, value in overall_summary.items():
|
|
|
|
| 233 |
|
| 234 |
# Provide download link for generated PDF with marked differences
|
| 235 |
st.subheader("Download PDF with Marked OCR Differences")
|
| 236 |
+
st.download_button("Download Marked PDF", data=pdf_buffer, file_name="marked_differences.pdf", mime="application/pdf")
|
|
|
|
|
|
|
| 237 |
|
| 238 |
if __name__ == "__main__":
|
| 239 |
main()
|