SuriRaja commited on
Commit
0daa9c2
·
verified ·
1 Parent(s): 95fb28f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -10
app.py CHANGED
@@ -22,10 +22,10 @@ def load_and_compare_documents(file1, file2):
22
  # Extract and compare text with font properties, placement, and special characters
23
  text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
24
 
25
- # Perform OCR-based comparison and underline OCR differences on images
26
  ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
27
 
28
- # Generate a PDF with marked OCR differences and positions
29
  pdf_buffer = create_pdf_with_differences(marked_images, ocr_differences)
30
 
31
  # Compile an overall summary of differences
@@ -54,7 +54,7 @@ def compare_text_with_properties(content1, content2):
54
  placement_changes = []
55
 
56
  for e1, e2 in zip(elements1, elements2):
57
- if e1["text"] != e2["text"]:
58
  diff = list(difflib.ndiff(e1["text"], e2["text"]))
59
  for i, change in enumerate(diff):
60
  if change.startswith("+ "):
@@ -130,7 +130,7 @@ def perform_ocr_and_compare(content1, content2):
130
  marked_img = img2.copy()
131
  draw = ImageDraw.Draw(marked_img)
132
 
133
- if text1 != text2:
134
  diff = list(difflib.ndiff(text1, text2))
135
  page_diffs = []
136
  diff_index = 1 # Start index for marking
@@ -148,7 +148,7 @@ def perform_ocr_and_compare(content1, content2):
148
  # Mark OCR-detected differences and indices on image
149
  for result in ocr_reader.readtext(img2_np):
150
  bbox, detected_text = result[0], result[1]
151
- if detected_text in text2 and detected_text not in text1:
152
  # Flatten bounding box
153
  flattened_bbox = [coord for point in bbox for coord in point]
154
  # Draw a rectangle around the OCR difference and label with index
@@ -161,12 +161,10 @@ def perform_ocr_and_compare(content1, content2):
161
  return ocr_differences, marked_images
162
 
163
  def create_pdf_with_differences(marked_images, ocr_differences):
164
- # Use BytesIO to create an in-memory PDF file
165
  pdf_buffer = BytesIO()
166
  c = canvas.Canvas(pdf_buffer, pagesize=letter)
167
 
168
  for page_num, img in marked_images.items():
169
- # Save the marked image to a temporary file
170
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file:
171
  img.save(temp_img_file, format="PNG")
172
  temp_img_path = temp_img_file.name
@@ -174,7 +172,7 @@ def create_pdf_with_differences(marked_images, ocr_differences):
174
  # Draw the saved image on the PDF
175
  c.drawImage(temp_img_path, 0, 0, width=letter[0], height=letter[1])
176
 
177
- # Add OCR differences to PDF
178
  c.setFont("Helvetica", 10)
179
  c.drawString(10, 750, f"Page {page_num} OCR Differences:")
180
  y_position = 730
@@ -187,7 +185,6 @@ def create_pdf_with_differences(marked_images, ocr_differences):
187
  c.drawString(10, y_position, diff)
188
  y_position -= 15
189
 
190
- # Move to the next page and delete the temporary image file
191
  c.showPage()
192
  temp_img_file.close()
193
  try:
@@ -195,7 +192,6 @@ def create_pdf_with_differences(marked_images, ocr_differences):
195
  except OSError:
196
  pass
197
 
198
- # Save the PDF to the in-memory buffer
199
  c.save()
200
  pdf_buffer.seek(0)
201
  return pdf_buffer
 
22
  # Extract and compare text with font properties, placement, and special characters
23
  text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
24
 
25
+ # Perform OCR-based comparison and mark OCR differences with indices
26
  ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
27
 
28
+ # Generate a PDF with marked OCR differences and observations
29
  pdf_buffer = create_pdf_with_differences(marked_images, ocr_differences)
30
 
31
  # Compile an overall summary of differences
 
54
  placement_changes = []
55
 
56
  for e1, e2 in zip(elements1, elements2):
57
+ if e1["text"].strip().lower() != e2["text"].strip().lower(): # Enhanced: Case-insensitive, whitespace-trimmed
58
  diff = list(difflib.ndiff(e1["text"], e2["text"]))
59
  for i, change in enumerate(diff):
60
  if change.startswith("+ "):
 
130
  marked_img = img2.copy()
131
  draw = ImageDraw.Draw(marked_img)
132
 
133
+ if text1.strip().lower() != text2.strip().lower(): # Enhanced: Case-insensitive, whitespace-trimmed
134
  diff = list(difflib.ndiff(text1, text2))
135
  page_diffs = []
136
  diff_index = 1 # Start index for marking
 
148
  # Mark OCR-detected differences and indices on image
149
  for result in ocr_reader.readtext(img2_np):
150
  bbox, detected_text = result[0], result[1]
151
+ if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
152
  # Flatten bounding box
153
  flattened_bbox = [coord for point in bbox for coord in point]
154
  # Draw a rectangle around the OCR difference and label with index
 
161
  return ocr_differences, marked_images
162
 
163
  def create_pdf_with_differences(marked_images, ocr_differences):
 
164
  pdf_buffer = BytesIO()
165
  c = canvas.Canvas(pdf_buffer, pagesize=letter)
166
 
167
  for page_num, img in marked_images.items():
 
168
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file:
169
  img.save(temp_img_file, format="PNG")
170
  temp_img_path = temp_img_file.name
 
172
  # Draw the saved image on the PDF
173
  c.drawImage(temp_img_path, 0, 0, width=letter[0], height=letter[1])
174
 
175
+ # Add OCR observations below the image
176
  c.setFont("Helvetica", 10)
177
  c.drawString(10, 750, f"Page {page_num} OCR Differences:")
178
  y_position = 730
 
185
  c.drawString(10, y_position, diff)
186
  y_position -= 15
187
 
 
188
  c.showPage()
189
  temp_img_file.close()
190
  try:
 
192
  except OSError:
193
  pass
194
 
 
195
  c.save()
196
  pdf_buffer.seek(0)
197
  return pdf_buffer