SuriRaja commited on
Commit
c91c330
·
verified ·
1 Parent(s): f7a769c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -64
app.py CHANGED
@@ -19,10 +19,10 @@ def load_and_compare_documents(file1, file2):
19
  file2_content = file2.read()
20
 
21
  # Perform OCR-based comparison across all pages
22
- ocr_differences, marked_images = perform_ocr_and_compare(file1_content, file2_content)
23
 
24
- # Generate a PDF with marked OCR differences for each page and observation tables
25
- pdf_buffer = create_pdf_with_observations(marked_images, ocr_differences)
26
 
27
  # Compile an overall summary of differences
28
  overall_summary = generate_overall_summary(ocr_differences)
@@ -34,7 +34,7 @@ def pdf_to_images(file_content):
34
  pdf_document = fitz.open(stream=file_content, filetype="pdf")
35
  for page_num in range(pdf_document.page_count):
36
  page = pdf_document.load_page(page_num)
37
- pix = page.get_pixmap(dpi=150) # Higher DPI for clearer images
38
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
39
 
40
  # Preprocess image: adjust brightness, contrast, and apply filter
@@ -50,7 +50,8 @@ def pdf_to_images(file_content):
50
 
51
  def perform_ocr_and_compare(content1, content2):
52
  ocr_differences = []
53
- marked_images = {}
 
54
  images1 = pdf_to_images(content1)
55
  images2 = pdf_to_images(content2)
56
 
@@ -62,9 +63,11 @@ def perform_ocr_and_compare(content1, content2):
62
  text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
63
  text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
64
 
65
- # Duplicate image for marking OCR differences
66
- marked_img = img2.copy()
67
- draw = ImageDraw.Draw(marked_img)
 
 
68
 
69
  if text1.strip().lower() != text2.strip().lower(): # Case-insensitive, whitespace-trimmed
70
  diff = list(difflib.ndiff(text1, text2))
@@ -81,71 +84,80 @@ def perform_ocr_and_compare(content1, content2):
81
 
82
  ocr_differences.append({"page": page_num, "differences": page_diffs})
83
 
84
- # Mark OCR-detected differences and indices on image
85
  for result in ocr_reader.readtext(img2_np):
86
  bbox, detected_text = result[0], result[1]
87
  if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
88
  flattened_bbox = [coord for point in bbox for coord in point]
89
- draw.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)
90
- draw.text((flattened_bbox[0], flattened_bbox[1] - 10), str(diff_index), fill="blue")
91
  diff_index += 1
92
 
93
- marked_images[page_num] = marked_img
 
94
 
95
- return ocr_differences, marked_images
96
 
97
- def create_pdf_with_observations(marked_images, ocr_differences):
98
  pdf_buffer = BytesIO()
99
- c = canvas.Canvas(pdf_buffer, pagesize=letter)
100
 
101
- # Loop through each page to add image with differences and observations
102
- for page_num, img in marked_images.items():
103
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file:
104
- img.save(temp_img_file, format="PNG")
105
- temp_img_path = temp_img_file.name
106
 
107
- # Draw the saved image on the PDF
108
- c.drawImage(temp_img_path, 0, 0, width=letter[0], height=letter[1])
109
- c.showPage()
110
-
111
- temp_img_file.close()
112
- try:
113
- os.remove(temp_img_path)
114
- except OSError:
115
- pass
116
-
117
- # Generate the observation table for each page
118
- c.setFont("Helvetica", 10)
119
- y_position = 750
120
- c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
121
- y_position -= 20
122
-
123
- # Table data for each page
124
- data = {"Additions": [], "Deletions": [], "Modifications": []}
125
- for ocr_diff in ocr_differences:
126
- if ocr_diff["page"] == page_num:
127
- for diff in ocr_diff["differences"]:
128
- if "Added" in diff:
129
- data["Additions"].append(diff)
130
- elif "Deleted" in diff:
131
- data["Deletions"].append(diff)
132
- elif "Modified" in diff:
133
- data["Modifications"].append(diff)
134
-
135
- # Convert data to DataFrame for formatting
136
- df = pd.DataFrame.from_dict(data, orient="index").transpose()
137
- column_widths = [150, 150, 150]
138
-
139
- # Render the DataFrame as a table in the PDF
140
- for row in df.itertuples(index=False):
141
- for col_index, value in enumerate(row):
142
- c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
143
- y_position -= 15
144
- if y_position < 50: # Start a new page if space is running out
145
- c.showPage()
146
- y_position = 750
147
-
148
- c.showPage()
 
 
 
 
 
 
 
 
 
 
149
 
150
  c.save()
151
  pdf_buffer.seek(0)
@@ -183,8 +195,8 @@ def main():
183
  st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
184
 
185
  # Provide download link for generated PDF with marked differences
186
- st.subheader("Download PDF with Marked OCR Differences and Observations")
187
- st.download_button("Download Marked PDF", data=pdf_buffer, file_name="marked_differences_and_observations.pdf", mime="application/pdf")
188
 
189
  if __name__ == "__main__":
190
  main()
 
19
  file2_content = file2.read()
20
 
21
  # Perform OCR-based comparison across all pages
22
+ ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
23
 
24
+ # Generate a PDF with side-by-side comparisons and observation tables
25
+ pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
26
 
27
  # Compile an overall summary of differences
28
  overall_summary = generate_overall_summary(ocr_differences)
 
34
  pdf_document = fitz.open(stream=file_content, filetype="pdf")
35
  for page_num in range(pdf_document.page_count):
36
  page = pdf_document.load_page(page_num)
37
+ pix = page.get_pixmap(dpi=300) # High DPI for better zoom capability
38
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
39
 
40
  # Preprocess image: adjust brightness, contrast, and apply filter
 
50
 
51
  def perform_ocr_and_compare(content1, content2):
52
  ocr_differences = []
53
+ marked_images_1 = {}
54
+ marked_images_2 = {}
55
  images1 = pdf_to_images(content1)
56
  images2 = pdf_to_images(content2)
57
 
 
63
  text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
64
  text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
65
 
66
+ # Duplicate images for marking OCR differences
67
+ marked_img1 = img1.copy()
68
+ marked_img2 = img2.copy()
69
+ draw1 = ImageDraw.Draw(marked_img1)
70
+ draw2 = ImageDraw.Draw(marked_img2)
71
 
72
  if text1.strip().lower() != text2.strip().lower(): # Case-insensitive, whitespace-trimmed
73
  diff = list(difflib.ndiff(text1, text2))
 
84
 
85
  ocr_differences.append({"page": page_num, "differences": page_diffs})
86
 
87
+ # Mark OCR-detected differences as boxed highlights on both images
88
  for result in ocr_reader.readtext(img2_np):
89
  bbox, detected_text = result[0], result[1]
90
  if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
91
  flattened_bbox = [coord for point in bbox for coord in point]
92
+ draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
93
+ draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="blue", width=2)
94
  diff_index += 1
95
 
96
+ marked_images_1[page_num] = marked_img1
97
+ marked_images_2[page_num] = marked_img2
98
 
99
+ return ocr_differences, marked_images_1, marked_images_2
100
 
101
+ def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences):
102
  pdf_buffer = BytesIO()
103
+ c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1])) # Adjusted for side-by-side layout
104
 
105
+ # Loop through each page to add side-by-side images and observations
106
+ for page_num, img1 in marked_images_1.items():
107
+ img2 = marked_images_2.get(page_num)
 
 
108
 
109
+ if img2:
110
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1:
111
+ img1.save(temp_img_file1, format="PNG")
112
+ temp_img_path1 = temp_img_file1.name
113
+
114
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2:
115
+ img2.save(temp_img_file2, format="PNG")
116
+ temp_img_path2 = temp_img_file2.name
117
+
118
+ # Draw the saved images side-by-side on the PDF
119
+ c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1])
120
+ c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1])
121
+ c.showPage()
122
+
123
+ try:
124
+ os.remove(temp_img_path1)
125
+ os.remove(temp_img_path2)
126
+ except OSError:
127
+ pass
128
+
129
+ # Generate the observation table for each page
130
+ c.setFont("Helvetica", 10)
131
+ y_position = 750
132
+ c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
133
+ y_position -= 20
134
+
135
+ # Table data for each page
136
+ data = {"Additions": [], "Deletions": [], "Modifications": []}
137
+ for ocr_diff in ocr_differences:
138
+ if ocr_diff["page"] == page_num:
139
+ for diff in ocr_diff["differences"]:
140
+ if "Added" in diff:
141
+ data["Additions"].append(diff)
142
+ elif "Deleted" in diff:
143
+ data["Deletions"].append(diff)
144
+ elif "Modified" in diff:
145
+ data["Modifications"].append(diff)
146
+
147
+ # Convert data to DataFrame for formatting
148
+ df = pd.DataFrame.from_dict(data, orient="index").transpose()
149
+ column_widths = [150, 150, 150]
150
+
151
+ # Render the DataFrame as a table in the PDF
152
+ for row in df.itertuples(index=False):
153
+ for col_index, value in enumerate(row):
154
+ c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
155
+ y_position -= 15
156
+ if y_position < 50: # Start a new page if space is running out
157
+ c.showPage()
158
+ y_position = 750
159
+
160
+ c.showPage()
161
 
162
  c.save()
163
  pdf_buffer.seek(0)
 
195
  st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
196
 
197
  # Provide download link for generated PDF with marked differences
198
+ st.subheader("Download PDF with Side-by-Side Comparisons and Observations")
199
+ st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf")
200
 
201
  if __name__ == "__main__":
202
  main()