SathvikGanta commited on
Commit
ee5f2b1
·
verified ·
1 Parent(s): 78d7db8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -55
app.py CHANGED
@@ -34,7 +34,7 @@ def align_images(img1, img2):
34
  aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
35
  return aligned_img
36
 
37
- # Compare images with noise reduction and filtering
38
  def compare_images(img1, img2):
39
  diff = cv2.absdiff(img1, img2)
40
  gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
@@ -51,22 +51,22 @@ def compare_images(img1, img2):
51
 
52
  return cleaned
53
 
54
- # Generate text-based differences
55
- def generate_text_differences(orig_text, edit_text, start_position):
56
  diff = difflib.ndiff(orig_text.splitlines(), edit_text.splitlines())
57
- changes = []
58
- position_number = start_position
59
  for line in diff:
60
  if line.startswith("+ "): # Added text
61
- changes.append((position_number, f'"{line[2:]}" added at {position_number}'))
62
  elif line.startswith("- "): # Removed text
63
- changes.append((position_number, f'"{line[2:]}" removed at {position_number}'))
64
- position_number += 1
65
- return changes, position_number
66
 
67
  # Highlight visual changes
68
- def highlight_visual_changes(orig_img, edit_img, mask, start_position):
69
- overlay = edit_img.copy()
70
  contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
71
  visual_changes = []
72
  font = cv2.FONT_HERSHEY_SIMPLEX
@@ -84,40 +84,9 @@ def highlight_visual_changes(orig_img, edit_img, mask, start_position):
84
 
85
  return overlay, visual_changes, position_counter
86
 
87
- # Sanitize text for PDF compatibility
88
- def sanitize_text(text):
89
- """Sanitize text for FPDF by replacing unsupported characters."""
90
- return text.encode('latin-1', errors='replace').decode('latin-1')
91
-
92
- # Generate separate PDFs for visual and text changes
93
- def generate_separate_pdfs(original_pdf, edited_pdf):
94
- original_images = convert_pdf_to_images(original_pdf)
95
- edited_images = convert_pdf_to_images(edited_pdf)
96
- combined_images = []
97
- visual_changes = [] # Visual changes summary
98
- text_changes = [] # Text-based changes summary
99
- position_counter = 1
100
-
101
- for orig_img, edit_img in zip(original_images, edited_images):
102
- aligned_img = align_images(orig_img, edit_img)
103
- diff_mask = compare_images(orig_img, aligned_img)
104
- highlighted_img, page_visual_changes, position_counter = highlight_visual_changes(
105
- orig_img, edit_img, diff_mask, position_counter
106
- )
107
- text_differences, position_counter = generate_text_differences(
108
- pytesseract.image_to_string(orig_img), pytesseract.image_to_string(edit_img), position_counter
109
- )
110
- visual_changes.extend(page_visual_changes)
111
- text_changes.extend(text_differences)
112
-
113
- # Ensure dimensions match
114
- height = min(orig_img.shape[0], highlighted_img.shape[0])
115
- orig_img_resized = orig_img[:height]
116
- highlighted_img_resized = highlighted_img[:height]
117
- combined_images.append(np.hstack((orig_img_resized, highlighted_img_resized)))
118
-
119
- # Generate Visual Changes PDF
120
- visual_pdf_path = "outputs/visual_changes.pdf"
121
  pdf_visual = FPDF()
122
  for img in combined_images:
123
  temp_path = "temp_image_visual.png"
@@ -127,14 +96,16 @@ def generate_separate_pdfs(original_pdf, edited_pdf):
127
  os.remove(temp_path)
128
  pdf_visual.add_page()
129
  pdf_visual.set_font("Arial", size=12)
130
- pdf_visual.cell(0, 10, sanitize_text("Visual Changes"), ln=True, align="C")
131
  pdf_visual.ln(10)
132
  for _, change in visual_changes:
133
  pdf_visual.cell(0, 10, sanitize_text(change), ln=True)
134
- pdf_visual.output(visual_pdf_path)
 
135
 
136
- # Generate Text Changes PDF
137
- text_pdf_path = "outputs/text_changes.pdf"
 
138
  pdf_text = FPDF()
139
  for img in combined_images:
140
  temp_path = "temp_image_text.png"
@@ -144,17 +115,53 @@ def generate_separate_pdfs(original_pdf, edited_pdf):
144
  os.remove(temp_path)
145
  pdf_text.add_page()
146
  pdf_text.set_font("Arial", size=12)
147
- pdf_text.cell(0, 10, sanitize_text("Text Changes"), ln=True, align="C")
148
  pdf_text.ln(10)
149
  for _, change in text_changes:
150
  pdf_text.cell(0, 10, sanitize_text(change), ln=True)
151
- pdf_text.output(text_pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- return visual_pdf_path, text_pdf_path
154
 
155
  # Gradio interface function
156
  def pdf_comparison(original_pdf, edited_pdf):
157
- visual_path, text_path = generate_separate_pdfs(original_pdf.name, edited_pdf.name)
158
  return visual_path, text_path
159
 
160
  # Gradio interface
@@ -168,8 +175,8 @@ interface = gr.Interface(
168
  gr.File(label="Download Visual Changes Report"),
169
  gr.File(label="Download Text Changes Report")
170
  ],
171
- title="PDF Comparison Tool with Separate Reports",
172
- description="Upload two PDFs: the original and the edited version. The tool generates two separate reports: one for visual changes and another for text changes."
173
  )
174
 
175
  if __name__ == "__main__":
 
34
  aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
35
  return aligned_img
36
 
37
+ # Compare images for visual changes
38
  def compare_images(img1, img2):
39
  diff = cv2.absdiff(img1, img2)
40
  gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
 
51
 
52
  return cleaned
53
 
54
+ # Compare text and generate differences
55
+ def compare_text(orig_text, edit_text, start_position):
56
  diff = difflib.ndiff(orig_text.splitlines(), edit_text.splitlines())
57
+ text_changes = []
58
+ position_counter = start_position
59
  for line in diff:
60
  if line.startswith("+ "): # Added text
61
+ text_changes.append((position_counter, f'"{line[2:]}" added at {position_counter}'))
62
  elif line.startswith("- "): # Removed text
63
+ text_changes.append((position_counter, f'"{line[2:]}" removed at {position_counter}'))
64
+ position_counter += 1
65
+ return text_changes, position_counter
66
 
67
  # Highlight visual changes
68
+ def highlight_visual_changes(img1, img2, mask, start_position):
69
+ overlay = img2.copy()
70
  contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
71
  visual_changes = []
72
  font = cv2.FONT_HERSHEY_SIMPLEX
 
84
 
85
  return overlay, visual_changes, position_counter
86
 
87
+ # Generate visual changes report
88
+ def generate_visual_report(original_images, edited_images, combined_images, visual_changes):
89
+ output_path = "outputs/visual_changes.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  pdf_visual = FPDF()
91
  for img in combined_images:
92
  temp_path = "temp_image_visual.png"
 
96
  os.remove(temp_path)
97
  pdf_visual.add_page()
98
  pdf_visual.set_font("Arial", size=12)
99
+ pdf_visual.cell(0, 10, "Visual Changes", ln=True, align="C")
100
  pdf_visual.ln(10)
101
  for _, change in visual_changes:
102
  pdf_visual.cell(0, 10, sanitize_text(change), ln=True)
103
+ pdf_visual.output(output_path)
104
+ return output_path
105
 
106
+ # Generate text changes report
107
+ def generate_text_report(original_images, edited_images, combined_images, text_changes):
108
+ output_path = "outputs/text_changes.pdf"
109
  pdf_text = FPDF()
110
  for img in combined_images:
111
  temp_path = "temp_image_text.png"
 
115
  os.remove(temp_path)
116
  pdf_text.add_page()
117
  pdf_text.set_font("Arial", size=12)
118
+ pdf_text.cell(0, 10, "Text Changes", ln=True, align="C")
119
  pdf_text.ln(10)
120
  for _, change in text_changes:
121
  pdf_text.cell(0, 10, sanitize_text(change), ln=True)
122
+ pdf_text.output(output_path)
123
+ return output_path
124
+
125
+ # Generate separate PDFs for visual and text changes
126
+ def generate_separate_comparisons(original_pdf, edited_pdf):
127
+ original_images = convert_pdf_to_images(original_pdf)
128
+ edited_images = convert_pdf_to_images(edited_pdf)
129
+ combined_images = []
130
+ visual_changes = []
131
+ text_changes = []
132
+ position_counter = 1
133
+
134
+ for orig_img, edit_img in zip(original_images, edited_images):
135
+ aligned_img = align_images(orig_img, edit_img)
136
+
137
+ # Visual comparison
138
+ diff_mask = compare_images(orig_img, aligned_img)
139
+ highlighted_img, page_visual_changes, position_counter = highlight_visual_changes(
140
+ orig_img, edit_img, diff_mask, position_counter
141
+ )
142
+ visual_changes.extend(page_visual_changes)
143
+
144
+ # Text comparison
145
+ orig_text = pytesseract.image_to_string(orig_img)
146
+ edit_text = pytesseract.image_to_string(edit_img)
147
+ page_text_changes, position_counter = compare_text(orig_text, edit_text, position_counter)
148
+ text_changes.extend(page_text_changes)
149
+
150
+ # Combine images for side-by-side display
151
+ height = min(orig_img.shape[0], highlighted_img.shape[0])
152
+ orig_img_resized = orig_img[:height]
153
+ highlighted_img_resized = highlighted_img[:height]
154
+ combined_images.append(np.hstack((orig_img_resized, highlighted_img_resized)))
155
+
156
+ # Generate separate reports
157
+ visual_report = generate_visual_report(original_images, edited_images, combined_images, visual_changes)
158
+ text_report = generate_text_report(original_images, edited_images, combined_images, text_changes)
159
 
160
+ return visual_report, text_report
161
 
162
  # Gradio interface function
163
  def pdf_comparison(original_pdf, edited_pdf):
164
+ visual_path, text_path = generate_separate_comparisons(original_pdf.name, edited_pdf.name)
165
  return visual_path, text_path
166
 
167
  # Gradio interface
 
175
  gr.File(label="Download Visual Changes Report"),
176
  gr.File(label="Download Text Changes Report")
177
  ],
178
+ title="PDF Comparison Tool with Separate Comparisons",
179
+ description="Upload two PDFs: the original and the edited version. The tool generates separate reports for visual and text changes."
180
  )
181
 
182
  if __name__ == "__main__":