SathvikGanta commited on
Commit
b2bb51d
·
verified ·
1 Parent(s): 31e9e89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -159
app.py CHANGED
@@ -3,114 +3,94 @@ import fitz # PyMuPDF
3
  import cv2
4
  from pdf2image import convert_from_path
5
  import pytesseract
6
- from pytesseract import Output
7
  import numpy as np
8
  import os
9
  from fpdf import FPDF
10
- import difflib # For text comparison
11
 
12
- # Convert PDFs to images
 
13
  def convert_pdf_to_images(pdf_path, dpi=300):
14
  images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
15
  return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
16
 
17
- # Align images
18
- def align_images(img1, img2):
19
- gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
20
- gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
21
- orb = cv2.ORB_create()
22
- kp1, des1 = orb.detectAndCompute(gray1, None)
23
- kp2, des2 = orb.detectAndCompute(gray2, None)
24
- bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
25
- matches = bf.match(des1, des2)
26
- matches = sorted(matches, key=lambda x: x.distance)
27
- src_pts = np.float32([kp1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
28
- dst_pts = np.float32([kp2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)
29
- matrix, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
30
-
31
- # Validate if alignment is good enough
32
- if matrix is None or len(matches) < 10: # Check if sufficient matches exist
33
- raise ValueError("Alignment failed. Insufficient matches between images.")
34
-
35
- aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
36
- return aligned_img
37
-
38
- # Compare visual changes
39
- def compare_visual_changes(orig_img, edit_img, start_position):
40
- diff = cv2.absdiff(orig_img, edit_img)
41
- gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
42
-
43
- # Apply Gaussian blur to reduce noise
44
- blurred_diff = cv2.GaussianBlur(gray_diff, (5, 5), 0)
45
-
46
- # Apply thresholding
47
- _, thresh = cv2.threshold(blurred_diff, 40, 255, cv2.THRESH_BINARY)
48
-
49
- # Morphological operations to clean noise
50
- kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
51
- cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
52
-
53
- contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
54
- overlay = edit_img.copy()
55
- visual_changes = []
56
- position_counter = start_position
57
- font = cv2.FONT_HERSHEY_SIMPLEX
58
- font_scale = 0.8
59
- thickness = 2
60
-
61
- for cnt in contours:
62
- if cv2.contourArea(cnt) > 100: # Filter out small regions
63
- x, y, w, h = cv2.boundingRect(cnt)
64
- cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2) # Red bounding box
65
- cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
66
- visual_changes.append((position_counter, f'Visual change detected at position {position_counter}'))
67
- position_counter += 1
68
-
69
- return overlay, visual_changes, position_counter
70
-
71
- # Compare text changes with bounding boxes
72
- def compare_text_changes_with_boxes(orig_img, edit_img, start_position):
73
- orig_data = pytesseract.image_to_data(orig_img, output_type=Output.DICT)
74
- edit_data = pytesseract.image_to_data(edit_img, output_type=Output.DICT)
75
- orig_text = "\n".join(orig_data['text']).splitlines()
76
- edit_text = "\n".join(edit_data['text']).splitlines()
77
-
78
- diff = difflib.ndiff(orig_text, edit_text)
79
- overlay = edit_img.copy()
80
- text_changes = []
81
- position_counter = start_position
82
- font = cv2.FONT_HERSHEY_SIMPLEX
83
- font_scale = 0.8
84
- thickness = 2
85
-
86
- for line in diff:
87
- if line.startswith("+ "): # Added text
88
- text = line[2:]
89
- if text in edit_data['text']:
90
- index = edit_data['text'].index(text)
91
- x, y, w, h = edit_data['left'][index], edit_data['top'][index], edit_data['width'][index], edit_data['height'][index]
92
- cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)
93
- cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
94
- text_changes.append((position_counter, f'"{text}" added at position {position_counter}'))
95
- position_counter += 1
96
- elif line.startswith("- "): # Removed text
97
- text = line[2:]
98
- if text in orig_data['text']:
99
- index = orig_data['text'].index(text)
100
- x, y, w, h = orig_data['left'][index], orig_data['top'][index], orig_data['width'][index], orig_data['height'][index]
101
- cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)
102
- cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
103
- text_changes.append((position_counter, f'"{text}" removed at position {position_counter}'))
104
- position_counter += 1
105
-
106
- return overlay, text_changes, position_counter
107
-
108
- # Sanitize text for PDF compatibility
109
- def sanitize_text(text):
110
- return text.encode('latin-1', errors='replace').decode('latin-1')
111
 
112
  # Generate PDF report
113
- def generate_report(images, changes, title, output_path):
114
  pdf = FPDF()
115
  for img in images:
116
  temp_path = "temp_image.png"
@@ -118,76 +98,26 @@ def generate_report(images, changes, title, output_path):
118
  pdf.add_page()
119
  pdf.image(temp_path, x=10, y=10, w=190)
120
  os.remove(temp_path)
121
-
122
  pdf.add_page()
123
  pdf.set_font("Arial", size=12)
124
- pdf.cell(0, 10, sanitize_text(title), ln=True, align="C")
125
- pdf.ln(10)
126
- for _, change in changes:
127
- pdf.cell(0, 10, sanitize_text(change), ln=True)
128
-
129
  pdf.output(output_path)
130
- return output_path
131
-
132
- # Perform visual and text comparisons separately
133
- def generate_separate_comparisons(original_pdf, edited_pdf):
134
- original_images = convert_pdf_to_images(original_pdf)
135
- edited_images = convert_pdf_to_images(edited_pdf)
136
-
137
- # Visual comparison
138
- visual_combined_images = []
139
- visual_changes = []
140
- position_counter = 1
141
- for orig_img, edit_img in zip(original_images, edited_images):
142
- aligned_img = align_images(orig_img, edit_img)
143
- highlighted_img, page_visual_changes, position_counter = compare_visual_changes(
144
- orig_img, aligned_img, position_counter
145
- )
146
- visual_changes.extend(page_visual_changes)
147
- visual_combined_images.append(np.hstack((orig_img, highlighted_img)))
148
-
149
- # Generate visual changes report
150
- visual_report_path = generate_report(
151
- visual_combined_images, visual_changes, "Visual Changes", "outputs/visual_changes.pdf"
152
- )
153
-
154
- # Text comparison
155
- text_combined_images = []
156
- text_changes = []
157
- position_counter = 1
158
- for orig_img, edit_img in zip(original_images, edited_images):
159
- aligned_img = align_images(orig_img, edit_img)
160
- highlighted_img, page_text_changes, position_counter = compare_text_changes_with_boxes(
161
- orig_img, aligned_img, position_counter
162
- )
163
- text_changes.extend(page_text_changes)
164
- text_combined_images.append(np.hstack((orig_img, highlighted_img)))
165
 
166
- # Generate text changes report
167
- text_report_path = generate_report(
168
- text_combined_images, text_changes, "Text Changes", "outputs/text_changes.pdf"
169
- )
170
 
171
- return visual_report_path, text_report_path
172
-
173
- # Gradio interface function
174
  def pdf_comparison(original_pdf, edited_pdf):
175
- visual_path, text_path = generate_separate_comparisons(original_pdf.name, edited_pdf.name)
176
- return visual_path, text_path
177
 
178
- # Gradio interface
 
179
  interface = gr.Interface(
180
  fn=pdf_comparison,
181
- inputs=[
182
- gr.File(label="Upload Original PDF", file_types=[".pdf"]),
183
- gr.File(label="Upload Edited PDF", file_types=[".pdf"])
184
- ],
185
- outputs=[
186
- gr.File(label="Download Visual Changes Report"),
187
- gr.File(label="Download Text Changes Report")
188
- ],
189
- title="PDF Comparison Tool with Separate Comparisons",
190
- description="Upload two PDFs: the original and the edited version. The tool generates separate reports for visual and text changes."
191
  )
192
 
193
  if __name__ == "__main__":
 
3
  import cv2
4
  from pdf2image import convert_from_path
5
  import pytesseract
 
6
  import numpy as np
7
  import os
8
  from fpdf import FPDF
 
9
 
10
+
11
+ # Helper: Convert PDFs to images
12
  def convert_pdf_to_images(pdf_path, dpi=300):
13
  images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
14
  return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
15
 
16
+
17
+ # Helper: Extract text and bounding boxes
18
+ def extract_text_with_boxes(pdf_path):
19
+ doc = fitz.open(pdf_path)
20
+ text_data = []
21
+ for page_num, page in enumerate(doc):
22
+ for block in page.get_text("dict")["blocks"]:
23
+ for line in block["lines"]:
24
+ for span in line["spans"]:
25
+ text_data.append({
26
+ "text": span["text"],
27
+ "bbox": span["bbox"],
28
+ "page": page_num + 1
29
+ })
30
+ return text_data
31
+
32
+
33
+ # Helper: Highlight changes with bounding boxes
34
+ def highlight_changes(img, changes):
35
+ overlay = img.copy()
36
+ for change in changes:
37
+ x0, y0, x1, y1 = map(int, change["bbox"])
38
+ cv2.rectangle(overlay, (x0, y0), (x1, y1), (0, 0, 255), 2) # Red for changes
39
+ cv2.putText(
40
+ overlay,
41
+ str(change["position"]),
42
+ (x0, y0 - 10),
43
+ cv2.FONT_HERSHEY_SIMPLEX,
44
+ 0.5,
45
+ (0, 255, 0),
46
+ 1,
47
+ cv2.LINE_AA,
48
+ )
49
+ return overlay
50
+
51
+
52
+ # Text comparison logic
53
+ def compare_texts(original_text, edited_text):
54
+ changes = []
55
+ position = 1
56
+ for o, e in zip(original_text, edited_text):
57
+ if o["text"] != e["text"]:
58
+ changes.append({
59
+ "text": f'"{e["text"]}" added' if not o["text"] else f'"{o["text"]}" removed',
60
+ "bbox": e["bbox"],
61
+ "position": position
62
+ })
63
+ position += 1
64
+ return changes
65
+
66
+
67
+ # Generate reports for text and visual changes
68
+ def generate_reports(original_pdf, edited_pdf):
69
+ # Process original and edited PDFs
70
+ original_images = convert_pdf_to_images(original_pdf)
71
+ edited_images = convert_pdf_to_images(edited_pdf)
72
+
73
+ # Extract text
74
+ original_text = extract_text_with_boxes(original_pdf)
75
+ edited_text = extract_text_with_boxes(edited_pdf)
76
+
77
+ # Compare text and visual changes
78
+ text_changes = compare_texts(original_text, edited_text)
79
+
80
+ # Highlight changes in images
81
+ text_highlighted_images = [
82
+ highlight_changes(edited, text_changes) for edited in edited_images
83
+ ]
84
+
85
+ # Generate separate PDF reports
86
+ text_pdf_path = "outputs/text_changes.pdf"
87
+ generate_pdf_report(text_highlighted_images, text_changes, text_pdf_path, "Text Changes")
88
+
89
+ return text_pdf_path
90
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  # Generate PDF report
93
+ def generate_pdf_report(images, changes, output_path, report_type):
94
  pdf = FPDF()
95
  for img in images:
96
  temp_path = "temp_image.png"
 
98
  pdf.add_page()
99
  pdf.image(temp_path, x=10, y=10, w=190)
100
  os.remove(temp_path)
 
101
  pdf.add_page()
102
  pdf.set_font("Arial", size=12)
103
+ pdf.cell(0, 10, f"{report_type} Summary", ln=True, align="C")
104
+ for change in changes:
105
+ pdf.cell(0, 10, f'Position {change["position"]}: {change["text"]}', ln=True)
 
 
106
  pdf.output(output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
 
 
 
 
108
 
109
+ # Gradio interface
 
 
110
  def pdf_comparison(original_pdf, edited_pdf):
111
+ text_report = generate_reports(original_pdf.name, edited_pdf.name)
112
+ return text_report
113
 
114
+
115
+ # Interface
116
  interface = gr.Interface(
117
  fn=pdf_comparison,
118
+ inputs=[gr.File(label="Upload Original PDF"), gr.File(label="Upload Edited PDF")],
119
+ outputs=[gr.File(label="Download Text Changes Report")],
120
+ live=True
 
 
 
 
 
 
 
121
  )
122
 
123
  if __name__ == "__main__":