SathvikGanta commited on
Commit
8ae85b7
·
verified ·
1 Parent(s): 70c61ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -95
app.py CHANGED
@@ -7,95 +7,110 @@ from pytesseract import Output
7
  import numpy as np
8
  import os
9
  from fpdf import FPDF
 
10
 
11
-
12
- # Helper: Convert PDFs to images
13
  def convert_pdf_to_images(pdf_path, dpi=300):
14
  images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
15
  return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
16
 
17
-
18
- # Helper: Extract text and bounding boxes
19
- def extract_text_with_boxes(pdf_path):
20
- doc = fitz.open(pdf_path)
21
- text_data = []
22
- for page_num, page in enumerate(doc):
23
- for block in page.get_text("dict")["blocks"]:
24
- # Skip blocks without "lines"
25
- if "lines" not in block:
26
- continue
27
- for line in block["lines"]:
28
- for span in line["spans"]:
29
- if span["text"].strip(): # Skip empty text spans
30
- text_data.append({
31
- "text": span["text"],
32
- "bbox": span["bbox"],
33
- "page": page_num + 1
34
- })
35
- return text_data
36
-
37
-
38
- # Helper: Highlight changes with bounding boxes
39
- def highlight_changes(img, changes):
40
- overlay = img.copy()
41
- for change in changes:
42
- x0, y0, x1, y1 = map(int, change["bbox"])
43
- cv2.rectangle(overlay, (x0, y0), (x1, y1), (0, 0, 255), 2) # Red for changes
44
- cv2.putText(
45
- overlay,
46
- str(change["position"]),
47
- (x0, y0 - 10),
48
- cv2.FONT_HERSHEY_SIMPLEX,
49
- 0.5,
50
- (0, 255, 0),
51
- 1,
52
- cv2.LINE_AA,
53
- )
54
- return overlay
55
-
56
-
57
- # Text comparison logic
58
- def compare_texts(original_text, edited_text):
59
- changes = []
60
- position = 1
61
- for o, e in zip(original_text, edited_text):
62
- if o["text"] != e["text"]:
63
- changes.append({
64
- "text": f'"{e["text"]}" added' if not o["text"] else f'"{o["text"]}" removed',
65
- "bbox": e["bbox"],
66
- "position": position
67
- })
68
- position += 1
69
- return changes
70
-
71
-
72
- # Generate reports for text and visual changes
73
- def generate_reports(original_pdf, edited_pdf):
74
- # Process original and edited PDFs
75
- original_images = convert_pdf_to_images(original_pdf)
76
- edited_images = convert_pdf_to_images(edited_pdf)
77
-
78
- # Extract text
79
- original_text = extract_text_with_boxes(original_pdf)
80
- edited_text = extract_text_with_boxes(edited_pdf)
81
-
82
- # Compare text and visual changes
83
- text_changes = compare_texts(original_text, edited_text)
84
-
85
- # Highlight changes in images
86
- text_highlighted_images = [
87
- highlight_changes(edited, text_changes) for edited in edited_images
88
- ]
89
-
90
- # Generate separate PDF reports
91
- text_pdf_path = "outputs/text_changes.pdf"
92
- generate_pdf_report(text_highlighted_images, text_changes, text_pdf_path, "Text Changes")
93
-
94
- return text_pdf_path
95
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # Generate PDF report
98
- def generate_pdf_report(images, changes, output_path, report_type):
99
  pdf = FPDF()
100
  for img in images:
101
  temp_path = "temp_image.png"
@@ -103,28 +118,76 @@ def generate_pdf_report(images, changes, output_path, report_type):
103
  pdf.add_page()
104
  pdf.image(temp_path, x=10, y=10, w=190)
105
  os.remove(temp_path)
 
106
  pdf.add_page()
107
  pdf.set_font("Arial", size=12)
108
- pdf.cell(0, 10, f"{report_type} Summary", ln=True, align="C")
109
- for change in changes:
110
- pdf.cell(0, 10, f'Position {change["position"]}: {change["text"]}', ln=True)
 
 
111
  pdf.output(output_path)
 
 
 
 
 
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Gradio interface
115
- def pdf_comparison(original_pdf, edited_pdf):
116
- if original_pdf is None or edited_pdf is None:
117
- return "Error: Please upload both PDFs."
118
- text_report = generate_reports(original_pdf.name, edited_pdf.name)
119
- return text_report
120
 
 
 
 
 
121
 
122
- # Interface
123
  interface = gr.Interface(
124
  fn=pdf_comparison,
125
- inputs=[gr.File(label="Upload Original PDF"), gr.File(label="Upload Edited PDF")],
126
- outputs=[gr.File(label="Download Text Changes Report")],
127
- live=True
 
 
 
 
 
 
 
128
  )
129
 
130
  if __name__ == "__main__":
 
7
  import numpy as np
8
  import os
9
  from fpdf import FPDF
10
+ import difflib # For text comparison
11
 
12
+ # Convert PDFs to images
 
13
  def convert_pdf_to_images(pdf_path, dpi=300):
14
  images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
15
  return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
16
 
17
+ # Align images
18
+ def align_images(img1, img2):
19
+ gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
20
+ gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
21
+ orb = cv2.ORB_create()
22
+ kp1, des1 = orb.detectAndCompute(gray1, None)
23
+ kp2, des2 = orb.detectAndCompute(gray2, None)
24
+ bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
25
+ matches = bf.match(des1, des2)
26
+ matches = sorted(matches, key=lambda x: x.distance)
27
+ src_pts = np.float32([kp1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
28
+ dst_pts = np.float32([kp2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)
29
+ matrix, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
30
+
31
+ # Validate if alignment is good enough
32
+ if matrix is None or len(matches) < 10: # Check if sufficient matches exist
33
+ raise ValueError("Alignment failed. Insufficient matches between images.")
34
+
35
+ aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
36
+ return aligned_img
37
+
38
+ # Compare visual changes
39
+ def compare_visual_changes(orig_img, edit_img, start_position):
40
+ diff = cv2.absdiff(orig_img, edit_img)
41
+ gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
42
+
43
+ # Apply Gaussian blur to reduce noise
44
+ blurred_diff = cv2.GaussianBlur(gray_diff, (5, 5), 0)
45
+
46
+ # Apply thresholding
47
+ _, thresh = cv2.threshold(blurred_diff, 40, 255, cv2.THRESH_BINARY)
48
+
49
+ # Morphological operations to clean noise
50
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
51
+ cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
52
+
53
+ contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
54
+ overlay = edit_img.copy()
55
+ visual_changes = []
56
+ position_counter = start_position
57
+ font = cv2.FONT_HERSHEY_SIMPLEX
58
+ font_scale = 0.8
59
+ thickness = 2
60
+
61
+ for cnt in contours:
62
+ if cv2.contourArea(cnt) > 100: # Filter out small regions
63
+ x, y, w, h = cv2.boundingRect(cnt)
64
+ cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2) # Red bounding box
65
+ cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
66
+ visual_changes.append((position_counter, f'Visual change detected at position {position_counter}'))
67
+ position_counter += 1
68
+
69
+ return overlay, visual_changes, position_counter
70
+
71
+ # Compare text changes with bounding boxes
72
+ def compare_text_changes_with_boxes(orig_img, edit_img, start_position):
73
+ orig_data = pytesseract.image_to_data(orig_img, output_type=Output.DICT)
74
+ edit_data = pytesseract.image_to_data(edit_img, output_type=Output.DICT)
75
+ orig_text = "\n".join(orig_data['text']).splitlines()
76
+ edit_text = "\n".join(edit_data['text']).splitlines()
77
+
78
+ diff = difflib.ndiff(orig_text, edit_text)
79
+ overlay = edit_img.copy()
80
+ text_changes = []
81
+ position_counter = start_position
82
+ font = cv2.FONT_HERSHEY_SIMPLEX
83
+ font_scale = 0.8
84
+ thickness = 2
85
+
86
+ for line in diff:
87
+ if line.startswith("+ "): # Added text
88
+ text = line[2:]
89
+ if text in edit_data['text']:
90
+ index = edit_data['text'].index(text)
91
+ x, y, w, h = edit_data['left'][index], edit_data['top'][index], edit_data['width'][index], edit_data['height'][index]
92
+ cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)
93
+ cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
94
+ text_changes.append((position_counter, f'"{text}" added at position {position_counter}'))
95
+ position_counter += 1
96
+ elif line.startswith("- "): # Removed text
97
+ text = line[2:]
98
+ if text in orig_data['text']:
99
+ index = orig_data['text'].index(text)
100
+ x, y, w, h = orig_data['left'][index], orig_data['top'][index], orig_data['width'][index], orig_data['height'][index]
101
+ cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)
102
+ cv2.putText(overlay, str(position_counter), (x, y - 10), font, font_scale, (0, 255, 0), thickness)
103
+ text_changes.append((position_counter, f'"{text}" removed at position {position_counter}'))
104
+ position_counter += 1
105
+
106
+ return overlay, text_changes, position_counter
107
+
108
+ # Sanitize text for PDF compatibility
109
+ def sanitize_text(text):
110
+ return text.encode('latin-1', errors='replace').decode('latin-1')
111
 
112
  # Generate PDF report
113
+ def generate_report(images, changes, title, output_path):
114
  pdf = FPDF()
115
  for img in images:
116
  temp_path = "temp_image.png"
 
118
  pdf.add_page()
119
  pdf.image(temp_path, x=10, y=10, w=190)
120
  os.remove(temp_path)
121
+
122
  pdf.add_page()
123
  pdf.set_font("Arial", size=12)
124
+ pdf.cell(0, 10, sanitize_text(title), ln=True, align="C")
125
+ pdf.ln(10)
126
+ for _, change in changes:
127
+ pdf.cell(0, 10, sanitize_text(change), ln=True)
128
+
129
  pdf.output(output_path)
130
+ return output_path
131
+
132
+ # Perform visual and text comparisons separately
133
+ def generate_separate_comparisons(original_pdf, edited_pdf):
134
+ original_images = convert_pdf_to_images(original_pdf)
135
+ edited_images = convert_pdf_to_images(edited_pdf)
136
 
137
+ # Visual comparison
138
+ visual_combined_images = []
139
+ visual_changes = []
140
+ position_counter = 1
141
+ for orig_img, edit_img in zip(original_images, edited_images):
142
+ aligned_img = align_images(orig_img, edit_img)
143
+ highlighted_img, page_visual_changes, position_counter = compare_visual_changes(
144
+ orig_img, aligned_img, position_counter
145
+ )
146
+ visual_changes.extend(page_visual_changes)
147
+ visual_combined_images.append(np.hstack((orig_img, highlighted_img)))
148
+
149
+ # Generate visual changes report
150
+ visual_report_path = generate_report(
151
+ visual_combined_images, visual_changes, "Visual Changes", "outputs/visual_changes.pdf"
152
+ )
153
+
154
+ # Text comparison
155
+ text_combined_images = []
156
+ text_changes = []
157
+ position_counter = 1
158
+ for orig_img, edit_img in zip(original_images, edited_images):
159
+ aligned_img = align_images(orig_img, edit_img)
160
+ highlighted_img, page_text_changes, position_counter = compare_text_changes_with_boxes(
161
+ orig_img, aligned_img, position_counter
162
+ )
163
+ text_changes.extend(page_text_changes)
164
+ text_combined_images.append(np.hstack((orig_img, highlighted_img)))
165
 
166
+ # Generate text changes report
167
+ text_report_path = generate_report(
168
+ text_combined_images, text_changes, "Text Changes", "outputs/text_changes.pdf"
169
+ )
170
+
171
+ return visual_report_path, text_report_path
172
 
173
+ # Gradio interface function
174
+ def pdf_comparison(original_pdf, edited_pdf):
175
+ visual_path, text_path = generate_separate_comparisons(original_pdf.name, edited_pdf.name)
176
+ return visual_path, text_path
177
 
178
+ # Gradio interface
179
  interface = gr.Interface(
180
  fn=pdf_comparison,
181
+ inputs=[
182
+ gr.File(label="Upload Original PDF", file_types=[".pdf"]),
183
+ gr.File(label="Upload Edited PDF", file_types=[".pdf"])
184
+ ],
185
+ outputs=[
186
+ gr.File(label="Download Visual Changes Report"),
187
+ gr.File(label="Download Text Changes Report")
188
+ ],
189
+ title="PDF Comparison Tool with Separate Comparisons",
190
+ description="Upload two PDFs: the original and the edited version. The tool generates separate reports for visual and text changes."
191
  )
192
 
193
  if __name__ == "__main__":