SuriRaja commited on
Commit
455fefb
·
verified ·
1 Parent(s): 2ade524

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -75
app.py CHANGED
@@ -2,27 +2,32 @@ import streamlit as st
2
  import fitz # PyMuPDF
3
  import difflib
4
  from PIL import Image, ImageChops, ImageDraw
 
5
  import io
6
  import re
7
 
 
 
 
8
  def load_and_compare_documents(file1, file2):
9
- # Read the files into memory to avoid re-reading them
10
  file1_content = file1.read()
11
  file2_content = file2.read()
12
 
13
- # Extract and compare text with font properties and placement
14
  text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
15
 
16
- # Check for visual differences and box them on the second image
17
  boxed_images, visual_summary = check_and_box_image_differences(file1_content, file2_content)
18
 
19
- # Generate an overall comparison summary
20
- overall_summary = generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, visual_summary)
 
 
 
21
 
22
- return text_differences, text_property_changes, special_char_changes, placement_changes, boxed_images, visual_summary, overall_summary
23
 
24
  def extract_text_with_properties(pdf_content):
25
- # Extract text with font, size, color, and placement properties from PDF
26
  text_elements = []
27
  pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
28
 
@@ -45,7 +50,6 @@ def extract_text_with_properties(pdf_content):
45
  return text_elements
46
 
47
  def compare_text_with_properties(content1, content2):
48
- # Extract text with font properties from both documents
49
  elements1 = extract_text_with_properties(content1)
50
  elements2 = extract_text_with_properties(content2)
51
 
@@ -55,18 +59,16 @@ def compare_text_with_properties(content1, content2):
55
  placement_changes = []
56
 
57
  for e1, e2 in zip(elements1, elements2):
58
- # Character-by-character comparison
59
  if e1["text"] != e2["text"]:
60
  diff = list(difflib.ndiff(e1["text"], e2["text"]))
61
  for i, change in enumerate(diff):
62
- if change.startswith("+ "): # Character only in CorelDRAW Output
63
  differences["additions"].append(f"Added '{change[2:]}' at position {i} on page {e1['page']}")
64
- elif change.startswith("- "): # Character only in Customer Document
65
  differences["deletions"].append(f"Deleted '{change[2:]}' at position {i} on page {e1['page']}")
66
- elif change.startswith("? "): # Minor character difference
67
  differences["modifications"].append(f"Modified '{change[2:]}' at position {i} on page {e1['page']}")
68
 
69
- # Special character detection
70
  special_chars = re.findall(r"[^\w\s]", e1["text"] + e2["text"])
71
  if special_chars:
72
  special_char_changes.append({
@@ -76,7 +78,6 @@ def compare_text_with_properties(content1, content2):
76
  "special_characters": special_chars
77
  })
78
 
79
- # Font, size, color, and placement changes
80
  if e1["font"] != e2["font"] or e1["size"] != e2["size"] or e1["color"] != e2["color"] or e1["bbox"] != e2["bbox"]:
81
  property_changes.append({
82
  "page": e1["page"],
@@ -95,7 +96,6 @@ def compare_text_with_properties(content1, content2):
95
  return differences, property_changes, special_char_changes, placement_changes
96
 
97
  def pdf_to_images(file_content):
98
- # Convert each page of the PDF to images using in-memory content
99
  images = []
100
  pdf_document = fitz.open(stream=file_content, filetype="pdf")
101
  for page_num in range(pdf_document.page_count):
@@ -107,7 +107,6 @@ def pdf_to_images(file_content):
107
  return images
108
 
109
  def check_and_box_image_differences(file1_content, file2_content):
110
- # Convert PDFs to images and check for visual differences, drawing boxes on the second image
111
  images1 = pdf_to_images(file1_content)
112
  images2 = pdf_to_images(file2_content)
113
 
@@ -115,13 +114,11 @@ def check_and_box_image_differences(file1_content, file2_content):
115
  visual_summary = {"size_changes": [], "color_changes": [], "text_changes": []}
116
 
117
  for (page_num, img1), (_, img2) in zip(images1, images2):
118
- # Check for size differences
119
  if img1.size != img2.size:
120
  visual_summary["size_changes"].append(f"Page {page_num}: Significant layout or size difference detected.")
121
- boxed_images.append((page_num, img2)) # Display unmodified if size is different
122
  continue
123
 
124
- # Detect differences and box them
125
  diff = ImageChops.difference(img1, img2).convert("L")
126
  diff_sum = sum(diff.getdata())
127
 
@@ -131,15 +128,37 @@ def check_and_box_image_differences(file1_content, file2_content):
131
  bbox = diff.getbbox()
132
 
133
  if bbox:
134
- draw.rectangle(bbox, outline="red", width=3) # Draw a red box around differences
135
 
136
  boxed_images.append((page_num, highlighted_img))
137
  visual_summary["text_changes"].append(f"Page {page_num}: Text/font differences detected.")
138
 
139
  return boxed_images, visual_summary
140
 
141
- def generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, visual_summary):
142
- # Compile a high-level summary of changes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  overall_summary = {
144
  "total_additions": len(text_differences["additions"]),
145
  "total_deletions": len(text_differences["deletions"]),
@@ -151,14 +170,15 @@ def generate_overall_summary(text_differences, text_property_changes, special_ch
151
  "size_changes": len(visual_summary["size_changes"]),
152
  "color_changes": len(visual_summary["color_changes"]),
153
  "text_changes": len(visual_summary["text_changes"]),
154
- }
 
155
  }
156
  return overall_summary
157
 
158
  # Streamlit app interface
159
  def main():
160
- st.title("Comprehensive Document Comparison Tool")
161
- st.write("Upload Customer Document and CorelDRAW Output for a detailed comparison.")
162
 
163
  customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
164
  output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
@@ -168,59 +188,21 @@ def main():
168
  st.error("One or both files are empty. Please upload valid PDF files.")
169
  return
170
 
171
- text_differences, text_property_changes, special_char_changes, placement_changes, boxed_images, visual_summary, overall_summary = load_and_compare_documents(customer_file, output_file)
172
 
173
- # Text Differences
174
- st.subheader("Detailed Text Differences (Character by Character)")
175
- for category, changes in text_differences.items():
176
- if changes:
177
- st.write(f"**{category.capitalize()}**:")
178
- for change in changes:
179
- st.write(f"- {change}")
180
 
181
- # Font, Size, Color, and Placement Changes
182
- st.subheader("Font, Size, Color, and Placement Changes")
183
- for change in text_property_changes:
184
- st.write(f"**Page {change['page']}:** Font {change['original_font']} -> {change['new_font']}, Size {change['original_size']} -> {change['new_size']}, Color {change['original_color']} -> {change['new_color']}")
185
- for change in placement_changes:
186
- st.write(f"- {change}")
187
-
188
- # Special Character Changes
189
- if special_char_changes:
190
- st.subheader("Special Character Changes")
191
- for change in special_char_changes:
192
- st.write(f"Page {change['page']} - Original Text: '{change['original_text']}' -> New Text: '{change['new_text']}' with Special Characters: {change['special_characters']}")
193
-
194
- # Display images with boxed differences
195
- st.subheader("Pages with Visual Differences (Boxed)")
196
- if boxed_images:
197
- for page_num, img in boxed_images:
198
- st.write(f"**Page {page_num} with Differences Boxed**")
199
- img_io = io.BytesIO()
200
- img.save(img_io, format="PNG")
201
- st.image(img_io, caption=f"Differences on Page {page_num}", use_column_width=True)
202
- else:
203
- st.write("No visual differences detected between the images.")
204
-
205
- # Display Visual Differences Summary
206
- st.subheader("Visual Differences Summary")
207
- for key, changes in visual_summary.items():
208
- if changes:
209
- st.write(f"**{key.capitalize().replace('_', ' ')}:**")
210
- for item in changes:
211
- st.write(f"- {item}")
212
-
213
- # Overall Summary
214
  st.subheader("Overall Comparison Summary")
215
- st.write(f"**Total Additions:** {overall_summary['total_additions']}")
216
- st.write(f"**Total Deletions:** {overall_summary['total_deletions']}")
217
- st.write(f"**Total Modifications:** {overall_summary['total_modifications']}")
218
- st.write(f"**Font, Size, and Color Changes:** {overall_summary['font_size_color_changes']}")
219
- st.write(f"**Special Character Changes:** {overall_summary['special_character_changes']}")
220
- st.write(f"**Placement Changes:** {overall_summary['placement_changes']}")
221
- st.write(f"**Pages with Size Differences:** {overall_summary['visual_differences']['size_changes']}")
222
- st.write(f"**Pages with Color Differences:** {overall_summary['visual_differences']['color_changes']}")
223
- st.write(f"**Pages with Text Differences:** {overall_summary['visual_differences']['text_changes']}")
224
 
225
  if __name__ == "__main__":
226
  main()
 
2
  import fitz # PyMuPDF
3
  import difflib
4
  from PIL import Image, ImageChops, ImageDraw
5
+ import pytesseract
6
  import io
7
  import re
8
 
9
+ # Set up Tesseract path if needed (adjust as per system requirements)
10
+ # pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
11
+
12
  def load_and_compare_documents(file1, file2):
 
13
  file1_content = file1.read()
14
  file2_content = file2.read()
15
 
16
+ # Compare text with font properties, placement, and special characters
17
  text_differences, text_property_changes, special_char_changes, placement_changes = compare_text_with_properties(file1_content, file2_content)
18
 
19
+ # Visual differences with boxed highlights
20
  boxed_images, visual_summary = check_and_box_image_differences(file1_content, file2_content)
21
 
22
+ # OCR on images for text comparison
23
+ ocr_differences = perform_ocr_and_compare(file1_content, file2_content)
24
+
25
+ # Generate overall summary
26
+ overall_summary = generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, visual_summary, ocr_differences)
27
 
28
+ return text_differences, text_property_changes, special_char_changes, placement_changes, boxed_images, visual_summary, ocr_differences, overall_summary
29
 
30
  def extract_text_with_properties(pdf_content):
 
31
  text_elements = []
32
  pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
33
 
 
50
  return text_elements
51
 
52
  def compare_text_with_properties(content1, content2):
 
53
  elements1 = extract_text_with_properties(content1)
54
  elements2 = extract_text_with_properties(content2)
55
 
 
59
  placement_changes = []
60
 
61
  for e1, e2 in zip(elements1, elements2):
 
62
  if e1["text"] != e2["text"]:
63
  diff = list(difflib.ndiff(e1["text"], e2["text"]))
64
  for i, change in enumerate(diff):
65
+ if change.startswith("+ "):
66
  differences["additions"].append(f"Added '{change[2:]}' at position {i} on page {e1['page']}")
67
+ elif change.startswith("- "):
68
  differences["deletions"].append(f"Deleted '{change[2:]}' at position {i} on page {e1['page']}")
69
+ elif change.startswith("? "):
70
  differences["modifications"].append(f"Modified '{change[2:]}' at position {i} on page {e1['page']}")
71
 
 
72
  special_chars = re.findall(r"[^\w\s]", e1["text"] + e2["text"])
73
  if special_chars:
74
  special_char_changes.append({
 
78
  "special_characters": special_chars
79
  })
80
 
 
81
  if e1["font"] != e2["font"] or e1["size"] != e2["size"] or e1["color"] != e2["color"] or e1["bbox"] != e2["bbox"]:
82
  property_changes.append({
83
  "page": e1["page"],
 
96
  return differences, property_changes, special_char_changes, placement_changes
97
 
98
  def pdf_to_images(file_content):
 
99
  images = []
100
  pdf_document = fitz.open(stream=file_content, filetype="pdf")
101
  for page_num in range(pdf_document.page_count):
 
107
  return images
108
 
109
  def check_and_box_image_differences(file1_content, file2_content):
 
110
  images1 = pdf_to_images(file1_content)
111
  images2 = pdf_to_images(file2_content)
112
 
 
114
  visual_summary = {"size_changes": [], "color_changes": [], "text_changes": []}
115
 
116
  for (page_num, img1), (_, img2) in zip(images1, images2):
 
117
  if img1.size != img2.size:
118
  visual_summary["size_changes"].append(f"Page {page_num}: Significant layout or size difference detected.")
119
+ boxed_images.append((page_num, img2))
120
  continue
121
 
 
122
  diff = ImageChops.difference(img1, img2).convert("L")
123
  diff_sum = sum(diff.getdata())
124
 
 
128
  bbox = diff.getbbox()
129
 
130
  if bbox:
131
+ draw.rectangle(bbox, outline="red", width=3)
132
 
133
  boxed_images.append((page_num, highlighted_img))
134
  visual_summary["text_changes"].append(f"Page {page_num}: Text/font differences detected.")
135
 
136
  return boxed_images, visual_summary
137
 
138
+ def perform_ocr_and_compare(content1, content2):
139
+ ocr_differences = []
140
+ images1 = pdf_to_images(content1)
141
+ images2 = pdf_to_images(content2)
142
+
143
+ for (page_num, img1), (_, img2) in zip(images1, images2):
144
+ text1 = pytesseract.image_to_string(img1)
145
+ text2 = pytesseract.image_to_string(img2)
146
+
147
+ if text1 != text2:
148
+ diff = list(difflib.ndiff(text1, text2))
149
+ page_diffs = []
150
+ for i, change in enumerate(diff):
151
+ if change.startswith("+ "):
152
+ page_diffs.append(f"Added '{change[2:]}' at position {i} on page {page_num}")
153
+ elif change.startswith("- "):
154
+ page_diffs.append(f"Deleted '{change[2:]}' at position {i} on page {page_num}")
155
+ elif change.startswith("? "):
156
+ page_diffs.append(f"Modified '{change[2:]}' at position {i} on page {page_num}")
157
+ ocr_differences.append({"page": page_num, "differences": page_diffs})
158
+
159
+ return ocr_differences
160
+
161
+ def generate_overall_summary(text_differences, text_property_changes, special_char_changes, placement_changes, visual_summary, ocr_differences):
162
  overall_summary = {
163
  "total_additions": len(text_differences["additions"]),
164
  "total_deletions": len(text_differences["deletions"]),
 
170
  "size_changes": len(visual_summary["size_changes"]),
171
  "color_changes": len(visual_summary["color_changes"]),
172
  "text_changes": len(visual_summary["text_changes"]),
173
+ },
174
+ "ocr_differences": len(ocr_differences)
175
  }
176
  return overall_summary
177
 
178
  # Streamlit app interface
179
  def main():
180
+ st.title("Comprehensive Document Comparison Tool with OCR Text Extraction")
181
+ st.write("Upload Customer Document and CorelDRAW Output for detailed comparison.")
182
 
183
  customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
184
  output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
 
188
  st.error("One or both files are empty. Please upload valid PDF files.")
189
  return
190
 
191
+ text_differences, text_property_changes, special_char_changes, placement_changes, boxed_images, visual_summary, ocr_differences, overall_summary = load_and_compare_documents(customer_file, output_file)
192
 
193
+ # Display various outputs (as outlined previously)...
 
 
 
 
 
 
194
 
195
+ # OCR Differences
196
+ st.subheader("OCR-Based Text Differences (Embedded in Images)")
197
+ for ocr_diff in ocr_differences:
198
+ st.write(f"**Page {ocr_diff['page']} OCR Differences:**")
199
+ for diff in ocr_diff["differences"]:
200
+ st.write(f"- {diff}")
201
+
202
+ # Overall Comparison Summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  st.subheader("Overall Comparison Summary")
204
+ for key, value in overall_summary.items():
205
+ st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
 
 
 
 
 
 
 
206
 
207
  if __name__ == "__main__":
208
  main()