SuriRaja commited on
Commit
778fed4
·
verified ·
1 Parent(s): 9aff2fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -20
app.py CHANGED
@@ -1,14 +1,18 @@
1
  import streamlit as st
2
  import fitz # PyMuPDF
 
3
  import difflib
 
4
 
5
  def load_and_compare_documents(file1, file2):
6
- # Extract text from both documents
7
  doc1_text, doc2_text = extract_text_from_pdf(file1), extract_text_from_pdf(file2)
 
8
 
9
- # Compare text line-by-line and categorize differences
10
- differences = compare_text(doc1_text, doc2_text)
11
- return differences
 
12
 
13
  def extract_text_from_pdf(file):
14
  # Extract text from PDF
@@ -31,37 +35,68 @@ def compare_text(doc1_text, doc2_text):
31
  differences["minor"].append(line[2:]) # Minor differences or edits
32
  return differences
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Streamlit app interface
35
  def main():
36
- st.title("Document Text Comparison Tool")
37
  st.write("Upload Customer Document and CorelDRAW Output for Comparison.")
38
 
39
  customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
40
  output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
41
 
42
  if st.button("Compare Documents") and customer_file and output_file:
43
- differences = load_and_compare_documents(customer_file, output_file)
44
-
45
- # Display differences in the Streamlit UI
46
- st.subheader("Differences Summary")
47
 
48
- # New content in the second document
49
- if differences["new"]:
 
50
  st.write("**New Content (Lines present only in CorelDRAW Output):**")
51
- for line in differences["new"]:
52
  st.markdown(f"<span style='color:blue'>+ {line}</span>", unsafe_allow_html=True)
53
-
54
- # Missing content from the first document
55
- if differences["missing"]:
56
  st.write("**Missing Content (Lines present only in Customer Document):**")
57
- for line in differences["missing"]:
58
  st.markdown(f"<span style='color:red'>- {line}</span>", unsafe_allow_html=True)
59
-
60
- # Minor modifications between documents
61
- if differences["minor"]:
62
  st.write("**Minor Differences:**")
63
- for line in differences["minor"]:
64
  st.markdown(f"<span style='color:orange'>? {line}</span>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  if __name__ == "__main__":
67
  main()
 
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
+ from PIL import Image, ImageChops
4
  import difflib
5
+ import io
6
 
7
  def load_and_compare_documents(file1, file2):
8
+ # Extract and compare text
9
  doc1_text, doc2_text = extract_text_from_pdf(file1), extract_text_from_pdf(file2)
10
+ text_differences = compare_text(doc1_text, doc2_text)
11
 
12
+ # Compare images from both PDFs
13
+ image_differences = compare_images(file1, file2)
14
+
15
+ return text_differences, image_differences
16
 
17
  def extract_text_from_pdf(file):
18
  # Extract text from PDF
 
35
  differences["minor"].append(line[2:]) # Minor differences or edits
36
  return differences
37
 
38
+ def pdf_to_images(pdf_file):
39
+ # Convert each page of the PDF to images
40
+ images = []
41
+ pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
42
+ for page_num in range(pdf_document.page_count):
43
+ page = pdf_document.load_page(page_num)
44
+ pix = page.get_pixmap()
45
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
46
+ images.append(img)
47
+ pdf_document.close()
48
+ return images
49
+
50
+ def compare_images(file1, file2):
51
+ # Convert PDFs to images
52
+ images1 = pdf_to_images(file1)
53
+ images2 = pdf_to_images(file2)
54
+
55
+ differences = []
56
+ for i, (img1, img2) in enumerate(zip(images1, images2)):
57
+ # Compare images and find differences
58
+ diff = ImageChops.difference(img1, img2)
59
+ if diff.getbbox():
60
+ differences.append((i + 1, diff)) # Page number and diff image
61
+
62
+ return differences
63
+
64
  # Streamlit app interface
65
  def main():
66
+ st.title("Document Text and Image Comparison Tool")
67
  st.write("Upload Customer Document and CorelDRAW Output for Comparison.")
68
 
69
  customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
70
  output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
71
 
72
  if st.button("Compare Documents") and customer_file and output_file:
73
+ text_differences, image_differences = load_and_compare_documents(customer_file, output_file)
 
 
 
74
 
75
+ # Display text differences in the UI
76
+ st.subheader("Text Differences")
77
+ if text_differences["new"]:
78
  st.write("**New Content (Lines present only in CorelDRAW Output):**")
79
+ for line in text_differences["new"]:
80
  st.markdown(f"<span style='color:blue'>+ {line}</span>", unsafe_allow_html=True)
81
+ if text_differences["missing"]:
 
 
82
  st.write("**Missing Content (Lines present only in Customer Document):**")
83
+ for line in text_differences["missing"]:
84
  st.markdown(f"<span style='color:red'>- {line}</span>", unsafe_allow_html=True)
85
+ if text_differences["minor"]:
 
 
86
  st.write("**Minor Differences:**")
87
+ for line in text_differences["minor"]:
88
  st.markdown(f"<span style='color:orange'>? {line}</span>", unsafe_allow_html=True)
89
+
90
+ # Display image differences in the UI
91
+ st.subheader("Image Differences")
92
+ if image_differences:
93
+ for page_num, diff_image in image_differences:
94
+ st.write(f"**Differences on Page {page_num}**")
95
+ diff_image_io = io.BytesIO()
96
+ diff_image.save(diff_image_io, format="PNG")
97
+ st.image(diff_image_io, caption=f"Differences on Page {page_num}", use_column_width=True)
98
+ else:
99
+ st.write("No image differences detected.")
100
 
101
  if __name__ == "__main__":
102
  main()