Spaces:

SuriRaja
/

usecase2

Sleeping

App Files Files Community

SuriRaja commited on Nov 11, 2024

Commit

778fed4

verified ·

1 Parent(s): 9aff2fb

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -20

app.py CHANGED Viewed

@@ -1,14 +1,18 @@
 import streamlit as st
 import fitz  # PyMuPDF
 import difflib
 def load_and_compare_documents(file1, file2):
-    # Extract text from both documents
     doc1_text, doc2_text = extract_text_from_pdf(file1), extract_text_from_pdf(file2)
-    # Compare text line-by-line and categorize differences
-    differences = compare_text(doc1_text, doc2_text)
-    return differences
 def extract_text_from_pdf(file):
     # Extract text from PDF
@@ -31,37 +35,68 @@ def compare_text(doc1_text, doc2_text):
             differences["minor"].append(line[2:])  # Minor differences or edits
     return differences
 # Streamlit app interface
 def main():
-    st.title("Document Text Comparison Tool")
     st.write("Upload Customer Document and CorelDRAW Output for Comparison.")
     customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
     output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
     if st.button("Compare Documents") and customer_file and output_file:
-        differences = load_and_compare_documents(customer_file, output_file)
-        # Display differences in the Streamlit UI
-        st.subheader("Differences Summary")
-        # New content in the second document
-        if differences["new"]:
             st.write("**New Content (Lines present only in CorelDRAW Output):**")
-            for line in differences["new"]:
                 st.markdown(f"<span style='color:blue'>+ {line}</span>", unsafe_allow_html=True)
-        # Missing content from the first document
-        if differences["missing"]:
             st.write("**Missing Content (Lines present only in Customer Document):**")
-            for line in differences["missing"]:
                 st.markdown(f"<span style='color:red'>- {line}</span>", unsafe_allow_html=True)
-        # Minor modifications between documents
-        if differences["minor"]:
             st.write("**Minor Differences:**")
-            for line in differences["minor"]:
                 st.markdown(f"<span style='color:orange'>? {line}</span>", unsafe_allow_html=True)
 if __name__ == "__main__":
     main()

 import streamlit as st
 import fitz  # PyMuPDF
+from PIL import Image, ImageChops
 import difflib
+import io
 def load_and_compare_documents(file1, file2):
+    # Extract and compare text
     doc1_text, doc2_text = extract_text_from_pdf(file1), extract_text_from_pdf(file2)
+    text_differences = compare_text(doc1_text, doc2_text)
+    # Compare images from both PDFs
+    image_differences = compare_images(file1, file2)
+    return text_differences, image_differences
 def extract_text_from_pdf(file):
     # Extract text from PDF
             differences["minor"].append(line[2:])  # Minor differences or edits
     return differences
+def pdf_to_images(pdf_file):
+    # Convert each page of the PDF to images
+    images = []
+    pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    for page_num in range(pdf_document.page_count):
+        page = pdf_document.load_page(page_num)
+        pix = page.get_pixmap()
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
+    pdf_document.close()
+    return images
+def compare_images(file1, file2):
+    # Convert PDFs to images
+    images1 = pdf_to_images(file1)
+    images2 = pdf_to_images(file2)
+    differences = []
+    for i, (img1, img2) in enumerate(zip(images1, images2)):
+        # Compare images and find differences
+        diff = ImageChops.difference(img1, img2)
+        if diff.getbbox():
+            differences.append((i + 1, diff))  # Page number and diff image
+    return differences
 # Streamlit app interface
 def main():
+    st.title("Document Text and Image Comparison Tool")
     st.write("Upload Customer Document and CorelDRAW Output for Comparison.")
     customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
     output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
     if st.button("Compare Documents") and customer_file and output_file:
+        text_differences, image_differences = load_and_compare_documents(customer_file, output_file)
+        # Display text differences in the UI
+        st.subheader("Text Differences")
+        if text_differences["new"]:
             st.write("**New Content (Lines present only in CorelDRAW Output):**")
+            for line in text_differences["new"]:
                 st.markdown(f"<span style='color:blue'>+ {line}</span>", unsafe_allow_html=True)
+        if text_differences["missing"]:
             st.write("**Missing Content (Lines present only in Customer Document):**")
+            for line in text_differences["missing"]:
                 st.markdown(f"<span style='color:red'>- {line}</span>", unsafe_allow_html=True)
+        if text_differences["minor"]:
             st.write("**Minor Differences:**")
+            for line in text_differences["minor"]:
                 st.markdown(f"<span style='color:orange'>? {line}</span>", unsafe_allow_html=True)
+        # Display image differences in the UI
+        st.subheader("Image Differences")
+        if image_differences:
+            for page_num, diff_image in image_differences:
+                st.write(f"**Differences on Page {page_num}**")
+                diff_image_io = io.BytesIO()
+                diff_image.save(diff_image_io, format="PNG")
+                st.image(diff_image_io, caption=f"Differences on Page {page_num}", use_column_width=True)
+        else:
+            st.write("No image differences detected.")
 if __name__ == "__main__":
     main()