import streamlit as st
import fitz  # PyMuPDF
import difflib
from PIL import Image, ImageDraw, ImageEnhance, ImageFilter
import easyocr
import numpy as np
import pandas as pd
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import tempfile
import os
from io import BytesIO

# Initialize the easyocr Reader
ocr_reader = easyocr.Reader(['en'])

def load_and_compare_documents(file1, file2):
    file1_content = file1.read()
    file2_content = file2.read()
    
    # Perform OCR-based comparison across all pages
    ocr_differences, marked_images_1, marked_images_2 = perform_ocr_and_compare(file1_content, file2_content)
    
    # Generate a PDF with side-by-side comparisons and observation tables
    pdf_buffer = create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences)
    
    # Compile an overall summary of differences
    overall_summary = generate_overall_summary(ocr_differences)
    
    return pdf_buffer, overall_summary

def pdf_to_images(file_content):
    images = []
    pdf_document = fitz.open(stream=file_content, filetype="pdf")
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # High DPI for better zoom capability
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Preprocess image: adjust brightness, contrast, and apply filter
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(1.5)  # Increase contrast
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(1.2)  # Increase brightness
        img = img.filter(ImageFilter.SHARPEN)  # Sharpen to reduce noise
        
        images.append((page_num + 1, img))
    pdf_document.close()
    return images

def perform_ocr_and_compare(content1, content2):
    ocr_differences = []
    marked_images_1 = {}
    marked_images_2 = {}
    images1 = pdf_to_images(content1)
    images2 = pdf_to_images(content2)

    for (page_num, img1), (_, img2) in zip(images1, images2):
        img1_np = np.array(img1)
        img2_np = np.array(img2)
        
        # Perform OCR using easyocr
        text1 = ' '.join([result[1] for result in ocr_reader.readtext(img1_np)])
        text2 = ' '.join([result[1] for result in ocr_reader.readtext(img2_np)])
        
        # Duplicate images for marking OCR differences
        marked_img1 = img1.copy()
        marked_img2 = img2.copy()
        draw1 = ImageDraw.Draw(marked_img1)
        draw2 = ImageDraw.Draw(marked_img2)
        
        if text1.strip().lower() != text2.strip().lower():  # Case-insensitive, whitespace-trimmed
            diff = list(difflib.ndiff(text1, text2))
            page_diffs = []
            diff_index = 1  # Start index for marking
            
            for i, change in enumerate(diff):
                if change.startswith("+ "):
                    page_diffs.append({"type": "Added", "value": change[2:], "index": i, "description": f"'{change[2:]}' added in second PDF but not in first PDF at position {i}"})
                elif change.startswith("- "):
                    page_diffs.append({"type": "Deleted", "value": change[2:], "index": i, "description": f"'{change[2:]}' present in first PDF but missing in second PDF at position {i}"})
                elif change.startswith("? "):
                    page_diffs.append({"type": "Modified", "value": change[2:], "index": i, "description": f"'{change[2:]}' modified at position {i}"})
            
            ocr_differences.append({"page": page_num, "differences": page_diffs})
            
            # Mark OCR-detected differences as boxed highlights on both images
            for result in ocr_reader.readtext(img2_np):
                bbox, detected_text = result[0], result[1]
                if detected_text.strip().lower() in text2.lower() and detected_text.strip().lower() not in text1.lower():
                    flattened_bbox = [coord for point in bbox for coord in point]
                    draw1.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)
                    draw2.rectangle([flattened_bbox[0], flattened_bbox[1], flattened_bbox[2], flattened_bbox[3]], outline="red", width=2)

            marked_images_1[page_num] = marked_img1
            marked_images_2[page_num] = marked_img2

    return ocr_differences, marked_images_1, marked_images_2

def create_pdf_with_side_by_side(marked_images_1, marked_images_2, ocr_differences):
    pdf_buffer = BytesIO()
    c = canvas.Canvas(pdf_buffer, pagesize=(letter[0] * 2, letter[1]))  # Adjusted for side-by-side layout

    # Loop through each page to add side-by-side images and observations
    for page_num, img1 in marked_images_1.items():
        img2 = marked_images_2.get(page_num)
        
        if img2:
            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file1:
                img1.save(temp_img_file1, format="PNG")
                temp_img_path1 = temp_img_file1.name

            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_img_file2:
                img2.save(temp_img_file2, format="PNG")
                temp_img_path2 = temp_img_file2.name

            # Draw the saved images side-by-side on the PDF
            c.drawImage(temp_img_path1, 0, 0, width=letter[0], height=letter[1])
            c.drawImage(temp_img_path2, letter[0], 0, width=letter[0], height=letter[1])
            c.showPage()
            
            try:
                os.remove(temp_img_path1)
                os.remove(temp_img_path2)
            except OSError:
                pass

            # Generate the observation table for each page
            c.setFont("Helvetica", 10)
            y_position = 750
            c.drawString(10, y_position, f"Observation Summary for Page {page_num}:")
            y_position -= 20

            # Table data for each page
            data = {"Description": [], "Position": []}
            for ocr_diff in ocr_differences:
                if ocr_diff["page"] == page_num:
                    for diff in ocr_diff["differences"]:
                        data["Description"].append(diff["description"])
                        data["Position"].append(diff["index"])

            # Convert data to DataFrame for formatting
            df = pd.DataFrame(data)
            column_widths = [350, 100]

            # Render the DataFrame as a table in the PDF
            for row in df.itertuples(index=False):
                for col_index, value in enumerate(row):
                    c.drawString(10 + col_index * column_widths[col_index], y_position, str(value))
                y_position -= 15
                if y_position < 50:  # Start a new page if space is running out
                    c.showPage()
                    y_position = 750

            c.showPage()

    c.save()
    pdf_buffer.seek(0)
    return pdf_buffer

def generate_overall_summary(ocr_differences):
    total_additions = sum(len([d for d in diff["differences"] if d["type"] == "Added"]) for diff in ocr_differences)
    total_deletions = sum(len([d for d in diff["differences"] if d["type"] == "Deleted"]) for diff in ocr_differences)
    total_modifications = sum(len([d for d in diff["differences"] if d["type"] == "Modified"]) for diff in ocr_differences)
    
    overall_summary = {
        "total_additions": total_additions,
        "total_deletions": total_deletions,
        "total_modifications": total_modifications,
    }
    return overall_summary

# Streamlit app interface
def main():
    st.title("Comprehensive Document Comparison Tool with OCR Text Extraction")
    st.write("Upload Customer Document and CorelDRAW Output for detailed comparison.")
    
    customer_file = st.file_uploader("Customer Document (PDF only)", type=["pdf"])
    output_file = st.file_uploader("CorelDRAW Output (PDF only)", type=["pdf"])
    
    if st.button("Compare Documents") and customer_file and output_file:
        if customer_file.size == 0 or output_file.size == 0:
            st.error("One or both files are empty. Please upload valid PDF files.")
            return

        pdf_buffer, overall_summary = load_and_compare_documents(customer_file, output_file)
        
        st.subheader("Overall Comparison Summary")
        for key, value in overall_summary.items():
            st.write(f"{key.replace('_', ' ').capitalize()}: {value}")
        
        # Provide download link for generated PDF with marked differences
        st.subheader("Download PDF with Side-by-Side Comparisons and Observations")
        st.download_button("Download Marked PDF", data=pdf_buffer, file_name="side_by_side_comparison.pdf", mime="application/pdf")

if __name__ == "__main__":
    main()