Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from openai import OpenAI | |
| from openai.types.beta.threads.message_create_params import Attachment, AttachmentToolFileSearch | |
| import fitz # PyMuPDF | |
| from pdf2image import convert_from_bytes | |
| from PIL import Image, ImageDraw | |
| from rapidfuzz import fuzz | |
| import cv2 | |
| import hashlib | |
| import numpy as np | |
| import io | |
| import math | |
| import tempfile | |
| import os | |
| from streamlit_drawable_canvas import st_canvas | |
| ######################################## | |
| # Utility Functions & OpenAI Setup | |
| ######################################## | |
| client = OpenAI(api_key="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") | |
| def generate_llm_summary( | |
| text_mismatches, | |
| image_changes, | |
| pixel_diffs, | |
| model="gpt-3.5-turbo", | |
| client=client | |
| ): | |
| """ | |
| Generates a human-readable summary of PDF differences using an LLM. | |
| Args: | |
| text_mismatches (dict): Dictionary of missing and extra text | |
| image_changes (dict): Dictionary of added and deleted images | |
| pixel_diffs (list): List of (page_num, num_differences) tuples | |
| model (str): OpenAI model to use | |
| client: OpenAI client instance | |
| Returns: | |
| str: Generated summary of differences | |
| """ | |
| if client is None: | |
| raise ValueError("A valid OpenAI client instance is required.") | |
| # Format text differences | |
| missing_texts = text_mismatches.get("missing", []) | |
| extra_texts = text_mismatches.get("extra", []) | |
| missing_str = "\n".join([f"- Page {p+1}: {t}" for (p, t, *_) in missing_texts]) if missing_texts else "None" | |
| extra_str = "\n".join([f"- Page {p+1}: {t}" for (p, t, *_) in extra_texts]) if extra_texts else "None" | |
| # Format image hash differences | |
| added_images = image_changes.get("added", {}) | |
| deleted_images = image_changes.get("deleted", {}) | |
| added_str = "\n".join([ | |
| f"- Page {page_idx+1}: {len(hashes)} new image(s)" | |
| for page_idx, hashes in added_images.items() | |
| ]) if added_images else "None" | |
| deleted_str = "\n".join([ | |
| f"- Page {page_idx+1}: {len(hashes)} removed image(s)" | |
| for page_idx, hashes in deleted_images.items() | |
| ]) if deleted_images else "None" | |
| # Format pixel differences | |
| pixel_diff_str = "Visual differences detected on:\n" + "\n".join([ | |
| f"- Page {page_num}: {num_diffs} difference region(s)" | |
| for page_num, num_diffs in pixel_diffs | |
| ]) if pixel_diffs else "No visual differences detected" | |
| # System message for the LLM | |
| system_msg = { | |
| "role": "system", | |
| "content": """You are a PDF comparison expert performing a quality control check of package artwork. | |
| Analyze the differences between two PDFs and provide a clear, concise summary that a non-technical user can understand. | |
| Focus on: | |
| 1. Most significant changes first | |
| 2. Group similar changes together | |
| 3. Provide specific page numbers and locations | |
| 4. Explain the nature of changes (additions, deletions, modifications) | |
| 5. Specify specifics of the reported changes (e.g., color differences) | |
| 6. Indicate if text changes align with pixel differences so as not to double-count the same issue. | |
| """ | |
| } | |
| user_msg = { | |
| "role": "user", | |
| "content": f"""Please analyze these PDF differences and provide a clear summary: | |
| TEXT CHANGES | |
| Missing/Deleted Text: | |
| {missing_str} | |
| Added/Extra Text: | |
| {extra_str} | |
| IMAGE CHANGES | |
| Added Images: | |
| {added_str} | |
| Removed Images: | |
| {deleted_str} | |
| VISUAL DIFFERENCES | |
| {pixel_diff_str} | |
| Provide a clear, organized summary of these changes for a non-technical user.""" | |
| } | |
| # Call OpenAI API | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[system_msg, user_msg], | |
| temperature=0.7, | |
| max_tokens=1000 | |
| ) | |
| return response.choices[0].message.content | |
| def normalize_text(text): | |
| """Utility to normalize text spacing.""" | |
| return " ".join(text.split()) | |
| ######################################## | |
| # 1) Enhanced Text Extraction with Bounding Boxes & Font Info | |
| ######################################## | |
| def extract_text_with_details(pdf_bytes): | |
| """ | |
| Extracts text from a PDF using PyMuPDF along with bounding boxes, | |
| font information, and potential multi-language support. | |
| Returns: | |
| List of tuples: (page_index, extracted_text, bounding_box, font_name, font_size) | |
| """ | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| detailed_text = [] | |
| for page_index, page in enumerate(doc): | |
| # 'dict' layout includes spans, bounding boxes, etc. | |
| page_dict = page.get_text("dict") | |
| for block in page_dict["blocks"]: | |
| # Each block can have multiple lines/spans | |
| if "lines" not in block: | |
| continue | |
| for line in block["lines"]: | |
| for span in line["spans"]: | |
| text_content = normalize_text(span["text"]) | |
| if not text_content.strip(): | |
| continue | |
| # bounding box for the span is an approximation of textual extent | |
| bbox = span["bbox"] | |
| font_name = span.get("font", "Unknown") | |
| font_size = span.get("size", 0) | |
| # Store details | |
| detailed_text.append( | |
| ( | |
| page_index, | |
| text_content, | |
| bbox, | |
| font_name, | |
| font_size | |
| ) | |
| ) | |
| return detailed_text | |
| ######################################## | |
| # 2) Text Comparison Using Bounding Boxes & Font Properties | |
| ######################################## | |
| def extract_region_as_pdf(pdf_bytes, page_number, bbox): | |
| """ | |
| Extracts a rectangular region from a given page in a PDF and returns a new PDF | |
| containing just that cropped region as one page. | |
| Args: | |
| pdf_bytes (bytes): The full PDF file in bytes | |
| page_number (int): Zero-based index of the page to crop | |
| bbox (tuple): (x0, y0, x1, y1) in PDF coordinates, | |
| where (x0, y0) is lower-left, (x1, y1) is upper-right. | |
| Returns: | |
| bytes: Cropped PDF as in-memory bytes | |
| """ | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| # Safety check | |
| if page_number < 0 or page_number >= len(doc): | |
| raise ValueError("Invalid page number.") | |
| # Load the target page | |
| page = doc[page_number] | |
| # Create a copy of the entire page as a new PDF | |
| new_pdf = fitz.open() | |
| # We will create a single new page with the bounding box size | |
| rect = fitz.Rect(bbox) # (x0, y0, x1, y1) | |
| new_page = new_pdf.new_page(width=rect.width, height=rect.height) | |
| # Now we copy the region from the original page to the new page | |
| # Position it at (0,0) in the new page | |
| new_page.show_pdf_page(new_page.rect, doc, page_number, clip=rect) | |
| # Save to in-memory bytes | |
| output_buffer = io.BytesIO() | |
| new_pdf.save(output_buffer) | |
| new_pdf.close() | |
| doc.close() | |
| return output_buffer.getvalue() | |
| def compare_text_details(ref_text_details, test_text_details, similarity_threshold=90, box_shift_threshold=10.0): | |
| """ | |
| Compare reference and test text data (content + bounding boxes + font info). | |
| Identifies missing and extra text, and checks for bounding box shifts (warping). | |
| Args: | |
| ref_text_details: List of (page_idx, text, bbox, font_name, font_size) | |
| test_text_details: Same structure for test PDF | |
| similarity_threshold: Fuzzy text matching threshold | |
| box_shift_threshold: Maximum allowed bounding-box shift (in points) | |
| before flagging as 'warped' or misaligned. | |
| Returns: | |
| dict with keys: | |
| "missing": [(page, text, bbox, font, font_size), ...] | |
| "extra": [(page, text, bbox, font, font_size), ...] | |
| "warped": [(page, text, ref_bbox, test_bbox, ref_font, test_font), ...] | |
| """ | |
| mismatches = { | |
| "missing": [], | |
| "extra": [], | |
| "warped": [] | |
| } | |
| # Convert lists to a more manageable structure | |
| # For quick lookups, we won't just do naive search; we'll do a pairing approach: | |
| # We'll create a copy of the test_text_details we can remove from as we match. | |
| test_pool = list(test_text_details) | |
| for ref_item in ref_text_details: | |
| ref_page, ref_text, ref_bbox, ref_font, ref_size = ref_item | |
| best_match_idx = -1 | |
| best_match_score = 0 | |
| best_match = None | |
| # Try to find best text match in test_pool on the same page | |
| for idx, test_item in enumerate(test_pool): | |
| test_page, test_text, test_bbox, test_font, test_size = test_item | |
| if ref_page == test_page: # Compare only within the same page | |
| score = fuzz.ratio(ref_text, test_text) | |
| if score > best_match_score: | |
| best_match_score = score | |
| best_match_idx = idx | |
| best_match = test_item | |
| # Check if we found a match above threshold | |
| if best_match and best_match_score >= similarity_threshold: | |
| # Found a textual match, now compare bounding boxes for warp/misalignment | |
| _, _, test_bbox, test_font, test_size = best_match | |
| # Simple bounding box shift check (euclidean distance between centers) | |
| ref_center = ((ref_bbox[0] + ref_bbox[2]) / 2.0, (ref_bbox[1] + ref_bbox[3]) / 2.0) | |
| test_center = ((test_bbox[0] + test_bbox[2]) / 2.0, (test_bbox[1] + test_bbox[3]) / 2.0) | |
| shift_distance = math.dist(ref_center, test_center) | |
| # Check if bounding box or font significantly differs | |
| font_diff = (ref_font != test_font) or (abs(ref_size - test_size) > 0.5) | |
| if shift_distance > box_shift_threshold or font_diff: | |
| mismatches["warped"].append( | |
| ( | |
| ref_page, | |
| ref_text, | |
| ref_bbox, | |
| test_bbox, | |
| f"{ref_font}({ref_size:.1f})", | |
| f"{test_font}({test_size:.1f})" | |
| ) | |
| ) | |
| # Remove matched item from test_pool so it won't match again | |
| test_pool.pop(best_match_idx) | |
| else: | |
| # If no adequate match found, this reference text is missing in the test | |
| mismatches["missing"].append(ref_item) | |
| # Whatever remains in test_pool is "extra" text | |
| for test_item in test_pool: | |
| mismatches["extra"].append(test_item) | |
| return mismatches | |
| def generate_text_diff_report(mismatches): | |
| """ | |
| Formats text mismatch data for display in Streamlit (HTML format). | |
| """ | |
| missing = mismatches["missing"] | |
| extra = mismatches["extra"] | |
| warped = mismatches["warped"] | |
| report_lines = [] | |
| report_lines.append("### TEXT DIFFERENCES") | |
| if missing: | |
| report_lines.append("\n**Missing/Deleted Text:**") | |
| for (page_idx, text, bbox, font, size) in missing: | |
| colored_text = f"<span style='color:red;'>{text}</span>" | |
| report_lines.append(f" - Page {page_idx + 1}, BBox {bbox}, Font {font}({size:.1f}): {colored_text}") | |
| else: | |
| report_lines.append("\nNo deleted text.") | |
| if extra: | |
| report_lines.append("\n**Added/Extra Text:**") | |
| for (page_idx, text, bbox, font, size) in extra: | |
| colored_text = f"<span style='color:green;'>{text}</span>" | |
| report_lines.append(f" - Page {page_idx + 1}, BBox {bbox}, Font {font}({size:.1f}): {colored_text}") | |
| else: | |
| report_lines.append("\nNo added text.") | |
| if warped: | |
| report_lines.append("\n**Warped or Misaligned Text:**") | |
| for (page_idx, text, ref_bbox, test_bbox, ref_font_info, test_font_info) in warped: | |
| colored_text = f"<span style='color:orange;'>{text}</span>" | |
| report_lines.append( | |
| f" - Page {page_idx + 1}: {colored_text}<br>" | |
| f" Ref BBox {ref_bbox}, Test BBox {test_bbox}, " | |
| f" Ref Font: {ref_font_info}, Test Font: {test_font_info}" | |
| ) | |
| else: | |
| report_lines.append("\nNo warped or misaligned text.") | |
| return "\n".join(report_lines) | |
| ######################################## | |
| # 3) Image & Color Analysis | |
| ######################################## | |
| def get_image_info(pdf_bytes): | |
| """ | |
| Returns a dict of: | |
| page_index -> list of dictionaries with: | |
| { | |
| "hash": md5_hash_of_image, | |
| "width": width, | |
| "height": height, | |
| "colorspace": color_space_name, | |
| "xref": xref (for reference) | |
| } | |
| Useful for detecting added/removed images and color changes. | |
| """ | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| image_info = {} | |
| for page_index in range(len(doc)): | |
| page = doc[page_index] | |
| imgs = page.get_images(full=True) | |
| for img in imgs: | |
| xref = img[0] | |
| # The tuple typically includes: (xref, smask, width, height, bpc, colorspace, ...) | |
| width = img[2] | |
| height = img[3] | |
| bpc = img[4] | |
| colorspace = img[5] # e.g. 'DeviceRGB', 'DeviceCMYK', ... | |
| base_image = doc.extract_image(xref) | |
| image_data = base_image["image"] | |
| md5_hash = hashlib.md5(image_data).hexdigest() | |
| image_info.setdefault(page_index, []).append({ | |
| "hash": md5_hash, | |
| "width": width, | |
| "height": height, | |
| "bpc": bpc, | |
| "colorspace": colorspace, | |
| "xref": xref | |
| }) | |
| return image_info | |
| def compare_image_info(ref_info, test_info): | |
| """ | |
| Compare image data (hashes, color spaces, sizes). | |
| Return dictionary with keys 'added', 'deleted', 'color_mismatch', 'distorted'. | |
| Each is a dict of page_index -> list of details. | |
| """ | |
| results = { | |
| "added": {}, | |
| "deleted": {}, | |
| "color_mismatch": {}, | |
| "distorted": {} | |
| } | |
| all_pages = set(ref_info.keys()) | set(test_info.keys()) | |
| for page_idx in all_pages: | |
| ref_list = ref_info.get(page_idx, []) | |
| test_list = test_info.get(page_idx, []) | |
| ref_hashes = {img['hash']: img for img in ref_list} | |
| test_hashes = {img['hash']: img for img in test_list} | |
| # Identify added and removed | |
| deleted = set(ref_hashes.keys()) - set(test_hashes.keys()) | |
| added = set(test_hashes.keys()) - set(ref_hashes.keys()) | |
| if deleted: | |
| results["deleted"][page_idx] = [ref_hashes[h] for h in deleted] | |
| if added: | |
| results["added"][page_idx] = [test_hashes[h] for h in added] | |
| # Identify potential color space or size mismatches for images that exist in both | |
| common = set(ref_hashes.keys()) & set(test_hashes.keys()) | |
| for h in common: | |
| ref_img = ref_hashes[h] | |
| test_img = test_hashes[h] | |
| # Check color space mismatch | |
| if ref_img["colorspace"] != test_img["colorspace"]: | |
| results["color_mismatch"].setdefault(page_idx, []).append((ref_img, test_img)) | |
| # Check distortion (aspect ratio difference > some threshold) | |
| ref_ar = ref_img["width"] / float(ref_img["height"]) if ref_img["height"] != 0 else 0 | |
| test_ar = test_img["width"] / float(test_img["height"]) if test_img["height"] != 0 else 0 | |
| if ref_ar != 0 and abs(ref_ar - test_ar) > 0.01: | |
| results["distorted"].setdefault(page_idx, []).append((ref_img, test_img)) | |
| return results | |
| ######################################## | |
| # 4) Visual Layout / Pixel-Based Differences | |
| ######################################## | |
| def pdf_to_images(pdf_bytes, dpi=100): | |
| """ | |
| Convert PDF to list of PIL Images at given DPI. | |
| """ | |
| return convert_from_bytes(pdf_bytes, dpi=dpi) | |
| def detect_image_differences(img_ref, img_test, diff_threshold=30): | |
| """ | |
| Pixel-level difference detection with optional threshold. | |
| Returns a list of contours (cv2) that exceed the threshold. | |
| """ | |
| np_ref = cv2.cvtColor(np.array(img_ref), cv2.COLOR_RGB2GRAY) | |
| np_test = cv2.cvtColor(np.array(img_test), cv2.COLOR_RGB2GRAY) | |
| # Resize test to match ref if needed | |
| if np_ref.shape != np_test.shape: | |
| np_test = cv2.resize( | |
| np_test, | |
| (np_ref.shape[1], np_ref.shape[0]), | |
| interpolation=cv2.INTER_AREA | |
| ) | |
| diff = cv2.absdiff(np_ref, np_test) | |
| _, thresh = cv2.threshold(diff, diff_threshold, 255, cv2.THRESH_BINARY) | |
| kernel = np.ones((3,3), np.uint8) | |
| closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel) | |
| contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| return contours | |
| def highlight_image(image, contours, color="red", width=2): | |
| """ | |
| Draws bounding rectangles for each difference contour onto a PIL Image. | |
| """ | |
| draw = ImageDraw.Draw(image) | |
| for cnt in contours: | |
| x, y, w, h = cv2.boundingRect(cnt) | |
| draw.rectangle([x, y, x + w, y + h], outline=color, width=width) | |
| return image | |
| ######################################## | |
| # 5) Generating Interactive Reports & Downloads | |
| ######################################## | |
| def generate_image_diff_summary(image_comparison_results): | |
| """ | |
| Summarize image differences (added, deleted, color_mismatch, distorted). | |
| """ | |
| lines = ["### IMAGE & COLOR DIFFERENCES"] | |
| # Added | |
| if image_comparison_results["added"]: | |
| lines.append("**Added Images:**") | |
| for page_idx, imgs in image_comparison_results["added"].items(): | |
| for img in imgs: | |
| lines.append(f"- Page {page_idx+1}: hash={img['hash']} colorspace={img['colorspace']}") | |
| else: | |
| lines.append("No added images.") | |
| # Deleted | |
| if image_comparison_results["deleted"]: | |
| lines.append("\n**Removed Images:**") | |
| for page_idx, imgs in image_comparison_results["deleted"].items(): | |
| for img in imgs: | |
| lines.append(f"- Page {page_idx+1}: hash={img['hash']} colorspace={img['colorspace']}") | |
| else: | |
| lines.append("\nNo removed images.") | |
| # Color mismatch | |
| if image_comparison_results["color_mismatch"]: | |
| lines.append("\n**Color Space Mismatches:**") | |
| for page_idx, mismatches in image_comparison_results["color_mismatch"].items(): | |
| for (ref_img, test_img) in mismatches: | |
| lines.append( | |
| f"- Page {page_idx+1}: Hash={ref_img['hash']} " | |
| f"Ref CS={ref_img['colorspace']} -> Test CS={test_img['colorspace']}" | |
| ) | |
| else: | |
| lines.append("\nNo color space mismatches.") | |
| # Distorted | |
| if image_comparison_results["distorted"]: | |
| lines.append("\n**Distorted Images (Aspect Ratio Changes):**") | |
| for page_idx, pairs in image_comparison_results["distorted"].items(): | |
| for (ref_img, test_img) in pairs: | |
| lines.append( | |
| f"- Page {page_idx+1}: Hash={ref_img['hash']} had size " | |
| f"{ref_img['width']}x{ref_img['height']} -> {test_img['width']}x{test_img['height']}" | |
| ) | |
| else: | |
| lines.append("\nNo distorted images.") | |
| return "\n".join(lines) | |
| def create_annotated_pdf(pdf_bytes_ref, pdf_bytes_test, difference_data, diff_threshold=30, dpi=100): | |
| """ | |
| Creates a PDF with side-by-side annotated images for each page. | |
| For large documents, this could be memory-intensive; | |
| consider writing to disk per page. | |
| Returns: | |
| annotated_pdf_bytes: In-memory PDF with annotation highlights. | |
| """ | |
| ref_pages = pdf_to_images(pdf_bytes_ref, dpi=dpi) | |
| test_pages = pdf_to_images(pdf_bytes_test, dpi=dpi) | |
| # Use a temp directory to store annotated page images, then build a PDF. | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| annotated_image_paths = [] | |
| pages_to_compare = min(len(ref_pages), len(test_pages)) | |
| for i in range(pages_to_compare): | |
| contours = detect_image_differences(ref_pages[i], test_pages[i], diff_threshold=diff_threshold) | |
| ref_annot = highlight_image(ref_pages[i].copy(), contours, color="red", width=3) | |
| test_annot = highlight_image(test_pages[i].copy(), contours, color="blue", width=3) | |
| # Combine images horizontally for side-by-side | |
| w_ref, h_ref = ref_annot.size | |
| w_test, h_test = test_annot.size | |
| total_width = w_ref + w_test | |
| max_height = max(h_ref, h_test) | |
| combined_img = Image.new("RGB", (total_width, max_height), (255,255,255)) | |
| combined_img.paste(ref_annot, (0,0)) | |
| combined_img.paste(test_annot, (w_ref,0)) | |
| output_path = os.path.join(tmpdir, f"annotated_page_{i+1}.png") | |
| combined_img.save(output_path) | |
| annotated_image_paths.append(output_path) | |
| # Convert these annotated PNGs into a single PDF | |
| if annotated_image_paths: | |
| images_for_pdf = [Image.open(p).convert("RGB") for p in annotated_image_paths] | |
| pdf_output_path = os.path.join(tmpdir, "annotated_output.pdf") | |
| images_for_pdf[0].save( | |
| pdf_output_path, | |
| save_all=True, | |
| append_images=images_for_pdf[1:], | |
| format="PDF" | |
| ) | |
| with open(pdf_output_path, "rb") as f: | |
| annotated_pdf_bytes = f.read() | |
| return annotated_pdf_bytes | |
| else: | |
| return None | |
| ####################################### | |
| #MAIN helper | |
| ######################################## | |
| def run_qc_comparison( | |
| ref_pdf_bytes, | |
| test_pdf_bytes, | |
| similarity_threshold=90, | |
| box_shift_threshold=10, | |
| diff_threshold=30, | |
| rendering_dpi=100 | |
| ): | |
| """ | |
| Compares two PDFs (reference vs. test) at multiple levels: | |
| 1. Text comparison (including bounding box & font differences) | |
| 2. Image & color analysis | |
| 3. Pixel-based visual differences | |
| 4. Optional summary text (LLM or other methods) | |
| Args: | |
| ref_pdf_bytes (bytes): In-memory bytes of the reference PDF | |
| test_pdf_bytes (bytes): In-memory bytes of the test PDF | |
| similarity_threshold (int): Fuzzy match threshold for text | |
| box_shift_threshold (float): Max allowed bounding box shift for 'warping' | |
| diff_threshold (int): Pixel difference threshold for image diffs | |
| rendering_dpi (int): DPI used to rasterize PDF pages for pixel-based comparison | |
| Returns: | |
| dict: A dictionary containing the comparison results. For example: | |
| { | |
| "text_mismatches": {...}, | |
| "image_comparison_results": {...}, | |
| "pixel_diffs": [...], | |
| "summary": "Optional LLM or aggregated summary text" | |
| } | |
| """ | |
| ############################################################################ | |
| # 1) TEXT COMPARISON | |
| ############################################################################ | |
| # 1a) Extract text details (with bounding boxes, fonts) | |
| ref_text_details = extract_text_with_details(ref_pdf_bytes) | |
| test_text_details = extract_text_with_details(test_pdf_bytes) | |
| # 1b) Compare reference vs. test text using bounding boxes & font differences | |
| text_mismatches = compare_text_details( | |
| ref_text_details, | |
| test_text_details, | |
| similarity_threshold=similarity_threshold, | |
| box_shift_threshold=box_shift_threshold | |
| ) | |
| # You could convert these mismatches into an HTML or string report if needed | |
| text_diff_report_html = generate_text_diff_report(text_mismatches) | |
| ############################################################################ | |
| # 2) IMAGE & COLOR ANALYSIS | |
| ############################################################################ | |
| ref_image_data = get_image_info(ref_pdf_bytes) | |
| test_image_data = get_image_info(test_pdf_bytes) | |
| image_comparison_results = compare_image_info(ref_image_data, test_image_data) | |
| image_diff_report = generate_image_diff_summary(image_comparison_results) | |
| ############################################################################ | |
| # 3) PIXEL-BASED VISUAL DIFFERENCES (Layout, shifts, etc.) | |
| ############################################################################ | |
| ref_images = pdf_to_images(ref_pdf_bytes, dpi=rendering_dpi) | |
| test_images = pdf_to_images(test_pdf_bytes, dpi=rendering_dpi) | |
| pages_to_compare = min(len(ref_images), len(test_images)) | |
| pixel_diffs = [] | |
| for i in range(pages_to_compare): | |
| contours = detect_image_differences( | |
| ref_images[i], | |
| test_images[i], | |
| diff_threshold=diff_threshold | |
| ) | |
| if contours: | |
| pixel_diffs.append((i+1, len(contours))) # e.g. (page_number, number_of_diff_regions) | |
| ############################################################################ | |
| # 4) (Optional) Generate LLM Summary or Combined Text | |
| ############################################################################ | |
| try: | |
| llm_summary = generate_llm_summary( | |
| text_mismatches, | |
| { | |
| "added": image_comparison_results["added"], | |
| "deleted": image_comparison_results["deleted"] | |
| }, | |
| pixel_diffs, | |
| model="gpt-3.5-turbo" | |
| ) | |
| except Exception as e: | |
| llm_summary = f"Could not generate AI summary: {e}" | |
| ############################################################################ | |
| # 5) Compile All Results into a Dictionary | |
| ############################################################################ | |
| results = { | |
| "text_mismatches": text_mismatches, | |
| "text_diff_report_html": text_diff_report_html, | |
| "image_comparison_results": image_comparison_results, | |
| "image_diff_report": image_diff_report, | |
| "pixel_diffs": pixel_diffs, | |
| "summary": llm_summary | |
| } | |
| return results | |
| ######################################## | |
| # Streamlit App | |
| ######################################## | |
| st.set_page_config(layout="wide") | |
| import streamlit as st | |
| from openai import OpenAI | |
| from openai.types.beta.threads.message_create_params import Attachment, AttachmentToolFileSearch | |
| import fitz # PyMuPDF | |
| from pdf2image import convert_from_bytes | |
| from PIL import Image, ImageDraw | |
| from rapidfuzz import fuzz | |
| import cv2 | |
| import hashlib | |
| import numpy as np | |
| import io | |
| import math | |
| import tempfile | |
| import os | |
| from streamlit_drawable_canvas import st_canvas | |
| def single_pdf_warp_unwarp_tool_dragdrop(): | |
| st.title("Single PDF Crop - Drag & Drop Demo") | |
| # 1) Upload single PDF | |
| uploaded_pdf = st.file_uploader("Upload Single PDF Containing Both Versions", type=["pdf"]) | |
| # Let user pick which page of the PDF to display in the canvas | |
| page_number_input = st.number_input("Page Index to Crop (0-based)", min_value=0, value=0) | |
| # NEW: Let the user pick which cropping mode they want | |
| crop_method = st.selectbox( | |
| "Select Crop Method", | |
| ["Manual bounding boxes", "Crop half page (top/bottom)"] | |
| ) | |
| if uploaded_pdf: | |
| pdf_bytes = uploaded_pdf.read() | |
| # 2) Convert the specified page into a PIL image (for display) | |
| pdf_images = pdf_to_images(pdf_bytes, dpi=72) | |
| total_pages = len(pdf_images) | |
| if page_number_input >= total_pages: | |
| st.warning(f"PDF has only {total_pages} pages. Please choose a valid page number.") | |
| return | |
| # This is the PIL image for the chosen page | |
| page_image = pdf_images[page_number_input].convert("RGB") | |
| img_width, img_height = page_image.size | |
| if crop_method == "Manual bounding boxes": | |
| # 3) Use st_canvas to let the user draw bounding boxes | |
| st.write("Draw **2 rectangles**: one for 'Reference' (Unwarped) and one for 'Test' (Warped).") | |
| canvas_result = st_canvas( | |
| fill_color="rgba(255, 165, 0, 0.3)", # semi-transparent orange | |
| stroke_width=2, | |
| background_image=page_image, | |
| update_streamlit=True, | |
| width=img_width, | |
| height=img_height, | |
| drawing_mode="rect", # We only allow rectangle drawing | |
| key="canvas_dragdrop" | |
| ) | |
| else: | |
| # If we are cropping half-page, just show the page image so user knows what page they are on | |
| st.image(page_image, caption="PDF Page Preview (No bounding box needed for half-page crop)") | |
| # 4) Trigger "Crop & Compare" | |
| if st.button("Crop & Compare"): | |
| if crop_method == "Manual bounding boxes": | |
| # --- MANUAL BOUNDING BOXES LOGIC --- | |
| if not canvas_result.json_data: | |
| st.error("No bounding box data found. Please draw rectangles first.") | |
| return | |
| objects = canvas_result.json_data.get("objects", []) | |
| if len(objects) < 2: | |
| st.error("Please draw at least 2 rectangles: one for reference, one for test.") | |
| return | |
| ref_rect = objects[0] | |
| test_rect = objects[1] | |
| # We'll convert rectangle coords from st_canvas to PDF coords | |
| def image_to_pdf_bbox(obj, img_w, img_h, pdf_page): | |
| # PDF page size in points | |
| pdf_w = pdf_page.rect.width | |
| pdf_h = pdf_page.rect.height | |
| left = obj["left"] | |
| top = obj["top"] | |
| width = obj["width"] | |
| height = obj["height"] | |
| # st_canvas uses (0,0) at top-left. PDF uses (0,0) at bottom-left. | |
| x0_img = left | |
| y0_img = top + height # bottom edge in image coords | |
| x1_img = left + width | |
| y1_img = top # top edge in image coords | |
| pdf_x0 = (x0_img / img_w) * pdf_w | |
| pdf_x1 = (x1_img / img_w) * pdf_w | |
| pdf_y0 = pdf_h - (y0_img / img_h) * pdf_h | |
| pdf_y1 = pdf_h - (y1_img / img_h) * pdf_h | |
| x_min, x_max = sorted([pdf_x0, pdf_x1]) | |
| y_min, y_max = sorted([pdf_y0, pdf_y1]) | |
| return (x_min, y_min, x_max, y_max) | |
| # Create a fitz doc to get the real PDF page size | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| if page_number_input >= len(doc): | |
| st.error("Page index out of range in the PDF.") | |
| return | |
| pdf_page = doc[page_number_input] # PyMuPDF page object | |
| ref_bbox_pdf = image_to_pdf_bbox(ref_rect, img_width, img_height, pdf_page) | |
| test_bbox_pdf = image_to_pdf_bbox(test_rect, img_width, img_height, pdf_page) | |
| doc.close() | |
| else: | |
| # --- HALF-PAGE CROP LOGIC --- | |
| # For half-page, we skip st_canvas. We'll automatically define bounding boxes: | |
| # - ref_bbox: top half of the page | |
| # - test_bbox: bottom half of the page | |
| # Get PDF page dimensions | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| if page_number_input >= len(doc): | |
| st.error("Page index out of range in the PDF.") | |
| return | |
| pdf_page = doc[page_number_input] | |
| pdf_w = pdf_page.rect.width | |
| pdf_h = pdf_page.rect.height | |
| # Example: top half is Reference, bottom half is Test | |
| ref_bbox_pdf = (0, pdf_h/2, pdf_w, pdf_h) # (x0, y0, x1, y1) bottom-left origin | |
| test_bbox_pdf = (0, 0, pdf_w, pdf_h/2) | |
| doc.close() | |
| # 5) Extract those regions as cropped PDFs | |
| try: | |
| with st.spinner("Cropping PDF regions..."): | |
| ref_cropped_pdf = extract_region_as_pdf(pdf_bytes, page_number_input, ref_bbox_pdf) | |
| test_cropped_pdf = extract_region_as_pdf(pdf_bytes, page_number_input, test_bbox_pdf) | |
| except Exception as e: | |
| st.error(f"Error cropping PDF: {e}") | |
| return | |
| # 6) Compare the two cropped PDFs with your existing QC pipeline | |
| comparison_results = run_qc_comparison(ref_cropped_pdf, test_cropped_pdf) | |
| # 7) Display results | |
| st.success("Comparison Complete!") | |
| st.subheader("AI Analysis Summary") | |
| st.write(comparison_results["summary"]) | |
| st.subheader("Text Differences") | |
| st.markdown(comparison_results["text_diff_report_html"], unsafe_allow_html=True) | |
| st.subheader("Image & Color Differences") | |
| st.markdown(comparison_results["image_diff_report"], unsafe_allow_html=True) | |
| st.subheader("Pixel Differences") | |
| pixel_diffs = comparison_results["pixel_diffs"] | |
| if pixel_diffs: | |
| st.write(f"Pixel differences found on pages: {pixel_diffs}") | |
| else: | |
| st.write("No pixel differences found.") | |
| ######################################################################## | |
| # Display the reference & test PDFs with bounding boxes for each change | |
| ######################################################################## | |
| st.subheader("Annotated Reference & Test Pages") | |
| # We'll convert each cropped PDF to images (usually 1 page each) | |
| ref_pages = pdf_to_images(ref_cropped_pdf, dpi=100) | |
| test_pages = pdf_to_images(test_cropped_pdf, dpi=100) | |
| pages_to_show = min(len(ref_pages), len(test_pages)) | |
| # Helper to transform from PDF -> image coords: | |
| def pdf_to_image_coords(bbox, pdf_w, pdf_h, img_w, img_h): | |
| (x0_pdf, y0_pdf, x1_pdf, y1_pdf) = bbox | |
| left = (x0_pdf / pdf_w) * img_w | |
| right = (x1_pdf / pdf_w) * img_w | |
| top = img_h - ((y1_pdf / pdf_h) * img_h) | |
| bottom = img_h - ((y0_pdf / pdf_h) * img_h) | |
| return (left, top, right, bottom) | |
| # We'll highlight text "missing" on the reference side, | |
| # text "extra" on the test side, and "warped" on both. | |
| mismatches = comparison_results["text_mismatches"] # "missing", "extra", "warped" | |
| for i in range(pages_to_show): | |
| ref_img = ref_pages[i].copy() | |
| test_img = test_pages[i].copy() | |
| ref_doc = fitz.open(stream=ref_cropped_pdf, filetype="pdf") | |
| test_doc = fitz.open(stream=test_cropped_pdf, filetype="pdf") | |
| if i >= len(ref_doc) or i >= len(test_doc): | |
| break | |
| ref_page_obj = ref_doc[i] | |
| test_page_obj = test_doc[i] | |
| ref_pdf_w = ref_page_obj.rect.width | |
| ref_pdf_h = ref_page_obj.rect.height | |
| test_pdf_w = test_page_obj.rect.width | |
| test_pdf_h = test_page_obj.rect.height | |
| draw_ref = ImageDraw.Draw(ref_img) | |
| draw_test = ImageDraw.Draw(test_img) | |
| # Draw bounding boxes for "missing" text on reference | |
| for (page_idx, text, bbox, font, size) in mismatches["missing"]: | |
| if page_idx == i: | |
| (x0, y0, x1, y1) = pdf_to_image_coords(bbox, ref_pdf_w, ref_pdf_h, ref_img.width, ref_img.height) | |
| draw_ref.rectangle([(x0, y0), (x1, y1)], outline="red", width=3) | |
| # Draw bounding boxes for "extra" text on test | |
| for (page_idx, text, bbox, font, size) in mismatches["extra"]: | |
| if page_idx == i: | |
| (x0, y0, x1, y1) = pdf_to_image_coords(bbox, test_pdf_w, test_pdf_h, test_img.width, test_img.height) | |
| draw_test.rectangle([(x0, y0), (x1, y1)], outline="green", width=3) | |
| # Draw bounding boxes for "warped" text (on both reference & test) | |
| for (page_idx, text, ref_bbox, test_bbox, ref_font, test_font) in mismatches["warped"]: | |
| if page_idx == i: | |
| (x0r, y0r, x1r, y1r) = pdf_to_image_coords(ref_bbox, ref_pdf_w, ref_pdf_h, ref_img.width, ref_img.height) | |
| draw_ref.rectangle([(x0r, y0r), (x1r, y1r)], outline="orange", width=3) | |
| (x0t, y0t, x1t, y1t) = pdf_to_image_coords(test_bbox, test_pdf_w, test_pdf_h, test_img.width, test_img.height) | |
| draw_test.rectangle([(x0t, y0t), (x1t, y1t)], outline="purple", width=3) | |
| # Optionally detect pixel-level differences between these half-page images | |
| page_contours = detect_image_differences(ref_pages[i], test_pages[i], diff_threshold=30) | |
| test_img_annotated = highlight_image(test_img, page_contours, color="blue", width=3) | |
| ref_doc.close() | |
| test_doc.close() | |
| # Display side by side | |
| st.write(f"**Annotated Page {i+1}** of the cropped PDFs") | |
| colA, colB = st.columns(2) | |
| with colA: | |
| st.write("Reference PDF") | |
| st.image(ref_img, use_column_width=True) | |
| with colB: | |
| st.write("Test PDF") | |
| st.image(test_img_annotated, use_column_width=True) | |
| def pdf_quality_control_tool(): | |
| st.title("Beta 2-PDF compare QC Tool") | |
| # Sidebar Inputs | |
| st.sidebar.header("Settings") | |
| uploaded_ref = st.sidebar.file_uploader("Upload Reference PDF", type=["pdf"], key="ref_pdf") | |
| uploaded_test = st.sidebar.file_uploader("Upload Test PDF", type=["pdf"], key="test_pdf") | |
| # Text matching thresholds | |
| similarity_threshold = st.sidebar.slider("Text Similarity Threshold (fuzzy)", 50, 100, 90) | |
| box_shift_threshold = st.sidebar.slider("Box Shift Threshold (points)", 0, 100, 10) | |
| # Pixel diff thresholds | |
| diff_threshold = st.sidebar.slider("Pixel Difference Threshold", 1, 100, 30) | |
| # DPI for rendering | |
| rendering_dpi = st.sidebar.slider("Rendering DPI for Comparison", 72, 300, 100) | |
| if uploaded_ref and uploaded_test: | |
| st.header("PDF Comparison Results") | |
| if st.button("Compare PDFs"): | |
| with st.spinner("Analyzing PDFs..."): | |
| # 1) Read PDF bytes | |
| ref_bytes = uploaded_ref.read() | |
| test_bytes = uploaded_test.read() | |
| # 2) Use run_qc_comparison for all text/image/pixel diffs | |
| comparison_results = run_qc_comparison( | |
| ref_bytes, | |
| test_bytes, | |
| similarity_threshold=similarity_threshold, | |
| box_shift_threshold=box_shift_threshold, | |
| diff_threshold=diff_threshold, | |
| rendering_dpi=rendering_dpi | |
| ) | |
| # 3) Display top-level results | |
| st.subheader("AI Analysis Summary") | |
| st.write(comparison_results["summary"]) | |
| st.subheader("Text Differences") | |
| st.markdown(comparison_results["text_diff_report_html"], unsafe_allow_html=True) | |
| st.subheader("Image & Color Differences") | |
| st.markdown(comparison_results["image_diff_report"], unsafe_allow_html=True) | |
| st.subheader("Pixel-Based Visual Differences") | |
| pixel_diffs = comparison_results["pixel_diffs"] | |
| if pixel_diffs: | |
| diff_pages = [p for (p, cnt) in pixel_diffs] | |
| st.write(f"Visual differences detected on pages: {diff_pages}") | |
| else: | |
| st.write("No visual differences found.") | |
| # 4) Optionally, create and offer a downloadable annotated PDF | |
| annotated_pdf = create_annotated_pdf( | |
| ref_bytes, | |
| test_bytes, | |
| pixel_diffs, | |
| diff_threshold=diff_threshold, | |
| dpi=rendering_dpi | |
| ) | |
| if annotated_pdf: | |
| st.download_button( | |
| label="Download Annotated Comparison PDF", | |
| data=annotated_pdf, | |
| file_name="annotated_comparison.pdf", | |
| mime="application/pdf" | |
| ) | |
| # 5) NEW: Annotate each page with bounding boxes for text changes | |
| st.subheader("Detailed Page-by-Page Annotations") | |
| mismatches = comparison_results["text_mismatches"] # { "missing": [...], "extra": [...], "warped": [...] } | |
| # Convert full PDFs to images at the chosen DPI | |
| ref_pages = pdf_to_images(ref_bytes, dpi=rendering_dpi) | |
| test_pages = pdf_to_images(test_bytes, dpi=rendering_dpi) | |
| num_pages = min(len(ref_pages), len(test_pages)) | |
| # We'll open the actual PDFs with PyMuPDF to get page dimensions | |
| ref_doc = fitz.open(stream=ref_bytes, filetype="pdf") | |
| test_doc = fitz.open(stream=test_bytes, filetype="pdf") | |
| # Helper to convert PDF coords -> image coords | |
| def pdf_to_image_coords(bbox, pdf_w, pdf_h, img_w, img_h): | |
| """ | |
| bbox: (x0, y0, x1, y1) in PDF coords (bottom-left origin) | |
| pdf_w, pdf_h: page size in PDF points | |
| img_w, img_h: rasterized image size in pixels | |
| Returns (left, top, right, bottom) in image coords (top-left origin). | |
| """ | |
| x0_pdf, y0_pdf, x1_pdf, y1_pdf = bbox | |
| # Horizontal scaling is direct | |
| left = (x0_pdf / pdf_w) * img_w | |
| right = (x1_pdf / pdf_w) * img_w | |
| # Vertical must flip | |
| top = img_h - ((y1_pdf / pdf_h) * img_h) | |
| bottom = img_h - ((y0_pdf / pdf_h) * img_h) | |
| return (left, top, right, bottom) | |
| # Loop through each page | |
| for i in range(num_pages): | |
| ref_img = ref_pages[i].copy() | |
| test_img = test_pages[i].copy() | |
| # Get page dimensions in PDF coords | |
| # (If i >= len() because doc pages differ, handle gracefully) | |
| if i >= len(ref_doc) or i >= len(test_doc): | |
| break | |
| ref_page_obj = ref_doc[i] | |
| test_page_obj = test_doc[i] | |
| ref_pdf_w = ref_page_obj.rect.width | |
| ref_pdf_h = ref_page_obj.rect.height | |
| test_pdf_w = test_page_obj.rect.width | |
| test_pdf_h = test_page_obj.rect.height | |
| draw_ref = ImageDraw.Draw(ref_img) | |
| draw_test = ImageDraw.Draw(test_img) | |
| # Highlight missing text in RED on reference | |
| for (page_idx, text, bbox, font, size) in mismatches["missing"]: | |
| if page_idx == i: | |
| (lx, ty, rx, by) = pdf_to_image_coords( | |
| bbox, ref_pdf_w, ref_pdf_h, ref_img.width, ref_img.height | |
| ) | |
| draw_ref.rectangle([(lx, ty), (rx, by)], outline="red", width=3) | |
| # Highlight extra text in GREEN on test | |
| for (page_idx, text, bbox, font, size) in mismatches["extra"]: | |
| if page_idx == i: | |
| (lx, ty, rx, by) = pdf_to_image_coords( | |
| bbox, test_pdf_w, test_pdf_h, test_img.width, test_img.height | |
| ) | |
| draw_test.rectangle([(lx, ty), (rx, by)], outline="green", width=3) | |
| # Warped text: highlight both ref and test boxes | |
| for (page_idx, text, ref_bbox, test_bbox, ref_font, test_font) in mismatches["warped"]: | |
| if page_idx == i: | |
| # Orange for ref box | |
| (lx_ref, ty_ref, rx_ref, by_ref) = pdf_to_image_coords( | |
| ref_bbox, ref_pdf_w, ref_pdf_h, ref_img.width, ref_img.height | |
| ) | |
| draw_ref.rectangle([(lx_ref, ty_ref), (rx_ref, by_ref)], outline="orange", width=3) | |
| # Purple for test box | |
| (lx_test, ty_test, rx_test, by_test) = pdf_to_image_coords( | |
| test_bbox, test_pdf_w, test_pdf_h, test_img.width, test_img.height | |
| ) | |
| draw_test.rectangle([(lx_test, ty_test), (rx_test, by_test)], outline="purple", width=3) | |
| # Optionally highlight pixel differences on the test side | |
| # (We can re-run detect_image_differences on these two single pages if desired) | |
| contours = detect_image_differences(ref_pages[i], test_pages[i], diff_threshold=diff_threshold) | |
| test_img = highlight_image(test_img, contours, color="blue", width=3) | |
| # Show side-by-side | |
| st.write(f"**Page {i+1}** Annotations") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("Reference Page") | |
| st.image(ref_img, use_column_width=True) | |
| with col2: | |
| st.write("Test Page") | |
| st.image(test_img, use_column_width=True) | |
| # Close the PyMuPDF docs | |
| ref_doc.close() | |
| test_doc.close() | |
| else: | |
| st.info("Please upload both reference and test PDFs to begin comparison.") | |
| def beta_gpt4_pdf_extractor(): | |
| """ | |
| Placeholder for your GPT-4 PDF extraction code. | |
| (Kept from your original script for demonstration.) | |
| """ | |
| st.title("PDF QC Agent") | |
| api_key="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" | |
| uploaded_files = st.file_uploader( | |
| "Upload PDF file(s)", | |
| type=["pdf"], | |
| accept_multiple_files=True | |
| ) | |
| if not uploaded_files: | |
| st.info("Please upload one or more PDF files to get started.") | |
| st.stop() | |
| user_prompt = st.text_area( | |
| "Prompt", | |
| value="Extract the content from the provided file(s) without altering it. Just output the exact content and nothing else.", | |
| height=100 | |
| ) | |
| if st.button("Extract PDF Contents"): | |
| with st.spinner("Sending request to GPT-4o..."): | |
| client = OpenAI(api_key=api_key) | |
| pdf_assistant = client.beta.assistants.create( | |
| model="gpt-4o", | |
| description="An assistant to extract the contents of PDF files.", | |
| tools=[{"type": "file_search"}], | |
| name="PDF assistant" | |
| ) | |
| thread = client.beta.threads.create() | |
| attachments = [] | |
| for file_obj in uploaded_files: | |
| created_file = client.files.create( | |
| file=file_obj, | |
| purpose="assistants" | |
| ) | |
| attach = Attachment( | |
| file_id=created_file.id, | |
| tools=[AttachmentToolFileSearch(type="file_search")] | |
| ) | |
| attachments.append(attach) | |
| client.beta.threads.messages.create( | |
| thread_id=thread.id, | |
| role="user", | |
| attachments=attachments, | |
| content=user_prompt, | |
| ) | |
| run = client.beta.threads.runs.create_and_poll( | |
| thread_id=thread.id, assistant_id=pdf_assistant.id, timeout=1000 | |
| ) | |
| if run.status != "completed": | |
| st.error(f"Run failed: {run.status}") | |
| st.stop() | |
| messages_cursor = client.beta.threads.messages.list(thread_id=thread.id) | |
| messages = list(messages_cursor) | |
| if not messages: | |
| st.error("No messages returned.") | |
| st.stop() | |
| last_assistant_msg = next( | |
| (m for m in reversed(messages) if m.role == "assistant"), | |
| None | |
| ) | |
| if last_assistant_msg: | |
| st.subheader("Extracted PDF Text") | |
| st.text(last_assistant_msg.content[0].text.value) | |
| else: | |
| st.error("No assistant message found.") | |
| ######################################## | |
| # Main Navigation | |
| ######################################## | |
| st.sidebar.title("Navigation") | |
| app_mode = st.sidebar.radio( | |
| "Choose a tool", | |
| ["Single PDF Warp/Unwarp", "Two-PDF QC Comparison","Beta GPT-4 PDF Extractor"] | |
| ) | |
| if app_mode == "Single PDF Warp/Unwarp": | |
| single_pdf_warp_unwarp_tool_dragdrop() | |
| elif app_mode == "Two-PDF QC Comparison": | |
| pdf_quality_control_tool() | |
| elif app_mode == "Beta GPT-4 PDF Extractor": | |
| beta_gpt4_pdf_extractor() | |