Spaces:

Propelis
/

QC_Rules

Sleeping

App Files Files Community

QC_Rules / compare_app.py

Jakecole1

Upload 11 files

6c16992 verified 8 months ago

raw

history blame contribute delete

27.6 kB

	import streamlit as st
	import tempfile
	import os
	import pandas as pd
	from src.extract_text.google_document_api import GoogleDocumentAPI
	from pdf2image import convert_from_path
	from PIL import Image, ImageDraw, ImageFont
	from src.utils.image_utils import ImageUtils
	import base64
	from io import BytesIO
	from src.utils.barcode import Barcode
	import anthropic
	import json

	def load_client_artwork_files():
	"""Load all artwork PDF files from client directory"""
	base_path = "requirements_library/client-requirements"
	artwork_files = []

	if not os.path.exists(base_path):
	return artwork_files

	# Walk through all subdirectories
	for root, dirs, files in os.walk(base_path):
	for file in files:
	file_path = os.path.join(root, file)
	relative_path = os.path.relpath(file_path, base_path)

	if file.lower().endswith('.pdf'):
	artwork_files.append({
	'name': f"{relative_path}",
	'path': file_path,
	'type': 'artwork'
	})

	return artwork_files

	def load_artwork_content(file_info):
	"""Load artwork content as bytes"""
	try:
	with open(file_info['path'], 'rb') as f:
	return f.read()
	except Exception as e:
	st.error(f"Error loading artwork file {file_info['name']}: {str(e)}")
	return None

	def extract_pdf_data(pdf_file, file_name):
	"""Extract text, bounding boxes, images, and barcodes from PDF"""
	try:
	# Create a temporary file to process the PDF
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	pdf_file.seek(0)
	tmp_file.write(pdf_file.read())
	tmp_pdf_path = tmp_file.name

	# Extract text and bounding boxes using Google Document API
	google_document_api = GoogleDocumentAPI(credentials_path="src/extract_text/photon-services-f0d3ec1417d0.json")
	document = google_document_api.process_document(tmp_pdf_path)
	text_content = google_document_api.extract_text_with_markdown_table(document)
	bounding_boxes = google_document_api.extract_text_with_bounding_boxes(document)

	# Convert PDF to image
	try:
	images = convert_from_path(tmp_pdf_path)
	if not images:
	raise ValueError("No pages found in PDF")
	page_image = images[0] # Assuming single page for now
	except Exception as e:
	st.error(f"Error converting PDF to image: {str(e)}")
	# Create a placeholder image
	page_image = Image.new('RGB', (800, 600), color='white')
	draw = ImageDraw.Draw(page_image)
	draw.text((400, 300), "PDF conversion failed", fill='black', anchor='mm')

	# Process image for comparison: standardize size and optimize quality
	processed_image, quality, file_size = ImageUtils.process_image_for_comparison(
	page_image,
	target_size=(1200, 1600), # Standard size for comparison
	max_size_bytes=1024 * 1024 # 1MB limit
	)

	# Convert processed image to base64 for API
	image_base64 = ImageUtils.image_to_base64_optimized(
	page_image,
	target_size=(1200, 1600),
	max_size_bytes=1024 * 1024
	)

	# Scan for barcodes
	barcode = Barcode()
	barcode_results = barcode.scan_and_validate(page_image)

	# Clean up temporary file
	if os.path.exists(tmp_pdf_path):
	os.unlink(tmp_pdf_path)

	return {
	'text_content': text_content,
	'bounding_boxes': bounding_boxes,
	'image': processed_image, # Use the processed image
	'original_image': page_image, # Keep original for reference
	'image_base64': image_base64,
	'barcode_results': barcode_results,
	'file_name': file_name,
	'image_quality': quality,
	'image_size_bytes': file_size
	}

	except Exception as e:
	st.error(f"Error processing PDF {file_name}: {str(e)}")
	return None

	def compare_artworks_with_claude(artwork1_data, artwork2_data, model="claude-sonnet-4-20250514"):
	"""Compare two artworks using Claude API"""

	# Prepare the comparison prompt
	prompt = f"""
	You are an expert packaging compliance analyzer. Compare these two artwork PDFs and provide a detailed analysis of their differences and similarities.

	## Artwork 1: {artwork1_data['file_name']}
	Text Content:
	{artwork1_data['text_content']}

	Bounding Box Data:
	{json.dumps(artwork1_data['bounding_boxes'][:10], indent=2) if artwork1_data['bounding_boxes'] else "No text elements detected"}

	Barcode Data:
	{json.dumps(artwork1_data['barcode_results'], indent=2) if artwork1_data['barcode_results'] else "No barcodes detected"}

	## Artwork 2: {artwork2_data['file_name']}
	Text Content:
	{artwork2_data['text_content']}

	Bounding Box Data:
	{json.dumps(artwork2_data['bounding_boxes'][:10], indent=2) if artwork2_data['bounding_boxes'] else "No text elements detected"}

	Barcode Data:
	{json.dumps(artwork2_data['barcode_results'], indent=2) if artwork2_data['barcode_results'] else "No barcodes detected"}

	Please provide a comprehensive comparison analysis in the following JSON format:

	{{
	"overall_similarity": 0.85,
	"comparison_summary": "Brief overview of the comparison results",
	"text_differences": [
	{{
	"category": "Missing Text",
	"artwork1_content": "Text found only in artwork 1",
	"artwork2_content": "Text found only in artwork 2",
	"significance": "HIGH/MEDIUM/LOW",
	"description": "Detailed explanation of the difference"
	}}
	],
	"layout_differences": [
	{{
	"category": "Position Changes",
	"element": "Element that moved",
	"artwork1_position": "Description of position in artwork 1",
	"artwork2_position": "Description of position in artwork 2",
	"significance": "HIGH/MEDIUM/LOW",
	"description": "Impact of this change"
	}}
	],
	"barcode_differences": [
	{{
	"category": "Barcode Changes",
	"artwork1_barcodes": "Description of barcodes in artwork 1",
	"artwork2_barcodes": "Description of barcodes in artwork 2",
	"significance": "HIGH/MEDIUM/LOW",
	"description": "Analysis of barcode differences"
	}}
	],
	"visual_differences": [
	{{
	"category": "Visual Elements",
	"description": "Description of visual differences observed in the images",
	"significance": "HIGH/MEDIUM/LOW",
	"recommendation": "Suggested action or consideration"
	}}
	],
	"compliance_impact": [
	{{
	"area": "Regulatory compliance area affected",
	"impact": "Description of potential compliance impact",
	"risk_level": "HIGH/MEDIUM/LOW",
	"recommendation": "Recommended action"
	}}
	],
	"recommendations": [
	"List of actionable recommendations based on the comparison"
	]
	}}

	Analyze both the textual content and visual elements. Pay special attention to:
	1. Missing or changed text elements
	2. Repositioned elements that might affect readability
	3. Barcode differences that could impact functionality
	4. Visual changes that might affect brand consistency or compliance
	5. Any changes that could impact regulatory compliance

	Provide specific, actionable insights that would be valuable for quality control and compliance verification.
	"""

	try:
	# Initialize Anthropic client
	client = anthropic.Anthropic(api_key=os.getenv('CLAUDE_API_KEY'))

	# Create message with both images
	message = client.messages.create(
	model=model,
	max_tokens=4000,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt
	},
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": "image/png",
	"data": artwork1_data['image_base64']
	}
	},
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": "image/png",
	"data": artwork2_data['image_base64']
	}
	}
	]
	}
	]
	)

	# Parse the response
	response_text = ""
	for content_block in message.content:
	if hasattr(content_block, 'type') and content_block.type == 'text':
	response_text += content_block.text

	# Try to extract JSON from the response
	try:
	# Find JSON in the response
	start_idx = response_text.find('{')
	end_idx = response_text.rfind('}') + 1

	if start_idx != -1 and end_idx != -1:
	json_str = response_text[start_idx:end_idx]
	comparison_results = json.loads(json_str)
	else:
	# Fallback: create a basic structure with the raw response
	comparison_results = {
	"overall_similarity": 0.5,
	"comparison_summary": "Analysis completed but JSON parsing failed",
	"raw_response": response_text,
	"text_differences": [],
	"layout_differences": [],
	"barcode_differences": [],
	"visual_differences": [],
	"compliance_impact": [],
	"recommendations": ["Review the raw analysis output for detailed insights"]
	}
	except json.JSONDecodeError:
	# Fallback for JSON parsing errors
	comparison_results = {
	"overall_similarity": 0.5,
	"comparison_summary": "Analysis completed but structured parsing failed",
	"raw_response": response_text,
	"text_differences": [],
	"layout_differences": [],
	"barcode_differences": [],
	"visual_differences": [],
	"compliance_impact": [],
	"recommendations": ["Review the raw analysis output for detailed insights"]
	}

	return comparison_results

	except Exception as e:
	st.error(f"Error calling Claude API: {str(e)}")
	return None

	def display_comparison_results(results, artwork1_data, artwork2_data):
	"""Display the comparison results in a structured format"""

	if not results:
	st.error("No comparison results to display")
	return

	# Overall Summary
	st.markdown("## 📊 Comparison Summary")

	col1, col2, col3 = st.columns(3)
	with col1:
	similarity = results.get('overall_similarity', 0.5)
	st.metric("Overall Similarity", f"{similarity:.1%}")

	with col2:
	total_differences = (
	len(results.get('text_differences', [])) +
	len(results.get('layout_differences', [])) +
	len(results.get('barcode_differences', [])) +
	len(results.get('visual_differences', []))
	)
	st.metric("Total Differences", total_differences)

	with col3:
	compliance_impacts = len(results.get('compliance_impact', []))
	st.metric("Compliance Impacts", compliance_impacts)

	# Summary description
	if 'comparison_summary' in results:
	st.markdown(f"Summary: {results['comparison_summary']}")

	# Create tabs for different types of differences
	tabs = st.tabs(["📝 Text Differences", "📐 Layout Changes", "📱 Barcode Changes", "🎨 Visual Differences", "⚖️ Compliance Impact", "💡 Recommendations"])

	with tabs[0]: # Text Differences
	st.markdown("### Text Content Differences")
	text_diffs = results.get('text_differences', [])
	if text_diffs:
	for i, diff in enumerate(text_diffs):
	significance_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(diff.get('significance', 'MEDIUM'), "🟡")

	with st.expander(f"{significance_color} {diff.get('category', 'Text Difference')} - {diff.get('significance', 'MEDIUM')} Impact"):
	col1, col2 = st.columns(2)
	with col1:
	st.markdown(f"{artwork1_data['file_name']}:")
	st.text(diff.get('artwork1_content', 'N/A'))
	with col2:
	st.markdown(f"{artwork2_data['file_name']}:")
	st.text(diff.get('artwork2_content', 'N/A'))

	st.markdown(f"Description: {diff.get('description', 'No description available')}")
	else:
	st.info("No significant text differences found")

	with tabs[1]: # Layout Changes
	st.markdown("### Layout and Positioning Changes")
	layout_diffs = results.get('layout_differences', [])
	if layout_diffs:
	for diff in layout_diffs:
	significance_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(diff.get('significance', 'MEDIUM'), "🟡")

	with st.expander(f"{significance_color} {diff.get('category', 'Layout Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
	st.markdown(f"Element: {diff.get('element', 'Unknown element')}")

	col1, col2 = st.columns(2)
	with col1:
	st.markdown(f"Position in {artwork1_data['file_name']}:")
	st.text(diff.get('artwork1_position', 'N/A'))
	with col2:
	st.markdown(f"Position in {artwork2_data['file_name']}:")
	st.text(diff.get('artwork2_position', 'N/A'))

	st.markdown(f"Impact: {diff.get('description', 'No description available')}")
	else:
	st.info("No significant layout differences found")

	with tabs[2]: # Barcode Changes
	st.markdown("### Barcode Differences")
	barcode_diffs = results.get('barcode_differences', [])
	if barcode_diffs:
	for diff in barcode_diffs:
	significance_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(diff.get('significance', 'MEDIUM'), "🟡")

	with st.expander(f"{significance_color} {diff.get('category', 'Barcode Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
	col1, col2 = st.columns(2)
	with col1:
	st.markdown(f"{artwork1_data['file_name']} Barcodes:")
	st.text(diff.get('artwork1_barcodes', 'N/A'))
	with col2:
	st.markdown(f"{artwork2_data['file_name']} Barcodes:")
	st.text(diff.get('artwork2_barcodes', 'N/A'))

	st.markdown(f"Analysis: {diff.get('description', 'No description available')}")
	else:
	st.info("No significant barcode differences found")

	with tabs[3]: # Visual Differences
	st.markdown("### Visual and Design Differences")
	visual_diffs = results.get('visual_differences', [])
	if visual_diffs:
	for diff in visual_diffs:
	significance_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(diff.get('significance', 'MEDIUM'), "🟡")

	with st.expander(f"{significance_color} {diff.get('category', 'Visual Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
	st.markdown(f"Description: {diff.get('description', 'No description available')}")
	if 'recommendation' in diff:
	st.markdown(f"Recommendation: {diff['recommendation']}")
	else:
	st.info("No significant visual differences found")

	with tabs[4]: # Compliance Impact
	st.markdown("### Compliance and Regulatory Impact")
	compliance_impacts = results.get('compliance_impact', [])
	if compliance_impacts:
	for impact in compliance_impacts:
	risk_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(impact.get('risk_level', 'MEDIUM'), "🟡")

	with st.expander(f"{risk_color} {impact.get('area', 'Compliance Area')} - {impact.get('risk_level', 'MEDIUM')} Risk"):
	st.markdown(f"Impact: {impact.get('impact', 'No description available')}")
	st.markdown(f"Recommendation: {impact.get('recommendation', 'No recommendation provided')}")
	else:
	st.success("No compliance impacts identified")

	with tabs[5]: # Recommendations
	st.markdown("### Action Items and Recommendations")
	recommendations = results.get('recommendations', [])
	if recommendations:
	for i, rec in enumerate(recommendations, 1):
	st.markdown(f"{i}. {rec}")
	else:
	st.info("No specific recommendations provided")

	# Raw response section (collapsible)
	if 'raw_response' in results:
	with st.expander("🔍 Raw Analysis Output"):
	st.text(results['raw_response'])

	def display_side_by_side_images(artwork1_data, artwork2_data):
	"""Display the two artwork images side by side"""
	st.markdown("## 🖼️ Side-by-Side Comparison")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown(f"### {artwork1_data['file_name']}")
	st.image(ImageUtils.crop_image(artwork1_data['image']), caption=artwork1_data['file_name'], use_container_width=True)

	# Display image processing info
	if 'image_quality' in artwork1_data and 'image_size_bytes' in artwork1_data:
	quality = artwork1_data['image_quality']
	size_mb = artwork1_data['image_size_bytes'] / (1024 * 1024)
	st.info(f"📊 Image Quality: {quality}% \| Size: {size_mb:.2f}MB")

	# Display extracted data summary
	with st.expander("📊 Extracted Data Summary"):
	text_elements = len(artwork1_data['bounding_boxes']) if artwork1_data['bounding_boxes'] else 0
	barcodes = len(artwork1_data['barcode_results']) if artwork1_data['barcode_results'] else 0
	st.metric("Text Elements", text_elements)
	st.metric("Barcodes", barcodes)

	with col2:
	st.markdown(f"### {artwork2_data['file_name']}")
	st.image(ImageUtils.crop_image(artwork2_data['image']), caption=artwork2_data['file_name'], use_container_width=True)

	# Display image processing info
	if 'image_quality' in artwork2_data and 'image_size_bytes' in artwork2_data:
	quality = artwork2_data['image_quality']
	size_mb = artwork2_data['image_size_bytes'] / (1024 * 1024)
	st.info(f"📊 Image Quality: {quality}% \| Size: {size_mb:.2f}MB")

	# Display extracted data summary
	with st.expander("📊 Extracted Data Summary"):
	text_elements = len(artwork2_data['bounding_boxes']) if artwork2_data['bounding_boxes'] else 0
	barcodes = len(artwork2_data['barcode_results']) if artwork2_data['barcode_results'] else 0
	st.metric("Text Elements", text_elements)
	st.metric("Barcodes", barcodes)

	def main():
	st.set_page_config(layout="wide", page_title="Artwork Comparison Tool")

	# Load client artwork files
	client_artwork_files = load_client_artwork_files()

	# Initialize session state
	if "artwork1_data" not in st.session_state:
	st.session_state.artwork1_data = None
	if "artwork2_data" not in st.session_state:
	st.session_state.artwork2_data = None
	if "comparison_results" not in st.session_state:
	st.session_state.comparison_results = None

	st.title("🎨 Artwork Comparison Tool")
	st.write("Compare two packaging artwork PDFs to identify differences in text, layout, barcodes, and visual elements.")

	# File selection section
	st.markdown("## 📁 Select Artworks to Compare")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("### 🎨 Artwork 1")

	# Create tabs for client files vs upload
	art1_tab1, art1_tab2 = st.tabs(["📁 Client Files", "📤 Upload New"])

	with art1_tab1:
	if client_artwork_files:
	art1_options = ["Select artwork 1..."] + [f["name"] for f in client_artwork_files]
	selected_art1_file = st.selectbox("Choose artwork 1:", art1_options, key="art1_select")

	if selected_art1_file != "Select artwork 1...":
	# Find and load the selected file
	for file_info in client_artwork_files:
	if file_info["name"] == selected_art1_file:
	file_content = load_artwork_content(file_info)
	if file_content:
	import io
	temp_file = io.BytesIO(file_content)
	temp_file.name = file_info["name"]

	# Extract data from the artwork
	with st.spinner("Processing artwork 1..."):
	st.session_state.artwork1_data = extract_pdf_data(temp_file, file_info["name"])

	if st.session_state.artwork1_data:
	st.success(f"✅ Loaded artwork 1: {selected_art1_file}")
	break
	else:
	st.info("No client artwork files found")

	with art1_tab2:
	artwork1_file = st.file_uploader("Upload Artwork 1 (PDF)", type=["pdf"], key="art1_upload")

	if artwork1_file:
	with st.spinner("Processing artwork 1..."):
	st.session_state.artwork1_data = extract_pdf_data(artwork1_file, artwork1_file.name)

	if st.session_state.artwork1_data:
	st.success(f"✅ Uploaded artwork 1: {artwork1_file.name}")

	with col2:
	st.markdown("### 🎨 Artwork 2")

	# Create tabs for client files vs upload
	art2_tab1, art2_tab2 = st.tabs(["📁 Client Files", "📤 Upload New"])

	with art2_tab1:
	if client_artwork_files:
	art2_options = ["Select artwork 2..."] + [f["name"] for f in client_artwork_files]
	selected_art2_file = st.selectbox("Choose artwork 2:", art2_options, key="art2_select")

	if selected_art2_file != "Select artwork 2...":
	# Find and load the selected file
	for file_info in client_artwork_files:
	if file_info["name"] == selected_art2_file:
	file_content = load_artwork_content(file_info)
	if file_content:
	import io
	temp_file = io.BytesIO(file_content)
	temp_file.name = file_info["name"]

	# Extract data from the artwork
	with st.spinner("Processing artwork 2..."):
	st.session_state.artwork2_data = extract_pdf_data(temp_file, file_info["name"])

	if st.session_state.artwork2_data:
	st.success(f"✅ Loaded artwork 2: {selected_art2_file}")
	break
	else:
	st.info("No client artwork files found")

	with art2_tab2:
	artwork2_file = st.file_uploader("Upload Artwork 2 (PDF)", type=["pdf"], key="art2_upload")

	if artwork2_file:
	with st.spinner("Processing artwork 2..."):
	st.session_state.artwork2_data = extract_pdf_data(artwork2_file, artwork2_file.name)

	if st.session_state.artwork2_data:
	st.success(f"✅ Uploaded artwork 2: {artwork2_file.name}")

	# Display images side by side if both are loaded
	if st.session_state.artwork1_data and st.session_state.artwork2_data:
	display_side_by_side_images(st.session_state.artwork1_data, st.session_state.artwork2_data)

	# Model selection
	model_option = "claude-sonnet-4-20250514"

	# Comparison button
	if st.button("🔍 Compare Artworks", type="primary"):
	if st.session_state.artwork1_data and st.session_state.artwork2_data:
	with st.spinner("Analyzing artworks with Claude..."):
	st.session_state.comparison_results = compare_artworks_with_claude(
	st.session_state.artwork1_data,
	st.session_state.artwork2_data,
	model=model_option
	)

	if st.session_state.comparison_results:
	st.success("✅ Comparison analysis complete!")
	else:
	st.error("❌ Comparison analysis failed")
	else:
	st.warning("⚠️ Please select or upload both artworks before comparing")

	# Display comparison results
	if st.session_state.comparison_results:
	display_comparison_results(
	st.session_state.comparison_results,
	st.session_state.artwork1_data,
	st.session_state.artwork2_data
	)

	# Add helpful information
	st.markdown("---")
	st.markdown("""
	### 🛠️ How It Works
	1. Extract Content: The tool extracts text, bounding boxes, images, and barcodes from both PDFs
	2. AI Analysis: Claude analyzes the extracted data and visual elements to identify differences
	3. Structured Results: Differences are categorized by type (text, layout, barcode, visual) and significance
	4. Compliance Assessment: Potential compliance impacts are identified with risk levels and recommendations

	### 🎯 Use Cases
	- Quality Control: Verify artwork changes between versions
	- Brand Consistency: Ensure visual elements remain consistent
	- Compliance Review: Identify changes that might affect regulatory compliance
	- Change Documentation: Track and document artwork modifications
	""")

	if __name__ == "__main__":
	main()