QC_Rules / compare_app.py
Jakecole1's picture
Upload 11 files
6c16992 verified
import streamlit as st
import tempfile
import os
import pandas as pd
from src.extract_text.google_document_api import GoogleDocumentAPI
from pdf2image import convert_from_path
from PIL import Image, ImageDraw, ImageFont
from src.utils.image_utils import ImageUtils
import base64
from io import BytesIO
from src.utils.barcode import Barcode
import anthropic
import json
def load_client_artwork_files():
"""Load all artwork PDF files from client directory"""
base_path = "requirements_library/client-requirements"
artwork_files = []
if not os.path.exists(base_path):
return artwork_files
# Walk through all subdirectories
for root, dirs, files in os.walk(base_path):
for file in files:
file_path = os.path.join(root, file)
relative_path = os.path.relpath(file_path, base_path)
if file.lower().endswith('.pdf'):
artwork_files.append({
'name': f"{relative_path}",
'path': file_path,
'type': 'artwork'
})
return artwork_files
def load_artwork_content(file_info):
"""Load artwork content as bytes"""
try:
with open(file_info['path'], 'rb') as f:
return f.read()
except Exception as e:
st.error(f"Error loading artwork file {file_info['name']}: {str(e)}")
return None
def extract_pdf_data(pdf_file, file_name):
"""Extract text, bounding boxes, images, and barcodes from PDF"""
try:
# Create a temporary file to process the PDF
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
pdf_file.seek(0)
tmp_file.write(pdf_file.read())
tmp_pdf_path = tmp_file.name
# Extract text and bounding boxes using Google Document API
google_document_api = GoogleDocumentAPI(credentials_path="src/extract_text/photon-services-f0d3ec1417d0.json")
document = google_document_api.process_document(tmp_pdf_path)
text_content = google_document_api.extract_text_with_markdown_table(document)
bounding_boxes = google_document_api.extract_text_with_bounding_boxes(document)
# Convert PDF to image
try:
images = convert_from_path(tmp_pdf_path)
if not images:
raise ValueError("No pages found in PDF")
page_image = images[0] # Assuming single page for now
except Exception as e:
st.error(f"Error converting PDF to image: {str(e)}")
# Create a placeholder image
page_image = Image.new('RGB', (800, 600), color='white')
draw = ImageDraw.Draw(page_image)
draw.text((400, 300), "PDF conversion failed", fill='black', anchor='mm')
# Process image for comparison: standardize size and optimize quality
processed_image, quality, file_size = ImageUtils.process_image_for_comparison(
page_image,
target_size=(1200, 1600), # Standard size for comparison
max_size_bytes=1024 * 1024 # 1MB limit
)
# Convert processed image to base64 for API
image_base64 = ImageUtils.image_to_base64_optimized(
page_image,
target_size=(1200, 1600),
max_size_bytes=1024 * 1024
)
# Scan for barcodes
barcode = Barcode()
barcode_results = barcode.scan_and_validate(page_image)
# Clean up temporary file
if os.path.exists(tmp_pdf_path):
os.unlink(tmp_pdf_path)
return {
'text_content': text_content,
'bounding_boxes': bounding_boxes,
'image': processed_image, # Use the processed image
'original_image': page_image, # Keep original for reference
'image_base64': image_base64,
'barcode_results': barcode_results,
'file_name': file_name,
'image_quality': quality,
'image_size_bytes': file_size
}
except Exception as e:
st.error(f"Error processing PDF {file_name}: {str(e)}")
return None
def compare_artworks_with_claude(artwork1_data, artwork2_data, model="claude-sonnet-4-20250514"):
"""Compare two artworks using Claude API"""
# Prepare the comparison prompt
prompt = f"""
You are an expert packaging compliance analyzer. Compare these two artwork PDFs and provide a detailed analysis of their differences and similarities.
## Artwork 1: {artwork1_data['file_name']}
**Text Content:**
{artwork1_data['text_content']}
**Bounding Box Data:**
{json.dumps(artwork1_data['bounding_boxes'][:10], indent=2) if artwork1_data['bounding_boxes'] else "No text elements detected"}
**Barcode Data:**
{json.dumps(artwork1_data['barcode_results'], indent=2) if artwork1_data['barcode_results'] else "No barcodes detected"}
## Artwork 2: {artwork2_data['file_name']}
**Text Content:**
{artwork2_data['text_content']}
**Bounding Box Data:**
{json.dumps(artwork2_data['bounding_boxes'][:10], indent=2) if artwork2_data['bounding_boxes'] else "No text elements detected"}
**Barcode Data:**
{json.dumps(artwork2_data['barcode_results'], indent=2) if artwork2_data['barcode_results'] else "No barcodes detected"}
Please provide a comprehensive comparison analysis in the following JSON format:
{{
"overall_similarity": 0.85,
"comparison_summary": "Brief overview of the comparison results",
"text_differences": [
{{
"category": "Missing Text",
"artwork1_content": "Text found only in artwork 1",
"artwork2_content": "Text found only in artwork 2",
"significance": "HIGH/MEDIUM/LOW",
"description": "Detailed explanation of the difference"
}}
],
"layout_differences": [
{{
"category": "Position Changes",
"element": "Element that moved",
"artwork1_position": "Description of position in artwork 1",
"artwork2_position": "Description of position in artwork 2",
"significance": "HIGH/MEDIUM/LOW",
"description": "Impact of this change"
}}
],
"barcode_differences": [
{{
"category": "Barcode Changes",
"artwork1_barcodes": "Description of barcodes in artwork 1",
"artwork2_barcodes": "Description of barcodes in artwork 2",
"significance": "HIGH/MEDIUM/LOW",
"description": "Analysis of barcode differences"
}}
],
"visual_differences": [
{{
"category": "Visual Elements",
"description": "Description of visual differences observed in the images",
"significance": "HIGH/MEDIUM/LOW",
"recommendation": "Suggested action or consideration"
}}
],
"compliance_impact": [
{{
"area": "Regulatory compliance area affected",
"impact": "Description of potential compliance impact",
"risk_level": "HIGH/MEDIUM/LOW",
"recommendation": "Recommended action"
}}
],
"recommendations": [
"List of actionable recommendations based on the comparison"
]
}}
Analyze both the textual content and visual elements. Pay special attention to:
1. Missing or changed text elements
2. Repositioned elements that might affect readability
3. Barcode differences that could impact functionality
4. Visual changes that might affect brand consistency or compliance
5. Any changes that could impact regulatory compliance
Provide specific, actionable insights that would be valuable for quality control and compliance verification.
"""
try:
# Initialize Anthropic client
client = anthropic.Anthropic(api_key=os.getenv('CLAUDE_API_KEY'))
# Create message with both images
message = client.messages.create(
model=model,
max_tokens=4000,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": artwork1_data['image_base64']
}
},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": artwork2_data['image_base64']
}
}
]
}
]
)
# Parse the response
response_text = ""
for content_block in message.content:
if hasattr(content_block, 'type') and content_block.type == 'text':
response_text += content_block.text
# Try to extract JSON from the response
try:
# Find JSON in the response
start_idx = response_text.find('{')
end_idx = response_text.rfind('}') + 1
if start_idx != -1 and end_idx != -1:
json_str = response_text[start_idx:end_idx]
comparison_results = json.loads(json_str)
else:
# Fallback: create a basic structure with the raw response
comparison_results = {
"overall_similarity": 0.5,
"comparison_summary": "Analysis completed but JSON parsing failed",
"raw_response": response_text,
"text_differences": [],
"layout_differences": [],
"barcode_differences": [],
"visual_differences": [],
"compliance_impact": [],
"recommendations": ["Review the raw analysis output for detailed insights"]
}
except json.JSONDecodeError:
# Fallback for JSON parsing errors
comparison_results = {
"overall_similarity": 0.5,
"comparison_summary": "Analysis completed but structured parsing failed",
"raw_response": response_text,
"text_differences": [],
"layout_differences": [],
"barcode_differences": [],
"visual_differences": [],
"compliance_impact": [],
"recommendations": ["Review the raw analysis output for detailed insights"]
}
return comparison_results
except Exception as e:
st.error(f"Error calling Claude API: {str(e)}")
return None
def display_comparison_results(results, artwork1_data, artwork2_data):
"""Display the comparison results in a structured format"""
if not results:
st.error("No comparison results to display")
return
# Overall Summary
st.markdown("## πŸ“Š Comparison Summary")
col1, col2, col3 = st.columns(3)
with col1:
similarity = results.get('overall_similarity', 0.5)
st.metric("Overall Similarity", f"{similarity:.1%}")
with col2:
total_differences = (
len(results.get('text_differences', [])) +
len(results.get('layout_differences', [])) +
len(results.get('barcode_differences', [])) +
len(results.get('visual_differences', []))
)
st.metric("Total Differences", total_differences)
with col3:
compliance_impacts = len(results.get('compliance_impact', []))
st.metric("Compliance Impacts", compliance_impacts)
# Summary description
if 'comparison_summary' in results:
st.markdown(f"**Summary:** {results['comparison_summary']}")
# Create tabs for different types of differences
tabs = st.tabs(["πŸ“ Text Differences", "πŸ“ Layout Changes", "πŸ“± Barcode Changes", "🎨 Visual Differences", "βš–οΈ Compliance Impact", "πŸ’‘ Recommendations"])
with tabs[0]: # Text Differences
st.markdown("### Text Content Differences")
text_diffs = results.get('text_differences', [])
if text_diffs:
for i, diff in enumerate(text_diffs):
significance_color = {"HIGH": "πŸ”΄", "MEDIUM": "🟑", "LOW": "🟒"}.get(diff.get('significance', 'MEDIUM'), "🟑")
with st.expander(f"{significance_color} {diff.get('category', 'Text Difference')} - {diff.get('significance', 'MEDIUM')} Impact"):
col1, col2 = st.columns(2)
with col1:
st.markdown(f"**{artwork1_data['file_name']}:**")
st.text(diff.get('artwork1_content', 'N/A'))
with col2:
st.markdown(f"**{artwork2_data['file_name']}:**")
st.text(diff.get('artwork2_content', 'N/A'))
st.markdown(f"**Description:** {diff.get('description', 'No description available')}")
else:
st.info("No significant text differences found")
with tabs[1]: # Layout Changes
st.markdown("### Layout and Positioning Changes")
layout_diffs = results.get('layout_differences', [])
if layout_diffs:
for diff in layout_diffs:
significance_color = {"HIGH": "πŸ”΄", "MEDIUM": "🟑", "LOW": "🟒"}.get(diff.get('significance', 'MEDIUM'), "🟑")
with st.expander(f"{significance_color} {diff.get('category', 'Layout Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
st.markdown(f"**Element:** {diff.get('element', 'Unknown element')}")
col1, col2 = st.columns(2)
with col1:
st.markdown(f"**Position in {artwork1_data['file_name']}:**")
st.text(diff.get('artwork1_position', 'N/A'))
with col2:
st.markdown(f"**Position in {artwork2_data['file_name']}:**")
st.text(diff.get('artwork2_position', 'N/A'))
st.markdown(f"**Impact:** {diff.get('description', 'No description available')}")
else:
st.info("No significant layout differences found")
with tabs[2]: # Barcode Changes
st.markdown("### Barcode Differences")
barcode_diffs = results.get('barcode_differences', [])
if barcode_diffs:
for diff in barcode_diffs:
significance_color = {"HIGH": "πŸ”΄", "MEDIUM": "🟑", "LOW": "🟒"}.get(diff.get('significance', 'MEDIUM'), "🟑")
with st.expander(f"{significance_color} {diff.get('category', 'Barcode Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
col1, col2 = st.columns(2)
with col1:
st.markdown(f"**{artwork1_data['file_name']} Barcodes:**")
st.text(diff.get('artwork1_barcodes', 'N/A'))
with col2:
st.markdown(f"**{artwork2_data['file_name']} Barcodes:**")
st.text(diff.get('artwork2_barcodes', 'N/A'))
st.markdown(f"**Analysis:** {diff.get('description', 'No description available')}")
else:
st.info("No significant barcode differences found")
with tabs[3]: # Visual Differences
st.markdown("### Visual and Design Differences")
visual_diffs = results.get('visual_differences', [])
if visual_diffs:
for diff in visual_diffs:
significance_color = {"HIGH": "πŸ”΄", "MEDIUM": "🟑", "LOW": "🟒"}.get(diff.get('significance', 'MEDIUM'), "🟑")
with st.expander(f"{significance_color} {diff.get('category', 'Visual Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
st.markdown(f"**Description:** {diff.get('description', 'No description available')}")
if 'recommendation' in diff:
st.markdown(f"**Recommendation:** {diff['recommendation']}")
else:
st.info("No significant visual differences found")
with tabs[4]: # Compliance Impact
st.markdown("### Compliance and Regulatory Impact")
compliance_impacts = results.get('compliance_impact', [])
if compliance_impacts:
for impact in compliance_impacts:
risk_color = {"HIGH": "πŸ”΄", "MEDIUM": "🟑", "LOW": "🟒"}.get(impact.get('risk_level', 'MEDIUM'), "🟑")
with st.expander(f"{risk_color} {impact.get('area', 'Compliance Area')} - {impact.get('risk_level', 'MEDIUM')} Risk"):
st.markdown(f"**Impact:** {impact.get('impact', 'No description available')}")
st.markdown(f"**Recommendation:** {impact.get('recommendation', 'No recommendation provided')}")
else:
st.success("No compliance impacts identified")
with tabs[5]: # Recommendations
st.markdown("### Action Items and Recommendations")
recommendations = results.get('recommendations', [])
if recommendations:
for i, rec in enumerate(recommendations, 1):
st.markdown(f"{i}. {rec}")
else:
st.info("No specific recommendations provided")
# Raw response section (collapsible)
if 'raw_response' in results:
with st.expander("πŸ” Raw Analysis Output"):
st.text(results['raw_response'])
def display_side_by_side_images(artwork1_data, artwork2_data):
"""Display the two artwork images side by side"""
st.markdown("## πŸ–ΌοΈ Side-by-Side Comparison")
col1, col2 = st.columns(2)
with col1:
st.markdown(f"### {artwork1_data['file_name']}")
st.image(ImageUtils.crop_image(artwork1_data['image']), caption=artwork1_data['file_name'], use_container_width=True)
# Display image processing info
if 'image_quality' in artwork1_data and 'image_size_bytes' in artwork1_data:
quality = artwork1_data['image_quality']
size_mb = artwork1_data['image_size_bytes'] / (1024 * 1024)
st.info(f"πŸ“Š Image Quality: {quality}% | Size: {size_mb:.2f}MB")
# Display extracted data summary
with st.expander("πŸ“Š Extracted Data Summary"):
text_elements = len(artwork1_data['bounding_boxes']) if artwork1_data['bounding_boxes'] else 0
barcodes = len(artwork1_data['barcode_results']) if artwork1_data['barcode_results'] else 0
st.metric("Text Elements", text_elements)
st.metric("Barcodes", barcodes)
with col2:
st.markdown(f"### {artwork2_data['file_name']}")
st.image(ImageUtils.crop_image(artwork2_data['image']), caption=artwork2_data['file_name'], use_container_width=True)
# Display image processing info
if 'image_quality' in artwork2_data and 'image_size_bytes' in artwork2_data:
quality = artwork2_data['image_quality']
size_mb = artwork2_data['image_size_bytes'] / (1024 * 1024)
st.info(f"πŸ“Š Image Quality: {quality}% | Size: {size_mb:.2f}MB")
# Display extracted data summary
with st.expander("πŸ“Š Extracted Data Summary"):
text_elements = len(artwork2_data['bounding_boxes']) if artwork2_data['bounding_boxes'] else 0
barcodes = len(artwork2_data['barcode_results']) if artwork2_data['barcode_results'] else 0
st.metric("Text Elements", text_elements)
st.metric("Barcodes", barcodes)
def main():
st.set_page_config(layout="wide", page_title="Artwork Comparison Tool")
# Load client artwork files
client_artwork_files = load_client_artwork_files()
# Initialize session state
if "artwork1_data" not in st.session_state:
st.session_state.artwork1_data = None
if "artwork2_data" not in st.session_state:
st.session_state.artwork2_data = None
if "comparison_results" not in st.session_state:
st.session_state.comparison_results = None
st.title("🎨 Artwork Comparison Tool")
st.write("Compare two packaging artwork PDFs to identify differences in text, layout, barcodes, and visual elements.")
# File selection section
st.markdown("## πŸ“ Select Artworks to Compare")
col1, col2 = st.columns(2)
with col1:
st.markdown("### 🎨 Artwork 1")
# Create tabs for client files vs upload
art1_tab1, art1_tab2 = st.tabs(["πŸ“ Client Files", "πŸ“€ Upload New"])
with art1_tab1:
if client_artwork_files:
art1_options = ["Select artwork 1..."] + [f["name"] for f in client_artwork_files]
selected_art1_file = st.selectbox("Choose artwork 1:", art1_options, key="art1_select")
if selected_art1_file != "Select artwork 1...":
# Find and load the selected file
for file_info in client_artwork_files:
if file_info["name"] == selected_art1_file:
file_content = load_artwork_content(file_info)
if file_content:
import io
temp_file = io.BytesIO(file_content)
temp_file.name = file_info["name"]
# Extract data from the artwork
with st.spinner("Processing artwork 1..."):
st.session_state.artwork1_data = extract_pdf_data(temp_file, file_info["name"])
if st.session_state.artwork1_data:
st.success(f"βœ… Loaded artwork 1: {selected_art1_file}")
break
else:
st.info("No client artwork files found")
with art1_tab2:
artwork1_file = st.file_uploader("Upload Artwork 1 (PDF)", type=["pdf"], key="art1_upload")
if artwork1_file:
with st.spinner("Processing artwork 1..."):
st.session_state.artwork1_data = extract_pdf_data(artwork1_file, artwork1_file.name)
if st.session_state.artwork1_data:
st.success(f"βœ… Uploaded artwork 1: {artwork1_file.name}")
with col2:
st.markdown("### 🎨 Artwork 2")
# Create tabs for client files vs upload
art2_tab1, art2_tab2 = st.tabs(["πŸ“ Client Files", "πŸ“€ Upload New"])
with art2_tab1:
if client_artwork_files:
art2_options = ["Select artwork 2..."] + [f["name"] for f in client_artwork_files]
selected_art2_file = st.selectbox("Choose artwork 2:", art2_options, key="art2_select")
if selected_art2_file != "Select artwork 2...":
# Find and load the selected file
for file_info in client_artwork_files:
if file_info["name"] == selected_art2_file:
file_content = load_artwork_content(file_info)
if file_content:
import io
temp_file = io.BytesIO(file_content)
temp_file.name = file_info["name"]
# Extract data from the artwork
with st.spinner("Processing artwork 2..."):
st.session_state.artwork2_data = extract_pdf_data(temp_file, file_info["name"])
if st.session_state.artwork2_data:
st.success(f"βœ… Loaded artwork 2: {selected_art2_file}")
break
else:
st.info("No client artwork files found")
with art2_tab2:
artwork2_file = st.file_uploader("Upload Artwork 2 (PDF)", type=["pdf"], key="art2_upload")
if artwork2_file:
with st.spinner("Processing artwork 2..."):
st.session_state.artwork2_data = extract_pdf_data(artwork2_file, artwork2_file.name)
if st.session_state.artwork2_data:
st.success(f"βœ… Uploaded artwork 2: {artwork2_file.name}")
# Display images side by side if both are loaded
if st.session_state.artwork1_data and st.session_state.artwork2_data:
display_side_by_side_images(st.session_state.artwork1_data, st.session_state.artwork2_data)
# Model selection
model_option = "claude-sonnet-4-20250514"
# Comparison button
if st.button("πŸ” Compare Artworks", type="primary"):
if st.session_state.artwork1_data and st.session_state.artwork2_data:
with st.spinner("Analyzing artworks with Claude..."):
st.session_state.comparison_results = compare_artworks_with_claude(
st.session_state.artwork1_data,
st.session_state.artwork2_data,
model=model_option
)
if st.session_state.comparison_results:
st.success("βœ… Comparison analysis complete!")
else:
st.error("❌ Comparison analysis failed")
else:
st.warning("⚠️ Please select or upload both artworks before comparing")
# Display comparison results
if st.session_state.comparison_results:
display_comparison_results(
st.session_state.comparison_results,
st.session_state.artwork1_data,
st.session_state.artwork2_data
)
# Add helpful information
st.markdown("---")
st.markdown("""
### πŸ› οΈ How It Works
1. **Extract Content**: The tool extracts text, bounding boxes, images, and barcodes from both PDFs
2. **AI Analysis**: Claude analyzes the extracted data and visual elements to identify differences
3. **Structured Results**: Differences are categorized by type (text, layout, barcode, visual) and significance
4. **Compliance Assessment**: Potential compliance impacts are identified with risk levels and recommendations
### 🎯 Use Cases
- **Quality Control**: Verify artwork changes between versions
- **Brand Consistency**: Ensure visual elements remain consistent
- **Compliance Review**: Identify changes that might affect regulatory compliance
- **Change Documentation**: Track and document artwork modifications
""")
if __name__ == "__main__":
main()