Upload 11 files
Browse files- .env +1 -0
- .gitignore +2 -0
- README.md +185 -10
- compare_app.py +620 -0
- main.py +938 -0
- quick_test.py +63 -0
- quick_test_results.md +17 -0
- requirements.txt +14 -0
- test_google_doc_ai.py +183 -0
- test_metadata.py +89 -0
- test_pdf_requirements.py +74 -0
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
CLAUDE_API_KEY=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
|
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
requirements_library/client_requirements/
|
README.md
CHANGED
|
@@ -1,10 +1,185 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Artwork Comparison Tool
|
| 2 |
+
|
| 3 |
+
A Streamlit-based application for comparing packaging artwork PDFs using AI-powered analysis. This tool extracts text, images, and barcodes from PDF files and provides detailed comparison analysis using Claude AI.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **PDF Processing**: Extract text, bounding boxes, and convert PDFs to images
|
| 8 |
+
- **Barcode Detection**: Scan and validate barcodes in artwork
|
| 9 |
+
- **AI-Powered Comparison**: Use Claude AI to analyze differences between artworks
|
| 10 |
+
- **Compliance Analysis**: Identify potential compliance impacts of changes
|
| 11 |
+
- **Visual Comparison**: Side-by-side image comparison with extracted data
|
| 12 |
+
- **Client File Management**: Load and compare files from a structured client directory
|
| 13 |
+
|
| 14 |
+
## Installation
|
| 15 |
+
|
| 16 |
+
### Prerequisites
|
| 17 |
+
|
| 18 |
+
- Python 3.8 or higher
|
| 19 |
+
- Windows, macOS, or Linux
|
| 20 |
+
|
| 21 |
+
### Setup
|
| 22 |
+
|
| 23 |
+
1. **Clone or download the project**
|
| 24 |
+
```bash
|
| 25 |
+
git clone <repository-url>
|
| 26 |
+
cd SGK-AI-LAB
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
2. **Install dependencies**
|
| 30 |
+
```bash
|
| 31 |
+
pip install -r requirements.txt
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
3. **Set up Google Cloud credentials** (for text extraction)
|
| 35 |
+
- Place your Google Cloud service account JSON file at `src/extract_text/photon-services-f0d3ec1417d0.json`
|
| 36 |
+
- Or update the path in `compare_app.py` to point to your credentials file
|
| 37 |
+
|
| 38 |
+
4. **Set up Anthropic API key** (for AI comparison)
|
| 39 |
+
```bash
|
| 40 |
+
# Windows
|
| 41 |
+
set ANTHROPIC_API_KEY=your_api_key_here
|
| 42 |
+
|
| 43 |
+
# macOS/Linux
|
| 44 |
+
export ANTHROPIC_API_KEY=your_api_key_here
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
## Usage
|
| 48 |
+
|
| 49 |
+
### Running the Application
|
| 50 |
+
|
| 51 |
+
```bash
|
| 52 |
+
streamlit run compare_app.py
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
The application will open in your default web browser at `http://localhost:8501`.
|
| 56 |
+
|
| 57 |
+
### How to Use
|
| 58 |
+
|
| 59 |
+
1. **Select Artworks to Compare**
|
| 60 |
+
- Choose from existing client files in the dropdown
|
| 61 |
+
- Or upload new PDF files using the file uploader
|
| 62 |
+
|
| 63 |
+
2. **View Side-by-Side Comparison**
|
| 64 |
+
- See both artworks displayed side by side
|
| 65 |
+
- View extracted data summaries (text elements, barcodes)
|
| 66 |
+
|
| 67 |
+
3. **Run AI Analysis**
|
| 68 |
+
- Click "Compare Artworks" to start the AI-powered analysis
|
| 69 |
+
- Wait for Claude to process and analyze the differences
|
| 70 |
+
|
| 71 |
+
4. **Review Results**
|
| 72 |
+
- **Text Differences**: Missing, added, or changed text elements
|
| 73 |
+
- **Layout Changes**: Repositioned elements and their impact
|
| 74 |
+
- **Barcode Changes**: Differences in barcode data or positioning
|
| 75 |
+
- **Visual Differences**: Design and visual element changes
|
| 76 |
+
- **Compliance Impact**: Potential regulatory compliance issues
|
| 77 |
+
- **Recommendations**: Actionable insights and next steps
|
| 78 |
+
|
| 79 |
+
### File Structure
|
| 80 |
+
|
| 81 |
+
```
|
| 82 |
+
SGK-AI-LAB/
|
| 83 |
+
├── compare_app.py # Main Streamlit application
|
| 84 |
+
├── requirements.txt # Python dependencies
|
| 85 |
+
├── test_compare_app.py # Test script
|
| 86 |
+
├── README.md # This file
|
| 87 |
+
├── requirements_library/ # Client artwork files
|
| 88 |
+
│ └── client-requirements/
|
| 89 |
+
│ ├── M&S/
|
| 90 |
+
│ │ ├── Curry puff/
|
| 91 |
+
│ │ └── Lemon package/
|
| 92 |
+
│ └── package/
|
| 93 |
+
└── src/
|
| 94 |
+
├── core/
|
| 95 |
+
│ └── analysis.py
|
| 96 |
+
├── extract_text/
|
| 97 |
+
│ ├── google_document_api.py
|
| 98 |
+
│ ├── ingest.py
|
| 99 |
+
│ └── photon-services-f0d3ec1417d0.json
|
| 100 |
+
└── utils/
|
| 101 |
+
├── barcode.py
|
| 102 |
+
└── image_utils.py
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
## Configuration
|
| 106 |
+
|
| 107 |
+
### Google Cloud Document AI
|
| 108 |
+
|
| 109 |
+
The application uses Google Cloud Document AI for text extraction. To set up:
|
| 110 |
+
|
| 111 |
+
1. Create a Google Cloud project
|
| 112 |
+
2. Enable Document AI API
|
| 113 |
+
3. Create a service account and download the JSON credentials
|
| 114 |
+
4. Place the credentials file in `src/extract_text/` or update the path in the code
|
| 115 |
+
|
| 116 |
+
### Anthropic Claude API
|
| 117 |
+
|
| 118 |
+
For AI-powered comparison, you need an Anthropic API key:
|
| 119 |
+
|
| 120 |
+
1. Sign up at [Anthropic Console](https://console.anthropic.com/)
|
| 121 |
+
2. Create an API key
|
| 122 |
+
3. Set the `ANTHROPIC_API_KEY` environment variable
|
| 123 |
+
|
| 124 |
+
## Dependencies
|
| 125 |
+
|
| 126 |
+
### Core Dependencies
|
| 127 |
+
- `streamlit` - Web application framework
|
| 128 |
+
- `anthropic` - Claude AI API client
|
| 129 |
+
- `google-cloud-documentai` - Google Document AI
|
| 130 |
+
- `pdf2image` - PDF to image conversion
|
| 131 |
+
- `Pillow` - Image processing
|
| 132 |
+
- `opencv-python` - Computer vision
|
| 133 |
+
- `numpy` - Numerical computing
|
| 134 |
+
- `pandas` - Data manipulation
|
| 135 |
+
|
| 136 |
+
### Barcode Processing
|
| 137 |
+
- `zxing-cpp` - Barcode detection
|
| 138 |
+
- `barcodenumber` - Barcode validation
|
| 139 |
+
|
| 140 |
+
## Troubleshooting
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
## API Response Format
|
| 144 |
+
|
| 145 |
+
The AI comparison returns structured JSON with the following format:
|
| 146 |
+
|
| 147 |
+
```json
|
| 148 |
+
{
|
| 149 |
+
"overall_similarity": 0.85,
|
| 150 |
+
"comparison_summary": "Brief overview",
|
| 151 |
+
"text_differences": [
|
| 152 |
+
{
|
| 153 |
+
"category": "Missing Text",
|
| 154 |
+
"artwork1_content": "Text in artwork 1",
|
| 155 |
+
"artwork2_content": "Text in artwork 2",
|
| 156 |
+
"significance": "HIGH/MEDIUM/LOW",
|
| 157 |
+
"description": "Detailed explanation"
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"layout_differences": [...],
|
| 161 |
+
"barcode_differences": [...],
|
| 162 |
+
"visual_differences": [...],
|
| 163 |
+
"compliance_impact": [...],
|
| 164 |
+
"recommendations": ["Action item 1", "Action item 2"]
|
| 165 |
+
}
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
## Contributing
|
| 169 |
+
|
| 170 |
+
1. Fork the repository
|
| 171 |
+
2. Create a feature branch
|
| 172 |
+
3. Make your changes
|
| 173 |
+
4. Add tests if applicable
|
| 174 |
+
5. Submit a pull request
|
| 175 |
+
|
| 176 |
+
## License
|
| 177 |
+
|
| 178 |
+
[Add your license information here]
|
| 179 |
+
|
| 180 |
+
## Support
|
| 181 |
+
|
| 182 |
+
For issues and questions:
|
| 183 |
+
1. Check the troubleshooting section
|
| 184 |
+
2. Review the test script output
|
| 185 |
+
3. Create an issue with detailed error information
|
compare_app.py
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import tempfile
|
| 3 |
+
import os
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from src.extract_text.google_document_api import GoogleDocumentAPI
|
| 6 |
+
from pdf2image import convert_from_path
|
| 7 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 8 |
+
from src.utils.image_utils import ImageUtils
|
| 9 |
+
import base64
|
| 10 |
+
from io import BytesIO
|
| 11 |
+
from src.utils.barcode import Barcode
|
| 12 |
+
import anthropic
|
| 13 |
+
import json
|
| 14 |
+
|
| 15 |
+
def load_client_artwork_files():
|
| 16 |
+
"""Load all artwork PDF files from client directory"""
|
| 17 |
+
base_path = "requirements_library/client-requirements"
|
| 18 |
+
artwork_files = []
|
| 19 |
+
|
| 20 |
+
if not os.path.exists(base_path):
|
| 21 |
+
return artwork_files
|
| 22 |
+
|
| 23 |
+
# Walk through all subdirectories
|
| 24 |
+
for root, dirs, files in os.walk(base_path):
|
| 25 |
+
for file in files:
|
| 26 |
+
file_path = os.path.join(root, file)
|
| 27 |
+
relative_path = os.path.relpath(file_path, base_path)
|
| 28 |
+
|
| 29 |
+
if file.lower().endswith('.pdf'):
|
| 30 |
+
artwork_files.append({
|
| 31 |
+
'name': f"{relative_path}",
|
| 32 |
+
'path': file_path,
|
| 33 |
+
'type': 'artwork'
|
| 34 |
+
})
|
| 35 |
+
|
| 36 |
+
return artwork_files
|
| 37 |
+
|
| 38 |
+
def load_artwork_content(file_info):
|
| 39 |
+
"""Load artwork content as bytes"""
|
| 40 |
+
try:
|
| 41 |
+
with open(file_info['path'], 'rb') as f:
|
| 42 |
+
return f.read()
|
| 43 |
+
except Exception as e:
|
| 44 |
+
st.error(f"Error loading artwork file {file_info['name']}: {str(e)}")
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
def extract_pdf_data(pdf_file, file_name):
|
| 48 |
+
"""Extract text, bounding boxes, images, and barcodes from PDF"""
|
| 49 |
+
try:
|
| 50 |
+
# Create a temporary file to process the PDF
|
| 51 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 52 |
+
pdf_file.seek(0)
|
| 53 |
+
tmp_file.write(pdf_file.read())
|
| 54 |
+
tmp_pdf_path = tmp_file.name
|
| 55 |
+
|
| 56 |
+
# Extract text and bounding boxes using Google Document API
|
| 57 |
+
google_document_api = GoogleDocumentAPI(credentials_path="src/extract_text/photon-services-f0d3ec1417d0.json")
|
| 58 |
+
document = google_document_api.process_document(tmp_pdf_path)
|
| 59 |
+
text_content = google_document_api.extract_text_with_markdown_table(document)
|
| 60 |
+
bounding_boxes = google_document_api.extract_text_with_bounding_boxes(document)
|
| 61 |
+
|
| 62 |
+
# Convert PDF to image
|
| 63 |
+
try:
|
| 64 |
+
images = convert_from_path(tmp_pdf_path)
|
| 65 |
+
if not images:
|
| 66 |
+
raise ValueError("No pages found in PDF")
|
| 67 |
+
page_image = images[0] # Assuming single page for now
|
| 68 |
+
except Exception as e:
|
| 69 |
+
st.error(f"Error converting PDF to image: {str(e)}")
|
| 70 |
+
# Create a placeholder image
|
| 71 |
+
page_image = Image.new('RGB', (800, 600), color='white')
|
| 72 |
+
draw = ImageDraw.Draw(page_image)
|
| 73 |
+
draw.text((400, 300), "PDF conversion failed", fill='black', anchor='mm')
|
| 74 |
+
|
| 75 |
+
# Process image for comparison: standardize size and optimize quality
|
| 76 |
+
processed_image, quality, file_size = ImageUtils.process_image_for_comparison(
|
| 77 |
+
page_image,
|
| 78 |
+
target_size=(1200, 1600), # Standard size for comparison
|
| 79 |
+
max_size_bytes=1024 * 1024 # 1MB limit
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Convert processed image to base64 for API
|
| 83 |
+
image_base64 = ImageUtils.image_to_base64_optimized(
|
| 84 |
+
page_image,
|
| 85 |
+
target_size=(1200, 1600),
|
| 86 |
+
max_size_bytes=1024 * 1024
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Scan for barcodes
|
| 90 |
+
barcode = Barcode()
|
| 91 |
+
barcode_results = barcode.scan_and_validate(page_image)
|
| 92 |
+
|
| 93 |
+
# Clean up temporary file
|
| 94 |
+
if os.path.exists(tmp_pdf_path):
|
| 95 |
+
os.unlink(tmp_pdf_path)
|
| 96 |
+
|
| 97 |
+
return {
|
| 98 |
+
'text_content': text_content,
|
| 99 |
+
'bounding_boxes': bounding_boxes,
|
| 100 |
+
'image': processed_image, # Use the processed image
|
| 101 |
+
'original_image': page_image, # Keep original for reference
|
| 102 |
+
'image_base64': image_base64,
|
| 103 |
+
'barcode_results': barcode_results,
|
| 104 |
+
'file_name': file_name,
|
| 105 |
+
'image_quality': quality,
|
| 106 |
+
'image_size_bytes': file_size
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
except Exception as e:
|
| 110 |
+
st.error(f"Error processing PDF {file_name}: {str(e)}")
|
| 111 |
+
return None
|
| 112 |
+
|
| 113 |
+
def compare_artworks_with_claude(artwork1_data, artwork2_data, model="claude-sonnet-4-20250514"):
|
| 114 |
+
"""Compare two artworks using Claude API"""
|
| 115 |
+
|
| 116 |
+
# Prepare the comparison prompt
|
| 117 |
+
prompt = f"""
|
| 118 |
+
You are an expert packaging compliance analyzer. Compare these two artwork PDFs and provide a detailed analysis of their differences and similarities.
|
| 119 |
+
|
| 120 |
+
## Artwork 1: {artwork1_data['file_name']}
|
| 121 |
+
**Text Content:**
|
| 122 |
+
{artwork1_data['text_content']}
|
| 123 |
+
|
| 124 |
+
**Bounding Box Data:**
|
| 125 |
+
{json.dumps(artwork1_data['bounding_boxes'][:10], indent=2) if artwork1_data['bounding_boxes'] else "No text elements detected"}
|
| 126 |
+
|
| 127 |
+
**Barcode Data:**
|
| 128 |
+
{json.dumps(artwork1_data['barcode_results'], indent=2) if artwork1_data['barcode_results'] else "No barcodes detected"}
|
| 129 |
+
|
| 130 |
+
## Artwork 2: {artwork2_data['file_name']}
|
| 131 |
+
**Text Content:**
|
| 132 |
+
{artwork2_data['text_content']}
|
| 133 |
+
|
| 134 |
+
**Bounding Box Data:**
|
| 135 |
+
{json.dumps(artwork2_data['bounding_boxes'][:10], indent=2) if artwork2_data['bounding_boxes'] else "No text elements detected"}
|
| 136 |
+
|
| 137 |
+
**Barcode Data:**
|
| 138 |
+
{json.dumps(artwork2_data['barcode_results'], indent=2) if artwork2_data['barcode_results'] else "No barcodes detected"}
|
| 139 |
+
|
| 140 |
+
Please provide a comprehensive comparison analysis in the following JSON format:
|
| 141 |
+
|
| 142 |
+
{{
|
| 143 |
+
"overall_similarity": 0.85,
|
| 144 |
+
"comparison_summary": "Brief overview of the comparison results",
|
| 145 |
+
"text_differences": [
|
| 146 |
+
{{
|
| 147 |
+
"category": "Missing Text",
|
| 148 |
+
"artwork1_content": "Text found only in artwork 1",
|
| 149 |
+
"artwork2_content": "Text found only in artwork 2",
|
| 150 |
+
"significance": "HIGH/MEDIUM/LOW",
|
| 151 |
+
"description": "Detailed explanation of the difference"
|
| 152 |
+
}}
|
| 153 |
+
],
|
| 154 |
+
"layout_differences": [
|
| 155 |
+
{{
|
| 156 |
+
"category": "Position Changes",
|
| 157 |
+
"element": "Element that moved",
|
| 158 |
+
"artwork1_position": "Description of position in artwork 1",
|
| 159 |
+
"artwork2_position": "Description of position in artwork 2",
|
| 160 |
+
"significance": "HIGH/MEDIUM/LOW",
|
| 161 |
+
"description": "Impact of this change"
|
| 162 |
+
}}
|
| 163 |
+
],
|
| 164 |
+
"barcode_differences": [
|
| 165 |
+
{{
|
| 166 |
+
"category": "Barcode Changes",
|
| 167 |
+
"artwork1_barcodes": "Description of barcodes in artwork 1",
|
| 168 |
+
"artwork2_barcodes": "Description of barcodes in artwork 2",
|
| 169 |
+
"significance": "HIGH/MEDIUM/LOW",
|
| 170 |
+
"description": "Analysis of barcode differences"
|
| 171 |
+
}}
|
| 172 |
+
],
|
| 173 |
+
"visual_differences": [
|
| 174 |
+
{{
|
| 175 |
+
"category": "Visual Elements",
|
| 176 |
+
"description": "Description of visual differences observed in the images",
|
| 177 |
+
"significance": "HIGH/MEDIUM/LOW",
|
| 178 |
+
"recommendation": "Suggested action or consideration"
|
| 179 |
+
}}
|
| 180 |
+
],
|
| 181 |
+
"compliance_impact": [
|
| 182 |
+
{{
|
| 183 |
+
"area": "Regulatory compliance area affected",
|
| 184 |
+
"impact": "Description of potential compliance impact",
|
| 185 |
+
"risk_level": "HIGH/MEDIUM/LOW",
|
| 186 |
+
"recommendation": "Recommended action"
|
| 187 |
+
}}
|
| 188 |
+
],
|
| 189 |
+
"recommendations": [
|
| 190 |
+
"List of actionable recommendations based on the comparison"
|
| 191 |
+
]
|
| 192 |
+
}}
|
| 193 |
+
|
| 194 |
+
Analyze both the textual content and visual elements. Pay special attention to:
|
| 195 |
+
1. Missing or changed text elements
|
| 196 |
+
2. Repositioned elements that might affect readability
|
| 197 |
+
3. Barcode differences that could impact functionality
|
| 198 |
+
4. Visual changes that might affect brand consistency or compliance
|
| 199 |
+
5. Any changes that could impact regulatory compliance
|
| 200 |
+
|
| 201 |
+
Provide specific, actionable insights that would be valuable for quality control and compliance verification.
|
| 202 |
+
"""
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
# Initialize Anthropic client
|
| 206 |
+
client = anthropic.Anthropic(api_key=os.getenv('CLAUDE_API_KEY'))
|
| 207 |
+
|
| 208 |
+
# Create message with both images
|
| 209 |
+
message = client.messages.create(
|
| 210 |
+
model=model,
|
| 211 |
+
max_tokens=4000,
|
| 212 |
+
messages=[
|
| 213 |
+
{
|
| 214 |
+
"role": "user",
|
| 215 |
+
"content": [
|
| 216 |
+
{
|
| 217 |
+
"type": "text",
|
| 218 |
+
"text": prompt
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"type": "image",
|
| 222 |
+
"source": {
|
| 223 |
+
"type": "base64",
|
| 224 |
+
"media_type": "image/png",
|
| 225 |
+
"data": artwork1_data['image_base64']
|
| 226 |
+
}
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"type": "image",
|
| 230 |
+
"source": {
|
| 231 |
+
"type": "base64",
|
| 232 |
+
"media_type": "image/png",
|
| 233 |
+
"data": artwork2_data['image_base64']
|
| 234 |
+
}
|
| 235 |
+
}
|
| 236 |
+
]
|
| 237 |
+
}
|
| 238 |
+
]
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# Parse the response
|
| 242 |
+
response_text = ""
|
| 243 |
+
for content_block in message.content:
|
| 244 |
+
if hasattr(content_block, 'type') and content_block.type == 'text':
|
| 245 |
+
response_text += content_block.text
|
| 246 |
+
|
| 247 |
+
# Try to extract JSON from the response
|
| 248 |
+
try:
|
| 249 |
+
# Find JSON in the response
|
| 250 |
+
start_idx = response_text.find('{')
|
| 251 |
+
end_idx = response_text.rfind('}') + 1
|
| 252 |
+
|
| 253 |
+
if start_idx != -1 and end_idx != -1:
|
| 254 |
+
json_str = response_text[start_idx:end_idx]
|
| 255 |
+
comparison_results = json.loads(json_str)
|
| 256 |
+
else:
|
| 257 |
+
# Fallback: create a basic structure with the raw response
|
| 258 |
+
comparison_results = {
|
| 259 |
+
"overall_similarity": 0.5,
|
| 260 |
+
"comparison_summary": "Analysis completed but JSON parsing failed",
|
| 261 |
+
"raw_response": response_text,
|
| 262 |
+
"text_differences": [],
|
| 263 |
+
"layout_differences": [],
|
| 264 |
+
"barcode_differences": [],
|
| 265 |
+
"visual_differences": [],
|
| 266 |
+
"compliance_impact": [],
|
| 267 |
+
"recommendations": ["Review the raw analysis output for detailed insights"]
|
| 268 |
+
}
|
| 269 |
+
except json.JSONDecodeError:
|
| 270 |
+
# Fallback for JSON parsing errors
|
| 271 |
+
comparison_results = {
|
| 272 |
+
"overall_similarity": 0.5,
|
| 273 |
+
"comparison_summary": "Analysis completed but structured parsing failed",
|
| 274 |
+
"raw_response": response_text,
|
| 275 |
+
"text_differences": [],
|
| 276 |
+
"layout_differences": [],
|
| 277 |
+
"barcode_differences": [],
|
| 278 |
+
"visual_differences": [],
|
| 279 |
+
"compliance_impact": [],
|
| 280 |
+
"recommendations": ["Review the raw analysis output for detailed insights"]
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
return comparison_results
|
| 284 |
+
|
| 285 |
+
except Exception as e:
|
| 286 |
+
st.error(f"Error calling Claude API: {str(e)}")
|
| 287 |
+
return None
|
| 288 |
+
|
| 289 |
+
def display_comparison_results(results, artwork1_data, artwork2_data):
|
| 290 |
+
"""Display the comparison results in a structured format"""
|
| 291 |
+
|
| 292 |
+
if not results:
|
| 293 |
+
st.error("No comparison results to display")
|
| 294 |
+
return
|
| 295 |
+
|
| 296 |
+
# Overall Summary
|
| 297 |
+
st.markdown("## 📊 Comparison Summary")
|
| 298 |
+
|
| 299 |
+
col1, col2, col3 = st.columns(3)
|
| 300 |
+
with col1:
|
| 301 |
+
similarity = results.get('overall_similarity', 0.5)
|
| 302 |
+
st.metric("Overall Similarity", f"{similarity:.1%}")
|
| 303 |
+
|
| 304 |
+
with col2:
|
| 305 |
+
total_differences = (
|
| 306 |
+
len(results.get('text_differences', [])) +
|
| 307 |
+
len(results.get('layout_differences', [])) +
|
| 308 |
+
len(results.get('barcode_differences', [])) +
|
| 309 |
+
len(results.get('visual_differences', []))
|
| 310 |
+
)
|
| 311 |
+
st.metric("Total Differences", total_differences)
|
| 312 |
+
|
| 313 |
+
with col3:
|
| 314 |
+
compliance_impacts = len(results.get('compliance_impact', []))
|
| 315 |
+
st.metric("Compliance Impacts", compliance_impacts)
|
| 316 |
+
|
| 317 |
+
# Summary description
|
| 318 |
+
if 'comparison_summary' in results:
|
| 319 |
+
st.markdown(f"**Summary:** {results['comparison_summary']}")
|
| 320 |
+
|
| 321 |
+
# Create tabs for different types of differences
|
| 322 |
+
tabs = st.tabs(["📝 Text Differences", "📐 Layout Changes", "📱 Barcode Changes", "🎨 Visual Differences", "⚖️ Compliance Impact", "💡 Recommendations"])
|
| 323 |
+
|
| 324 |
+
with tabs[0]: # Text Differences
|
| 325 |
+
st.markdown("### Text Content Differences")
|
| 326 |
+
text_diffs = results.get('text_differences', [])
|
| 327 |
+
if text_diffs:
|
| 328 |
+
for i, diff in enumerate(text_diffs):
|
| 329 |
+
significance_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(diff.get('significance', 'MEDIUM'), "🟡")
|
| 330 |
+
|
| 331 |
+
with st.expander(f"{significance_color} {diff.get('category', 'Text Difference')} - {diff.get('significance', 'MEDIUM')} Impact"):
|
| 332 |
+
col1, col2 = st.columns(2)
|
| 333 |
+
with col1:
|
| 334 |
+
st.markdown(f"**{artwork1_data['file_name']}:**")
|
| 335 |
+
st.text(diff.get('artwork1_content', 'N/A'))
|
| 336 |
+
with col2:
|
| 337 |
+
st.markdown(f"**{artwork2_data['file_name']}:**")
|
| 338 |
+
st.text(diff.get('artwork2_content', 'N/A'))
|
| 339 |
+
|
| 340 |
+
st.markdown(f"**Description:** {diff.get('description', 'No description available')}")
|
| 341 |
+
else:
|
| 342 |
+
st.info("No significant text differences found")
|
| 343 |
+
|
| 344 |
+
with tabs[1]: # Layout Changes
|
| 345 |
+
st.markdown("### Layout and Positioning Changes")
|
| 346 |
+
layout_diffs = results.get('layout_differences', [])
|
| 347 |
+
if layout_diffs:
|
| 348 |
+
for diff in layout_diffs:
|
| 349 |
+
significance_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(diff.get('significance', 'MEDIUM'), "🟡")
|
| 350 |
+
|
| 351 |
+
with st.expander(f"{significance_color} {diff.get('category', 'Layout Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
|
| 352 |
+
st.markdown(f"**Element:** {diff.get('element', 'Unknown element')}")
|
| 353 |
+
|
| 354 |
+
col1, col2 = st.columns(2)
|
| 355 |
+
with col1:
|
| 356 |
+
st.markdown(f"**Position in {artwork1_data['file_name']}:**")
|
| 357 |
+
st.text(diff.get('artwork1_position', 'N/A'))
|
| 358 |
+
with col2:
|
| 359 |
+
st.markdown(f"**Position in {artwork2_data['file_name']}:**")
|
| 360 |
+
st.text(diff.get('artwork2_position', 'N/A'))
|
| 361 |
+
|
| 362 |
+
st.markdown(f"**Impact:** {diff.get('description', 'No description available')}")
|
| 363 |
+
else:
|
| 364 |
+
st.info("No significant layout differences found")
|
| 365 |
+
|
| 366 |
+
with tabs[2]: # Barcode Changes
|
| 367 |
+
st.markdown("### Barcode Differences")
|
| 368 |
+
barcode_diffs = results.get('barcode_differences', [])
|
| 369 |
+
if barcode_diffs:
|
| 370 |
+
for diff in barcode_diffs:
|
| 371 |
+
significance_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(diff.get('significance', 'MEDIUM'), "🟡")
|
| 372 |
+
|
| 373 |
+
with st.expander(f"{significance_color} {diff.get('category', 'Barcode Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
|
| 374 |
+
col1, col2 = st.columns(2)
|
| 375 |
+
with col1:
|
| 376 |
+
st.markdown(f"**{artwork1_data['file_name']} Barcodes:**")
|
| 377 |
+
st.text(diff.get('artwork1_barcodes', 'N/A'))
|
| 378 |
+
with col2:
|
| 379 |
+
st.markdown(f"**{artwork2_data['file_name']} Barcodes:**")
|
| 380 |
+
st.text(diff.get('artwork2_barcodes', 'N/A'))
|
| 381 |
+
|
| 382 |
+
st.markdown(f"**Analysis:** {diff.get('description', 'No description available')}")
|
| 383 |
+
else:
|
| 384 |
+
st.info("No significant barcode differences found")
|
| 385 |
+
|
| 386 |
+
with tabs[3]: # Visual Differences
|
| 387 |
+
st.markdown("### Visual and Design Differences")
|
| 388 |
+
visual_diffs = results.get('visual_differences', [])
|
| 389 |
+
if visual_diffs:
|
| 390 |
+
for diff in visual_diffs:
|
| 391 |
+
significance_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(diff.get('significance', 'MEDIUM'), "🟡")
|
| 392 |
+
|
| 393 |
+
with st.expander(f"{significance_color} {diff.get('category', 'Visual Change')} - {diff.get('significance', 'MEDIUM')} Impact"):
|
| 394 |
+
st.markdown(f"**Description:** {diff.get('description', 'No description available')}")
|
| 395 |
+
if 'recommendation' in diff:
|
| 396 |
+
st.markdown(f"**Recommendation:** {diff['recommendation']}")
|
| 397 |
+
else:
|
| 398 |
+
st.info("No significant visual differences found")
|
| 399 |
+
|
| 400 |
+
with tabs[4]: # Compliance Impact
|
| 401 |
+
st.markdown("### Compliance and Regulatory Impact")
|
| 402 |
+
compliance_impacts = results.get('compliance_impact', [])
|
| 403 |
+
if compliance_impacts:
|
| 404 |
+
for impact in compliance_impacts:
|
| 405 |
+
risk_color = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢"}.get(impact.get('risk_level', 'MEDIUM'), "🟡")
|
| 406 |
+
|
| 407 |
+
with st.expander(f"{risk_color} {impact.get('area', 'Compliance Area')} - {impact.get('risk_level', 'MEDIUM')} Risk"):
|
| 408 |
+
st.markdown(f"**Impact:** {impact.get('impact', 'No description available')}")
|
| 409 |
+
st.markdown(f"**Recommendation:** {impact.get('recommendation', 'No recommendation provided')}")
|
| 410 |
+
else:
|
| 411 |
+
st.success("No compliance impacts identified")
|
| 412 |
+
|
| 413 |
+
with tabs[5]: # Recommendations
|
| 414 |
+
st.markdown("### Action Items and Recommendations")
|
| 415 |
+
recommendations = results.get('recommendations', [])
|
| 416 |
+
if recommendations:
|
| 417 |
+
for i, rec in enumerate(recommendations, 1):
|
| 418 |
+
st.markdown(f"{i}. {rec}")
|
| 419 |
+
else:
|
| 420 |
+
st.info("No specific recommendations provided")
|
| 421 |
+
|
| 422 |
+
# Raw response section (collapsible)
|
| 423 |
+
if 'raw_response' in results:
|
| 424 |
+
with st.expander("🔍 Raw Analysis Output"):
|
| 425 |
+
st.text(results['raw_response'])
|
| 426 |
+
|
| 427 |
+
def display_side_by_side_images(artwork1_data, artwork2_data):
|
| 428 |
+
"""Display the two artwork images side by side"""
|
| 429 |
+
st.markdown("## 🖼️ Side-by-Side Comparison")
|
| 430 |
+
|
| 431 |
+
col1, col2 = st.columns(2)
|
| 432 |
+
|
| 433 |
+
with col1:
|
| 434 |
+
st.markdown(f"### {artwork1_data['file_name']}")
|
| 435 |
+
st.image(ImageUtils.crop_image(artwork1_data['image']), caption=artwork1_data['file_name'], use_container_width=True)
|
| 436 |
+
|
| 437 |
+
# Display image processing info
|
| 438 |
+
if 'image_quality' in artwork1_data and 'image_size_bytes' in artwork1_data:
|
| 439 |
+
quality = artwork1_data['image_quality']
|
| 440 |
+
size_mb = artwork1_data['image_size_bytes'] / (1024 * 1024)
|
| 441 |
+
st.info(f"📊 Image Quality: {quality}% | Size: {size_mb:.2f}MB")
|
| 442 |
+
|
| 443 |
+
# Display extracted data summary
|
| 444 |
+
with st.expander("📊 Extracted Data Summary"):
|
| 445 |
+
text_elements = len(artwork1_data['bounding_boxes']) if artwork1_data['bounding_boxes'] else 0
|
| 446 |
+
barcodes = len(artwork1_data['barcode_results']) if artwork1_data['barcode_results'] else 0
|
| 447 |
+
st.metric("Text Elements", text_elements)
|
| 448 |
+
st.metric("Barcodes", barcodes)
|
| 449 |
+
|
| 450 |
+
with col2:
|
| 451 |
+
st.markdown(f"### {artwork2_data['file_name']}")
|
| 452 |
+
st.image(ImageUtils.crop_image(artwork2_data['image']), caption=artwork2_data['file_name'], use_container_width=True)
|
| 453 |
+
|
| 454 |
+
# Display image processing info
|
| 455 |
+
if 'image_quality' in artwork2_data and 'image_size_bytes' in artwork2_data:
|
| 456 |
+
quality = artwork2_data['image_quality']
|
| 457 |
+
size_mb = artwork2_data['image_size_bytes'] / (1024 * 1024)
|
| 458 |
+
st.info(f"📊 Image Quality: {quality}% | Size: {size_mb:.2f}MB")
|
| 459 |
+
|
| 460 |
+
# Display extracted data summary
|
| 461 |
+
with st.expander("📊 Extracted Data Summary"):
|
| 462 |
+
text_elements = len(artwork2_data['bounding_boxes']) if artwork2_data['bounding_boxes'] else 0
|
| 463 |
+
barcodes = len(artwork2_data['barcode_results']) if artwork2_data['barcode_results'] else 0
|
| 464 |
+
st.metric("Text Elements", text_elements)
|
| 465 |
+
st.metric("Barcodes", barcodes)
|
| 466 |
+
|
| 467 |
+
def main():
|
| 468 |
+
st.set_page_config(layout="wide", page_title="Artwork Comparison Tool")
|
| 469 |
+
|
| 470 |
+
# Load client artwork files
|
| 471 |
+
client_artwork_files = load_client_artwork_files()
|
| 472 |
+
|
| 473 |
+
# Initialize session state
|
| 474 |
+
if "artwork1_data" not in st.session_state:
|
| 475 |
+
st.session_state.artwork1_data = None
|
| 476 |
+
if "artwork2_data" not in st.session_state:
|
| 477 |
+
st.session_state.artwork2_data = None
|
| 478 |
+
if "comparison_results" not in st.session_state:
|
| 479 |
+
st.session_state.comparison_results = None
|
| 480 |
+
|
| 481 |
+
st.title("🎨 Artwork Comparison Tool")
|
| 482 |
+
st.write("Compare two packaging artwork PDFs to identify differences in text, layout, barcodes, and visual elements.")
|
| 483 |
+
|
| 484 |
+
# File selection section
|
| 485 |
+
st.markdown("## 📁 Select Artworks to Compare")
|
| 486 |
+
|
| 487 |
+
col1, col2 = st.columns(2)
|
| 488 |
+
|
| 489 |
+
with col1:
|
| 490 |
+
st.markdown("### 🎨 Artwork 1")
|
| 491 |
+
|
| 492 |
+
# Create tabs for client files vs upload
|
| 493 |
+
art1_tab1, art1_tab2 = st.tabs(["📁 Client Files", "📤 Upload New"])
|
| 494 |
+
|
| 495 |
+
with art1_tab1:
|
| 496 |
+
if client_artwork_files:
|
| 497 |
+
art1_options = ["Select artwork 1..."] + [f["name"] for f in client_artwork_files]
|
| 498 |
+
selected_art1_file = st.selectbox("Choose artwork 1:", art1_options, key="art1_select")
|
| 499 |
+
|
| 500 |
+
if selected_art1_file != "Select artwork 1...":
|
| 501 |
+
# Find and load the selected file
|
| 502 |
+
for file_info in client_artwork_files:
|
| 503 |
+
if file_info["name"] == selected_art1_file:
|
| 504 |
+
file_content = load_artwork_content(file_info)
|
| 505 |
+
if file_content:
|
| 506 |
+
import io
|
| 507 |
+
temp_file = io.BytesIO(file_content)
|
| 508 |
+
temp_file.name = file_info["name"]
|
| 509 |
+
|
| 510 |
+
# Extract data from the artwork
|
| 511 |
+
with st.spinner("Processing artwork 1..."):
|
| 512 |
+
st.session_state.artwork1_data = extract_pdf_data(temp_file, file_info["name"])
|
| 513 |
+
|
| 514 |
+
if st.session_state.artwork1_data:
|
| 515 |
+
st.success(f"✅ Loaded artwork 1: {selected_art1_file}")
|
| 516 |
+
break
|
| 517 |
+
else:
|
| 518 |
+
st.info("No client artwork files found")
|
| 519 |
+
|
| 520 |
+
with art1_tab2:
|
| 521 |
+
artwork1_file = st.file_uploader("Upload Artwork 1 (PDF)", type=["pdf"], key="art1_upload")
|
| 522 |
+
|
| 523 |
+
if artwork1_file:
|
| 524 |
+
with st.spinner("Processing artwork 1..."):
|
| 525 |
+
st.session_state.artwork1_data = extract_pdf_data(artwork1_file, artwork1_file.name)
|
| 526 |
+
|
| 527 |
+
if st.session_state.artwork1_data:
|
| 528 |
+
st.success(f"✅ Uploaded artwork 1: {artwork1_file.name}")
|
| 529 |
+
|
| 530 |
+
with col2:
|
| 531 |
+
st.markdown("### 🎨 Artwork 2")
|
| 532 |
+
|
| 533 |
+
# Create tabs for client files vs upload
|
| 534 |
+
art2_tab1, art2_tab2 = st.tabs(["📁 Client Files", "📤 Upload New"])
|
| 535 |
+
|
| 536 |
+
with art2_tab1:
|
| 537 |
+
if client_artwork_files:
|
| 538 |
+
art2_options = ["Select artwork 2..."] + [f["name"] for f in client_artwork_files]
|
| 539 |
+
selected_art2_file = st.selectbox("Choose artwork 2:", art2_options, key="art2_select")
|
| 540 |
+
|
| 541 |
+
if selected_art2_file != "Select artwork 2...":
|
| 542 |
+
# Find and load the selected file
|
| 543 |
+
for file_info in client_artwork_files:
|
| 544 |
+
if file_info["name"] == selected_art2_file:
|
| 545 |
+
file_content = load_artwork_content(file_info)
|
| 546 |
+
if file_content:
|
| 547 |
+
import io
|
| 548 |
+
temp_file = io.BytesIO(file_content)
|
| 549 |
+
temp_file.name = file_info["name"]
|
| 550 |
+
|
| 551 |
+
# Extract data from the artwork
|
| 552 |
+
with st.spinner("Processing artwork 2..."):
|
| 553 |
+
st.session_state.artwork2_data = extract_pdf_data(temp_file, file_info["name"])
|
| 554 |
+
|
| 555 |
+
if st.session_state.artwork2_data:
|
| 556 |
+
st.success(f"✅ Loaded artwork 2: {selected_art2_file}")
|
| 557 |
+
break
|
| 558 |
+
else:
|
| 559 |
+
st.info("No client artwork files found")
|
| 560 |
+
|
| 561 |
+
with art2_tab2:
|
| 562 |
+
artwork2_file = st.file_uploader("Upload Artwork 2 (PDF)", type=["pdf"], key="art2_upload")
|
| 563 |
+
|
| 564 |
+
if artwork2_file:
|
| 565 |
+
with st.spinner("Processing artwork 2..."):
|
| 566 |
+
st.session_state.artwork2_data = extract_pdf_data(artwork2_file, artwork2_file.name)
|
| 567 |
+
|
| 568 |
+
if st.session_state.artwork2_data:
|
| 569 |
+
st.success(f"✅ Uploaded artwork 2: {artwork2_file.name}")
|
| 570 |
+
|
| 571 |
+
# Display images side by side if both are loaded
|
| 572 |
+
if st.session_state.artwork1_data and st.session_state.artwork2_data:
|
| 573 |
+
display_side_by_side_images(st.session_state.artwork1_data, st.session_state.artwork2_data)
|
| 574 |
+
|
| 575 |
+
# Model selection
|
| 576 |
+
model_option = "claude-sonnet-4-20250514"
|
| 577 |
+
|
| 578 |
+
# Comparison button
|
| 579 |
+
if st.button("🔍 Compare Artworks", type="primary"):
|
| 580 |
+
if st.session_state.artwork1_data and st.session_state.artwork2_data:
|
| 581 |
+
with st.spinner("Analyzing artworks with Claude..."):
|
| 582 |
+
st.session_state.comparison_results = compare_artworks_with_claude(
|
| 583 |
+
st.session_state.artwork1_data,
|
| 584 |
+
st.session_state.artwork2_data,
|
| 585 |
+
model=model_option
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
if st.session_state.comparison_results:
|
| 589 |
+
st.success("✅ Comparison analysis complete!")
|
| 590 |
+
else:
|
| 591 |
+
st.error("❌ Comparison analysis failed")
|
| 592 |
+
else:
|
| 593 |
+
st.warning("⚠️ Please select or upload both artworks before comparing")
|
| 594 |
+
|
| 595 |
+
# Display comparison results
|
| 596 |
+
if st.session_state.comparison_results:
|
| 597 |
+
display_comparison_results(
|
| 598 |
+
st.session_state.comparison_results,
|
| 599 |
+
st.session_state.artwork1_data,
|
| 600 |
+
st.session_state.artwork2_data
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
# Add helpful information
|
| 604 |
+
st.markdown("---")
|
| 605 |
+
st.markdown("""
|
| 606 |
+
### 🛠️ How It Works
|
| 607 |
+
1. **Extract Content**: The tool extracts text, bounding boxes, images, and barcodes from both PDFs
|
| 608 |
+
2. **AI Analysis**: Claude analyzes the extracted data and visual elements to identify differences
|
| 609 |
+
3. **Structured Results**: Differences are categorized by type (text, layout, barcode, visual) and significance
|
| 610 |
+
4. **Compliance Assessment**: Potential compliance impacts are identified with risk levels and recommendations
|
| 611 |
+
|
| 612 |
+
### 🎯 Use Cases
|
| 613 |
+
- **Quality Control**: Verify artwork changes between versions
|
| 614 |
+
- **Brand Consistency**: Ensure visual elements remain consistent
|
| 615 |
+
- **Compliance Review**: Identify changes that might affect regulatory compliance
|
| 616 |
+
- **Change Documentation**: Track and document artwork modifications
|
| 617 |
+
""")
|
| 618 |
+
|
| 619 |
+
if __name__ == "__main__":
|
| 620 |
+
main()
|
main.py
ADDED
|
@@ -0,0 +1,938 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import tempfile
|
| 3 |
+
import os
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from src.extract_text.ingest import RequirementsIngest
|
| 6 |
+
from src.extract_text.google_document_api import GoogleDocumentAPI
|
| 7 |
+
from src.extract_text.extract_meta_data import PDFArtworkMetadataExtractor
|
| 8 |
+
from src.core.analysis import ComplianceAnalysis
|
| 9 |
+
from pdf2image import convert_from_path
|
| 10 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 11 |
+
from src.utils.image_utils import ImageUtils
|
| 12 |
+
import base64
|
| 13 |
+
from io import BytesIO
|
| 14 |
+
from src.utils.barcode import Barcode
|
| 15 |
+
import glob
|
| 16 |
+
|
| 17 |
+
def load_client_requirements_files():
|
| 18 |
+
"""Load all requirements and packaging files from client-requirements directory"""
|
| 19 |
+
base_path = "requirements_library/client-requirements"
|
| 20 |
+
requirements_files = []
|
| 21 |
+
packaging_files = []
|
| 22 |
+
|
| 23 |
+
if not os.path.exists(base_path):
|
| 24 |
+
return requirements_files, packaging_files
|
| 25 |
+
|
| 26 |
+
# Walk through all subdirectories
|
| 27 |
+
for root, dirs, files in os.walk(base_path):
|
| 28 |
+
for file in files:
|
| 29 |
+
file_path = os.path.join(root, file)
|
| 30 |
+
relative_path = os.path.relpath(file_path, base_path)
|
| 31 |
+
|
| 32 |
+
if file.lower().endswith('.txt') and 'requirement' in file.lower():
|
| 33 |
+
requirements_files.append({
|
| 34 |
+
'name': f"{relative_path}",
|
| 35 |
+
'path': file_path,
|
| 36 |
+
'type': 'requirements'
|
| 37 |
+
})
|
| 38 |
+
elif file.lower().endswith('.pdf') and 'requirement' in file.lower():
|
| 39 |
+
requirements_files.append({
|
| 40 |
+
'name': f"{relative_path}",
|
| 41 |
+
'path': file_path,
|
| 42 |
+
'type': 'requirements'
|
| 43 |
+
})
|
| 44 |
+
elif file.lower().endswith('.pdf'):
|
| 45 |
+
packaging_files.append({
|
| 46 |
+
'name': f"{relative_path}",
|
| 47 |
+
'path': file_path,
|
| 48 |
+
'type': 'packaging'
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
return requirements_files, packaging_files
|
| 52 |
+
|
| 53 |
+
def load_file_content(file_info):
|
| 54 |
+
"""Load content from a file based on its type"""
|
| 55 |
+
try:
|
| 56 |
+
if file_info['type'] == 'requirements':
|
| 57 |
+
# For requirements files, read as text
|
| 58 |
+
with open(file_info['path'], 'r', encoding='utf-8') as f:
|
| 59 |
+
return f.read()
|
| 60 |
+
else:
|
| 61 |
+
# For packaging files, return bytes
|
| 62 |
+
with open(file_info['path'], 'rb') as f:
|
| 63 |
+
return f.read()
|
| 64 |
+
except Exception as e:
|
| 65 |
+
st.error(f"Error loading file {file_info['name']}: {str(e)}")
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
def load_requirements_content(file_info):
|
| 69 |
+
"""Load requirements content as string"""
|
| 70 |
+
try:
|
| 71 |
+
with open(file_info['path'], 'r', encoding='utf-8') as f:
|
| 72 |
+
return f.read()
|
| 73 |
+
except Exception as e:
|
| 74 |
+
st.error(f"Error loading requirements file {file_info['name']}: {str(e)}")
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
def load_packaging_content(file_info):
|
| 78 |
+
"""Load packaging content as bytes"""
|
| 79 |
+
try:
|
| 80 |
+
with open(file_info['path'], 'rb') as f:
|
| 81 |
+
return f.read()
|
| 82 |
+
except Exception as e:
|
| 83 |
+
st.error(f"Error loading packaging file {file_info['name']}: {str(e)}")
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
def main():
|
| 87 |
+
st.set_page_config(layout="wide", page_title="Packaging Compliance Checker")
|
| 88 |
+
|
| 89 |
+
# Load client requirements files
|
| 90 |
+
client_requirements_files, client_packaging_files = load_client_requirements_files()
|
| 91 |
+
|
| 92 |
+
# Initialize session state variables
|
| 93 |
+
if "requirements_text" not in st.session_state:
|
| 94 |
+
st.session_state.requirements_text = None
|
| 95 |
+
if "analysis_results" not in st.session_state:
|
| 96 |
+
st.session_state.analysis_results = None
|
| 97 |
+
if "current_requirements_file" not in st.session_state:
|
| 98 |
+
st.session_state.current_requirements_file = None
|
| 99 |
+
if "uploaded_packaging_files" not in st.session_state:
|
| 100 |
+
st.session_state.uploaded_packaging_files = []
|
| 101 |
+
if "selected_packaging_file" not in st.session_state:
|
| 102 |
+
st.session_state.selected_packaging_file = None
|
| 103 |
+
if "client_requirements_files" not in st.session_state:
|
| 104 |
+
st.session_state.client_requirements_files = client_requirements_files
|
| 105 |
+
if "client_packaging_files" not in st.session_state:
|
| 106 |
+
st.session_state.client_packaging_files = client_packaging_files
|
| 107 |
+
|
| 108 |
+
st.title("Packaging Compliance Checker")
|
| 109 |
+
st.write(
|
| 110 |
+
"Upload a requirements document (plain text) that specifies requirements, "
|
| 111 |
+
"and then upload one or more packaging PDFs to check for compliance."
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Create two columns for the layout
|
| 115 |
+
col1, col2 = st.columns([1, 1])
|
| 116 |
+
|
| 117 |
+
with col1:
|
| 118 |
+
# Stylish upload section with custom CSS
|
| 119 |
+
st.markdown("""
|
| 120 |
+
<style>
|
| 121 |
+
.upload-section {
|
| 122 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 123 |
+
padding: 20px;
|
| 124 |
+
border-radius: 15px;
|
| 125 |
+
color: white;
|
| 126 |
+
margin-bottom: 20px;
|
| 127 |
+
}
|
| 128 |
+
.upload-title {
|
| 129 |
+
font-size: 24px;
|
| 130 |
+
font-weight: bold;
|
| 131 |
+
margin-bottom: 15px;
|
| 132 |
+
text-align: center;
|
| 133 |
+
}
|
| 134 |
+
.upload-description {
|
| 135 |
+
font-size: 14px;
|
| 136 |
+
opacity: 0.9;
|
| 137 |
+
margin-bottom: 20px;
|
| 138 |
+
text-align: center;
|
| 139 |
+
}
|
| 140 |
+
.file-uploader {
|
| 141 |
+
background: rgba(255, 255, 255, 0.1);
|
| 142 |
+
border: 2px dashed rgba(255, 255, 255, 0.3);
|
| 143 |
+
border-radius: 10px;
|
| 144 |
+
padding: 15px;
|
| 145 |
+
margin-bottom: 15px;
|
| 146 |
+
}
|
| 147 |
+
.requirements-display {
|
| 148 |
+
background: rgba(255, 255, 255, 0.05);
|
| 149 |
+
border-radius: 10px;
|
| 150 |
+
padding: 15px;
|
| 151 |
+
margin-top: 15px;
|
| 152 |
+
}
|
| 153 |
+
.artwork-display {
|
| 154 |
+
background: rgba(255, 255, 255, 0.05);
|
| 155 |
+
border-radius: 10px;
|
| 156 |
+
padding: 15px;
|
| 157 |
+
margin-top: 15px;
|
| 158 |
+
}
|
| 159 |
+
.image-container {
|
| 160 |
+
max-width: 100%;
|
| 161 |
+
border-radius: 8px;
|
| 162 |
+
overflow: hidden;
|
| 163 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
| 164 |
+
}
|
| 165 |
+
</style>
|
| 166 |
+
""", unsafe_allow_html=True)
|
| 167 |
+
|
| 168 |
+
# Upload section container
|
| 169 |
+
st.markdown('<div class="upload-section">', unsafe_allow_html=True)
|
| 170 |
+
st.markdown('<div class="upload-title">📄 Document Upload</div>', unsafe_allow_html=True)
|
| 171 |
+
st.markdown('<div class="upload-description">Upload your requirements and packaging documents for compliance analysis</div>', unsafe_allow_html=True)
|
| 172 |
+
|
| 173 |
+
# Requirements file selection
|
| 174 |
+
st.markdown('<div class="file-uploader">', unsafe_allow_html=True)
|
| 175 |
+
st.markdown("**📋 Requirements Document**")
|
| 176 |
+
|
| 177 |
+
# Create tabs for client files vs upload
|
| 178 |
+
req_tab1, req_tab2 = st.tabs(["📁 Client Files", "📤 Upload New"])
|
| 179 |
+
|
| 180 |
+
with req_tab1:
|
| 181 |
+
if st.session_state.client_requirements_files:
|
| 182 |
+
req_options = ["Select a requirements file..."] + [f["name"] for f in st.session_state.client_requirements_files]
|
| 183 |
+
selected_req_file = st.selectbox("Choose from client files:", req_options)
|
| 184 |
+
|
| 185 |
+
if selected_req_file != "Select a requirements file...":
|
| 186 |
+
# Find the selected file
|
| 187 |
+
selected_file_info = None
|
| 188 |
+
for file_info in st.session_state.client_requirements_files:
|
| 189 |
+
if file_info["name"] == selected_req_file:
|
| 190 |
+
selected_file_info = file_info
|
| 191 |
+
break
|
| 192 |
+
|
| 193 |
+
if selected_file_info:
|
| 194 |
+
# Load and process the requirements file
|
| 195 |
+
if selected_file_info["name"].lower().endswith('.pdf'):
|
| 196 |
+
# Handle PDF file - load as bytes
|
| 197 |
+
requirements_content = load_packaging_content(selected_file_info)
|
| 198 |
+
if requirements_content:
|
| 199 |
+
# Create a temporary file-like object for the RequirementsIngest
|
| 200 |
+
import io
|
| 201 |
+
temp_file = io.BytesIO(requirements_content)
|
| 202 |
+
temp_file.name = selected_file_info["name"]
|
| 203 |
+
else:
|
| 204 |
+
# Handle text file - load as text
|
| 205 |
+
requirements_content = load_requirements_content(selected_file_info)
|
| 206 |
+
if requirements_content:
|
| 207 |
+
# Create a temporary file-like object for the RequirementsIngest
|
| 208 |
+
import io
|
| 209 |
+
temp_file = io.StringIO(requirements_content)
|
| 210 |
+
temp_file.name = selected_file_info["name"]
|
| 211 |
+
|
| 212 |
+
st.session_state.requirements_text = RequirementsIngest().ingest_requirements_document(temp_file)
|
| 213 |
+
st.session_state.current_requirements_file = temp_file
|
| 214 |
+
st.session_state.analysis_results = None # Clear previous results
|
| 215 |
+
|
| 216 |
+
# Display file type information
|
| 217 |
+
if isinstance(st.session_state.requirements_text, dict):
|
| 218 |
+
file_type = st.session_state.requirements_text.get('type', 'unknown')
|
| 219 |
+
if file_type == 'pdf':
|
| 220 |
+
st.success(f"✅ Loaded PDF requirements from: {selected_req_file}")
|
| 221 |
+
st.info("📄 PDF will be processed natively by Claude for full visual analysis")
|
| 222 |
+
else:
|
| 223 |
+
st.success(f"✅ Loaded requirements from: {selected_req_file}")
|
| 224 |
+
else:
|
| 225 |
+
st.success(f"✅ Loaded requirements from: {selected_req_file}")
|
| 226 |
+
else:
|
| 227 |
+
st.info("No client requirements files found")
|
| 228 |
+
|
| 229 |
+
with req_tab2:
|
| 230 |
+
requirements_file = st.file_uploader("Upload Requirements Document (TXT or PDF)", type=["txt", "pdf"])
|
| 231 |
+
|
| 232 |
+
# Only process requirements if a new file is uploaded
|
| 233 |
+
if requirements_file and requirements_file != st.session_state.current_requirements_file:
|
| 234 |
+
st.session_state.requirements_text = RequirementsIngest().ingest_requirements_document(requirements_file)
|
| 235 |
+
st.session_state.current_requirements_file = requirements_file
|
| 236 |
+
st.session_state.analysis_results = None # Clear previous results
|
| 237 |
+
|
| 238 |
+
# Display file type information
|
| 239 |
+
if isinstance(st.session_state.requirements_text, dict):
|
| 240 |
+
file_type = st.session_state.requirements_text.get('type', 'unknown')
|
| 241 |
+
file_size = st.session_state.requirements_text.get('file_size', 0)
|
| 242 |
+
if file_type == 'pdf':
|
| 243 |
+
st.success(f"✅ Uploaded PDF requirements: {requirements_file.name} ({file_size:,} bytes)")
|
| 244 |
+
st.info("📄 PDF will be processed natively by Claude for full visual analysis")
|
| 245 |
+
else:
|
| 246 |
+
st.success(f"✅ Uploaded requirements: {requirements_file.name}")
|
| 247 |
+
else:
|
| 248 |
+
st.success(f"✅ Uploaded requirements: {requirements_file.name}")
|
| 249 |
+
|
| 250 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 251 |
+
|
| 252 |
+
# Packaging files selection
|
| 253 |
+
st.markdown('<div class="file-uploader">', unsafe_allow_html=True)
|
| 254 |
+
st.markdown("**📦 Packaging PDFs**")
|
| 255 |
+
|
| 256 |
+
# Create tabs for client files vs upload
|
| 257 |
+
pkg_tab1, pkg_tab2 = st.tabs(["📁 Client Files", "📤 Upload New"])
|
| 258 |
+
|
| 259 |
+
with pkg_tab1:
|
| 260 |
+
if st.session_state.client_packaging_files:
|
| 261 |
+
pkg_options = ["Select packaging files..."] + [f["name"] for f in st.session_state.client_packaging_files]
|
| 262 |
+
selected_pkg_files = st.multiselect("Choose from client files:", pkg_options[1:]) # Skip the placeholder
|
| 263 |
+
|
| 264 |
+
if selected_pkg_files:
|
| 265 |
+
# Convert selected client files to file-like objects
|
| 266 |
+
client_file_objects = []
|
| 267 |
+
for selected_file_name in selected_pkg_files:
|
| 268 |
+
# Find the selected file
|
| 269 |
+
for file_info in st.session_state.client_packaging_files:
|
| 270 |
+
if file_info["name"] == selected_file_name:
|
| 271 |
+
# Create a file-like object
|
| 272 |
+
import io
|
| 273 |
+
file_content = load_packaging_content(file_info)
|
| 274 |
+
if file_content:
|
| 275 |
+
temp_file = io.BytesIO(file_content)
|
| 276 |
+
temp_file.name = file_info["name"]
|
| 277 |
+
client_file_objects.append(temp_file)
|
| 278 |
+
break
|
| 279 |
+
|
| 280 |
+
st.session_state.uploaded_packaging_files = client_file_objects
|
| 281 |
+
# Set the first file as selected if none is selected
|
| 282 |
+
if not st.session_state.selected_packaging_file and client_file_objects:
|
| 283 |
+
st.session_state.selected_packaging_file = client_file_objects[0]
|
| 284 |
+
st.success(f"✅ Loaded {len(client_file_objects)} packaging files from client directory")
|
| 285 |
+
else:
|
| 286 |
+
st.info("No client packaging files found")
|
| 287 |
+
|
| 288 |
+
with pkg_tab2:
|
| 289 |
+
packaging_files = st.file_uploader("Upload Packaging PDFs", type=["pdf"], accept_multiple_files=True)
|
| 290 |
+
|
| 291 |
+
# Update uploaded files list when new files are uploaded
|
| 292 |
+
if packaging_files:
|
| 293 |
+
st.session_state.uploaded_packaging_files = packaging_files
|
| 294 |
+
# Set the first file as selected if none is selected
|
| 295 |
+
if not st.session_state.selected_packaging_file and packaging_files:
|
| 296 |
+
st.session_state.selected_packaging_file = packaging_files[0]
|
| 297 |
+
st.success(f"✅ Uploaded {len(packaging_files)} packaging files")
|
| 298 |
+
else:
|
| 299 |
+
# Only clear if no files are selected from client directory either
|
| 300 |
+
if not st.session_state.uploaded_packaging_files:
|
| 301 |
+
st.session_state.selected_packaging_file = None
|
| 302 |
+
|
| 303 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 304 |
+
|
| 305 |
+
# File selector for multiple packaging files
|
| 306 |
+
if st.session_state.uploaded_packaging_files:
|
| 307 |
+
st.markdown('<div class="file-uploader">', unsafe_allow_html=True)
|
| 308 |
+
file_names = [f.name for f in st.session_state.uploaded_packaging_files]
|
| 309 |
+
selected_file_name = st.selectbox(
|
| 310 |
+
"Select packaging file to display:",
|
| 311 |
+
file_names,
|
| 312 |
+
index=file_names.index(st.session_state.selected_packaging_file.name) if st.session_state.selected_packaging_file else 0
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
# Update selected file
|
| 316 |
+
for file in st.session_state.uploaded_packaging_files:
|
| 317 |
+
if file.name == selected_file_name:
|
| 318 |
+
st.session_state.selected_packaging_file = file
|
| 319 |
+
break
|
| 320 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 326 |
+
|
| 327 |
+
# Requirements display section
|
| 328 |
+
if st.session_state.requirements_text:
|
| 329 |
+
st.markdown('<div class="requirements-display">', unsafe_allow_html=True)
|
| 330 |
+
with st.expander("📋 Requirements Document", expanded=True):
|
| 331 |
+
if isinstance(st.session_state.requirements_text, dict):
|
| 332 |
+
# PDF requirements
|
| 333 |
+
file_type = st.session_state.requirements_text.get('type', 'unknown')
|
| 334 |
+
filename = st.session_state.requirements_text.get('filename', 'Unknown')
|
| 335 |
+
file_size = st.session_state.requirements_text.get('file_size', 0)
|
| 336 |
+
|
| 337 |
+
st.markdown(f"**File Type:** {file_type.upper()}")
|
| 338 |
+
st.markdown(f"**Filename:** {filename}")
|
| 339 |
+
st.markdown(f"**File Size:** {file_size:,} bytes")
|
| 340 |
+
|
| 341 |
+
if file_type == 'pdf':
|
| 342 |
+
st.info("📄 This PDF will be processed natively by Claude for full visual analysis including charts, graphs, and visual layouts.")
|
| 343 |
+
st.markdown("**Preview Text:**")
|
| 344 |
+
st.text_area("Requirements Text", st.session_state.requirements_text.get('text_content', ''), height=200)
|
| 345 |
+
else:
|
| 346 |
+
st.text_area("Requirements Text", st.session_state.requirements_text.get('text_content', ''), height=200)
|
| 347 |
+
else:
|
| 348 |
+
# Text requirements (backward compatibility)
|
| 349 |
+
st.text_area("Requirements Text", st.session_state.requirements_text, height=200)
|
| 350 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 351 |
+
|
| 352 |
+
# Artwork display section
|
| 353 |
+
if st.session_state.selected_packaging_file:
|
| 354 |
+
st.markdown('<div class="artwork-display">', unsafe_allow_html=True)
|
| 355 |
+
with st.expander("🎨 Package Artwork", expanded=True):
|
| 356 |
+
try:
|
| 357 |
+
# Reset file pointer to beginning
|
| 358 |
+
st.session_state.selected_packaging_file.seek(0)
|
| 359 |
+
|
| 360 |
+
# Create a temporary file to process the PDF
|
| 361 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 362 |
+
tmp_file.write(st.session_state.selected_packaging_file.read())
|
| 363 |
+
tmp_pdf_path = tmp_file.name
|
| 364 |
+
|
| 365 |
+
# Convert PDF to image
|
| 366 |
+
try:
|
| 367 |
+
images = convert_from_path(tmp_pdf_path)
|
| 368 |
+
if not images:
|
| 369 |
+
raise ValueError("No pages found in PDF")
|
| 370 |
+
page_image = images[0] # Assuming single page for now
|
| 371 |
+
except Exception as e:
|
| 372 |
+
st.error(f"Error converting PDF to image: {str(e)}")
|
| 373 |
+
# Create a placeholder image
|
| 374 |
+
page_image = Image.new('RGB', (800, 600), color='white')
|
| 375 |
+
draw = ImageDraw.Draw(page_image)
|
| 376 |
+
draw.text((400, 300), "PDF conversion failed", fill='black', anchor='mm')
|
| 377 |
+
|
| 378 |
+
# Display the image with proportional sizing
|
| 379 |
+
st.markdown('<div class="image-container">', unsafe_allow_html=True)
|
| 380 |
+
st.image(page_image, caption=f"Package: {st.session_state.selected_packaging_file.name}", use_container_width=True)
|
| 381 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 382 |
+
|
| 383 |
+
# Clean up temporary file
|
| 384 |
+
if os.path.exists(tmp_pdf_path):
|
| 385 |
+
os.unlink(tmp_pdf_path)
|
| 386 |
+
|
| 387 |
+
except Exception as e:
|
| 388 |
+
st.error(f"Error displaying package artwork: {str(e)}")
|
| 389 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 390 |
+
|
| 391 |
+
with col2:
|
| 392 |
+
# Compliance guidelines section
|
| 393 |
+
st.markdown("""
|
| 394 |
+
<style>
|
| 395 |
+
.compliance-section {
|
| 396 |
+
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
| 397 |
+
padding: 20px;
|
| 398 |
+
border-radius: 15px;
|
| 399 |
+
color: white;
|
| 400 |
+
height: 100%;
|
| 401 |
+
}
|
| 402 |
+
.compliance-title {
|
| 403 |
+
font-size: 24px;
|
| 404 |
+
font-weight: bold;
|
| 405 |
+
margin-bottom: 15px;
|
| 406 |
+
text-align: center;
|
| 407 |
+
}
|
| 408 |
+
.compliance-content {
|
| 409 |
+
background: rgba(255, 255, 255, 0.1);
|
| 410 |
+
border-radius: 10px;
|
| 411 |
+
padding: 15px;
|
| 412 |
+
margin-top: 15px;
|
| 413 |
+
}
|
| 414 |
+
.status-compliant {
|
| 415 |
+
background: rgba(76, 175, 80, 0.2);
|
| 416 |
+
border-left: 4px solid #4CAF50;
|
| 417 |
+
padding: 10px;
|
| 418 |
+
margin: 10px 0;
|
| 419 |
+
border-radius: 5px;
|
| 420 |
+
}
|
| 421 |
+
.status-partial {
|
| 422 |
+
background: rgba(255, 193, 7, 0.2);
|
| 423 |
+
border-left: 4px solid #FFC107;
|
| 424 |
+
padding: 10px;
|
| 425 |
+
margin: 10px 0;
|
| 426 |
+
border-radius: 5px;
|
| 427 |
+
}
|
| 428 |
+
.status-non-compliant {
|
| 429 |
+
background: rgba(244, 67, 54, 0.2);
|
| 430 |
+
border-left: 4px solid #F44336;
|
| 431 |
+
padding: 10px;
|
| 432 |
+
margin: 10px 0;
|
| 433 |
+
border-radius: 5px;
|
| 434 |
+
}
|
| 435 |
+
</style>
|
| 436 |
+
""", unsafe_allow_html=True)
|
| 437 |
+
|
| 438 |
+
st.markdown('<div class="compliance-section">', unsafe_allow_html=True)
|
| 439 |
+
st.markdown('<div class="compliance-title">📋 Compliance Guidelines</div>', unsafe_allow_html=True)
|
| 440 |
+
|
| 441 |
+
# Read and display the compliance outline
|
| 442 |
+
try:
|
| 443 |
+
with open("requirements_library/compliance_outline.txt", "r") as f:
|
| 444 |
+
outline_content = f.read()
|
| 445 |
+
|
| 446 |
+
st.markdown('<div class="compliance-content">', unsafe_allow_html=True)
|
| 447 |
+
|
| 448 |
+
# Parse and format the content for better display
|
| 449 |
+
lines = outline_content.strip().split('\n')
|
| 450 |
+
current_section = ""
|
| 451 |
+
|
| 452 |
+
for line in lines:
|
| 453 |
+
line = line.strip()
|
| 454 |
+
if not line:
|
| 455 |
+
continue
|
| 456 |
+
|
| 457 |
+
if line == "Compliance Outline":
|
| 458 |
+
st.markdown("**📋 Compliance Outline**")
|
| 459 |
+
elif line == "Compliant":
|
| 460 |
+
st.markdown('<div class="status-compliant">', unsafe_allow_html=True)
|
| 461 |
+
st.markdown("🟢 **Compliant**")
|
| 462 |
+
current_section = "compliant"
|
| 463 |
+
elif line == "Partially Compliant":
|
| 464 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 465 |
+
st.markdown('<div class="status-partial">', unsafe_allow_html=True)
|
| 466 |
+
st.markdown("🟡 **Partially Compliant**")
|
| 467 |
+
current_section = "partial"
|
| 468 |
+
elif line == "Non-Compliant":
|
| 469 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 470 |
+
st.markdown('<div class="status-non-compliant">', unsafe_allow_html=True)
|
| 471 |
+
st.markdown("🔴 **Non-Compliant**")
|
| 472 |
+
current_section = "non_compliant"
|
| 473 |
+
elif line.startswith("> "):
|
| 474 |
+
# Description line
|
| 475 |
+
description = line[2:] # Remove "> "
|
| 476 |
+
st.markdown(f"*{description}*")
|
| 477 |
+
elif line == "Example Criteria:":
|
| 478 |
+
st.markdown("**Example Criteria:**")
|
| 479 |
+
elif line.startswith("- "):
|
| 480 |
+
# Criteria item
|
| 481 |
+
criteria = line[2:] # Remove "- "
|
| 482 |
+
st.markdown(f"• {criteria}")
|
| 483 |
+
elif line and not line.startswith("Example Criteria:"):
|
| 484 |
+
# Any other content
|
| 485 |
+
st.markdown(line)
|
| 486 |
+
|
| 487 |
+
# Close the last status div
|
| 488 |
+
if current_section:
|
| 489 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 490 |
+
|
| 491 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 492 |
+
|
| 493 |
+
except FileNotFoundError:
|
| 494 |
+
st.error("Compliance outline file not found")
|
| 495 |
+
except Exception as e:
|
| 496 |
+
st.error(f"Error reading compliance outline: {e}")
|
| 497 |
+
|
| 498 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 499 |
+
|
| 500 |
+
# Model selection
|
| 501 |
+
# model_option = st.selectbox(
|
| 502 |
+
# "Select Claude Model",
|
| 503 |
+
# ["claude-sonnet-4-20250514", "claude-3-5-haiku-20241022"]
|
| 504 |
+
# )
|
| 505 |
+
model_option = "claude-sonnet-4-20250514"
|
| 506 |
+
|
| 507 |
+
# Analysis button
|
| 508 |
+
if st.button("Analyze Compliance"):
|
| 509 |
+
if st.session_state.requirements_text and st.session_state.uploaded_packaging_files:
|
| 510 |
+
for packaging_file in st.session_state.uploaded_packaging_files:
|
| 511 |
+
st.markdown(f"## Analyzing: {packaging_file.name}")
|
| 512 |
+
|
| 513 |
+
# Create a progress bar
|
| 514 |
+
progress_bar = st.progress(0)
|
| 515 |
+
status_text = st.empty()
|
| 516 |
+
|
| 517 |
+
# Save the uploaded PDF temporarily.
|
| 518 |
+
# Reset file pointer to beginning
|
| 519 |
+
packaging_file.seek(0)
|
| 520 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 521 |
+
tmp_file.write(packaging_file.read())
|
| 522 |
+
tmp_pdf_path = tmp_file.name
|
| 523 |
+
|
| 524 |
+
try:
|
| 525 |
+
# Ingest the packaging document.
|
| 526 |
+
status_text.text("Extracting text from packaging PDF...")
|
| 527 |
+
google_document_api = GoogleDocumentAPI(credentials_path="src/extract_text/photon-services-f0d3ec1417d0.json")
|
| 528 |
+
document = google_document_api.process_document(tmp_pdf_path)
|
| 529 |
+
packaging_text = google_document_api.extract_text_with_markdown_table(document)
|
| 530 |
+
packaging_data = google_document_api.extract_text_with_bounding_boxes(document)
|
| 531 |
+
progress_bar.progress(25)
|
| 532 |
+
|
| 533 |
+
# Process image once and store it efficiently
|
| 534 |
+
status_text.text("Processing packaging image...")
|
| 535 |
+
try:
|
| 536 |
+
images = convert_from_path(tmp_pdf_path)
|
| 537 |
+
if not images:
|
| 538 |
+
raise ValueError("No pages found in PDF")
|
| 539 |
+
page_image = images[0] # Assuming single page for now
|
| 540 |
+
except Exception as e:
|
| 541 |
+
st.error(f"Error converting PDF to image: {str(e)}")
|
| 542 |
+
# Create a placeholder image
|
| 543 |
+
page_image = Image.new('RGB', (800, 600), color='white')
|
| 544 |
+
draw = ImageDraw.Draw(page_image)
|
| 545 |
+
draw.text((400, 300), "PDF conversion failed", fill='black', anchor='mm')
|
| 546 |
+
# Convert to base64 once for analysis
|
| 547 |
+
buffer = BytesIO()
|
| 548 |
+
page_image.save(buffer, format='PNG')
|
| 549 |
+
image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
| 550 |
+
|
| 551 |
+
# Scan for barcodes
|
| 552 |
+
status_text.text("Scanning for barcodes...")
|
| 553 |
+
barcode = Barcode()
|
| 554 |
+
barcode_results = barcode.scan_and_validate(page_image)
|
| 555 |
+
|
| 556 |
+
progress_bar.progress(40)
|
| 557 |
+
|
| 558 |
+
#Extract metadata from the PDF
|
| 559 |
+
status_text.text("Extracting metadata from packaging...")
|
| 560 |
+
metadata_extractor = PDFArtworkMetadataExtractor()
|
| 561 |
+
metadata_results = metadata_extractor.extract_metadata(tmp_pdf_path)
|
| 562 |
+
|
| 563 |
+
# Convert tuple keys to strings for JSON serialization
|
| 564 |
+
if metadata_results and not metadata_results.get('error'):
|
| 565 |
+
if 'text_colors' in metadata_results:
|
| 566 |
+
# Convert color tuples to string representation
|
| 567 |
+
text_colors_str = {}
|
| 568 |
+
for color_tuple, count in metadata_results['text_colors'].items():
|
| 569 |
+
if isinstance(color_tuple, tuple):
|
| 570 |
+
color_str = f"RGB{color_tuple}"
|
| 571 |
+
else:
|
| 572 |
+
color_str = str(color_tuple)
|
| 573 |
+
text_colors_str[color_str] = count
|
| 574 |
+
metadata_results['text_colors'] = text_colors_str
|
| 575 |
+
|
| 576 |
+
progress_bar.progress(50)
|
| 577 |
+
|
| 578 |
+
# Call the enhanced analyze_compliance method with the raw text documents and metadata
|
| 579 |
+
status_text.text("Analyzing requirements and compliance...")
|
| 580 |
+
st.session_state.analysis_results = ComplianceAnalysis().analyze_compliance(
|
| 581 |
+
st.session_state.requirements_text,
|
| 582 |
+
packaging_text,
|
| 583 |
+
packaging_data,
|
| 584 |
+
image_base64,
|
| 585 |
+
barcode_results,
|
| 586 |
+
metadata_results,
|
| 587 |
+
model=model_option
|
| 588 |
+
)
|
| 589 |
+
progress_bar.progress(100)
|
| 590 |
+
status_text.text("Analysis complete!")
|
| 591 |
+
|
| 592 |
+
# Display the structured results
|
| 593 |
+
st.markdown("### Extracted Requirements")
|
| 594 |
+
if "requirements" in st.session_state.analysis_results:
|
| 595 |
+
req_df = pd.DataFrame(st.session_state.analysis_results["requirements"])
|
| 596 |
+
st.dataframe(req_df)
|
| 597 |
+
|
| 598 |
+
st.markdown("### Verification Results")
|
| 599 |
+
if "verifications" in st.session_state.analysis_results:
|
| 600 |
+
# Create tabs for different views of the results
|
| 601 |
+
tabs = st.tabs(["Summary", "Detailed Results"])
|
| 602 |
+
|
| 603 |
+
with tabs[0]:
|
| 604 |
+
# Count compliance statuses
|
| 605 |
+
if "verifications" in st.session_state.analysis_results:
|
| 606 |
+
statuses = [v.get("compliance_status", "UNKNOWN") for v in st.session_state.analysis_results["verifications"]]
|
| 607 |
+
compliant = statuses.count("COMPLIANT")
|
| 608 |
+
non_compliant = statuses.count("NON-COMPLIANT")
|
| 609 |
+
partial = statuses.count("PARTIALLY COMPLIANT")
|
| 610 |
+
error = len(statuses) - compliant - non_compliant - partial
|
| 611 |
+
|
| 612 |
+
# Create columns for status counts
|
| 613 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 614 |
+
col1.metric("Compliant", compliant)
|
| 615 |
+
col2.metric("Non-Compliant", non_compliant)
|
| 616 |
+
col3.metric("Partially Compliant", partial)
|
| 617 |
+
col4.metric("Errors", error)
|
| 618 |
+
|
| 619 |
+
# Display the overall compliance report
|
| 620 |
+
if "compliance_report" in st.session_state.analysis_results:
|
| 621 |
+
st.markdown(st.session_state.analysis_results["compliance_report"])
|
| 622 |
+
|
| 623 |
+
with tabs[1]:
|
| 624 |
+
st.markdown("### Barcode Scanning Results")
|
| 625 |
+
if "barcode_data" in st.session_state.analysis_results and st.session_state.analysis_results["barcode_data"]:
|
| 626 |
+
barcode_df = pd.DataFrame(st.session_state.analysis_results["barcode_data"])
|
| 627 |
+
st.dataframe(barcode_df)
|
| 628 |
+
|
| 629 |
+
# Display barcode summary
|
| 630 |
+
valid_barcodes = sum(1 for barcode in st.session_state.analysis_results["barcode_data"] if barcode["valid"])
|
| 631 |
+
total_barcodes = len(st.session_state.analysis_results["barcode_data"])
|
| 632 |
+
st.markdown(f"**Barcode Summary:** {valid_barcodes}/{total_barcodes} valid barcodes found")
|
| 633 |
+
else:
|
| 634 |
+
st.info("No barcodes found in the packaging")
|
| 635 |
+
|
| 636 |
+
# Display metadata results
|
| 637 |
+
st.markdown("### Typography and Design Metadata")
|
| 638 |
+
if "metadata" in st.session_state.analysis_results and st.session_state.analysis_results["metadata"]:
|
| 639 |
+
metadata = st.session_state.analysis_results["metadata"]
|
| 640 |
+
|
| 641 |
+
if metadata.get('error'):
|
| 642 |
+
st.error(f"Metadata extraction error: {metadata['error']}")
|
| 643 |
+
else:
|
| 644 |
+
# Display metadata summary
|
| 645 |
+
col1, col2 = st.columns(2)
|
| 646 |
+
|
| 647 |
+
with col1:
|
| 648 |
+
st.markdown("**Extraction Info:**")
|
| 649 |
+
st.write(f"**Method:** {metadata.get('extraction_method', 'Unknown')}")
|
| 650 |
+
st.write(f"**Selectable Text:** {'Yes' if metadata.get('has_selectable_text') else 'No'}")
|
| 651 |
+
st.write(f"**Pages Processed:** {metadata.get('pages_processed', 0)}")
|
| 652 |
+
|
| 653 |
+
with col2:
|
| 654 |
+
st.markdown("**Dominant Elements:**")
|
| 655 |
+
if metadata.get('fonts'):
|
| 656 |
+
dominant_font = max(metadata['fonts'].items(), key=lambda x: x[1])[0]
|
| 657 |
+
st.write(f"**Font:** {dominant_font}")
|
| 658 |
+
if metadata.get('font_sizes'):
|
| 659 |
+
dominant_size = max(metadata['font_sizes'].items(), key=lambda x: x[1])[0]
|
| 660 |
+
st.write(f"**Font Size:** {dominant_size:.1f}pt")
|
| 661 |
+
if metadata.get('text_colors'):
|
| 662 |
+
dominant_color = max(metadata['text_colors'].items(), key=lambda x: x[1])[0]
|
| 663 |
+
st.write(f"**Text Color:** {dominant_color}")
|
| 664 |
+
|
| 665 |
+
# Display detailed metadata in expandable sections
|
| 666 |
+
with st.expander("📊 Detailed Font Analysis"):
|
| 667 |
+
if metadata.get('fonts'):
|
| 668 |
+
font_df = pd.DataFrame([
|
| 669 |
+
{'Font': font, 'Character Count': count}
|
| 670 |
+
for font, count in list(metadata['fonts'].items())[:10] # Top 10
|
| 671 |
+
])
|
| 672 |
+
st.dataframe(font_df)
|
| 673 |
+
else:
|
| 674 |
+
st.info("No font data available")
|
| 675 |
+
|
| 676 |
+
with st.expander("📏 Font Size Distribution"):
|
| 677 |
+
if metadata.get('font_sizes'):
|
| 678 |
+
size_df = pd.DataFrame([
|
| 679 |
+
{'Font Size (pt)': f"{size:.1f}", 'Character Count': count}
|
| 680 |
+
for size, count in list(metadata['font_sizes'].items())[:10] # Top 10
|
| 681 |
+
])
|
| 682 |
+
st.dataframe(size_df)
|
| 683 |
+
else:
|
| 684 |
+
st.info("No font size data available")
|
| 685 |
+
|
| 686 |
+
with st.expander("🎨 Text Color Analysis"):
|
| 687 |
+
if metadata.get('text_colors'):
|
| 688 |
+
color_df = pd.DataFrame([
|
| 689 |
+
{'Color (RGB)': str(color), 'Character Count': count}
|
| 690 |
+
for color, count in list(metadata['text_colors'].items())[:10] # Top 10
|
| 691 |
+
])
|
| 692 |
+
st.dataframe(color_df)
|
| 693 |
+
else:
|
| 694 |
+
st.info("No color data available")
|
| 695 |
+
else:
|
| 696 |
+
st.info("No metadata available")
|
| 697 |
+
|
| 698 |
+
# Show detailed verification results
|
| 699 |
+
for i, verification in enumerate(st.session_state.analysis_results["verifications"]):
|
| 700 |
+
req_id = verification.get("requirement_id", f"REQ{i+1}")
|
| 701 |
+
text_id = verification.get("Text ID", "Unknown")
|
| 702 |
+
status = verification.get("compliance_status", "UNKNOWN")
|
| 703 |
+
|
| 704 |
+
# Color-code status
|
| 705 |
+
if status == "COMPLIANT":
|
| 706 |
+
status_color = "green"
|
| 707 |
+
elif status == "NON-COMPLIANT":
|
| 708 |
+
status_color = "red"
|
| 709 |
+
elif status == "PARTIALLY COMPLIANT":
|
| 710 |
+
status_color = "orange"
|
| 711 |
+
else:
|
| 712 |
+
status_color = "gray"
|
| 713 |
+
|
| 714 |
+
with st.expander(f"{req_id}: {status}", expanded=status != "COMPLIANT"):
|
| 715 |
+
# Show confidence score if available
|
| 716 |
+
if "confidence" in verification:
|
| 717 |
+
st.progress(verification["confidence"])
|
| 718 |
+
|
| 719 |
+
# Show reasoning
|
| 720 |
+
if "reasoning" in verification:
|
| 721 |
+
st.markdown(f"**Reasoning:** {verification['reasoning']}")
|
| 722 |
+
|
| 723 |
+
# Show criteria if available
|
| 724 |
+
if "criteria" in verification and verification["criteria"]:
|
| 725 |
+
st.markdown("**Criteria:**")
|
| 726 |
+
for criterion in verification["criteria"]:
|
| 727 |
+
st.markdown(f"- {criterion}")
|
| 728 |
+
|
| 729 |
+
# Show evidence if available
|
| 730 |
+
if "evidence_found" in verification and verification["evidence_found"]:
|
| 731 |
+
st.markdown("**Evidence Found:**")
|
| 732 |
+
|
| 733 |
+
# Separate text, visual, and barcode evidence
|
| 734 |
+
text_evidence = []
|
| 735 |
+
visual_evidence = []
|
| 736 |
+
barcode_evidence = []
|
| 737 |
+
|
| 738 |
+
for evidence in verification["evidence_found"]:
|
| 739 |
+
if "text_id" in evidence and evidence["text_id"] is not None:
|
| 740 |
+
text_evidence.append(evidence)
|
| 741 |
+
elif "barcode_id" in evidence and evidence["barcode_id"] is not None:
|
| 742 |
+
barcode_evidence.append(evidence)
|
| 743 |
+
else:
|
| 744 |
+
visual_evidence.append(evidence)
|
| 745 |
+
|
| 746 |
+
# Display text evidence
|
| 747 |
+
if text_evidence:
|
| 748 |
+
st.markdown("**Text Evidence:**")
|
| 749 |
+
for evidence in text_evidence:
|
| 750 |
+
text_id = evidence.get("text_id", "Unknown")
|
| 751 |
+
evidence_text = evidence.get("evidence_text", "No description")
|
| 752 |
+
st.markdown(f"- **Text ID {text_id}:** {evidence_text}")
|
| 753 |
+
|
| 754 |
+
# Display barcode evidence
|
| 755 |
+
if barcode_evidence:
|
| 756 |
+
st.markdown("**Barcode Evidence:**")
|
| 757 |
+
for evidence in barcode_evidence:
|
| 758 |
+
barcode_id = evidence.get("barcode_id", "Unknown")
|
| 759 |
+
evidence_text = evidence.get("evidence_text", "No description")
|
| 760 |
+
st.markdown(f"- **Barcode ID {barcode_id}:** {evidence_text}")
|
| 761 |
+
|
| 762 |
+
# Display visual evidence
|
| 763 |
+
if visual_evidence:
|
| 764 |
+
st.markdown("**Visual Evidence (from image analysis):**")
|
| 765 |
+
for i, evidence in enumerate(visual_evidence, 1):
|
| 766 |
+
evidence_text = evidence.get("evidence_text", "Visual element referenced by Claude")
|
| 767 |
+
st.markdown(f"- **Visual {i}:** {evidence_text}")
|
| 768 |
+
|
| 769 |
+
# Show summary
|
| 770 |
+
total_evidence = len(verification["evidence_found"])
|
| 771 |
+
st.markdown(f"*Total evidence: {total_evidence} ({len(text_evidence)} text, {len(barcode_evidence)} barcode, {len(visual_evidence)} visual)*")
|
| 772 |
+
|
| 773 |
+
# Individual visualization for this requirement
|
| 774 |
+
if "evidence_found" in verification and verification["evidence_found"]:
|
| 775 |
+
st.markdown(f"### Evidence Visualization for {req_id}")
|
| 776 |
+
|
| 777 |
+
# Create a copy of the image for drawing
|
| 778 |
+
try:
|
| 779 |
+
draw_image = page_image.copy()
|
| 780 |
+
draw = ImageDraw.Draw(draw_image)
|
| 781 |
+
img_width, img_height = draw_image.size
|
| 782 |
+
|
| 783 |
+
# Define colors for different compliance statuses
|
| 784 |
+
status_colors = {
|
| 785 |
+
"COMPLIANT": "green",
|
| 786 |
+
"NON-COMPLIANT": "red",
|
| 787 |
+
"PARTIALLY COMPLIANT": "orange",
|
| 788 |
+
"ERROR": "purple",
|
| 789 |
+
"UNKNOWN": "gray"
|
| 790 |
+
}
|
| 791 |
+
|
| 792 |
+
# Get color for this requirement's status
|
| 793 |
+
color = status_colors.get(status, "gray")
|
| 794 |
+
|
| 795 |
+
# Add a legend for this requirement
|
| 796 |
+
st.markdown(f"**Status:** <span style='color:{color}'>■</span> {status}", unsafe_allow_html=True)
|
| 797 |
+
|
| 798 |
+
# Track evidence types
|
| 799 |
+
text_evidence_count = 0
|
| 800 |
+
visual_evidence_count = 0
|
| 801 |
+
barcode_evidence_count = 0
|
| 802 |
+
|
| 803 |
+
# Draw evidence boxes for this specific requirement
|
| 804 |
+
if "packaging_data" in st.session_state.analysis_results:
|
| 805 |
+
for evidence in verification["evidence_found"]:
|
| 806 |
+
if "text_id" in evidence and evidence["text_id"] is not None:
|
| 807 |
+
# Handle text-based evidence with bounding boxes
|
| 808 |
+
text_id = evidence["text_id"]
|
| 809 |
+
try:
|
| 810 |
+
# Check if text_id is numeric for bounding box lookup
|
| 811 |
+
if isinstance(text_id, (int, float)) or (isinstance(text_id, str) and text_id.isdigit()):
|
| 812 |
+
# Text ID is 1-based, list is 0-based
|
| 813 |
+
numeric_id = int(text_id)
|
| 814 |
+
item = st.session_state.analysis_results["packaging_data"][numeric_id - 1]
|
| 815 |
+
box = item["bounding_box"]
|
| 816 |
+
|
| 817 |
+
# Denormalize vertices
|
| 818 |
+
points = [(v['x'] * img_width, v['y'] * img_height) for v in box]
|
| 819 |
+
|
| 820 |
+
# Draw polygon
|
| 821 |
+
draw.polygon(points, outline=color, width=3)
|
| 822 |
+
|
| 823 |
+
# Add a label with evidence number
|
| 824 |
+
text_evidence_count += 1
|
| 825 |
+
label = f"Text Evidence {text_evidence_count}"
|
| 826 |
+
draw.text(points[0], label, fill="white", stroke_width=2, stroke_fill="black")
|
| 827 |
+
else:
|
| 828 |
+
# Handle non-numeric text IDs (like barcode references)
|
| 829 |
+
text_evidence_count += 1
|
| 830 |
+
st.info(f"Text Evidence {text_evidence_count}: {evidence.get('evidence_text', 'Text element referenced by Claude')} (ID: {text_id})")
|
| 831 |
+
|
| 832 |
+
except (IndexError, KeyError) as e:
|
| 833 |
+
st.warning(f"Could not find bounding box for Text ID {text_id}: {e}")
|
| 834 |
+
elif "barcode_id" in evidence and evidence["barcode_id"] is not None:
|
| 835 |
+
# Handle barcode-based evidence with bounding boxes
|
| 836 |
+
barcode_id = evidence["barcode_id"]
|
| 837 |
+
try:
|
| 838 |
+
# Find the barcode in barcode_data
|
| 839 |
+
barcode_found = None
|
| 840 |
+
for barcode in st.session_state.analysis_results.get("barcode_data", []):
|
| 841 |
+
if barcode["id"] == barcode_id:
|
| 842 |
+
barcode_found = barcode
|
| 843 |
+
break
|
| 844 |
+
|
| 845 |
+
if barcode_found:
|
| 846 |
+
pos = barcode_found["position"]
|
| 847 |
+
x, y = pos["x"], pos["y"]
|
| 848 |
+
w, h = pos["width"], pos["height"]
|
| 849 |
+
|
| 850 |
+
# Draw rectangle for barcode
|
| 851 |
+
draw.rectangle([x, y, x + w, y + h], outline=color, width=3)
|
| 852 |
+
|
| 853 |
+
# Add a label with evidence number
|
| 854 |
+
barcode_evidence_count += 1
|
| 855 |
+
label = f"Barcode Evidence {barcode_evidence_count}"
|
| 856 |
+
draw.text((x, y - 20), label, fill="white", stroke_width=2, stroke_fill="black")
|
| 857 |
+
|
| 858 |
+
# Add barcode info
|
| 859 |
+
barcode_info = f"{barcode_found['type']}: {barcode_found['data']}"
|
| 860 |
+
draw.text((x, y - 40), barcode_info, fill="white", stroke_width=2, stroke_fill="black")
|
| 861 |
+
else:
|
| 862 |
+
st.warning(f"Could not find barcode data for Barcode ID {barcode_id}")
|
| 863 |
+
|
| 864 |
+
except Exception as e:
|
| 865 |
+
st.warning(f"Could not draw barcode bounding box for Barcode ID {barcode_id}: {e}")
|
| 866 |
+
else:
|
| 867 |
+
# Handle visual-only evidence (no text_id or barcode_id)
|
| 868 |
+
visual_evidence_count += 1
|
| 869 |
+
st.info(f"Visual Evidence {visual_evidence_count}: {evidence.get('evidence_text', 'Visual element referenced by Claude')}")
|
| 870 |
+
|
| 871 |
+
# Show the image if we have any evidence
|
| 872 |
+
if text_evidence_count > 0 or visual_evidence_count > 0 or barcode_evidence_count > 0:
|
| 873 |
+
# Add evidence count summary
|
| 874 |
+
evidence_summary = []
|
| 875 |
+
if text_evidence_count > 0:
|
| 876 |
+
evidence_summary.append(f"{text_evidence_count} text")
|
| 877 |
+
if barcode_evidence_count > 0:
|
| 878 |
+
evidence_summary.append(f"{barcode_evidence_count} barcode")
|
| 879 |
+
if visual_evidence_count > 0:
|
| 880 |
+
evidence_summary.append(f"{visual_evidence_count} visual")
|
| 881 |
+
|
| 882 |
+
st.markdown(f"**Evidence Count:** {', '.join(evidence_summary)}")
|
| 883 |
+
|
| 884 |
+
st.image(ImageUtils.crop_image(draw_image), caption=f"Evidence for {req_id} - {status}", use_container_width=True)
|
| 885 |
+
else:
|
| 886 |
+
st.info(f"No visual evidence found for {req_id}")
|
| 887 |
+
else:
|
| 888 |
+
# Handle case where no packaging data is available but we have evidence
|
| 889 |
+
evidence_counts = {
|
| 890 |
+
'text': len([e for e in verification["evidence_found"] if "text_id" in e and e["text_id"] is not None]),
|
| 891 |
+
'barcode': len([e for e in verification["evidence_found"] if "barcode_id" in e and e["barcode_id"] is not None]),
|
| 892 |
+
'visual': len([e for e in verification["evidence_found"] if ("text_id" not in e or e["text_id"] is None) and ("barcode_id" not in e or e["barcode_id"] is None)])
|
| 893 |
+
}
|
| 894 |
+
|
| 895 |
+
total_evidence = sum(evidence_counts.values())
|
| 896 |
+
if total_evidence > 0:
|
| 897 |
+
evidence_summary = []
|
| 898 |
+
if evidence_counts['text'] > 0:
|
| 899 |
+
evidence_summary.append(f"{evidence_counts['text']} text")
|
| 900 |
+
if evidence_counts['barcode'] > 0:
|
| 901 |
+
evidence_summary.append(f"{evidence_counts['barcode']} barcode")
|
| 902 |
+
if evidence_counts['visual'] > 0:
|
| 903 |
+
evidence_summary.append(f"{evidence_counts['visual']} visual")
|
| 904 |
+
|
| 905 |
+
st.info(f"Evidence Count: {', '.join(evidence_summary)} (no bounding box data available)")
|
| 906 |
+
# Show the original image without annotations
|
| 907 |
+
st.image(ImageUtils.crop_image(page_image), caption=f"Original image for {req_id} - {status}", use_container_width=True)
|
| 908 |
+
else:
|
| 909 |
+
st.info("No packaging data available for visualization")
|
| 910 |
+
|
| 911 |
+
except Exception as e:
|
| 912 |
+
st.error(f"Failed to generate visualization for {req_id}: {e}")
|
| 913 |
+
else:
|
| 914 |
+
st.info(f"No evidence found for {req_id}")
|
| 915 |
+
|
| 916 |
+
except Exception as e:
|
| 917 |
+
st.error(f"Error analyzing {packaging_file.name}: {str(e)}")
|
| 918 |
+
|
| 919 |
+
finally:
|
| 920 |
+
# Clean up the temporary file
|
| 921 |
+
if os.path.exists(tmp_pdf_path):
|
| 922 |
+
os.unlink(tmp_pdf_path)
|
| 923 |
+
else:
|
| 924 |
+
st.warning("Please upload a requirements document and at least one packaging PDF.")
|
| 925 |
+
|
| 926 |
+
# Add some helpful information at the bottom
|
| 927 |
+
st.markdown("---")
|
| 928 |
+
st.markdown("""
|
| 929 |
+
### How It Works
|
| 930 |
+
1. **Upload Requirements**: The system extracts structured requirements from your document
|
| 931 |
+
2. **Upload Packaging**: We extract text from PDFs and analyze them against requirements
|
| 932 |
+
3. **Analysis**: Each requirement is verified using structured reasoning and semantic matching
|
| 933 |
+
""")
|
| 934 |
+
|
| 935 |
+
if __name__ == "__main__":
|
| 936 |
+
# Import pandas here to avoid issues with st.set_page_config
|
| 937 |
+
import pandas as pd
|
| 938 |
+
main()
|
quick_test.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Quick test script for Google Document AI markdown table output.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add the src directory to the path
|
| 11 |
+
sys.path.append(str(Path(__file__).parent / "src"))
|
| 12 |
+
|
| 13 |
+
from extract_text.google_document_api import GoogleDocumentAPI
|
| 14 |
+
|
| 15 |
+
def quick_test():
|
| 16 |
+
"""Quick test of the markdown table generation."""
|
| 17 |
+
|
| 18 |
+
credentials_path = "src/extract_text/photon-services-f0d3ec1417d0.json"
|
| 19 |
+
test_pdf_path = "requirements_library/client-requirements/Kir-Kat/kitkat-f1.pdf"
|
| 20 |
+
|
| 21 |
+
if not os.path.exists(credentials_path):
|
| 22 |
+
print(f"❌ Credentials file not found: {credentials_path}")
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
if not os.path.exists(test_pdf_path):
|
| 26 |
+
print(f"❌ Test PDF file not found: {test_pdf_path}")
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
print("🔍 Quick test of Google Document AI...")
|
| 31 |
+
|
| 32 |
+
# Initialize and process
|
| 33 |
+
doc_api = GoogleDocumentAPI(credentials_path)
|
| 34 |
+
document = doc_api.process_document(test_pdf_path)
|
| 35 |
+
|
| 36 |
+
# Get text blocks with height
|
| 37 |
+
text_blocks = doc_api.extract_text_with_bounding_boxes(document)
|
| 38 |
+
print(f"📊 Found {len(text_blocks)} text blocks")
|
| 39 |
+
|
| 40 |
+
# Show first few blocks with height
|
| 41 |
+
print("\n📏 First 5 text blocks with height:")
|
| 42 |
+
print("-" * 60)
|
| 43 |
+
for i, block in enumerate(text_blocks[:5]):
|
| 44 |
+
print(f"Block {i+1}: Height={block['height']:.2f}mm | Text: {block['text'][:50]}...")
|
| 45 |
+
|
| 46 |
+
# Generate and display markdown table
|
| 47 |
+
print("\n📋 Markdown Table Output:")
|
| 48 |
+
print("=" * 80)
|
| 49 |
+
markdown_table = doc_api.extract_text_with_markdown_table(document)
|
| 50 |
+
print(markdown_table)
|
| 51 |
+
|
| 52 |
+
# Save to file
|
| 53 |
+
with open("quick_test_results.md", "w", encoding="utf-8") as f:
|
| 54 |
+
f.write("# Quick Test Results\n\n")
|
| 55 |
+
f.write(markdown_table)
|
| 56 |
+
|
| 57 |
+
print(f"\n✅ Results saved to: quick_test_results.md")
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f"❌ Error: {str(e)}")
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
quick_test()
|
quick_test_results.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Quick Test Results
|
| 2 |
+
|
| 3 |
+
| Text ID | X | Y | Height (mm) | Style | Text |\n|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\n| 1 | 35 | 935 | 48.69 | N/A | Version 1 |\n| 2 | 113 | 920 | 23.28 | N/A | TRIEZ-MOI |\n| 3 | 101 | 935 | 35.99 | N/A | PAPIER, |\n| 4 | 95 | 936 | 19.05 | N/A | JE SUIS EN |\n| 5 | 96 | 859 | 15.52 | N/A | EXTERIEUR |\n| 6 | 126 | 889 | 38.81 | N/A | ™MPAPER |\n| 7 | 312 | 280 | 2.82 | N/A | Nestlé S.A. |\n| 8 | 331 | 249 | 3.18 | N/A | La Bonne Portion |\n| 9 | 344 | 262 | 5.64 | N/A | 1 |\n| 10 | 333 | 278 | 2.47 | N/A | 1 BARRE = 1 PORTION |\n| 11 | 285 | 330 | 5.29 | N/A | 7613035 365896> |\n| 12 | 366 | 249 | 3.53 | N/A | Le Bon Geste de Tri / Verantwoord |\n| 13 | 367 | 254 | 3.53 | N/A | weggooien / Verantwortungsvoll entsorgen |\n| 14 | 394 | 275 | 3.53 | N/A | ÉLÉMENTS |\n| 15 | 390 | 285 | 2.82 | N/A | D'EMBALLAGE |\n| 16 | 351 | 299 | 4.59 | N/A | Predut par:/Geproduceerd door: Nestlé Deutschland AG, 60523 Frankfurt am |\n| 17 | 380 | 313 | 4.59 | N/A | EMBALLAGE
|
| 4 |
+
EXTERIEUR |\n| 18 | 401 | 309 | 6.7 | N/A | MIKKEL
|
| 5 |
+
INDIVIDUELS |\n| 19 | 445 | 247 | 3.18 | N/A | Plus d'Informations/Let's talk Informations nutritionnelles Voedingswaarde informatie Nährwertinformationen |\n| 20 | 445 | 252 | 3.53 | N/A | FR 806 800 363 |\n| 21 | 449 | 257 | 3.53 | N/A | (service |\n| 22 | 458 | 257 | 3.53 | N/A | gratuit +pakappel) |\n| 23 | 445 | 262 | 6.35 | N/A | NL 0205699699
|
| 6 |
+
BE(+32) 02 529 5525 |\n| 24 | 445 | 272 | 2.82 | N/A | www.kitkat.fr/www.kitkat.nl |\n| 25 | 446 | 278 | 10.94 | N/A | consommer de...
|
| 7 |
+
préférence avant fin:/
|
| 8 |
+
en minste houdbaar tot
|
| 9 |
+
einde: / Mindestens |\n| 26 | 445 | 296 | 3.88 | N/A | haltbar bis Ende |\n| 27 | 549 | 253 | 3.88 | N/A | \\|Parportion\\| Pro Portion \\|\\|%.AR" par partion%RM pro |\n| 28 | 518 | 258 | 3.88 | N/A | Pour 100g\\|Pro 100g\\|Per 100g |\n| 29 | 557 | 258 | 3.88 | N/A | Per portie 41,5 |\n| 30 | 529 | 265 | 3.18 | N/A | 2135 kJ/510 |\n| 31 | 554 | 265 | 3.18 | N/A | 886 kJ/212 kcal |\n| 32 | 598 | 265 | 2.82 | N/A | 115 |\n| 33 | 542 | 271 | 3.53 | N/A | 25,1 |\n| 34 | 565 | 271 | 3.18 | N/A | 10,8g |\n| 35 | 598 | 271 | 3.18 | N/A | 15% |\n| 36 | 567 | 282 | 3.18 | N/A | 6,1g |\n| 37 | 541 | 289 | 3.18 | N/A | 60,6g |\n| 38 | 565 | 289 | 2.82 | N/A | 25,2 g |\n| 39 | 492 | 293 | 3.88 | N/A | - dont sizes/-aren salas(-devon Zackar 43,49\\|\\| |\n| 40 | 565 | 294 | 2.82 | N/A | 18,0 g |\n| 41 | 598 | 295 | 2.47 | N/A | 20% |\n| 42 | 491 | 299 | 3.53 | N/A | Flores alimentabes/Venis/Bolestatale |\n| 43 | 542 | 299 | 3.18 | N/A | 25g |\n| 44 | 566 | 300 | 3.18 | N/A | 1,0g |\n| 45 | 543 | 306 | 3.18 | N/A | 6,69 |\n| 46 | 566 | 306 | 2.82 | N/A | 2,7g |\n| 47 | 542 | 312 | 2.82 | N/A | 005 |\n| 48 | 565 | 312 | 3.18 | N/A | 0,06 g |\n| 49 | 445 | 305 | 3.88 | N/A | de alfafilichting / Siebe unter der Lanche Preis/Bitten/Ed |\n| 50 | 422 | 311 | 3.53 | N/A | Conservation/Bewaaradvies, |\n| 51 | 423 | 315 | 4.59 | N/A | Lagerungshinweise |\n| 52 | 502 | 312 | 2.82 | N/A | Salz |\n| 53 | 422 | 319 | 5.29 | N/A | A corrà Paberi de la la, de draft Apport de diffrence pour adulte-type (8400 kJ/2000 kcal). Catge confitations |\n| 54 | 598 | 312 | 2.47 | N/A | 15 |\n| 55 | 489 | 329 | 3.53 | N/A | E400 2000. |\n| 56 | 526 | 329 | 3.53 | N/A | Diese Packing enthält 6 Portionen. Portaben sollen für Kinder entreched |\n| 57 | 490 | 339 | 4.23 | N/A | Verpakking bevet 6 peries. Parties diesen te worden aangepast in de leeftijd van Modern. |\n| 58 | 329 | 507 | 4.59 | N/A | W.I |\n| 59 | 334 | 524 | 5.29 | N/A | ON-ZEINL |\n| 60 | 357 | 548 | 3.53 | N/A | PING G |\n| 61 | 337 | 361 | 12.0 | N/A | x9 |\n| 62 | 390 | 361 | 4.94 | N/A | Ни и чела ил |\n| 63 | 473 | 365 | 3.88 | N/A | D |\n| 64 | 472 | 364 | 8.47 | N/A | в элоура элон |\n| 65 | 520 | 408 | 2.82 | N/A | = ਤ ਖ ਖਰਾਬ |\n| 66 | 524 | 419 | 5.29 | N/A | T |\n| 67 | 558 | 428 | 4.59 | N/A | FREE-PORN |\n| 68 | 337 | 603 | 10.58 | N/A | x9 |\n| 69 | 386 | 604 | 5.64 | N/A | Уни и ела ил |\n| 70 | 473 | 607 | 3.88 | N/A | D |\n| 71 | 562 | 544 | 18.35 | N/A | x9 |\n| 72 | 473 | 606 | 8.11 | N/A | в элоура в элон |\n| 73 | 286 | 652 | 14.46 | N/A | breaks |\n| 74 | 295 | 663 | 12.0 | N/A | FOR GOOD |\n| 75 | 293 | 679 | 3.88 | N/A | "Unbreak plus engage |\n| 76 | 285 | 694 | 3.18 | N/A | Ingrédients/Ingrediënten / Zutaten |\n| 77 | 347 | 652 | 5.64 | N/A | JE SUIS EN |\n| 78 | 348 | 656 | 7.76 | N/A | PAPIER |\n| 79 | 351 | 664 | 3.18 | N/A | I'M PAPER |\n| 80 | 337 | 673 | 3.53 | N/A | "De sachet est désormais comes maart |\n| 81 | 337 | 679 | 3.18 | N/A | de papier, vous pouvez le mettra dans votre bac |\n| 82 | 337 | 684 | 3.18 | N/A | de til sera recyclé dans la filière du papier. |\n| 83 | 386 | 643 | 3.53 | N/A | ET POUR LES EMBALLAGES |\n| 84 | 386 | 648 | 9.17 | N/A | INDIVIDUELS, ILS RESTENT EN
|
| 10 |
+
PLASTIQUE POUR GARANTIR LA
|
| 11 |
+
QUALITÉ ET LE GOÛT DE NOS |\n| 85 | 386 | 664 | 3.88 | N/A | BARRES KITKAT NOUS UTILISONS |\n| 86 | 386 | 669 | 2.47 | N/A | DU PLASTIQUE RECYCLE" |\n| 87 | 386 | 674 | 2.82 | N/A | AND FOR |\n| 88 | 398 | 674 | 2.82 | N/A | THE |\n| 89 | 403 | 675 | 2.82 | N/A | WRAPPERS |\n| 90 | 417 | 675 | 3.18 | N/A | INSIDE. HE USE |\n| 91 | 386 | 679 | 2.47 | N/A | RECYCLED PLASTIC TO ENSURE THE QUALITY |\n| 92 | 387 | 683 | 2.82 | N/A | AND TASTINESS OF OUR KITKAT BARS. |\n| 93 | 478 | 676 | 4.94 | N/A | Cocod |\n| 94 | 497 | 648 | 2.47 | N/A | Nestlé acheteune |\n| 95 | 497 | 654 | 2.82 | N/A | quantité de caca |\n| 96 | 497 | 658 | 2.82 | N/A | certifiée Rainforest |\n| 97 | 506 | 664 | 2.47 | N/A | quiv |\n| 98 | 497 | 668 | 3.18 | N/A | à celle cessaire |\n| 99 | 497 | 673 | 3.53 | N/A | pour ce produi |\n| 100 | 522 | 648 | 2.47 | N/A | DÉCOUVREZ-EN PLUS |\n| 101 | 285 | 700 | 21.87 | N/A | Gaufrette croustillante enrobée de chocolat au lait (67%). Le chocolat au lait contient des matières grasses végétales en plus du beurre de cacao. Ingrédients: fra de BLE LAIT écrémén poudre,
|
| 12 |
+
pile de cacao, moes grass vedtales (entre, fritt), berre de cacae', PETIT-LAIT tiré en poudre, shop de glucose, male om de LAIT anhydre, cacao malgra, émulant (cithins), poudre lever
|
| 13 |
+
(carbonates de sodium). Bilan massique certiléninforest Allance. www.Krokante wafel omhuld met melkchocolade (67%). De melkchocolade bevat naast cacaoboter ook andere plantaardige vetten.
|
| 14 |
+
wel, ungern Walkpoeder, cacaoman die wetten (palm, kartié), cacaoboter",\ beeder, glucosestroop, walenbret, spere cacao, emigr
|
| 15 |
+
Rainforest Allance Lesseer op aar Knusperwaffel in Milchschokolade (67% Milchschokolade enthält neben Kakaobutter auch andere
|
| 16 |
+
flanzliche Fette. Zutaten: Zucker, WEREMMEN, MAGENT CHPULVER, Kakaomasse planche Fette (Palm, Shee), Kakaobutter, MOLIENERZEUGNIS, Glukosesirup, BUTTERBEINFETT, fettarmer Kaka',
|
| 17 |
+
Emulgator Lecithine, Backtriebmittel Natriumcarbonnie Rainforest Allance-zertiert. Wehr erfahren untersa.org |\n| 102 | 583 | 645 | 2.47 | N/A | MIXTE |\n| 103 | 557 | 660 | 4.59 | N/A | FSC FSC C149053 |\n| 104 | 570 | 670 | 2.12 | N/A | Вотермия |\n| 105 | 551 | 679 | 3.53 | N/A | *** Kika❤achète du plastique recyclé pour |\n| 106 | 553 | 685 | 3.18 | N/A | couvrir la quantité nécessaire à la production |\n| 107 | 570 | 690 | 3.53 | N/A | KitKat qui partent cette |\n| 108 | 552 | 687 | 8.11 | N/A | desembe |\n| 109 | 552 | 695 | 4.23 | N/A | mention |\n| 110 | 562 | 697 | 3.18 | N/A | en |\n| 111 | 552 | 702 | 2.82 | N/A | Ce volume est susceptible d'être utilisé dans |\n| 112 | 551 | 708 | 2.47 | N/A | dautres |\n| 113 | 561 | 708 | 2.47 | N/A | emballages. Pour en savoir plus |\n| 114 | 552 | 713 | 2.47 | N/A | www.kitkat.fr |\n| 115 | 541 | 722 | 6.35 | N/A | 6x41,5 g = 249 ge |\n
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
google-cloud-documentai
|
| 2 |
+
streamlit
|
| 3 |
+
pandas
|
| 4 |
+
anthropic
|
| 5 |
+
requests>=2.31.0
|
| 6 |
+
urllib3>=2.0.7
|
| 7 |
+
Pillow>=8.3.0
|
| 8 |
+
pdf2image
|
| 9 |
+
numpy>=1.21.0
|
| 10 |
+
opencv-python
|
| 11 |
+
barcodenumber
|
| 12 |
+
zxing-cpp
|
| 13 |
+
PyMuPDF>=1.23.0
|
| 14 |
+
PyPDF2>=3.0.0
|
test_google_doc_ai.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for Google Document AI functionality.
|
| 4 |
+
This script demonstrates the text extraction with bounding boxes and height calculation.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add the src directory to the path so we can import our modules
|
| 12 |
+
sys.path.append(str(Path(__file__).parent / "src"))
|
| 13 |
+
|
| 14 |
+
from extract_text.google_document_api import GoogleDocumentAPI
|
| 15 |
+
|
| 16 |
+
def test_google_doc_ai():
|
| 17 |
+
"""Test the Google Document AI functionality with a sample PDF."""
|
| 18 |
+
|
| 19 |
+
# Path to the credentials file
|
| 20 |
+
credentials_path = "src/extract_text/photon-services-f0d3ec1417d0.json"
|
| 21 |
+
|
| 22 |
+
# Path to a test PDF file
|
| 23 |
+
test_pdf_path = "requirements_library/client-requirements/Kir-Kat/kitkat-f1.pdf"
|
| 24 |
+
|
| 25 |
+
# Check if files exist
|
| 26 |
+
if not os.path.exists(credentials_path):
|
| 27 |
+
print(f"❌ Credentials file not found: {credentials_path}")
|
| 28 |
+
print("Please ensure the Google Cloud credentials file is in the correct location.")
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
if not os.path.exists(test_pdf_path):
|
| 32 |
+
print(f"❌ Test PDF file not found: {test_pdf_path}")
|
| 33 |
+
print("Please ensure the test PDF file exists.")
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
print("🔍 Testing Google Document AI functionality...")
|
| 37 |
+
print(f"📄 Using PDF: {test_pdf_path}")
|
| 38 |
+
print(f"🔑 Using credentials: {credentials_path}")
|
| 39 |
+
print("-" * 80)
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
# Initialize the Google Document API
|
| 43 |
+
print("1. Initializing Google Document API...")
|
| 44 |
+
doc_api = GoogleDocumentAPI(credentials_path)
|
| 45 |
+
print("✅ Google Document API initialized successfully")
|
| 46 |
+
|
| 47 |
+
# Process the document
|
| 48 |
+
print("\n2. Processing document...")
|
| 49 |
+
document = doc_api.process_document(test_pdf_path)
|
| 50 |
+
print("✅ Document processed successfully")
|
| 51 |
+
|
| 52 |
+
# Get basic text
|
| 53 |
+
print("\n3. Extracting basic text...")
|
| 54 |
+
basic_text = doc_api.get_document_text(document, page_number=0)
|
| 55 |
+
print(f"📝 Basic text length: {len(basic_text)} characters")
|
| 56 |
+
print(f"📝 First 200 characters: {basic_text[:200]}...")
|
| 57 |
+
|
| 58 |
+
# Extract text with bounding boxes and height
|
| 59 |
+
print("\n4. Extracting text with bounding boxes and height...")
|
| 60 |
+
text_blocks = doc_api.extract_text_with_bounding_boxes(document)
|
| 61 |
+
print(f"📊 Found {len(text_blocks)} text blocks")
|
| 62 |
+
|
| 63 |
+
# Display sample text blocks
|
| 64 |
+
print("\n5. Sample text blocks with height information:")
|
| 65 |
+
print("-" * 80)
|
| 66 |
+
for i, block in enumerate(text_blocks[:10]): # Show first 10 blocks
|
| 67 |
+
print(f"Block {i+1}:")
|
| 68 |
+
print(f" Page: {block['page_number']}")
|
| 69 |
+
print(f" Height: {block['height']:.2f} mm")
|
| 70 |
+
print(f" Style: {block['style']}")
|
| 71 |
+
print(f" Text: {block['text'][:100]}{'...' if len(block['text']) > 100 else ''}")
|
| 72 |
+
print(f" Bounding Box: {block['bounding_box']}")
|
| 73 |
+
print()
|
| 74 |
+
|
| 75 |
+
# Generate markdown table
|
| 76 |
+
print("\n6. Generating markdown table...")
|
| 77 |
+
markdown_table = doc_api.extract_text_with_markdown_table(document)
|
| 78 |
+
print("📋 Markdown table generated successfully")
|
| 79 |
+
|
| 80 |
+
# Test the new extract_text_heights_mm function
|
| 81 |
+
print("\n7. Testing extract_text_heights_mm function...")
|
| 82 |
+
heights_mm = doc_api.extract_text_heights_mm(document)
|
| 83 |
+
print(f"📏 Found {len(heights_mm)} lines with height in mm")
|
| 84 |
+
|
| 85 |
+
# Display sample heights
|
| 86 |
+
print("\n📏 Sample line heights (mm):")
|
| 87 |
+
print("-" * 60)
|
| 88 |
+
for i, (page_num, line_text, height_mm) in enumerate(heights_mm[:10]):
|
| 89 |
+
print(f"Line {i+1}: Page {page_num}, Height={height_mm}mm | Text: {line_text[:50]}...")
|
| 90 |
+
|
| 91 |
+
# Save results to files
|
| 92 |
+
print("\n8. Saving results to files...")
|
| 93 |
+
|
| 94 |
+
# Save raw text blocks
|
| 95 |
+
with open("test_results_text_blocks.txt", "w", encoding="utf-8") as f:
|
| 96 |
+
f.write("Text Blocks with Height Information:\n")
|
| 97 |
+
f.write("=" * 50 + "\n\n")
|
| 98 |
+
for i, block in enumerate(text_blocks):
|
| 99 |
+
f.write(f"Block {i+1}:\n")
|
| 100 |
+
f.write(f" Page: {block['page_number']}\n")
|
| 101 |
+
f.write(f" Height: {block['height']:.2f} mm\n")
|
| 102 |
+
f.write(f" Style: {block['style']}\n")
|
| 103 |
+
f.write(f" Text: {block['text']}\n")
|
| 104 |
+
f.write(f" Bounding Box: {block['bounding_box']}\n")
|
| 105 |
+
f.write("-" * 40 + "\n")
|
| 106 |
+
|
| 107 |
+
# Save markdown table
|
| 108 |
+
with open("test_results_markdown_table.md", "w", encoding="utf-8") as f:
|
| 109 |
+
f.write("# Google Document AI Results\n\n")
|
| 110 |
+
f.write("## Text Blocks with Height Information\n\n")
|
| 111 |
+
f.write(markdown_table)
|
| 112 |
+
|
| 113 |
+
# Save basic text
|
| 114 |
+
with open("test_results_basic_text.txt", "w", encoding="utf-8") as f:
|
| 115 |
+
f.write("Basic Extracted Text:\n")
|
| 116 |
+
f.write("=" * 30 + "\n\n")
|
| 117 |
+
f.write(basic_text)
|
| 118 |
+
|
| 119 |
+
print("✅ Results saved to:")
|
| 120 |
+
print(" - test_results_text_blocks.txt")
|
| 121 |
+
print(" - test_results_markdown_table.md")
|
| 122 |
+
print(" - test_results_basic_text.txt")
|
| 123 |
+
|
| 124 |
+
# Save heights data
|
| 125 |
+
with open("test_results_heights_mm.txt", "w", encoding="utf-8") as f:
|
| 126 |
+
f.write("Line Heights in Millimeters:\n")
|
| 127 |
+
f.write("=" * 40 + "\n\n")
|
| 128 |
+
for i, (page_num, line_text, height_mm) in enumerate(heights_mm):
|
| 129 |
+
f.write(f"Line {i+1}: Page {page_num}, Height={height_mm}mm\n")
|
| 130 |
+
f.write(f"Text: {line_text}\n")
|
| 131 |
+
f.write("-" * 40 + "\n")
|
| 132 |
+
|
| 133 |
+
print(" - test_results_heights_mm.txt")
|
| 134 |
+
|
| 135 |
+
# Display statistics
|
| 136 |
+
print("\n9. Statistics:")
|
| 137 |
+
print("-" * 30)
|
| 138 |
+
heights = [block['height'] for block in text_blocks]
|
| 139 |
+
if heights:
|
| 140 |
+
print(f"📏 Height statistics:")
|
| 141 |
+
print(f" Min height: {min(heights):.2f} mm")
|
| 142 |
+
print(f" Max height: {max(heights):.2f} mm")
|
| 143 |
+
print(f" Average height: {sum(heights)/len(heights):.2f} mm")
|
| 144 |
+
|
| 145 |
+
# Count styles
|
| 146 |
+
styles = {}
|
| 147 |
+
for block in text_blocks:
|
| 148 |
+
style = block['style']
|
| 149 |
+
styles[style] = styles.get(style, 0) + 1
|
| 150 |
+
|
| 151 |
+
print(f"\n🎨 Style distribution:")
|
| 152 |
+
for style, count in sorted(styles.items(), key=lambda x: x[1], reverse=True):
|
| 153 |
+
print(f" {style}: {count} blocks")
|
| 154 |
+
|
| 155 |
+
print("\n🎉 Test completed successfully!")
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"❌ Error during testing: {str(e)}")
|
| 159 |
+
import traceback
|
| 160 |
+
traceback.print_exc()
|
| 161 |
+
|
| 162 |
+
def display_markdown_preview():
|
| 163 |
+
"""Display a preview of the generated markdown table."""
|
| 164 |
+
try:
|
| 165 |
+
with open("test_results_markdown_table.md", "r", encoding="utf-8") as f:
|
| 166 |
+
content = f.read()
|
| 167 |
+
|
| 168 |
+
print("\n📋 Markdown Table Preview:")
|
| 169 |
+
print("=" * 80)
|
| 170 |
+
print(content)
|
| 171 |
+
|
| 172 |
+
except FileNotFoundError:
|
| 173 |
+
print("❌ Markdown table file not found. Run the test first.")
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
print("🚀 Google Document AI Test Script")
|
| 177 |
+
print("=" * 50)
|
| 178 |
+
|
| 179 |
+
# Run the main test
|
| 180 |
+
test_google_doc_ai()
|
| 181 |
+
|
| 182 |
+
# Display markdown preview
|
| 183 |
+
display_markdown_preview()
|
test_metadata.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for metadata extraction functionality
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
from src.extract_text.extract_meta_data import PDFArtworkMetadataExtractor
|
| 8 |
+
|
| 9 |
+
def test_metadata_extraction():
|
| 10 |
+
"""Test the metadata extraction on a sample PDF"""
|
| 11 |
+
|
| 12 |
+
# Check if we have any PDF files in the requirements library
|
| 13 |
+
base_path = "requirements_library/client-requirements"
|
| 14 |
+
|
| 15 |
+
if not os.path.exists(base_path):
|
| 16 |
+
print("❌ No requirements library found")
|
| 17 |
+
return False
|
| 18 |
+
|
| 19 |
+
# Find the first PDF file
|
| 20 |
+
pdf_file = None
|
| 21 |
+
for root, dirs, files in os.walk(base_path):
|
| 22 |
+
for file in files:
|
| 23 |
+
if file.lower().endswith('.pdf'):
|
| 24 |
+
pdf_file = os.path.join(root, file)
|
| 25 |
+
break
|
| 26 |
+
if pdf_file:
|
| 27 |
+
break
|
| 28 |
+
|
| 29 |
+
if not pdf_file:
|
| 30 |
+
print("❌ No PDF files found in requirements library")
|
| 31 |
+
return False
|
| 32 |
+
|
| 33 |
+
print(f"📄 Testing metadata extraction on: {pdf_file}")
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
# Initialize the extractor
|
| 37 |
+
extractor = PDFArtworkMetadataExtractor()
|
| 38 |
+
|
| 39 |
+
# Extract metadata
|
| 40 |
+
metadata = extractor.extract_metadata(pdf_file)
|
| 41 |
+
|
| 42 |
+
if 'error' in metadata:
|
| 43 |
+
print(f"❌ Error extracting metadata: {metadata['error']}")
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
# Print results
|
| 47 |
+
print("✅ Metadata extraction successful!")
|
| 48 |
+
print(f"📊 Pages processed: {metadata.get('pages_processed', 0)}")
|
| 49 |
+
print(f"📝 Has selectable text: {metadata.get('has_selectable_text', False)}")
|
| 50 |
+
print(f"🔧 Extraction method: {metadata.get('extraction_method', 'unknown')}")
|
| 51 |
+
|
| 52 |
+
# Show top fonts
|
| 53 |
+
fonts = metadata.get('fonts', {})
|
| 54 |
+
if fonts:
|
| 55 |
+
print("\n🔤 Top 3 Fonts:")
|
| 56 |
+
for i, (font, count) in enumerate(list(fonts.items())[:3]):
|
| 57 |
+
print(f" {i+1}. {font}: {count} characters")
|
| 58 |
+
|
| 59 |
+
# Show top font sizes
|
| 60 |
+
font_sizes = metadata.get('font_sizes', {})
|
| 61 |
+
if font_sizes:
|
| 62 |
+
print("\n📏 Top 3 Font Sizes:")
|
| 63 |
+
for i, (size, count) in enumerate(list(font_sizes.items())[:3]):
|
| 64 |
+
print(f" {i+1}. {size}pt: {count} characters")
|
| 65 |
+
|
| 66 |
+
# Show top colors
|
| 67 |
+
colors = metadata.get('text_colors', {})
|
| 68 |
+
if colors:
|
| 69 |
+
print("\n🎨 Top 3 Text Colors:")
|
| 70 |
+
for i, (color, count) in enumerate(list(colors.items())[:3]):
|
| 71 |
+
print(f" {i+1}. RGB{color}: {count} characters")
|
| 72 |
+
|
| 73 |
+
return True
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"❌ Test failed with error: {str(e)}")
|
| 77 |
+
return False
|
| 78 |
+
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
print("🧪 Testing Metadata Extraction")
|
| 81 |
+
print("=" * 40)
|
| 82 |
+
|
| 83 |
+
success = test_metadata_extraction()
|
| 84 |
+
|
| 85 |
+
if success:
|
| 86 |
+
print("\n✅ All tests passed! Metadata extraction is working correctly.")
|
| 87 |
+
else:
|
| 88 |
+
print("\n❌ Tests failed. Please check the error messages above.")
|
| 89 |
+
sys.exit(1)
|
test_pdf_requirements.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for PDF requirements functionality
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
from src.extract_text.ingest import RequirementsIngest
|
| 9 |
+
|
| 10 |
+
def test_pdf_requirements():
|
| 11 |
+
"""Test PDF requirements ingestion"""
|
| 12 |
+
print("Testing PDF requirements functionality...")
|
| 13 |
+
|
| 14 |
+
# Create a simple test PDF (we'll use an existing one if available)
|
| 15 |
+
test_pdf_path = None
|
| 16 |
+
|
| 17 |
+
# Look for any PDF file in the requirements_library
|
| 18 |
+
for root, dirs, files in os.walk("requirements_library"):
|
| 19 |
+
for file in files:
|
| 20 |
+
if file.lower().endswith('.pdf'):
|
| 21 |
+
test_pdf_path = os.path.join(root, file)
|
| 22 |
+
break
|
| 23 |
+
if test_pdf_path:
|
| 24 |
+
break
|
| 25 |
+
|
| 26 |
+
if not test_pdf_path:
|
| 27 |
+
print("No PDF files found for testing. Creating a simple test...")
|
| 28 |
+
# Create a simple test with text file
|
| 29 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
|
| 30 |
+
f.write("Test requirement: All products must have allergen information.")
|
| 31 |
+
test_file_path = f.name
|
| 32 |
+
|
| 33 |
+
print(f"Created test text file: {test_file_path}")
|
| 34 |
+
else:
|
| 35 |
+
print(f"Using existing PDF for testing: {test_pdf_path}")
|
| 36 |
+
test_file_path = test_pdf_path
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# Test the ingestion
|
| 40 |
+
ingest = RequirementsIngest()
|
| 41 |
+
|
| 42 |
+
# Open the file and test ingestion
|
| 43 |
+
with open(test_file_path, 'rb') as f:
|
| 44 |
+
result = ingest.ingest_requirements_document(f)
|
| 45 |
+
|
| 46 |
+
print("✅ Ingestion successful!")
|
| 47 |
+
print(f"Result type: {type(result)}")
|
| 48 |
+
|
| 49 |
+
if isinstance(result, dict):
|
| 50 |
+
print(f"File type: {result.get('type', 'unknown')}")
|
| 51 |
+
print(f"Filename: {result.get('filename', 'unknown')}")
|
| 52 |
+
print(f"File size: {result.get('file_size', 0)} bytes")
|
| 53 |
+
print(f"Text content preview: {result.get('text_content', '')[:200]}...")
|
| 54 |
+
else:
|
| 55 |
+
print(f"Text content: {result[:200]}...")
|
| 56 |
+
|
| 57 |
+
print("\n✅ PDF requirements functionality is working!")
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f"❌ Error during testing: {e}")
|
| 61 |
+
import traceback
|
| 62 |
+
traceback.print_exc()
|
| 63 |
+
|
| 64 |
+
finally:
|
| 65 |
+
# Clean up test file if we created one
|
| 66 |
+
if test_pdf_path is None and 'test_file_path' in locals():
|
| 67 |
+
try:
|
| 68 |
+
os.unlink(test_file_path)
|
| 69 |
+
print(f"Cleaned up test file: {test_file_path}")
|
| 70 |
+
except:
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
test_pdf_requirements()
|