Upload 18 files
Browse files- src/core/__pycache__/analysis.cpython-313.pyc +0 -0
- src/core/__pycache__/analysis.cpython-313.pyc.1424781933232 +0 -0
- src/core/__pycache__/analysis.cpython-313.pyc.3054062041392 +0 -0
- src/core/__pycache__/analysis.cpython-313.pyc.3054062628656 +0 -0
- src/core/__pycache__/analysis.cpython-313.pyc.3054062929328 +0 -0
- src/core/analysis.py +704 -0
- src/extract_text/__pycache__/extract_meta_data.cpython-313.pyc +0 -0
- src/extract_text/__pycache__/google_document_api.cpython-313.pyc +0 -0
- src/extract_text/__pycache__/google_document_api.cpython-313.pyc.1480615374128 +0 -0
- src/extract_text/__pycache__/ingest.cpython-313.pyc +0 -0
- src/extract_text/extract_meta_data.py +355 -0
- src/extract_text/google_document_api.py +224 -0
- src/extract_text/ingest.py +92 -0
- src/extract_text/photon-services-f0d3ec1417d0.json +13 -0
- src/utils/__pycache__/barcode.cpython-313.pyc +0 -0
- src/utils/__pycache__/image_utils.cpython-313.pyc +0 -0
- src/utils/barcode.py +95 -0
- src/utils/image_utils.py +227 -0
src/core/__pycache__/analysis.cpython-313.pyc
ADDED
|
Binary file (28.5 kB). View file
|
|
|
src/core/__pycache__/analysis.cpython-313.pyc.1424781933232
ADDED
|
Binary file (25.4 kB). View file
|
|
|
src/core/__pycache__/analysis.cpython-313.pyc.3054062041392
ADDED
|
Binary file (24.3 kB). View file
|
|
|
src/core/__pycache__/analysis.cpython-313.pyc.3054062628656
ADDED
|
Binary file (22.2 kB). View file
|
|
|
src/core/__pycache__/analysis.cpython-313.pyc.3054062929328
ADDED
|
Binary file (21.1 kB). View file
|
|
|
src/core/analysis.py
ADDED
|
@@ -0,0 +1,704 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import anthropic
|
| 3 |
+
import requests
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import numpy as np
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
from requests.adapters import HTTPAdapter
|
| 9 |
+
from urllib3.util.retry import Retry
|
| 10 |
+
from src.extract_text.google_document_api import GoogleDocumentAPI
|
| 11 |
+
|
| 12 |
+
CLAUDE_API_URL = "https://api.anthropic.com/v1/messages"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class LLM:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.claude_api_key = os.getenv('CLAUDE_API_KEY')
|
| 19 |
+
if not self.claude_api_key:
|
| 20 |
+
raise ValueError("Please set the CLAUDE_API_KEY environment variable.")
|
| 21 |
+
|
| 22 |
+
# Configure retry strategy with more comprehensive error handling
|
| 23 |
+
retry_strategy = Retry(
|
| 24 |
+
total=5, # Increased total retries
|
| 25 |
+
backoff_factor=2, # Increased backoff factor for exponential backoff
|
| 26 |
+
status_forcelist=[429, 500, 502, 503, 504, 529], # Added 529 for server overload
|
| 27 |
+
allowed_methods=["POST"], # Only retry POST requests
|
| 28 |
+
respect_retry_after_header=True, # Respect Retry-After headers
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Create session with retry strategy
|
| 32 |
+
self.session = requests.Session()
|
| 33 |
+
self.session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
|
| 34 |
+
|
| 35 |
+
def call_claude_api(self, prompt, system_prompt, model="claude-sonnet-4-20250514", max_tokens=2000) -> str:
|
| 36 |
+
"""
|
| 37 |
+
Helper function to call Claude API with consistent parameters and enhanced error handling.
|
| 38 |
+
"""
|
| 39 |
+
headers = {
|
| 40 |
+
"x-api-key": self.claude_api_key,
|
| 41 |
+
"anthropic-version": "2023-06-01",
|
| 42 |
+
"Content-Type": "application/json"
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
payload = {
|
| 46 |
+
"model": model,
|
| 47 |
+
"max_tokens": max_tokens,
|
| 48 |
+
"temperature": 0.1,
|
| 49 |
+
"messages": [
|
| 50 |
+
{
|
| 51 |
+
"role": "user",
|
| 52 |
+
"content": prompt
|
| 53 |
+
}
|
| 54 |
+
],
|
| 55 |
+
"system": system_prompt
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
max_retries = 3
|
| 59 |
+
for attempt in range(max_retries):
|
| 60 |
+
try:
|
| 61 |
+
response = self.session.post(
|
| 62 |
+
CLAUDE_API_URL,
|
| 63 |
+
headers=headers,
|
| 64 |
+
json=payload,
|
| 65 |
+
verify=True, # Explicitly enable SSL verification
|
| 66 |
+
timeout=60 # Increased timeout for better reliability
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Handle specific error codes
|
| 70 |
+
if response.status_code == 529:
|
| 71 |
+
st.warning(f"Server overload (529) on attempt {attempt + 1}/{max_retries}. Retrying with exponential backoff...")
|
| 72 |
+
if attempt < max_retries - 1:
|
| 73 |
+
import time
|
| 74 |
+
time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s
|
| 75 |
+
continue
|
| 76 |
+
else:
|
| 77 |
+
st.error("Server overload after all retries. Please try again later.")
|
| 78 |
+
return ""
|
| 79 |
+
|
| 80 |
+
response.raise_for_status() # Raise exception for other bad status codes
|
| 81 |
+
|
| 82 |
+
# Parse response
|
| 83 |
+
response_data = response.json()
|
| 84 |
+
if "content" in response_data and len(response_data["content"]) > 0:
|
| 85 |
+
return response_data["content"][0]["text"]
|
| 86 |
+
else:
|
| 87 |
+
st.error("Unexpected response format from Claude API")
|
| 88 |
+
return ""
|
| 89 |
+
|
| 90 |
+
except requests.exceptions.SSLError as ssl_err:
|
| 91 |
+
st.error(f"SSL Error when calling Claude API. Please check your SSL certificates and network connection. Error: {ssl_err}")
|
| 92 |
+
return ""
|
| 93 |
+
except requests.exceptions.Timeout as timeout_err:
|
| 94 |
+
st.warning(f"Timeout on attempt {attempt + 1}/{max_retries}. Retrying...")
|
| 95 |
+
if attempt == max_retries - 1:
|
| 96 |
+
st.error("Request timed out after all retries")
|
| 97 |
+
return ""
|
| 98 |
+
except requests.exceptions.RequestException as e:
|
| 99 |
+
st.error(f"Error calling Claude API: {str(e)}")
|
| 100 |
+
return ""
|
| 101 |
+
except json.JSONDecodeError as json_err:
|
| 102 |
+
st.error(f"Invalid JSON response from Claude API: {json_err}")
|
| 103 |
+
return ""
|
| 104 |
+
|
| 105 |
+
return ""
|
| 106 |
+
|
| 107 |
+
def call_claude_vision_api(self, prompt, system_prompt, image_base64, model="claude-sonnet-4-20250514", max_tokens=2000) -> str:
|
| 108 |
+
"""
|
| 109 |
+
Helper function to call Claude Vision API with image support and enhanced error handling.
|
| 110 |
+
"""
|
| 111 |
+
headers = {
|
| 112 |
+
"x-api-key": self.claude_api_key,
|
| 113 |
+
"anthropic-version": "2023-06-01",
|
| 114 |
+
"Content-Type": "application/json"
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
content = [
|
| 118 |
+
{
|
| 119 |
+
"type": "text",
|
| 120 |
+
"text": prompt
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"type": "image",
|
| 124 |
+
"source": {
|
| 125 |
+
"type": "base64",
|
| 126 |
+
"media_type": "image/png",
|
| 127 |
+
"data": image_base64
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
]
|
| 131 |
+
|
| 132 |
+
payload = {
|
| 133 |
+
"model": model,
|
| 134 |
+
"max_tokens": max_tokens,
|
| 135 |
+
"temperature": 0,
|
| 136 |
+
"messages": [
|
| 137 |
+
{
|
| 138 |
+
"role": "user",
|
| 139 |
+
"content": content
|
| 140 |
+
}
|
| 141 |
+
],
|
| 142 |
+
"system": system_prompt
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
max_retries = 3
|
| 146 |
+
for attempt in range(max_retries):
|
| 147 |
+
try:
|
| 148 |
+
response = self.session.post(
|
| 149 |
+
CLAUDE_API_URL,
|
| 150 |
+
headers=headers,
|
| 151 |
+
json=payload,
|
| 152 |
+
verify=True, # Explicitly enable SSL verification
|
| 153 |
+
timeout=90 # Increased timeout for vision API calls
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# Handle specific error codes
|
| 157 |
+
if response.status_code == 529:
|
| 158 |
+
st.warning(f"Server overload (529) on attempt {attempt + 1}/{max_retries}. Retrying with exponential backoff...")
|
| 159 |
+
if attempt < max_retries - 1:
|
| 160 |
+
import time
|
| 161 |
+
time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s
|
| 162 |
+
continue
|
| 163 |
+
else:
|
| 164 |
+
st.error("Server overload after all retries. Please try again later.")
|
| 165 |
+
return ""
|
| 166 |
+
|
| 167 |
+
response.raise_for_status() # Raise exception for other bad status codes
|
| 168 |
+
|
| 169 |
+
# Parse response
|
| 170 |
+
response_data = response.json()
|
| 171 |
+
if "content" in response_data and len(response_data["content"]) > 0:
|
| 172 |
+
return response_data["content"][0]["text"]
|
| 173 |
+
else:
|
| 174 |
+
st.error("Unexpected response format from Claude Vision API")
|
| 175 |
+
return ""
|
| 176 |
+
|
| 177 |
+
except requests.exceptions.SSLError as ssl_err:
|
| 178 |
+
st.error(f"SSL Error when calling Claude Vision API. Please check your SSL certificates and network connection. Error: {ssl_err}")
|
| 179 |
+
return ""
|
| 180 |
+
except requests.exceptions.Timeout as timeout_err:
|
| 181 |
+
st.warning(f"Timeout on attempt {attempt + 1}/{max_retries}. Retrying...")
|
| 182 |
+
if attempt == max_retries - 1:
|
| 183 |
+
st.error("Request timed out after all retries")
|
| 184 |
+
return ""
|
| 185 |
+
except requests.exceptions.RequestException as e:
|
| 186 |
+
st.error(f"Error calling Claude Vision API: {str(e)}")
|
| 187 |
+
return ""
|
| 188 |
+
except json.JSONDecodeError as json_err:
|
| 189 |
+
st.error(f"Invalid JSON response from Claude Vision API: {json_err}")
|
| 190 |
+
return ""
|
| 191 |
+
|
| 192 |
+
return ""
|
| 193 |
+
|
| 194 |
+
def call_claude_pdf_api(self, prompt, system_prompt, pdf_base64, model="claude-sonnet-4-20250514", max_tokens=4000) -> str:
|
| 195 |
+
"""
|
| 196 |
+
Helper function to call Claude API with PDF support for requirements documents.
|
| 197 |
+
For now, we'll fall back to text-based processing since PDF API requires specific setup.
|
| 198 |
+
"""
|
| 199 |
+
# For now, we'll use the regular API with text extraction
|
| 200 |
+
# In the future, this can be enhanced to use the Converse API with citations
|
| 201 |
+
st.info("📄 PDF requirements detected. Using text-based processing for now.")
|
| 202 |
+
st.info("💡 For full visual PDF analysis, consider using the Converse API with citations enabled.")
|
| 203 |
+
|
| 204 |
+
# Extract text from PDF using a simple approach
|
| 205 |
+
# In a production environment, you might want to use a more robust PDF text extraction library
|
| 206 |
+
try:
|
| 207 |
+
import base64
|
| 208 |
+
import io
|
| 209 |
+
|
| 210 |
+
# Try to import PyPDF2
|
| 211 |
+
try:
|
| 212 |
+
from PyPDF2 import PdfReader
|
| 213 |
+
pdf_reader_available = True
|
| 214 |
+
except ImportError:
|
| 215 |
+
pdf_reader_available = False
|
| 216 |
+
st.warning("PyPDF2 not available. Using basic text processing for PDF.")
|
| 217 |
+
|
| 218 |
+
if pdf_reader_available:
|
| 219 |
+
# Decode base64 PDF
|
| 220 |
+
pdf_bytes = base64.b64decode(pdf_base64)
|
| 221 |
+
pdf_stream = io.BytesIO(pdf_bytes)
|
| 222 |
+
|
| 223 |
+
# Extract text from PDF
|
| 224 |
+
reader = PdfReader(pdf_stream)
|
| 225 |
+
text_content = ""
|
| 226 |
+
for page in reader.pages:
|
| 227 |
+
text_content += page.extract_text() + "\n"
|
| 228 |
+
|
| 229 |
+
if not text_content.strip():
|
| 230 |
+
text_content = "PDF Requirements Document (text extraction limited)"
|
| 231 |
+
|
| 232 |
+
# Use regular API with extracted text
|
| 233 |
+
return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
|
| 234 |
+
else:
|
| 235 |
+
# Fallback when PyPDF2 is not available
|
| 236 |
+
return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
|
| 237 |
+
|
| 238 |
+
except Exception as e:
|
| 239 |
+
st.warning(f"PDF text extraction failed: {e}")
|
| 240 |
+
st.warning("Falling back to basic text processing")
|
| 241 |
+
|
| 242 |
+
# Fallback to basic text processing
|
| 243 |
+
return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
|
| 244 |
+
|
| 245 |
+
class ComplianceAnalysis:
|
| 246 |
+
def __init__(self):
|
| 247 |
+
self.llm = LLM()
|
| 248 |
+
|
| 249 |
+
def extract_structured_requirements(self, requirements_data) -> list[dict]:
|
| 250 |
+
"""
|
| 251 |
+
Use Claude to extract structured requirements from the requirements document.
|
| 252 |
+
|
| 253 |
+
Args:
|
| 254 |
+
requirements_data: Either a string (for text files) or a dict (for PDF files) containing requirements.
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
A list of dictionaries, each containing a requirement ID, description, and category.
|
| 258 |
+
"""
|
| 259 |
+
# Handle both text and PDF requirements
|
| 260 |
+
if isinstance(requirements_data, str):
|
| 261 |
+
# Text-based requirements
|
| 262 |
+
requirements_text = requirements_data
|
| 263 |
+
requirements_type = "text"
|
| 264 |
+
elif isinstance(requirements_data, dict):
|
| 265 |
+
# PDF-based requirements
|
| 266 |
+
requirements_text = requirements_data.get('text_content', '')
|
| 267 |
+
requirements_type = requirements_data.get('type', 'text')
|
| 268 |
+
pdf_base64 = requirements_data.get('content', '') if requirements_type == 'pdf' else None
|
| 269 |
+
else:
|
| 270 |
+
st.error("Invalid requirements data format. Please upload a valid requirements document.")
|
| 271 |
+
return []
|
| 272 |
+
|
| 273 |
+
# Check if requirements text is empty or None
|
| 274 |
+
if not requirements_text or not requirements_text.strip():
|
| 275 |
+
st.error("Requirements text is empty. Please upload a valid requirements document.")
|
| 276 |
+
return []
|
| 277 |
+
|
| 278 |
+
system_prompt = """You are an expert requirements analyst. Extract clear, structured requirements from documents. You must always return valid JSON, even if no specific requirements are found."""
|
| 279 |
+
|
| 280 |
+
extraction_prompt = f"""
|
| 281 |
+
Extract all requirements from this document (not just allergen requirements):
|
| 282 |
+
|
| 283 |
+
{requirements_text}
|
| 284 |
+
|
| 285 |
+
For each requirement found, provide:
|
| 286 |
+
1. Unique ID (REQ001, REQ002, etc.)
|
| 287 |
+
2. Description (verbatim from the document)
|
| 288 |
+
3. Category (Font Size, Allergen List, Formatting, Placement, Barcode, Organic, Promotional, etc.)
|
| 289 |
+
4. Source reference (section/paragraph or line number)
|
| 290 |
+
|
| 291 |
+
If no requirements are found, return an empty array: []
|
| 292 |
+
|
| 293 |
+
Return as JSON array with fields: id, description, category, source_reference.
|
| 294 |
+
|
| 295 |
+
Example:
|
| 296 |
+
```json
|
| 297 |
+
[
|
| 298 |
+
{{
|
| 299 |
+
"id": "REQ001",
|
| 300 |
+
"description": "IF the product is labeled as organic, THEN a certified organic seal must be visible",
|
| 301 |
+
"category": "Organic",
|
| 302 |
+
"source_reference": "Line 1"
|
| 303 |
+
}},
|
| 304 |
+
{{
|
| 305 |
+
"id": "REQ002",
|
| 306 |
+
"description": "IF there is a promotional offer mentioned, THEN include the offer expiry date",
|
| 307 |
+
"category": "Promotional",
|
| 308 |
+
"source_reference": "Line 2"
|
| 309 |
+
}}
|
| 310 |
+
]
|
| 311 |
+
```
|
| 312 |
+
|
| 313 |
+
IMPORTANT: Always return valid JSON. If you cannot extract any requirements, return an empty array: []
|
| 314 |
+
"""
|
| 315 |
+
|
| 316 |
+
# Use appropriate API based on requirements type
|
| 317 |
+
if requirements_type == 'pdf' and pdf_base64:
|
| 318 |
+
# Use PDF API for native PDF processing
|
| 319 |
+
response = self.llm.call_claude_pdf_api(extraction_prompt, system_prompt, pdf_base64, model='claude-sonnet-4-20250514')
|
| 320 |
+
else:
|
| 321 |
+
# Use regular API for text processing
|
| 322 |
+
response = self.llm.call_claude_api(extraction_prompt, system_prompt, model='claude-3-5-haiku-20241022')
|
| 323 |
+
|
| 324 |
+
# Extract JSON from the response
|
| 325 |
+
try:
|
| 326 |
+
# Find JSON content between triple backticks if present
|
| 327 |
+
if "```json" in response and "```" in response.split("```json")[1]:
|
| 328 |
+
json_content = response.split("```json")[1].split("```")[0].strip()
|
| 329 |
+
elif "```" in response:
|
| 330 |
+
# Try to find any code block
|
| 331 |
+
json_content = response.split("```")[1].split("```")[0].strip()
|
| 332 |
+
else:
|
| 333 |
+
# Assume the entire response is JSON
|
| 334 |
+
json_content = response
|
| 335 |
+
|
| 336 |
+
# Clean the JSON content to handle control characters
|
| 337 |
+
# Remove or replace invalid control characters except newlines and tabs
|
| 338 |
+
json_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', json_content)
|
| 339 |
+
# Replace newlines within strings with escaped newlines
|
| 340 |
+
json_content = re.sub(r'(?<!\\)"(?:[^"\\]|\\.)*?(?<!\\)"', lambda m: m.group(0).replace('\n', '\\n'), json_content)
|
| 341 |
+
|
| 342 |
+
requirements = json.loads(json_content)
|
| 343 |
+
return requirements
|
| 344 |
+
except Exception as e:
|
| 345 |
+
st.error(f"Error parsing extracted requirements: {e}")
|
| 346 |
+
st.error(f"Raw response: {response}")
|
| 347 |
+
# Return empty array as fallback
|
| 348 |
+
return []
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def verify_individual_requirement(self, requirement, markdown_table, image=None, barcode_data=None, metadata=None, requirements_data=None):
|
| 352 |
+
"""
|
| 353 |
+
Use structured reasoning to verify if a specific requirement is met in the packaging text.
|
| 354 |
+
|
| 355 |
+
Args:
|
| 356 |
+
requirement: A dictionary containing requirement details
|
| 357 |
+
markdown_table: The markdown table extracted from the packaging PDF
|
| 358 |
+
image: The image of the packaging document (optional)
|
| 359 |
+
barcode_data: List of barcode objects with position data (optional)
|
| 360 |
+
metadata: Dictionary containing font, font size, and color metadata (optional)
|
| 361 |
+
requirements_data: Original requirements data (text or PDF) for context (optional)
|
| 362 |
+
Returns:
|
| 363 |
+
A dictionary with verification results including reasoning and compliance status
|
| 364 |
+
"""
|
| 365 |
+
system_prompt = """You are a regulatory compliance expert. Provide detailed, objective compliance reports."""
|
| 366 |
+
|
| 367 |
+
# Build the prompt for verification
|
| 368 |
+
verification_prompt = f"""
|
| 369 |
+
You are a regulatory compliance expert. Provide detailed, objective compliance reports.
|
| 370 |
+
I need to verify if the following specific requirement is met in the packaging text:
|
| 371 |
+
|
| 372 |
+
Requirement ID: {requirement['id']}
|
| 373 |
+
Requirement Description: {requirement['description']}
|
| 374 |
+
Requirement Category: {requirement['category']}
|
| 375 |
+
|
| 376 |
+
Here is the packaging text to analyze:
|
| 377 |
+
|
| 378 |
+
{markdown_table}
|
| 379 |
+
"""
|
| 380 |
+
|
| 381 |
+
# Add barcode information if available
|
| 382 |
+
if barcode_data:
|
| 383 |
+
# Create minimal barcode summary for LLM (save tokens)
|
| 384 |
+
barcode_summary = []
|
| 385 |
+
for barcode in barcode_data:
|
| 386 |
+
barcode_summary.append({
|
| 387 |
+
'id': barcode['id'],
|
| 388 |
+
'type': barcode['type'],
|
| 389 |
+
'data': barcode['data'],
|
| 390 |
+
'valid': barcode['valid']
|
| 391 |
+
})
|
| 392 |
+
|
| 393 |
+
verification_prompt += f"""
|
| 394 |
+
|
| 395 |
+
Barcode Information Found:
|
| 396 |
+
{json.dumps(barcode_summary, indent=2)}
|
| 397 |
+
|
| 398 |
+
When analyzing barcode-related requirements, consider:
|
| 399 |
+
- Barcode ID for evidence reference
|
| 400 |
+
- Barcode type and validation status
|
| 401 |
+
"""
|
| 402 |
+
|
| 403 |
+
# Add metadata information if available
|
| 404 |
+
if metadata and not metadata.get('error'):
|
| 405 |
+
# Create metadata summary for LLM (save tokens)
|
| 406 |
+
metadata_summary = {
|
| 407 |
+
'extraction_method': metadata.get('extraction_method', 'unknown'),
|
| 408 |
+
'has_selectable_text': metadata.get('has_selectable_text', False),
|
| 409 |
+
'pages_processed': metadata.get('pages_processed', 0),
|
| 410 |
+
'dominant_font': metadata.get('fonts', {}),
|
| 411 |
+
'dominant_font_size': metadata.get('font_sizes', {}),
|
| 412 |
+
'dominant_text_color': metadata.get('text_colors', {})
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
verification_prompt += f"""
|
| 416 |
+
|
| 417 |
+
Typography and Design Metadata:
|
| 418 |
+
{json.dumps(metadata_summary, indent=2)}
|
| 419 |
+
|
| 420 |
+
When analyzing typography and design requirements, consider:
|
| 421 |
+
- Font types and their usage frequency
|
| 422 |
+
- Font sizes and their distribution
|
| 423 |
+
- Text colors and their application
|
| 424 |
+
- Whether text is selectable or requires OCR
|
| 425 |
+
"""
|
| 426 |
+
|
| 427 |
+
verification_prompt += f"""
|
| 428 |
+
|
| 429 |
+
Verify this requirement using these steps:
|
| 430 |
+
1. Break down into checkable criteria
|
| 431 |
+
2. Search for evidence in packaging text (provide Text ID)
|
| 432 |
+
3. For visual elements not in text, describe clearly (text_id = null)
|
| 433 |
+
4. For barcode evidence, use Barcode ID (text_id = null)
|
| 434 |
+
5. Provide specific examples/quotes
|
| 435 |
+
6. Determine: COMPLIANT/NON-COMPLIANT/PARTIALLY COMPLIANT
|
| 436 |
+
- Compliant: All applicable rules are fully met without any deviation.
|
| 437 |
+
- Partially Compliant: Some rules are met, but minor issues/omissions that don't constitute a full failure but need attention.
|
| 438 |
+
- Non-Compliant: One or more critical rules are violated or omitted, posing a regulatory, safety, or logistical risk.
|
| 439 |
+
7. Explain reasoning
|
| 440 |
+
|
| 441 |
+
For visual evidence, describe:
|
| 442 |
+
- Location (e.g., "top right corner", "bottom section")
|
| 443 |
+
- Visual characteristics (e.g., "large bold text", "red warning box")
|
| 444 |
+
- Content description (e.g., "allergen warning in red box")
|
| 445 |
+
|
| 446 |
+
If there is barcode evidence, include:
|
| 447 |
+
- Barcode ID
|
| 448 |
+
- Barcode type and validation status
|
| 449 |
+
|
| 450 |
+
Return JSON with structure:
|
| 451 |
+
```json
|
| 452 |
+
{{
|
| 453 |
+
"requirement_id": "{requirement['id']}",
|
| 454 |
+
"criteria": ["criterion 1", "criterion 2"],
|
| 455 |
+
"evidence_found": [
|
| 456 |
+
{{"text_id": <Text ID or null>, "evidence_text": "<description>", "barcode_id": "<Barcode ID ONLY if applicable>"}}
|
| 457 |
+
],
|
| 458 |
+
"compliance_status": "COMPLIANT/NON-COMPLIANT/PARTIALLY COMPLIANT",
|
| 459 |
+
"reasoning": "Detailed explanation",
|
| 460 |
+
"confidence": 0.95
|
| 461 |
+
}}
|
| 462 |
+
```
|
| 463 |
+
"""
|
| 464 |
+
|
| 465 |
+
# Use vision API if image is provided, otherwise use regular API
|
| 466 |
+
if image:
|
| 467 |
+
response = self.llm.call_claude_vision_api(verification_prompt, system_prompt, image)
|
| 468 |
+
else:
|
| 469 |
+
response = self.llm.call_claude_api(verification_prompt, system_prompt)
|
| 470 |
+
|
| 471 |
+
# Extract JSON from the response with enhanced error handling
|
| 472 |
+
try:
|
| 473 |
+
# Check if response is empty or None
|
| 474 |
+
if not response or not response.strip():
|
| 475 |
+
st.error("Empty response received from Claude API")
|
| 476 |
+
return {
|
| 477 |
+
"requirement_id": requirement['id'],
|
| 478 |
+
"evidence_found": [],
|
| 479 |
+
"compliance_status": "ERROR",
|
| 480 |
+
"reasoning": "Empty response received from Claude API",
|
| 481 |
+
"confidence": 0
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
# Find JSON content between triple backticks if present
|
| 485 |
+
if "```json" in response and "```" in response.split("```json")[1]:
|
| 486 |
+
json_content = response.split("```json")[1].split("```")[0].strip()
|
| 487 |
+
elif "```" in response:
|
| 488 |
+
# Try to find any code block
|
| 489 |
+
json_content = response.split("```")[1].split("```")[0].strip()
|
| 490 |
+
else:
|
| 491 |
+
# Assume the entire response is JSON
|
| 492 |
+
json_content = response
|
| 493 |
+
|
| 494 |
+
# Clean the JSON content to handle control characters
|
| 495 |
+
# Remove or replace invalid control characters except newlines and tabs
|
| 496 |
+
json_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', json_content)
|
| 497 |
+
# Replace newlines within strings with escaped newlines
|
| 498 |
+
json_content = re.sub(r'(?<!\\)"(?:[^"\\]|\\.)*?(?<!\\)"', lambda m: m.group(0).replace('\n', '\\n'), json_content)
|
| 499 |
+
|
| 500 |
+
# Try to parse JSON with multiple fallback strategies
|
| 501 |
+
verification_result = None
|
| 502 |
+
|
| 503 |
+
# Strategy 1: Direct parsing
|
| 504 |
+
try:
|
| 505 |
+
verification_result = json.loads(json_content)
|
| 506 |
+
except json.JSONDecodeError as e1:
|
| 507 |
+
st.warning(f"Initial JSON parsing failed: {e1}")
|
| 508 |
+
|
| 509 |
+
# Strategy 2: Try to extract JSON from malformed response
|
| 510 |
+
try:
|
| 511 |
+
# Look for JSON-like structure
|
| 512 |
+
json_match = re.search(r'\{.*\}', json_content, re.DOTALL)
|
| 513 |
+
if json_match:
|
| 514 |
+
potential_json = json_match.group(0)
|
| 515 |
+
verification_result = json.loads(potential_json)
|
| 516 |
+
st.info("Successfully extracted JSON from malformed response")
|
| 517 |
+
except json.JSONDecodeError as e2:
|
| 518 |
+
st.warning(f"JSON extraction failed: {e2}")
|
| 519 |
+
|
| 520 |
+
# Strategy 3: Create a minimal valid JSON structure
|
| 521 |
+
try:
|
| 522 |
+
# Try to extract key information from the response
|
| 523 |
+
compliance_status = "UNKNOWN"
|
| 524 |
+
if "COMPLIANT" in response.upper():
|
| 525 |
+
compliance_status = "COMPLIANT"
|
| 526 |
+
elif "NON-COMPLIANT" in response.upper():
|
| 527 |
+
compliance_status = "NON-COMPLIANT"
|
| 528 |
+
elif "PARTIALLY" in response.upper():
|
| 529 |
+
compliance_status = "PARTIALLY COMPLIANT"
|
| 530 |
+
|
| 531 |
+
verification_result = {
|
| 532 |
+
"requirement_id": requirement['id'],
|
| 533 |
+
"criteria": ["Unable to parse criteria"],
|
| 534 |
+
"evidence_found": [],
|
| 535 |
+
"compliance_status": compliance_status,
|
| 536 |
+
"reasoning": f"Response parsing failed. Raw response: {response[:200]}...",
|
| 537 |
+
"confidence": 0.1
|
| 538 |
+
}
|
| 539 |
+
st.warning("Created fallback JSON structure due to parsing errors")
|
| 540 |
+
except Exception as e3:
|
| 541 |
+
st.error(f"Fallback JSON creation failed: {e3}")
|
| 542 |
+
raise e3
|
| 543 |
+
|
| 544 |
+
if verification_result:
|
| 545 |
+
return verification_result
|
| 546 |
+
else:
|
| 547 |
+
raise Exception("All JSON parsing strategies failed")
|
| 548 |
+
|
| 549 |
+
except Exception as e:
|
| 550 |
+
st.error(f"Error parsing verification result: {e}")
|
| 551 |
+
st.error(f"Raw response: {response}")
|
| 552 |
+
# Return a failure result
|
| 553 |
+
return {
|
| 554 |
+
"requirement_id": requirement['id'],
|
| 555 |
+
"evidence_found": [],
|
| 556 |
+
"compliance_status": "ERROR",
|
| 557 |
+
"reasoning": f"Failed to verify requirement due to parsing error: {str(e)}",
|
| 558 |
+
"confidence": 0
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
def analyze_compliance(self, requirements_data, packaging_text, packaging_data, image=None, barcode_data=None, metadata=None, model="claude-sonnet-4-20250514"):
|
| 564 |
+
"""
|
| 565 |
+
Analyze packaging compliance through multi-step process:
|
| 566 |
+
1. Extract structured requirements
|
| 567 |
+
2. Verify each requirement with structured reasoning
|
| 568 |
+
|
| 569 |
+
Args:
|
| 570 |
+
requirements_data: The requirements data (text string or PDF dict)
|
| 571 |
+
packaging_text: Markdown table extracted from the packaging PDF
|
| 572 |
+
packaging_data: Structured text with bounding boxes
|
| 573 |
+
image: The image of the packaging document
|
| 574 |
+
barcode_data: List of barcode objects with position data
|
| 575 |
+
metadata: Dictionary containing font, font size, and color metadata
|
| 576 |
+
model: The Claude model to use
|
| 577 |
+
|
| 578 |
+
Returns:
|
| 579 |
+
A dictionary containing compliance analysis results
|
| 580 |
+
"""
|
| 581 |
+
# Step 1: Extract structured requirements
|
| 582 |
+
st.info("Extracting structured requirements...")
|
| 583 |
+
requirements = self.extract_structured_requirements(requirements_data)
|
| 584 |
+
|
| 585 |
+
if not requirements:
|
| 586 |
+
st.warning("No requirements found in the document. Please check that your requirements file contains valid requirement statements.")
|
| 587 |
+
return {"error": "No requirements found", "requirements": [], "verifications": []}
|
| 588 |
+
|
| 589 |
+
st.success(f"Extracted {len(requirements)} requirements")
|
| 590 |
+
|
| 591 |
+
# Step 2: Verify each requirement with structured reasoning
|
| 592 |
+
st.info("Verifying requirements...")
|
| 593 |
+
verifications = []
|
| 594 |
+
|
| 595 |
+
for i, req in enumerate(requirements):
|
| 596 |
+
st.text(f"Verifying requirement {i+1}/{len(requirements)}: {req['id']}")
|
| 597 |
+
|
| 598 |
+
# Get verification result
|
| 599 |
+
verification = self.verify_individual_requirement(req, packaging_text, image, barcode_data, metadata, requirements_data)
|
| 600 |
+
verifications.append(verification)
|
| 601 |
+
|
| 602 |
+
# Step 4: Generate final compliance report
|
| 603 |
+
system_prompt = """You are a regulatory compliance expert. Provide detailed, objective compliance reports."""
|
| 604 |
+
|
| 605 |
+
# Create minimal summary for LLM (save tokens)
|
| 606 |
+
compliance_summary = []
|
| 607 |
+
for verification in verifications:
|
| 608 |
+
compliance_summary.append({
|
| 609 |
+
'requirement_id': verification.get('requirement_id', 'Unknown'),
|
| 610 |
+
'compliance_status': verification.get('compliance_status', 'UNKNOWN'),
|
| 611 |
+
'confidence': verification.get('confidence', 0),
|
| 612 |
+
'evidence_count': len(verification.get('evidence_found', []))
|
| 613 |
+
})
|
| 614 |
+
|
| 615 |
+
summary_prompt = f"""
|
| 616 |
+
Based on the verification of {len(requirements)} requirements,
|
| 617 |
+
please provide a final compliance summary report.
|
| 618 |
+
|
| 619 |
+
Requirements Summary:
|
| 620 |
+
{json.dumps([{'id': req['id'], 'description': req['description'], 'category': req['category']} for req in requirements], indent=2)}
|
| 621 |
+
|
| 622 |
+
Compliance Results Summary:
|
| 623 |
+
{json.dumps(compliance_summary, indent=2)}
|
| 624 |
+
|
| 625 |
+
Format your response in the following template:
|
| 626 |
+
|
| 627 |
+
## 🎯 **Analysis Requirements**
|
| 628 |
+
|
| 629 |
+
Summarize the overall compliance status with focus on:
|
| 630 |
+
|
| 631 |
+
1. **Quantitative Metrics**: Count of fully compliant, partially compliant, and non-compliant requirements
|
| 632 |
+
2. **Critical Issues**: Most urgent compliance gaps requiring immediate attention
|
| 633 |
+
3. **Strategic Recommendations**: Actionable steps for the artwork designer to fix the compliance issues
|
| 634 |
+
|
| 635 |
+
---
|
| 636 |
+
|
| 637 |
+
## 📋 **Response Template**
|
| 638 |
+
|
| 639 |
+
### 🔍 **Executive Summary**
|
| 640 |
+
Provide a single, clear statement of overall compliance status
|
| 641 |
+
*Example: "Organization achieved 70% compliance (14/20 requirements); moderate risk profile with 3 critical gaps identified."*
|
| 642 |
+
|
| 643 |
+
---
|
| 644 |
+
|
| 645 |
+
### 📈 **Compliance Statistics**
|
| 646 |
+
|
| 647 |
+
| **Metric** | **Count** | **Percentage** |
|
| 648 |
+
|------------|-----------|----------------|
|
| 649 |
+
| **Total Requirements** | `[total]` | `100%` |
|
| 650 |
+
| ✅ **Fully Compliant** | `[count]` | `[%]` |
|
| 651 |
+
| ⚠️ **Partially Compliant** | `[count]` | `[%]` |
|
| 652 |
+
| ❌ **Non-Compliant** | `[count]` | `[%]` |
|
| 653 |
+
|
| 654 |
+
---
|
| 655 |
+
|
| 656 |
+
### 🚨 **Priority Findings**
|
| 657 |
+
|
| 658 |
+
List 3-5 highest-severity issues in order of criticality:
|
| 659 |
+
|
| 660 |
+
1. **[REQ-ID]** - [Brief description of critical issue]
|
| 661 |
+
2. **[REQ-ID]** - [Brief description of high-priority gap]
|
| 662 |
+
3. **[REQ-ID]** - [Brief description of moderate-priority concern]
|
| 663 |
+
|
| 664 |
+
---
|
| 665 |
+
|
| 666 |
+
### 💡 **Targeted Recommendations**
|
| 667 |
+
|
| 668 |
+
For each Priority Finding, provide specific corrective actions:
|
| 669 |
+
|
| 670 |
+
| **Finding** | **Recommended Action** | **Priority** |
|
| 671 |
+
|-------------|------------------------|--------------|
|
| 672 |
+
| **[REQ-ID]** | [Specific artwork designer action] | 🔴 **Critical** |
|
| 673 |
+
| **[REQ-ID]** | [Specific artwork designer action] | 🟡 **High** |
|
| 674 |
+
| **[REQ-ID]** | [Specific artwork designer action] | 🟢 **Medium** |
|
| 675 |
+
|
| 676 |
+
---
|
| 677 |
+
|
| 678 |
+
### 📝 **Detailed Assessment Results**
|
| 679 |
+
|
| 680 |
+
*[Provide comprehensive breakdown of each requirement with status and supporting details]*
|
| 681 |
+
|
| 682 |
+
---
|
| 683 |
+
|
| 684 |
+
### 📊 **Supporting Evidence**
|
| 685 |
+
|
| 686 |
+
*[Include relevant data, metrics, or documentation that supports the compliance assessment]*
|
| 687 |
+
|
| 688 |
+
|
| 689 |
+
"""
|
| 690 |
+
|
| 691 |
+
# Get the final compliance report
|
| 692 |
+
compliance_report = self.llm.call_claude_api(summary_prompt, system_prompt, model='claude-3-5-haiku-20241022')
|
| 693 |
+
|
| 694 |
+
# Compile all results
|
| 695 |
+
result = {
|
| 696 |
+
"requirements": requirements,
|
| 697 |
+
"verifications": verifications,
|
| 698 |
+
"compliance_report": compliance_report,
|
| 699 |
+
"packaging_data": packaging_data,
|
| 700 |
+
"barcode_data": barcode_data,
|
| 701 |
+
"metadata": metadata
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
return result
|
src/extract_text/__pycache__/extract_meta_data.cpython-313.pyc
ADDED
|
Binary file (16.2 kB). View file
|
|
|
src/extract_text/__pycache__/google_document_api.cpython-313.pyc
ADDED
|
Binary file (13.1 kB). View file
|
|
|
src/extract_text/__pycache__/google_document_api.cpython-313.pyc.1480615374128
ADDED
|
Binary file (7.92 kB). View file
|
|
|
src/extract_text/__pycache__/ingest.cpython-313.pyc
ADDED
|
Binary file (3.51 kB). View file
|
|
|
src/extract_text/extract_meta_data.py
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import pytesseract
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import numpy as np
|
| 5 |
+
import cv2
|
| 6 |
+
from collections import defaultdict, Counter
|
| 7 |
+
import io
|
| 8 |
+
import re
|
| 9 |
+
from typing import Dict, List, Tuple, Optional, Union
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class PDFArtworkMetadataExtractor:
|
| 13 |
+
"""
|
| 14 |
+
A class for extracting metadata (font, font size, text color) from artwork PDFs.
|
| 15 |
+
Handles both selectable text and non-selectable text using OCR.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, tesseract_path: Optional[str] = None):
|
| 19 |
+
"""
|
| 20 |
+
Initialize the metadata extractor.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
tesseract_path: Path to tesseract executable (if not in PATH)
|
| 24 |
+
"""
|
| 25 |
+
if tesseract_path:
|
| 26 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
| 27 |
+
|
| 28 |
+
self.pdf_doc = None
|
| 29 |
+
self.metadata = {
|
| 30 |
+
'fonts': {},
|
| 31 |
+
'font_sizes': {},
|
| 32 |
+
'text_colors': {},
|
| 33 |
+
'has_selectable_text': False,
|
| 34 |
+
'pages_processed': 0,
|
| 35 |
+
'extraction_method': None
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
def load_pdf(self, pdf_path: str) -> bool:
|
| 39 |
+
"""
|
| 40 |
+
Load PDF document.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
pdf_path: Path to PDF file
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
bool: True if successful, False otherwise
|
| 47 |
+
"""
|
| 48 |
+
try:
|
| 49 |
+
self.pdf_doc = fitz.open(pdf_path)
|
| 50 |
+
return True
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"Error loading PDF: {e}")
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
def _extract_selectable_text_metadata(self) -> Dict:
|
| 56 |
+
"""
|
| 57 |
+
Extract metadata from selectable text using PyMuPDF.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Dict: Metadata dictionary with fonts, sizes, and colors
|
| 61 |
+
"""
|
| 62 |
+
fonts = defaultdict(int)
|
| 63 |
+
font_sizes = defaultdict(int)
|
| 64 |
+
colors = defaultdict(int)
|
| 65 |
+
|
| 66 |
+
for page_num in range(len(self.pdf_doc)):
|
| 67 |
+
page = self.pdf_doc[page_num]
|
| 68 |
+
|
| 69 |
+
# Get text with formatting information
|
| 70 |
+
text_dict = page.get_text("dict")
|
| 71 |
+
|
| 72 |
+
for block in text_dict["blocks"]:
|
| 73 |
+
if "lines" in block:
|
| 74 |
+
for line in block["lines"]:
|
| 75 |
+
for span in line["spans"]:
|
| 76 |
+
# Extract font information
|
| 77 |
+
font_name = span.get("font", "Unknown")
|
| 78 |
+
font_size = span.get("size", 0)
|
| 79 |
+
|
| 80 |
+
# Extract color (RGB)
|
| 81 |
+
color = span.get("color", 0)
|
| 82 |
+
if isinstance(color, int):
|
| 83 |
+
# Convert integer color to RGB
|
| 84 |
+
r = (color >> 16) & 255
|
| 85 |
+
g = (color >> 8) & 255
|
| 86 |
+
b = color & 255
|
| 87 |
+
color_rgb = (r, g, b)
|
| 88 |
+
else:
|
| 89 |
+
color_rgb = (0, 0, 0) # Default to black
|
| 90 |
+
|
| 91 |
+
# Count occurrences
|
| 92 |
+
text_content = span.get("text", "").strip()
|
| 93 |
+
if text_content:
|
| 94 |
+
fonts[font_name] += len(text_content)
|
| 95 |
+
# Round font size to one decimal place
|
| 96 |
+
rounded_size = round(font_size, 1)
|
| 97 |
+
font_sizes[rounded_size] += len(text_content)
|
| 98 |
+
colors[color_rgb] += len(text_content)
|
| 99 |
+
|
| 100 |
+
return {
|
| 101 |
+
'fonts': dict(fonts),
|
| 102 |
+
'font_sizes': dict(font_sizes),
|
| 103 |
+
'text_colors': dict(colors)
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
def _preprocess_image_for_ocr(self, image: np.ndarray) -> np.ndarray:
|
| 107 |
+
"""
|
| 108 |
+
Preprocess image for better OCR results.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
image: Input image as numpy array
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
np.ndarray: Preprocessed image
|
| 115 |
+
"""
|
| 116 |
+
# Convert to grayscale
|
| 117 |
+
if len(image.shape) == 3:
|
| 118 |
+
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
|
| 119 |
+
else:
|
| 120 |
+
gray = image
|
| 121 |
+
|
| 122 |
+
# Apply denoising
|
| 123 |
+
denoised = cv2.fastNlMeansDenoising(gray)
|
| 124 |
+
|
| 125 |
+
# Apply adaptive thresholding
|
| 126 |
+
thresh = cv2.adaptiveThreshold(
|
| 127 |
+
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 128 |
+
cv2.THRESH_BINARY, 11, 2
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
return thresh
|
| 132 |
+
|
| 133 |
+
def _estimate_font_size_from_ocr(self, image: np.ndarray, text_data: Dict) -> Dict[float, int]:
|
| 134 |
+
"""
|
| 135 |
+
Estimate font sizes from OCR bounding boxes.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
image: Input image
|
| 139 |
+
text_data: OCR data from pytesseract
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
Dict: Font sizes and their frequencies
|
| 143 |
+
"""
|
| 144 |
+
font_sizes = defaultdict(int)
|
| 145 |
+
|
| 146 |
+
for i, text in enumerate(text_data['text']):
|
| 147 |
+
if text.strip():
|
| 148 |
+
height = text_data['height'][i]
|
| 149 |
+
# Estimate font size from bounding box height
|
| 150 |
+
estimated_size = max(8, min(72, height * 0.75)) # Rough conversion
|
| 151 |
+
# Round to one decimal place
|
| 152 |
+
rounded_size = round(estimated_size, 1)
|
| 153 |
+
font_sizes[rounded_size] += len(text.strip())
|
| 154 |
+
|
| 155 |
+
return dict(font_sizes)
|
| 156 |
+
|
| 157 |
+
def _extract_colors_from_image(self, image: np.ndarray, text_data: Dict) -> Dict[Tuple[int, int, int], int]:
|
| 158 |
+
"""
|
| 159 |
+
Extract dominant colors from text regions.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
image: Input image
|
| 163 |
+
text_data: OCR data from pytesseract
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
Dict: Colors and their frequencies
|
| 167 |
+
"""
|
| 168 |
+
colors = defaultdict(int)
|
| 169 |
+
|
| 170 |
+
for i, text in enumerate(text_data['text']):
|
| 171 |
+
if text.strip():
|
| 172 |
+
x, y, w, h = (text_data['left'][i], text_data['top'][i],
|
| 173 |
+
text_data['width'][i], text_data['height'][i])
|
| 174 |
+
|
| 175 |
+
# Extract text region
|
| 176 |
+
if 0 <= y < image.shape[0] and 0 <= x < image.shape[1]:
|
| 177 |
+
text_region = image[y:y+h, x:x+w]
|
| 178 |
+
|
| 179 |
+
if text_region.size > 0:
|
| 180 |
+
if len(text_region.shape) == 3:
|
| 181 |
+
# For color images, find dominant color
|
| 182 |
+
pixels = text_region.reshape(-1, 3)
|
| 183 |
+
# Find the most common color that's not white/background
|
| 184 |
+
unique_colors, counts = np.unique(pixels, axis=0, return_counts=True)
|
| 185 |
+
|
| 186 |
+
# Filter out likely background colors (very light colors)
|
| 187 |
+
for color, count in zip(unique_colors, counts):
|
| 188 |
+
if np.mean(color) < 200: # Not too light
|
| 189 |
+
colors[tuple(color)] += len(text.strip())
|
| 190 |
+
else:
|
| 191 |
+
# For grayscale, assume black text
|
| 192 |
+
avg_intensity = np.mean(text_region)
|
| 193 |
+
if avg_intensity < 128: # Dark text
|
| 194 |
+
colors[(0, 0, 0)] += len(text.strip())
|
| 195 |
+
|
| 196 |
+
return dict(colors)
|
| 197 |
+
|
| 198 |
+
def _extract_ocr_metadata(self) -> Dict:
|
| 199 |
+
"""
|
| 200 |
+
Extract metadata using OCR for non-selectable text.
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
Dict: Metadata dictionary with estimated fonts, sizes, and colors
|
| 204 |
+
"""
|
| 205 |
+
all_font_sizes = defaultdict(int)
|
| 206 |
+
all_colors = defaultdict(int)
|
| 207 |
+
|
| 208 |
+
for page_num in range(len(self.pdf_doc)):
|
| 209 |
+
page = self.pdf_doc[page_num]
|
| 210 |
+
|
| 211 |
+
# Convert page to image
|
| 212 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better quality
|
| 213 |
+
img_data = pix.tobytes("ppm")
|
| 214 |
+
image = Image.open(io.BytesIO(img_data))
|
| 215 |
+
image_np = np.array(image)
|
| 216 |
+
|
| 217 |
+
# Preprocess image
|
| 218 |
+
processed_img = self._preprocess_image_for_ocr(image_np)
|
| 219 |
+
|
| 220 |
+
# Perform OCR with detailed data
|
| 221 |
+
ocr_data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT)
|
| 222 |
+
|
| 223 |
+
# Extract font sizes
|
| 224 |
+
page_font_sizes = self._estimate_font_size_from_ocr(processed_img, ocr_data)
|
| 225 |
+
for size, count in page_font_sizes.items():
|
| 226 |
+
all_font_sizes[size] += count
|
| 227 |
+
|
| 228 |
+
# Extract colors
|
| 229 |
+
page_colors = self._extract_colors_from_image(image_np, ocr_data)
|
| 230 |
+
for color, count in page_colors.items():
|
| 231 |
+
all_colors[color] += count
|
| 232 |
+
|
| 233 |
+
# For OCR, we can't determine exact fonts, so provide common estimates
|
| 234 |
+
estimated_fonts = {
|
| 235 |
+
'Arial-like': sum(all_font_sizes.values()) * 0.4,
|
| 236 |
+
'Times-like': sum(all_font_sizes.values()) * 0.3,
|
| 237 |
+
'Helvetica-like': sum(all_font_sizes.values()) * 0.3
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
'fonts': estimated_fonts,
|
| 242 |
+
'font_sizes': dict(all_font_sizes),
|
| 243 |
+
'text_colors': dict(all_colors)
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
def _has_selectable_text(self) -> bool:
|
| 247 |
+
"""
|
| 248 |
+
Check if PDF has selectable text.
|
| 249 |
+
|
| 250 |
+
Returns:
|
| 251 |
+
bool: True if PDF has selectable text
|
| 252 |
+
"""
|
| 253 |
+
for page_num in range(min(3, len(self.pdf_doc))): # Check first 3 pages
|
| 254 |
+
page = self.pdf_doc[page_num]
|
| 255 |
+
text = page.get_text().strip()
|
| 256 |
+
if text:
|
| 257 |
+
return True
|
| 258 |
+
return False
|
| 259 |
+
|
| 260 |
+
def extract_metadata(self, pdf_path: str) -> Dict:
|
| 261 |
+
"""
|
| 262 |
+
Extract metadata from PDF artwork.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
pdf_path: Path to PDF file
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
Dict: Complete metadata dictionary
|
| 269 |
+
"""
|
| 270 |
+
if not self.load_pdf(pdf_path):
|
| 271 |
+
return {'error': 'Failed to load PDF'}
|
| 272 |
+
|
| 273 |
+
try:
|
| 274 |
+
self.metadata['pages_processed'] = len(self.pdf_doc)
|
| 275 |
+
has_selectable = self._has_selectable_text()
|
| 276 |
+
self.metadata['has_selectable_text'] = has_selectable
|
| 277 |
+
|
| 278 |
+
if has_selectable:
|
| 279 |
+
self.metadata['extraction_method'] = 'selectable_text'
|
| 280 |
+
extracted_data = self._extract_selectable_text_metadata()
|
| 281 |
+
else:
|
| 282 |
+
self.metadata['extraction_method'] = 'ocr'
|
| 283 |
+
extracted_data = self._extract_ocr_metadata()
|
| 284 |
+
|
| 285 |
+
# Update metadata
|
| 286 |
+
self.metadata.update(extracted_data)
|
| 287 |
+
|
| 288 |
+
# Sort by frequency (most common first)
|
| 289 |
+
self.metadata['fonts'] = dict(sorted(
|
| 290 |
+
self.metadata['fonts'].items(),
|
| 291 |
+
key=lambda x: x[1],
|
| 292 |
+
reverse=True
|
| 293 |
+
))
|
| 294 |
+
|
| 295 |
+
self.metadata['font_sizes'] = dict(sorted(
|
| 296 |
+
self.metadata['font_sizes'].items(),
|
| 297 |
+
key=lambda x: x[1],
|
| 298 |
+
reverse=True
|
| 299 |
+
))
|
| 300 |
+
|
| 301 |
+
self.metadata['text_colors'] = dict(sorted(
|
| 302 |
+
self.metadata['text_colors'].items(),
|
| 303 |
+
key=lambda x: x[1],
|
| 304 |
+
reverse=True
|
| 305 |
+
))
|
| 306 |
+
|
| 307 |
+
return self.metadata
|
| 308 |
+
|
| 309 |
+
except Exception as e:
|
| 310 |
+
return {'error': f'Failed to extract metadata: {e}'}
|
| 311 |
+
|
| 312 |
+
finally:
|
| 313 |
+
if self.pdf_doc:
|
| 314 |
+
self.pdf_doc.close()
|
| 315 |
+
|
| 316 |
+
def get_dominant_font(self) -> Optional[str]:
|
| 317 |
+
"""Get the most frequently used font."""
|
| 318 |
+
if self.metadata['fonts']:
|
| 319 |
+
return max(self.metadata['fonts'], key=self.metadata['fonts'].get)
|
| 320 |
+
return None
|
| 321 |
+
|
| 322 |
+
def get_dominant_font_size(self) -> Optional[float]:
|
| 323 |
+
"""Get the most frequently used font size."""
|
| 324 |
+
if self.metadata['font_sizes']:
|
| 325 |
+
return max(self.metadata['font_sizes'], key=self.metadata['font_sizes'].get)
|
| 326 |
+
return None
|
| 327 |
+
|
| 328 |
+
def get_dominant_color(self) -> Optional[Tuple[int, int, int]]:
|
| 329 |
+
"""Get the most frequently used text color."""
|
| 330 |
+
if self.metadata['text_colors']:
|
| 331 |
+
return max(self.metadata['text_colors'], key=self.metadata['text_colors'].get)
|
| 332 |
+
return None
|
| 333 |
+
|
| 334 |
+
def print_summary(self):
|
| 335 |
+
"""Print a summary of extracted metadata."""
|
| 336 |
+
print("PDF Artwork Metadata Summary")
|
| 337 |
+
print("=" * 40)
|
| 338 |
+
print(f"Pages processed: {self.metadata['pages_processed']}")
|
| 339 |
+
print(f"Has selectable text: {self.metadata['has_selectable_text']}")
|
| 340 |
+
print(f"Extraction method: {self.metadata['extraction_method']}")
|
| 341 |
+
print()
|
| 342 |
+
|
| 343 |
+
print("Top 5 Fonts:")
|
| 344 |
+
for i, (font, count) in enumerate(list(self.metadata['fonts'].items())[:5]):
|
| 345 |
+
print(f" {i+1}. {font}: {count} characters")
|
| 346 |
+
print()
|
| 347 |
+
|
| 348 |
+
print("Top 5 Font Sizes:")
|
| 349 |
+
for i, (size, count) in enumerate(list(self.metadata['font_sizes'].items())[:5]):
|
| 350 |
+
print(f" {i+1}. {size}pt: {count} characters")
|
| 351 |
+
print()
|
| 352 |
+
|
| 353 |
+
print("Top 5 Text Colors (RGB):")
|
| 354 |
+
for i, (color, count) in enumerate(list(self.metadata['text_colors'].items())[:5]):
|
| 355 |
+
print(f" {i+1}. {color}: {count} characters")
|
src/extract_text/google_document_api.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Optional, List, Dict, Any
|
| 3 |
+
from google.api_core.client_options import ClientOptions
|
| 4 |
+
from google.cloud import documentai # type: ignore
|
| 5 |
+
from PIL import Image, ImageChops
|
| 6 |
+
from io import BytesIO
|
| 7 |
+
import fitz # PyMuPDF
|
| 8 |
+
import base64
|
| 9 |
+
|
| 10 |
+
class GoogleDocumentAPI:
|
| 11 |
+
def __init__(self, credentials_path: str):
|
| 12 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
|
| 13 |
+
|
| 14 |
+
self.project_id = "649829115993"
|
| 15 |
+
self.location = "us" # Format is "us" or "eu"
|
| 16 |
+
self.processor_id = "7f9fd758484d83fe" # Only use this
|
| 17 |
+
self.mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
|
| 18 |
+
|
| 19 |
+
def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document:
|
| 20 |
+
opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
|
| 21 |
+
client = documentai.DocumentProcessorServiceClient(client_options=opts)
|
| 22 |
+
|
| 23 |
+
if processor_version_id:
|
| 24 |
+
name = client.processor_version_path(
|
| 25 |
+
self.project_id, self.location, self.processor_id, processor_version_id
|
| 26 |
+
)
|
| 27 |
+
else:
|
| 28 |
+
name = client.processor_path(self.project_id, self.location, self.processor_id)
|
| 29 |
+
|
| 30 |
+
with open(file_path, "rb") as image:
|
| 31 |
+
image_content = image.read()
|
| 32 |
+
|
| 33 |
+
raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type)
|
| 34 |
+
|
| 35 |
+
process_options = documentai.ProcessOptions(
|
| 36 |
+
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
|
| 37 |
+
pages=[1]
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
request = documentai.ProcessRequest(
|
| 42 |
+
name=name,
|
| 43 |
+
raw_document=raw_document,
|
| 44 |
+
field_mask=field_mask,
|
| 45 |
+
process_options=process_options,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
result = client.process_document(request=request)
|
| 49 |
+
return result.document
|
| 50 |
+
|
| 51 |
+
def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str:
|
| 52 |
+
# Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0]
|
| 53 |
+
return document.pages[page_number].text
|
| 54 |
+
|
| 55 |
+
@staticmethod
|
| 56 |
+
def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str:
|
| 57 |
+
"""Helper function to extract style information for a text anchor."""
|
| 58 |
+
if not hasattr(document, 'text_styles') or not document.text_styles:
|
| 59 |
+
return "N/A"
|
| 60 |
+
|
| 61 |
+
styles = []
|
| 62 |
+
# A text anchor can have multiple non-contiguous segments.
|
| 63 |
+
for para_segment in text_anchor.text_segments:
|
| 64 |
+
para_start = int(para_segment.start_index)
|
| 65 |
+
para_end = int(para_segment.end_index)
|
| 66 |
+
|
| 67 |
+
for style in document.text_styles:
|
| 68 |
+
for style_segment in style.text_anchor.text_segments:
|
| 69 |
+
style_start = int(style_segment.start_index)
|
| 70 |
+
style_end = int(style_segment.end_index)
|
| 71 |
+
|
| 72 |
+
# Check for overlap between the paragraph segment and the style segment
|
| 73 |
+
if max(para_start, style_start) < min(para_end, style_end):
|
| 74 |
+
style_str_parts = []
|
| 75 |
+
if style.font_size and style.font_size.size > 0:
|
| 76 |
+
unit = style.font_size.unit if style.font_size.unit else 'pt'
|
| 77 |
+
style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}")
|
| 78 |
+
if style.font_weight and style.font_weight.lower() != 'normal':
|
| 79 |
+
style_str_parts.append(f"font weight: {style.font_weight}")
|
| 80 |
+
if style.text_style and style.text_style.lower() != 'normal':
|
| 81 |
+
style_str_parts.append(f"text style: {style.text_style}")
|
| 82 |
+
if style.font_family:
|
| 83 |
+
style_str_parts.append(f'font family: {style.font_family}')
|
| 84 |
+
|
| 85 |
+
if style_str_parts:
|
| 86 |
+
styles.append(" ".join(style_str_parts))
|
| 87 |
+
|
| 88 |
+
if styles:
|
| 89 |
+
# Using dict.fromkeys to preserve order and get unique styles
|
| 90 |
+
unique_styles = list(dict.fromkeys(styles))
|
| 91 |
+
return ", ".join(unique_styles)
|
| 92 |
+
|
| 93 |
+
return "default"
|
| 94 |
+
|
| 95 |
+
@staticmethod
|
| 96 |
+
def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
|
| 97 |
+
"""Helper function to extract text from text_anchor."""
|
| 98 |
+
if not text_anchor.text_segments:
|
| 99 |
+
return ""
|
| 100 |
+
return "".join(
|
| 101 |
+
text[int(segment.start_index) : int(segment.end_index)]
|
| 102 |
+
for segment in text_anchor.text_segments
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]:
|
| 106 |
+
"""
|
| 107 |
+
Extracts text and bounding box for each paragraph in the document.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
document: The processed documentai.Document object.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
A list of dictionaries, where each dictionary contains:
|
| 114 |
+
- 'page_number': The page number (1-based).
|
| 115 |
+
- 'text': The text of the paragraph.
|
| 116 |
+
- 'bounding_box': A list of normalized vertices for the bounding box.
|
| 117 |
+
- 'style': Style information for the text.
|
| 118 |
+
- 'height': The height of the text block in millimeters (mm).
|
| 119 |
+
"""
|
| 120 |
+
all_paragraphs = []
|
| 121 |
+
full_text = document.text
|
| 122 |
+
pt_to_mm = 0.3528 # Conversion factor from points to millimeters
|
| 123 |
+
|
| 124 |
+
for page in document.pages:
|
| 125 |
+
# Get page height in points for height calculation
|
| 126 |
+
page_pts = page.dimension.height
|
| 127 |
+
|
| 128 |
+
for paragraph in page.paragraphs:
|
| 129 |
+
p_text = self._get_text(paragraph.layout.text_anchor, full_text)
|
| 130 |
+
style_info = self._get_style_info(paragraph.layout.text_anchor, document)
|
| 131 |
+
|
| 132 |
+
# Get the normalized vertices for the bounding box
|
| 133 |
+
vertices = [
|
| 134 |
+
{"x": vertex.x, "y": vertex.y}
|
| 135 |
+
for vertex in paragraph.layout.bounding_poly.normalized_vertices
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
# Calculate height in millimeters
|
| 139 |
+
y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices]
|
| 140 |
+
height_ratio = max(y_coords) - min(y_coords)
|
| 141 |
+
height_pt = height_ratio * page_pts
|
| 142 |
+
height_mm = height_pt * pt_to_mm
|
| 143 |
+
|
| 144 |
+
all_paragraphs.append({
|
| 145 |
+
"page_number": page.page_number,
|
| 146 |
+
"text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace
|
| 147 |
+
"bounding_box": vertices,
|
| 148 |
+
"style": style_info,
|
| 149 |
+
"height": round(height_mm, 2)
|
| 150 |
+
})
|
| 151 |
+
return all_paragraphs
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def extract_text_with_markdown_table(self, document: documentai.Document) -> str:
|
| 157 |
+
data = self.extract_text_with_bounding_boxes(document)
|
| 158 |
+
return self._create_markdown_table(data)
|
| 159 |
+
|
| 160 |
+
def _quantize_coord(self, val, grid_size=1000) -> int:
|
| 161 |
+
"""Converts a float (0-1) to an integer on a grid."""
|
| 162 |
+
return int(val * grid_size)
|
| 163 |
+
|
| 164 |
+
def _create_markdown_table(self, data) -> str:
|
| 165 |
+
table = "| Text ID | X | Y | Text Height (mm) | Style | Text |\\n"
|
| 166 |
+
table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n"
|
| 167 |
+
for i, item in enumerate(data):
|
| 168 |
+
top_left = item['bounding_box'][0]
|
| 169 |
+
x = self._quantize_coord(top_left['x'])
|
| 170 |
+
y = self._quantize_coord(top_left['y'])
|
| 171 |
+
height = round(item.get('height', 0), 2)
|
| 172 |
+
style = item.get('style', 'N/A')
|
| 173 |
+
text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip()
|
| 174 |
+
table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n"
|
| 175 |
+
return table
|
| 176 |
+
|
| 177 |
+
def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]:
|
| 178 |
+
"""
|
| 179 |
+
Extracts bounding boxes for tokens on a specific page.
|
| 180 |
+
"""
|
| 181 |
+
page = document.pages[page_number]
|
| 182 |
+
return [token.layout.bounding_poly for token in page.tokens]
|
| 183 |
+
|
| 184 |
+
def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]:
|
| 185 |
+
"""
|
| 186 |
+
Extracts the height of each line of text from a Google Document AI parsed document
|
| 187 |
+
and returns a list of heights in millimeters (mm).
|
| 188 |
+
|
| 189 |
+
Parameters:
|
| 190 |
+
document (google.cloud.documentai.Document): Parsed Document AI response object
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
List of tuples: [(page_num, line_text, height_mm), ...]
|
| 194 |
+
"""
|
| 195 |
+
heights = []
|
| 196 |
+
pt_to_mm = 0.3528
|
| 197 |
+
|
| 198 |
+
for page_num, page in enumerate(document.pages, start=1):
|
| 199 |
+
page_height_pt = page.dimension.height # e.g., 792 for US Letter
|
| 200 |
+
|
| 201 |
+
for line in page.lines:
|
| 202 |
+
layout = line.layout
|
| 203 |
+
vertices = layout.bounding_poly.normalized_vertices
|
| 204 |
+
|
| 205 |
+
y_coords = [v.y for v in vertices]
|
| 206 |
+
if not y_coords:
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
height_ratio = max(y_coords) - min(y_coords)
|
| 210 |
+
height_pt = height_ratio * page_height_pt
|
| 211 |
+
height_mm = height_pt * pt_to_mm
|
| 212 |
+
|
| 213 |
+
# Extract visible text (optional — may require mapping segments)
|
| 214 |
+
text_segment = layout.text_anchor.text_segments[0]
|
| 215 |
+
start = int(text_segment.start_index)
|
| 216 |
+
end = int(text_segment.end_index)
|
| 217 |
+
line_text = document.text[start:end].strip()
|
| 218 |
+
|
| 219 |
+
heights.append((page_num, line_text, round(height_mm, 2)))
|
| 220 |
+
|
| 221 |
+
return heights
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
|
src/extract_text/ingest.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
|
| 4 |
+
class RequirementsIngest:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
def ingest_requirements_document(self, file_obj) -> dict:
|
| 9 |
+
"""
|
| 10 |
+
Ingest a requirements document from a file-like object.
|
| 11 |
+
Supports both TXT and PDF files.
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
dict: {
|
| 15 |
+
'type': 'text' or 'pdf',
|
| 16 |
+
'content': str (for text) or base64 string (for PDF),
|
| 17 |
+
'filename': str,
|
| 18 |
+
'text_content': str (extracted text for PDFs, same as content for TXT)
|
| 19 |
+
}
|
| 20 |
+
"""
|
| 21 |
+
try:
|
| 22 |
+
filename = getattr(file_obj, 'name', 'unknown')
|
| 23 |
+
file_extension = filename.lower().split('.')[-1] if '.' in filename else ''
|
| 24 |
+
|
| 25 |
+
if file_extension == 'pdf':
|
| 26 |
+
# Handle PDF file
|
| 27 |
+
file_obj.seek(0)
|
| 28 |
+
pdf_content = file_obj.read()
|
| 29 |
+
|
| 30 |
+
# Convert PDF to base64 for Claude
|
| 31 |
+
pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
|
| 32 |
+
|
| 33 |
+
# For PDFs, we'll extract text content for backward compatibility
|
| 34 |
+
# but the main content will be the PDF itself
|
| 35 |
+
try:
|
| 36 |
+
# Try to extract text using PyPDF2 if available
|
| 37 |
+
try:
|
| 38 |
+
from PyPDF2 import PdfReader
|
| 39 |
+
import io
|
| 40 |
+
|
| 41 |
+
# Reset file pointer and read PDF
|
| 42 |
+
file_obj.seek(0)
|
| 43 |
+
pdf_content = file_obj.read()
|
| 44 |
+
pdf_stream = io.BytesIO(pdf_content)
|
| 45 |
+
|
| 46 |
+
# Extract text from PDF
|
| 47 |
+
reader = PdfReader(pdf_stream)
|
| 48 |
+
text_content = ""
|
| 49 |
+
for page in reader.pages:
|
| 50 |
+
text_content += page.extract_text() + "\n"
|
| 51 |
+
|
| 52 |
+
if not text_content.strip():
|
| 53 |
+
text_content = f"PDF Requirements Document: {filename} (no text content found)"
|
| 54 |
+
else:
|
| 55 |
+
# Limit text content for display
|
| 56 |
+
text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content
|
| 57 |
+
|
| 58 |
+
except ImportError:
|
| 59 |
+
# PyPDF2 not available, use basic description
|
| 60 |
+
text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)"
|
| 61 |
+
except Exception as e:
|
| 62 |
+
text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
|
| 63 |
+
|
| 64 |
+
except Exception as e:
|
| 65 |
+
text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
|
| 66 |
+
|
| 67 |
+
return {
|
| 68 |
+
'type': 'pdf',
|
| 69 |
+
'content': pdf_base64,
|
| 70 |
+
'filename': filename,
|
| 71 |
+
'text_content': text_content,
|
| 72 |
+
'file_size': len(pdf_content)
|
| 73 |
+
}
|
| 74 |
+
else:
|
| 75 |
+
# Handle text file (default behavior)
|
| 76 |
+
file_obj.seek(0)
|
| 77 |
+
text = file_obj.read()
|
| 78 |
+
if isinstance(text, bytes):
|
| 79 |
+
text = text.decode("utf-8", errors="replace")
|
| 80 |
+
|
| 81 |
+
return {
|
| 82 |
+
'type': 'text',
|
| 83 |
+
'content': text,
|
| 84 |
+
'filename': filename,
|
| 85 |
+
'text_content': text,
|
| 86 |
+
'file_size': len(text.encode('utf-8'))
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
raise ValueError(f"Error reading requirements document: {e}")
|
| 91 |
+
|
| 92 |
+
|
src/extract_text/photon-services-f0d3ec1417d0.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"type": "service_account",
|
| 3 |
+
"project_id": "photon-services",
|
| 4 |
+
"private_key_id": "f0d3ec1417d0afe1a21079a88350de615829fb38",
|
| 5 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDGUlwi7owC2jS0\n9miy5mDi9Q84/8arKMkG8n2Zok7lfFz9cFf76G/ai1eIAvQ9u6OV2ddt05lZMX8S\n+q5PSFlmeOCXSHcnufoTsWY5FKTXWzWd4dZ6lMsCOq7kWB+tHEhlftxMR1egI7sn\nA3z32cbydPewInvw6QMMLaFdtACS8p09QnRZSdYnGX5FNJr9Hq+NBa5qRHqA0y8g\n6x5lo/Ybku3bKCNAu4NWOErsKZ4Z0yEZzggad7nojx1oA9wmIVaTbrJ6OY2kPOMN\n0mBQJBOdRaw5fIHiDYH18tnR0UzVVEnv2s1LADcSpe144nDbIlLdD3DsZ0H9j91J\n7b+EnbaJAgMBAAECggEAHXRe/csrHUNWP6g3LZbcveiCnccTNRmGHdOHBvnduOSr\nFPMKBj5j2nQGiItTxhVnutpTThr2tBIPWvzDRcArkvYR+TYIiGxtMV6QHZsszlVc\nFbpUdflCW27mycAy2C2SrQxV4LhZ0c1svuMcPN1p2Fm57b15ZfLdgoIGbNnOmgRO\nmOjJxXnjbPq4pFnZYVB2GxV7t3O8kzTG8msWFeIuOfrs6UJpXAS91BQXfLmnaxv5\nP56EaNGyamQgHVnOrtoLoTTUFrfNUFCl2Ggrs80FfS0ZJaIWqrItDLI9ah9MgfeL\nTwrcgjWFodX0BRu7Er2RX5Bo/vhhIVVZeOIHxzKWFwKBgQDk0+QCqChmMAOvchlX\nWb6XADW8qyYYbEPSVO+/IJi0teqIDGW/d1F0QrDdZc8dYlmaUqCt5z1NT8PdXSXd\nTifDRXLbHaKlFS3DQF+ComgC+ey9cUjZ0nMiCqzYKUftkmM2xWWJcLfEXPuWSZiy\n//Yqctd1ilQjk5pMyJFaT5k0MwKBgQDd3x8DwqEyWHk/nT4RQSVGp4S9+ZLegu+K\nefLPpCQevc0klvQVDospob181jZqBnWPDBd7fPyBc3+HmD/zzmU2YHlyWg3n9scb\nq/5WOssxjGkjhb8OftwsUesYLPFm6HcVfb+kiHJm+FKk2Yb935L90S3oOd0ljIuk\ng6LJF40OUwKBgE53XmOO2DOaWVkrLgdnDdTnzIWCxtBvJ56TY5bNja/CBcdbQPSz\n7KmKSO3SgIAZ/pHNra2Ucs/0/zwEOfy2VSo/wU/jzKcBKS0gAOBh4nrKyuR3WTzg\nTnyo3nZNSY3subrJW7USguGB5P+3Ava2kOojcUCsC4gbkDiuOjGWw/lDAoGBAIiG\nTihbMCOxq1JIqLOnWY+jbxwTIZvICCw2pAG/J/a+pif4t1Lpsxo4C0hw6+TL+rS+\nJQj4vMvPTU8bkWatvzv5m2GRJnNxN83ARO28meHwW5XfK9R4nXSsJ7SlmxnOu9A+\no5lT2MmhzgDgVZ+MXn/Ooqf+SyVa2WavFZEV69c/AoGACpBkRiXMscE1FISCy+lr\nDTIvGtqsMMadN7N+2ceQB+Yr/slE7FaCHblPWo2VnPosazis2340XW5LUhRYcATn\nuhwwFLGvC2IXSAq4uAyHSSiHVtwDjKWcJakkMnKlFuK1a5AI/2vMLkb3wKqyxKxC\nvQ0KZDSe4YO4nJk983CUL4g=\n-----END PRIVATE KEY-----\n",
|
| 6 |
+
"client_email": "jake-document-ai-test@photon-services.iam.gserviceaccount.com",
|
| 7 |
+
"client_id": "105944418590442697805",
|
| 8 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
| 9 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
| 10 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
| 11 |
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/jake-document-ai-test%40photon-services.iam.gserviceaccount.com",
|
| 12 |
+
"universe_domain": "googleapis.com"
|
| 13 |
+
}
|
src/utils/__pycache__/barcode.cpython-313.pyc
ADDED
|
Binary file (4.48 kB). View file
|
|
|
src/utils/__pycache__/image_utils.cpython-313.pyc
ADDED
|
Binary file (9.62 kB). View file
|
|
|
src/utils/barcode.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import zxingcpp
|
| 5 |
+
import barcodenumber
|
| 6 |
+
|
| 7 |
+
class Barcode:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self._SYM_ALIAS = {
|
| 10 |
+
'EAN13': 'ean13',
|
| 11 |
+
'EAN8': 'ean8',
|
| 12 |
+
'UPCA': 'upc',
|
| 13 |
+
'UPC-A': 'upc',
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
def validate_barcode(self, data: str, sym: str) -> bool:
|
| 17 |
+
# Empty strings are always invalid
|
| 18 |
+
if not data:
|
| 19 |
+
return False
|
| 20 |
+
|
| 21 |
+
# For unknown symbology, try all known formats first
|
| 22 |
+
if sym.upper() not in self._SYM_ALIAS:
|
| 23 |
+
if data.isdigit():
|
| 24 |
+
for known_format in ['ean13', 'ean8', 'upc']:
|
| 25 |
+
try:
|
| 26 |
+
if barcodenumber.check_code(known_format, data):
|
| 27 |
+
return True
|
| 28 |
+
except (ValueError, KeyError):
|
| 29 |
+
continue
|
| 30 |
+
# If no known format matches, validate basic structure
|
| 31 |
+
return False
|
| 32 |
+
|
| 33 |
+
# For known formats, validate normally
|
| 34 |
+
code = self._SYM_ALIAS.get(sym, sym.lower())
|
| 35 |
+
try:
|
| 36 |
+
return barcodenumber.check_code(code, data)
|
| 37 |
+
except (ValueError, KeyError):
|
| 38 |
+
return False
|
| 39 |
+
|
| 40 |
+
def scan_and_validate(self, image, show_image: bool = False):
|
| 41 |
+
# 1) normalize to OpenCV BGR numpy array
|
| 42 |
+
if isinstance(image, np.ndarray):
|
| 43 |
+
cv_img = image.copy()
|
| 44 |
+
else:
|
| 45 |
+
# assume PIL
|
| 46 |
+
cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 47 |
+
|
| 48 |
+
# 2) for zxing we need a PIL, so make one from cv_img
|
| 49 |
+
pil_for_scan = Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
|
| 50 |
+
barcodes = zxingcpp.read_barcodes(pil_for_scan)
|
| 51 |
+
|
| 52 |
+
results = []
|
| 53 |
+
for i, barcode in enumerate(barcodes):
|
| 54 |
+
pos = barcode.position
|
| 55 |
+
if pos:
|
| 56 |
+
pts = [pos.top_left, pos.top_right, pos.bottom_right, pos.bottom_left]
|
| 57 |
+
xs = [p.x for p in pts]
|
| 58 |
+
ys = [p.y for p in pts]
|
| 59 |
+
x, y = int(min(xs)), int(min(ys))
|
| 60 |
+
w, h = int(max(xs) - x), int(max(ys) - y)
|
| 61 |
+
else:
|
| 62 |
+
x, y, w, h = 0, 0, 100, 50
|
| 63 |
+
|
| 64 |
+
raw = barcode.text
|
| 65 |
+
sym = str(barcode.format)
|
| 66 |
+
ok = self.validate_barcode(raw, sym)
|
| 67 |
+
|
| 68 |
+
# Create barcode result with position data
|
| 69 |
+
barcode_result = {
|
| 70 |
+
'id': f'BARCODE_{i+1:03d}',
|
| 71 |
+
'type': sym,
|
| 72 |
+
'data': raw,
|
| 73 |
+
'valid': ok,
|
| 74 |
+
'position': {
|
| 75 |
+
'x': x,
|
| 76 |
+
'y': y,
|
| 77 |
+
'width': w,
|
| 78 |
+
'height': h,
|
| 79 |
+
'top_left': {'x': x, 'y': y},
|
| 80 |
+
'top_right': {'x': x + w, 'y': y},
|
| 81 |
+
'bottom_right': {'x': x + w, 'y': y + h},
|
| 82 |
+
'bottom_left': {'x': x, 'y': y + h}
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
results.append(barcode_result)
|
| 87 |
+
|
| 88 |
+
return results
|
| 89 |
+
|
| 90 |
+
def draw_box(self, img, x, y, w, h, sym, raw, ok):
|
| 91 |
+
color = (0,255,0) if ok else (0,0,255)
|
| 92 |
+
cv2.rectangle(img, (x,y), (x+w, y+h), color, 2)
|
| 93 |
+
cv2.putText(img, f"{sym}:{raw}", (x, y-10),
|
| 94 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
| 95 |
+
return img
|
src/utils/image_utils.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
from PIL import Image, ImageChops
|
| 4 |
+
from PIL import ImageDraw
|
| 5 |
+
import math
|
| 6 |
+
|
| 7 |
+
class ImageUtils:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
@staticmethod
|
| 12 |
+
def crop_base64(base64_string, output_format='PNG') -> str:
|
| 13 |
+
"""
|
| 14 |
+
Takes a base64 encoded image, crops it by removing uniform background,
|
| 15 |
+
and returns the cropped image as base64.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
base64_string (str or bytes): Base64 encoded image string or raw bytes
|
| 19 |
+
output_format (str): Output image format ('PNG', 'JPEG', etc.)
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
str: Base64 encoded cropped image, or empty string if cropping fails
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
# Handle both base64 strings and raw bytes
|
| 26 |
+
if isinstance(base64_string, bytes):
|
| 27 |
+
# If it's raw bytes, treat it as image data directly
|
| 28 |
+
image_data = base64_string
|
| 29 |
+
else:
|
| 30 |
+
# If it's a string, decode base64 to image
|
| 31 |
+
image_data = base64.b64decode(base64_string)
|
| 32 |
+
|
| 33 |
+
im = Image.open(BytesIO(image_data))
|
| 34 |
+
|
| 35 |
+
# Apply the original trim logic
|
| 36 |
+
bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
|
| 37 |
+
diff = ImageChops.difference(im, bg)
|
| 38 |
+
diff = ImageChops.add(diff, diff, 2.0, -100)
|
| 39 |
+
bbox = diff.getbbox()
|
| 40 |
+
|
| 41 |
+
if bbox:
|
| 42 |
+
cropped_im = im.crop(bbox)
|
| 43 |
+
else:
|
| 44 |
+
cropped_im = im # Return original if no cropping needed
|
| 45 |
+
|
| 46 |
+
# Convert back to base64
|
| 47 |
+
buffer = BytesIO()
|
| 48 |
+
cropped_im.save(buffer, format=output_format)
|
| 49 |
+
cropped_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
| 50 |
+
|
| 51 |
+
return cropped_base64
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"Error processing image: {e}")
|
| 55 |
+
return ""
|
| 56 |
+
|
| 57 |
+
@staticmethod
|
| 58 |
+
def crop_image(im: Image.Image) -> Image.Image:
|
| 59 |
+
"""
|
| 60 |
+
Original trim function for PIL Image objects
|
| 61 |
+
"""
|
| 62 |
+
try:
|
| 63 |
+
bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
|
| 64 |
+
diff = ImageChops.difference(im, bg)
|
| 65 |
+
diff = ImageChops.add(diff, diff, 2.0, -100)
|
| 66 |
+
bbox = diff.getbbox()
|
| 67 |
+
if bbox:
|
| 68 |
+
return im.crop(bbox)
|
| 69 |
+
return im
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"Error cropping image: {e}")
|
| 72 |
+
return im
|
| 73 |
+
|
| 74 |
+
@staticmethod
|
| 75 |
+
def draw_bounding_boxes(pil_image: Image.Image, boxes: list[tuple[int, int, int, int]], color: str = "red", width: int = 2) -> Image.Image:
|
| 76 |
+
"""
|
| 77 |
+
Draw bounding boxes on a PIL image.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
pil_image: A PIL.Image instance.
|
| 81 |
+
boxes: A list of boxes, each specified as (x1, y1, x2, y2).
|
| 82 |
+
color: The color for the bounding box outline.
|
| 83 |
+
width: The width of the bounding box line.
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
The PIL.Image with drawn bounding boxes.
|
| 87 |
+
"""
|
| 88 |
+
try:
|
| 89 |
+
draw = ImageDraw.Draw(pil_image)
|
| 90 |
+
for box in boxes:
|
| 91 |
+
draw.rectangle(box, outline=color, width=width)
|
| 92 |
+
return pil_image
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"Error drawing bounding boxes: {e}")
|
| 95 |
+
return pil_image
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def standardize_image_size(image: Image.Image, target_size: tuple = (1200, 1600), maintain_aspect_ratio: bool = True) -> Image.Image:
|
| 99 |
+
"""
|
| 100 |
+
Resize image to target size while optionally maintaining aspect ratio.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
image: PIL Image to resize
|
| 104 |
+
target_size: Target (width, height) in pixels
|
| 105 |
+
maintain_aspect_ratio: If True, fit within target size while maintaining aspect ratio
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
Resized PIL Image
|
| 109 |
+
"""
|
| 110 |
+
if maintain_aspect_ratio:
|
| 111 |
+
# Calculate aspect ratios
|
| 112 |
+
img_ratio = image.width / image.height
|
| 113 |
+
target_ratio = target_size[0] / target_size[1]
|
| 114 |
+
|
| 115 |
+
if img_ratio > target_ratio:
|
| 116 |
+
# Image is wider than target, fit to width
|
| 117 |
+
new_width = target_size[0]
|
| 118 |
+
new_height = int(target_size[0] / img_ratio)
|
| 119 |
+
else:
|
| 120 |
+
# Image is taller than target, fit to height
|
| 121 |
+
new_height = target_size[1]
|
| 122 |
+
new_width = int(target_size[1] * img_ratio)
|
| 123 |
+
|
| 124 |
+
# Resize image
|
| 125 |
+
resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 126 |
+
|
| 127 |
+
# Create new image with target size and white background
|
| 128 |
+
final_image = Image.new('RGB', target_size, 'white')
|
| 129 |
+
|
| 130 |
+
# Calculate position to center the resized image
|
| 131 |
+
x_offset = (target_size[0] - new_width) // 2
|
| 132 |
+
y_offset = (target_size[1] - new_height) // 2
|
| 133 |
+
|
| 134 |
+
# Paste the resized image onto the white background
|
| 135 |
+
final_image.paste(resized_image, (x_offset, y_offset))
|
| 136 |
+
|
| 137 |
+
return final_image
|
| 138 |
+
else:
|
| 139 |
+
# Direct resize to target size
|
| 140 |
+
return image.resize(target_size, Image.Resampling.LANCZOS)
|
| 141 |
+
|
| 142 |
+
@staticmethod
|
| 143 |
+
def optimize_image_quality(image: Image.Image, max_size_bytes: int = 1024 * 1024, initial_quality: int = 95) -> tuple[Image.Image, int]:
|
| 144 |
+
"""
|
| 145 |
+
Optimize image quality to fit within specified file size limit.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
image: PIL Image to optimize
|
| 149 |
+
max_size_bytes: Maximum file size in bytes (default 1MB)
|
| 150 |
+
initial_quality: Starting quality (1-100) - not used for PNG but kept for compatibility
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
Tuple of (optimized_image, final_quality)
|
| 154 |
+
"""
|
| 155 |
+
# For PNG, we'll use compression levels instead of quality
|
| 156 |
+
# PNG compression levels range from 0 (no compression) to 9 (maximum compression)
|
| 157 |
+
compression_levels = [0, 1, 3, 5, 7, 9] # Try different compression levels
|
| 158 |
+
|
| 159 |
+
for compression in compression_levels:
|
| 160 |
+
# Save image to buffer with current compression
|
| 161 |
+
buffer = BytesIO()
|
| 162 |
+
image.save(buffer, format='PNG', optimize=True, compress_level=compression)
|
| 163 |
+
current_size = buffer.tell()
|
| 164 |
+
|
| 165 |
+
# If size is within limit, return the image
|
| 166 |
+
if current_size <= max_size_bytes:
|
| 167 |
+
# Reset buffer position and load the optimized image
|
| 168 |
+
buffer.seek(0)
|
| 169 |
+
optimized_image = Image.open(buffer)
|
| 170 |
+
return optimized_image, 95 # Return a default quality value for compatibility
|
| 171 |
+
|
| 172 |
+
# If we can't get under the size limit, return the most compressed version
|
| 173 |
+
buffer = BytesIO()
|
| 174 |
+
image.save(buffer, format='PNG', optimize=True, compress_level=9)
|
| 175 |
+
buffer.seek(0)
|
| 176 |
+
optimized_image = Image.open(buffer)
|
| 177 |
+
return optimized_image, 50 # Return a lower quality value for compatibility
|
| 178 |
+
|
| 179 |
+
@staticmethod
|
| 180 |
+
def process_image_for_comparison(image: Image.Image, target_size: tuple = (1200, 1600), max_size_bytes: int = 1024 * 1024) -> tuple[Image.Image, int, int]:
|
| 181 |
+
"""
|
| 182 |
+
Process image for comparison: standardize size and optimize quality.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
image: PIL Image to process
|
| 186 |
+
target_size: Target size in pixels (width, height)
|
| 187 |
+
max_size_bytes: Maximum file size in bytes (default 1MB)
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
Tuple of (processed_image, final_quality, file_size_bytes)
|
| 191 |
+
"""
|
| 192 |
+
# First, standardize the size
|
| 193 |
+
sized_image = ImageUtils.standardize_image_size(image, target_size, maintain_aspect_ratio=True)
|
| 194 |
+
|
| 195 |
+
# Then optimize quality to fit within size limit
|
| 196 |
+
optimized_image, quality = ImageUtils.optimize_image_quality(sized_image, max_size_bytes)
|
| 197 |
+
|
| 198 |
+
# Get final file size (using PNG format for consistency)
|
| 199 |
+
buffer = BytesIO()
|
| 200 |
+
optimized_image.save(buffer, format='PNG', optimize=True)
|
| 201 |
+
file_size = buffer.tell()
|
| 202 |
+
|
| 203 |
+
return optimized_image, quality, file_size
|
| 204 |
+
|
| 205 |
+
@staticmethod
|
| 206 |
+
def image_to_base64_optimized(image: Image.Image, target_size: tuple = (1200, 1600), max_size_bytes: int = 1024 * 1024) -> str:
|
| 207 |
+
"""
|
| 208 |
+
Convert image to base64 with size and quality optimization.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
image: PIL Image to convert
|
| 212 |
+
target_size: Target size in pixels (width, height)
|
| 213 |
+
max_size_bytes: Maximum file size in bytes (default 1MB)
|
| 214 |
+
|
| 215 |
+
Returns:
|
| 216 |
+
Base64 encoded string of the optimized image
|
| 217 |
+
"""
|
| 218 |
+
processed_image, quality, file_size = ImageUtils.process_image_for_comparison(
|
| 219 |
+
image, target_size, max_size_bytes
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
# Convert to base64 as PNG format
|
| 223 |
+
buffer = BytesIO()
|
| 224 |
+
processed_image.save(buffer, format='PNG', optimize=True)
|
| 225 |
+
image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
| 226 |
+
|
| 227 |
+
return image_base64
|