Jakecole1 commited on
Commit
863cb78
·
verified ·
1 Parent(s): a5cebee

Upload 18 files

Browse files
src/core/__pycache__/analysis.cpython-313.pyc ADDED
Binary file (28.5 kB). View file
 
src/core/__pycache__/analysis.cpython-313.pyc.1424781933232 ADDED
Binary file (25.4 kB). View file
 
src/core/__pycache__/analysis.cpython-313.pyc.3054062041392 ADDED
Binary file (24.3 kB). View file
 
src/core/__pycache__/analysis.cpython-313.pyc.3054062628656 ADDED
Binary file (22.2 kB). View file
 
src/core/__pycache__/analysis.cpython-313.pyc.3054062929328 ADDED
Binary file (21.1 kB). View file
 
src/core/analysis.py ADDED
@@ -0,0 +1,704 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import anthropic
3
+ import requests
4
+ import streamlit as st
5
+ import numpy as np
6
+ import json
7
+ import re
8
+ from requests.adapters import HTTPAdapter
9
+ from urllib3.util.retry import Retry
10
+ from src.extract_text.google_document_api import GoogleDocumentAPI
11
+
12
+ CLAUDE_API_URL = "https://api.anthropic.com/v1/messages"
13
+
14
+
15
+
16
+ class LLM:
17
+ def __init__(self):
18
+ self.claude_api_key = os.getenv('CLAUDE_API_KEY')
19
+ if not self.claude_api_key:
20
+ raise ValueError("Please set the CLAUDE_API_KEY environment variable.")
21
+
22
+ # Configure retry strategy with more comprehensive error handling
23
+ retry_strategy = Retry(
24
+ total=5, # Increased total retries
25
+ backoff_factor=2, # Increased backoff factor for exponential backoff
26
+ status_forcelist=[429, 500, 502, 503, 504, 529], # Added 529 for server overload
27
+ allowed_methods=["POST"], # Only retry POST requests
28
+ respect_retry_after_header=True, # Respect Retry-After headers
29
+ )
30
+
31
+ # Create session with retry strategy
32
+ self.session = requests.Session()
33
+ self.session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
34
+
35
+ def call_claude_api(self, prompt, system_prompt, model="claude-sonnet-4-20250514", max_tokens=2000) -> str:
36
+ """
37
+ Helper function to call Claude API with consistent parameters and enhanced error handling.
38
+ """
39
+ headers = {
40
+ "x-api-key": self.claude_api_key,
41
+ "anthropic-version": "2023-06-01",
42
+ "Content-Type": "application/json"
43
+ }
44
+
45
+ payload = {
46
+ "model": model,
47
+ "max_tokens": max_tokens,
48
+ "temperature": 0.1,
49
+ "messages": [
50
+ {
51
+ "role": "user",
52
+ "content": prompt
53
+ }
54
+ ],
55
+ "system": system_prompt
56
+ }
57
+
58
+ max_retries = 3
59
+ for attempt in range(max_retries):
60
+ try:
61
+ response = self.session.post(
62
+ CLAUDE_API_URL,
63
+ headers=headers,
64
+ json=payload,
65
+ verify=True, # Explicitly enable SSL verification
66
+ timeout=60 # Increased timeout for better reliability
67
+ )
68
+
69
+ # Handle specific error codes
70
+ if response.status_code == 529:
71
+ st.warning(f"Server overload (529) on attempt {attempt + 1}/{max_retries}. Retrying with exponential backoff...")
72
+ if attempt < max_retries - 1:
73
+ import time
74
+ time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s
75
+ continue
76
+ else:
77
+ st.error("Server overload after all retries. Please try again later.")
78
+ return ""
79
+
80
+ response.raise_for_status() # Raise exception for other bad status codes
81
+
82
+ # Parse response
83
+ response_data = response.json()
84
+ if "content" in response_data and len(response_data["content"]) > 0:
85
+ return response_data["content"][0]["text"]
86
+ else:
87
+ st.error("Unexpected response format from Claude API")
88
+ return ""
89
+
90
+ except requests.exceptions.SSLError as ssl_err:
91
+ st.error(f"SSL Error when calling Claude API. Please check your SSL certificates and network connection. Error: {ssl_err}")
92
+ return ""
93
+ except requests.exceptions.Timeout as timeout_err:
94
+ st.warning(f"Timeout on attempt {attempt + 1}/{max_retries}. Retrying...")
95
+ if attempt == max_retries - 1:
96
+ st.error("Request timed out after all retries")
97
+ return ""
98
+ except requests.exceptions.RequestException as e:
99
+ st.error(f"Error calling Claude API: {str(e)}")
100
+ return ""
101
+ except json.JSONDecodeError as json_err:
102
+ st.error(f"Invalid JSON response from Claude API: {json_err}")
103
+ return ""
104
+
105
+ return ""
106
+
107
+ def call_claude_vision_api(self, prompt, system_prompt, image_base64, model="claude-sonnet-4-20250514", max_tokens=2000) -> str:
108
+ """
109
+ Helper function to call Claude Vision API with image support and enhanced error handling.
110
+ """
111
+ headers = {
112
+ "x-api-key": self.claude_api_key,
113
+ "anthropic-version": "2023-06-01",
114
+ "Content-Type": "application/json"
115
+ }
116
+
117
+ content = [
118
+ {
119
+ "type": "text",
120
+ "text": prompt
121
+ },
122
+ {
123
+ "type": "image",
124
+ "source": {
125
+ "type": "base64",
126
+ "media_type": "image/png",
127
+ "data": image_base64
128
+ }
129
+ }
130
+ ]
131
+
132
+ payload = {
133
+ "model": model,
134
+ "max_tokens": max_tokens,
135
+ "temperature": 0,
136
+ "messages": [
137
+ {
138
+ "role": "user",
139
+ "content": content
140
+ }
141
+ ],
142
+ "system": system_prompt
143
+ }
144
+
145
+ max_retries = 3
146
+ for attempt in range(max_retries):
147
+ try:
148
+ response = self.session.post(
149
+ CLAUDE_API_URL,
150
+ headers=headers,
151
+ json=payload,
152
+ verify=True, # Explicitly enable SSL verification
153
+ timeout=90 # Increased timeout for vision API calls
154
+ )
155
+
156
+ # Handle specific error codes
157
+ if response.status_code == 529:
158
+ st.warning(f"Server overload (529) on attempt {attempt + 1}/{max_retries}. Retrying with exponential backoff...")
159
+ if attempt < max_retries - 1:
160
+ import time
161
+ time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s
162
+ continue
163
+ else:
164
+ st.error("Server overload after all retries. Please try again later.")
165
+ return ""
166
+
167
+ response.raise_for_status() # Raise exception for other bad status codes
168
+
169
+ # Parse response
170
+ response_data = response.json()
171
+ if "content" in response_data and len(response_data["content"]) > 0:
172
+ return response_data["content"][0]["text"]
173
+ else:
174
+ st.error("Unexpected response format from Claude Vision API")
175
+ return ""
176
+
177
+ except requests.exceptions.SSLError as ssl_err:
178
+ st.error(f"SSL Error when calling Claude Vision API. Please check your SSL certificates and network connection. Error: {ssl_err}")
179
+ return ""
180
+ except requests.exceptions.Timeout as timeout_err:
181
+ st.warning(f"Timeout on attempt {attempt + 1}/{max_retries}. Retrying...")
182
+ if attempt == max_retries - 1:
183
+ st.error("Request timed out after all retries")
184
+ return ""
185
+ except requests.exceptions.RequestException as e:
186
+ st.error(f"Error calling Claude Vision API: {str(e)}")
187
+ return ""
188
+ except json.JSONDecodeError as json_err:
189
+ st.error(f"Invalid JSON response from Claude Vision API: {json_err}")
190
+ return ""
191
+
192
+ return ""
193
+
194
+ def call_claude_pdf_api(self, prompt, system_prompt, pdf_base64, model="claude-sonnet-4-20250514", max_tokens=4000) -> str:
195
+ """
196
+ Helper function to call Claude API with PDF support for requirements documents.
197
+ For now, we'll fall back to text-based processing since PDF API requires specific setup.
198
+ """
199
+ # For now, we'll use the regular API with text extraction
200
+ # In the future, this can be enhanced to use the Converse API with citations
201
+ st.info("📄 PDF requirements detected. Using text-based processing for now.")
202
+ st.info("💡 For full visual PDF analysis, consider using the Converse API with citations enabled.")
203
+
204
+ # Extract text from PDF using a simple approach
205
+ # In a production environment, you might want to use a more robust PDF text extraction library
206
+ try:
207
+ import base64
208
+ import io
209
+
210
+ # Try to import PyPDF2
211
+ try:
212
+ from PyPDF2 import PdfReader
213
+ pdf_reader_available = True
214
+ except ImportError:
215
+ pdf_reader_available = False
216
+ st.warning("PyPDF2 not available. Using basic text processing for PDF.")
217
+
218
+ if pdf_reader_available:
219
+ # Decode base64 PDF
220
+ pdf_bytes = base64.b64decode(pdf_base64)
221
+ pdf_stream = io.BytesIO(pdf_bytes)
222
+
223
+ # Extract text from PDF
224
+ reader = PdfReader(pdf_stream)
225
+ text_content = ""
226
+ for page in reader.pages:
227
+ text_content += page.extract_text() + "\n"
228
+
229
+ if not text_content.strip():
230
+ text_content = "PDF Requirements Document (text extraction limited)"
231
+
232
+ # Use regular API with extracted text
233
+ return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
234
+ else:
235
+ # Fallback when PyPDF2 is not available
236
+ return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
237
+
238
+ except Exception as e:
239
+ st.warning(f"PDF text extraction failed: {e}")
240
+ st.warning("Falling back to basic text processing")
241
+
242
+ # Fallback to basic text processing
243
+ return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
244
+
245
+ class ComplianceAnalysis:
246
+ def __init__(self):
247
+ self.llm = LLM()
248
+
249
+ def extract_structured_requirements(self, requirements_data) -> list[dict]:
250
+ """
251
+ Use Claude to extract structured requirements from the requirements document.
252
+
253
+ Args:
254
+ requirements_data: Either a string (for text files) or a dict (for PDF files) containing requirements.
255
+
256
+ Returns:
257
+ A list of dictionaries, each containing a requirement ID, description, and category.
258
+ """
259
+ # Handle both text and PDF requirements
260
+ if isinstance(requirements_data, str):
261
+ # Text-based requirements
262
+ requirements_text = requirements_data
263
+ requirements_type = "text"
264
+ elif isinstance(requirements_data, dict):
265
+ # PDF-based requirements
266
+ requirements_text = requirements_data.get('text_content', '')
267
+ requirements_type = requirements_data.get('type', 'text')
268
+ pdf_base64 = requirements_data.get('content', '') if requirements_type == 'pdf' else None
269
+ else:
270
+ st.error("Invalid requirements data format. Please upload a valid requirements document.")
271
+ return []
272
+
273
+ # Check if requirements text is empty or None
274
+ if not requirements_text or not requirements_text.strip():
275
+ st.error("Requirements text is empty. Please upload a valid requirements document.")
276
+ return []
277
+
278
+ system_prompt = """You are an expert requirements analyst. Extract clear, structured requirements from documents. You must always return valid JSON, even if no specific requirements are found."""
279
+
280
+ extraction_prompt = f"""
281
+ Extract all requirements from this document (not just allergen requirements):
282
+
283
+ {requirements_text}
284
+
285
+ For each requirement found, provide:
286
+ 1. Unique ID (REQ001, REQ002, etc.)
287
+ 2. Description (verbatim from the document)
288
+ 3. Category (Font Size, Allergen List, Formatting, Placement, Barcode, Organic, Promotional, etc.)
289
+ 4. Source reference (section/paragraph or line number)
290
+
291
+ If no requirements are found, return an empty array: []
292
+
293
+ Return as JSON array with fields: id, description, category, source_reference.
294
+
295
+ Example:
296
+ ```json
297
+ [
298
+ {{
299
+ "id": "REQ001",
300
+ "description": "IF the product is labeled as organic, THEN a certified organic seal must be visible",
301
+ "category": "Organic",
302
+ "source_reference": "Line 1"
303
+ }},
304
+ {{
305
+ "id": "REQ002",
306
+ "description": "IF there is a promotional offer mentioned, THEN include the offer expiry date",
307
+ "category": "Promotional",
308
+ "source_reference": "Line 2"
309
+ }}
310
+ ]
311
+ ```
312
+
313
+ IMPORTANT: Always return valid JSON. If you cannot extract any requirements, return an empty array: []
314
+ """
315
+
316
+ # Use appropriate API based on requirements type
317
+ if requirements_type == 'pdf' and pdf_base64:
318
+ # Use PDF API for native PDF processing
319
+ response = self.llm.call_claude_pdf_api(extraction_prompt, system_prompt, pdf_base64, model='claude-sonnet-4-20250514')
320
+ else:
321
+ # Use regular API for text processing
322
+ response = self.llm.call_claude_api(extraction_prompt, system_prompt, model='claude-3-5-haiku-20241022')
323
+
324
+ # Extract JSON from the response
325
+ try:
326
+ # Find JSON content between triple backticks if present
327
+ if "```json" in response and "```" in response.split("```json")[1]:
328
+ json_content = response.split("```json")[1].split("```")[0].strip()
329
+ elif "```" in response:
330
+ # Try to find any code block
331
+ json_content = response.split("```")[1].split("```")[0].strip()
332
+ else:
333
+ # Assume the entire response is JSON
334
+ json_content = response
335
+
336
+ # Clean the JSON content to handle control characters
337
+ # Remove or replace invalid control characters except newlines and tabs
338
+ json_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', json_content)
339
+ # Replace newlines within strings with escaped newlines
340
+ json_content = re.sub(r'(?<!\\)"(?:[^"\\]|\\.)*?(?<!\\)"', lambda m: m.group(0).replace('\n', '\\n'), json_content)
341
+
342
+ requirements = json.loads(json_content)
343
+ return requirements
344
+ except Exception as e:
345
+ st.error(f"Error parsing extracted requirements: {e}")
346
+ st.error(f"Raw response: {response}")
347
+ # Return empty array as fallback
348
+ return []
349
+
350
+
351
+ def verify_individual_requirement(self, requirement, markdown_table, image=None, barcode_data=None, metadata=None, requirements_data=None):
352
+ """
353
+ Use structured reasoning to verify if a specific requirement is met in the packaging text.
354
+
355
+ Args:
356
+ requirement: A dictionary containing requirement details
357
+ markdown_table: The markdown table extracted from the packaging PDF
358
+ image: The image of the packaging document (optional)
359
+ barcode_data: List of barcode objects with position data (optional)
360
+ metadata: Dictionary containing font, font size, and color metadata (optional)
361
+ requirements_data: Original requirements data (text or PDF) for context (optional)
362
+ Returns:
363
+ A dictionary with verification results including reasoning and compliance status
364
+ """
365
+ system_prompt = """You are a regulatory compliance expert. Provide detailed, objective compliance reports."""
366
+
367
+ # Build the prompt for verification
368
+ verification_prompt = f"""
369
+ You are a regulatory compliance expert. Provide detailed, objective compliance reports.
370
+ I need to verify if the following specific requirement is met in the packaging text:
371
+
372
+ Requirement ID: {requirement['id']}
373
+ Requirement Description: {requirement['description']}
374
+ Requirement Category: {requirement['category']}
375
+
376
+ Here is the packaging text to analyze:
377
+
378
+ {markdown_table}
379
+ """
380
+
381
+ # Add barcode information if available
382
+ if barcode_data:
383
+ # Create minimal barcode summary for LLM (save tokens)
384
+ barcode_summary = []
385
+ for barcode in barcode_data:
386
+ barcode_summary.append({
387
+ 'id': barcode['id'],
388
+ 'type': barcode['type'],
389
+ 'data': barcode['data'],
390
+ 'valid': barcode['valid']
391
+ })
392
+
393
+ verification_prompt += f"""
394
+
395
+ Barcode Information Found:
396
+ {json.dumps(barcode_summary, indent=2)}
397
+
398
+ When analyzing barcode-related requirements, consider:
399
+ - Barcode ID for evidence reference
400
+ - Barcode type and validation status
401
+ """
402
+
403
+ # Add metadata information if available
404
+ if metadata and not metadata.get('error'):
405
+ # Create metadata summary for LLM (save tokens)
406
+ metadata_summary = {
407
+ 'extraction_method': metadata.get('extraction_method', 'unknown'),
408
+ 'has_selectable_text': metadata.get('has_selectable_text', False),
409
+ 'pages_processed': metadata.get('pages_processed', 0),
410
+ 'dominant_font': metadata.get('fonts', {}),
411
+ 'dominant_font_size': metadata.get('font_sizes', {}),
412
+ 'dominant_text_color': metadata.get('text_colors', {})
413
+ }
414
+
415
+ verification_prompt += f"""
416
+
417
+ Typography and Design Metadata:
418
+ {json.dumps(metadata_summary, indent=2)}
419
+
420
+ When analyzing typography and design requirements, consider:
421
+ - Font types and their usage frequency
422
+ - Font sizes and their distribution
423
+ - Text colors and their application
424
+ - Whether text is selectable or requires OCR
425
+ """
426
+
427
+ verification_prompt += f"""
428
+
429
+ Verify this requirement using these steps:
430
+ 1. Break down into checkable criteria
431
+ 2. Search for evidence in packaging text (provide Text ID)
432
+ 3. For visual elements not in text, describe clearly (text_id = null)
433
+ 4. For barcode evidence, use Barcode ID (text_id = null)
434
+ 5. Provide specific examples/quotes
435
+ 6. Determine: COMPLIANT/NON-COMPLIANT/PARTIALLY COMPLIANT
436
+ - Compliant: All applicable rules are fully met without any deviation.
437
+ - Partially Compliant: Some rules are met, but minor issues/omissions that don't constitute a full failure but need attention.
438
+ - Non-Compliant: One or more critical rules are violated or omitted, posing a regulatory, safety, or logistical risk.
439
+ 7. Explain reasoning
440
+
441
+ For visual evidence, describe:
442
+ - Location (e.g., "top right corner", "bottom section")
443
+ - Visual characteristics (e.g., "large bold text", "red warning box")
444
+ - Content description (e.g., "allergen warning in red box")
445
+
446
+ If there is barcode evidence, include:
447
+ - Barcode ID
448
+ - Barcode type and validation status
449
+
450
+ Return JSON with structure:
451
+ ```json
452
+ {{
453
+ "requirement_id": "{requirement['id']}",
454
+ "criteria": ["criterion 1", "criterion 2"],
455
+ "evidence_found": [
456
+ {{"text_id": <Text ID or null>, "evidence_text": "<description>", "barcode_id": "<Barcode ID ONLY if applicable>"}}
457
+ ],
458
+ "compliance_status": "COMPLIANT/NON-COMPLIANT/PARTIALLY COMPLIANT",
459
+ "reasoning": "Detailed explanation",
460
+ "confidence": 0.95
461
+ }}
462
+ ```
463
+ """
464
+
465
+ # Use vision API if image is provided, otherwise use regular API
466
+ if image:
467
+ response = self.llm.call_claude_vision_api(verification_prompt, system_prompt, image)
468
+ else:
469
+ response = self.llm.call_claude_api(verification_prompt, system_prompt)
470
+
471
+ # Extract JSON from the response with enhanced error handling
472
+ try:
473
+ # Check if response is empty or None
474
+ if not response or not response.strip():
475
+ st.error("Empty response received from Claude API")
476
+ return {
477
+ "requirement_id": requirement['id'],
478
+ "evidence_found": [],
479
+ "compliance_status": "ERROR",
480
+ "reasoning": "Empty response received from Claude API",
481
+ "confidence": 0
482
+ }
483
+
484
+ # Find JSON content between triple backticks if present
485
+ if "```json" in response and "```" in response.split("```json")[1]:
486
+ json_content = response.split("```json")[1].split("```")[0].strip()
487
+ elif "```" in response:
488
+ # Try to find any code block
489
+ json_content = response.split("```")[1].split("```")[0].strip()
490
+ else:
491
+ # Assume the entire response is JSON
492
+ json_content = response
493
+
494
+ # Clean the JSON content to handle control characters
495
+ # Remove or replace invalid control characters except newlines and tabs
496
+ json_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', json_content)
497
+ # Replace newlines within strings with escaped newlines
498
+ json_content = re.sub(r'(?<!\\)"(?:[^"\\]|\\.)*?(?<!\\)"', lambda m: m.group(0).replace('\n', '\\n'), json_content)
499
+
500
+ # Try to parse JSON with multiple fallback strategies
501
+ verification_result = None
502
+
503
+ # Strategy 1: Direct parsing
504
+ try:
505
+ verification_result = json.loads(json_content)
506
+ except json.JSONDecodeError as e1:
507
+ st.warning(f"Initial JSON parsing failed: {e1}")
508
+
509
+ # Strategy 2: Try to extract JSON from malformed response
510
+ try:
511
+ # Look for JSON-like structure
512
+ json_match = re.search(r'\{.*\}', json_content, re.DOTALL)
513
+ if json_match:
514
+ potential_json = json_match.group(0)
515
+ verification_result = json.loads(potential_json)
516
+ st.info("Successfully extracted JSON from malformed response")
517
+ except json.JSONDecodeError as e2:
518
+ st.warning(f"JSON extraction failed: {e2}")
519
+
520
+ # Strategy 3: Create a minimal valid JSON structure
521
+ try:
522
+ # Try to extract key information from the response
523
+ compliance_status = "UNKNOWN"
524
+ if "COMPLIANT" in response.upper():
525
+ compliance_status = "COMPLIANT"
526
+ elif "NON-COMPLIANT" in response.upper():
527
+ compliance_status = "NON-COMPLIANT"
528
+ elif "PARTIALLY" in response.upper():
529
+ compliance_status = "PARTIALLY COMPLIANT"
530
+
531
+ verification_result = {
532
+ "requirement_id": requirement['id'],
533
+ "criteria": ["Unable to parse criteria"],
534
+ "evidence_found": [],
535
+ "compliance_status": compliance_status,
536
+ "reasoning": f"Response parsing failed. Raw response: {response[:200]}...",
537
+ "confidence": 0.1
538
+ }
539
+ st.warning("Created fallback JSON structure due to parsing errors")
540
+ except Exception as e3:
541
+ st.error(f"Fallback JSON creation failed: {e3}")
542
+ raise e3
543
+
544
+ if verification_result:
545
+ return verification_result
546
+ else:
547
+ raise Exception("All JSON parsing strategies failed")
548
+
549
+ except Exception as e:
550
+ st.error(f"Error parsing verification result: {e}")
551
+ st.error(f"Raw response: {response}")
552
+ # Return a failure result
553
+ return {
554
+ "requirement_id": requirement['id'],
555
+ "evidence_found": [],
556
+ "compliance_status": "ERROR",
557
+ "reasoning": f"Failed to verify requirement due to parsing error: {str(e)}",
558
+ "confidence": 0
559
+ }
560
+
561
+
562
+
563
+ def analyze_compliance(self, requirements_data, packaging_text, packaging_data, image=None, barcode_data=None, metadata=None, model="claude-sonnet-4-20250514"):
564
+ """
565
+ Analyze packaging compliance through multi-step process:
566
+ 1. Extract structured requirements
567
+ 2. Verify each requirement with structured reasoning
568
+
569
+ Args:
570
+ requirements_data: The requirements data (text string or PDF dict)
571
+ packaging_text: Markdown table extracted from the packaging PDF
572
+ packaging_data: Structured text with bounding boxes
573
+ image: The image of the packaging document
574
+ barcode_data: List of barcode objects with position data
575
+ metadata: Dictionary containing font, font size, and color metadata
576
+ model: The Claude model to use
577
+
578
+ Returns:
579
+ A dictionary containing compliance analysis results
580
+ """
581
+ # Step 1: Extract structured requirements
582
+ st.info("Extracting structured requirements...")
583
+ requirements = self.extract_structured_requirements(requirements_data)
584
+
585
+ if not requirements:
586
+ st.warning("No requirements found in the document. Please check that your requirements file contains valid requirement statements.")
587
+ return {"error": "No requirements found", "requirements": [], "verifications": []}
588
+
589
+ st.success(f"Extracted {len(requirements)} requirements")
590
+
591
+ # Step 2: Verify each requirement with structured reasoning
592
+ st.info("Verifying requirements...")
593
+ verifications = []
594
+
595
+ for i, req in enumerate(requirements):
596
+ st.text(f"Verifying requirement {i+1}/{len(requirements)}: {req['id']}")
597
+
598
+ # Get verification result
599
+ verification = self.verify_individual_requirement(req, packaging_text, image, barcode_data, metadata, requirements_data)
600
+ verifications.append(verification)
601
+
602
+ # Step 4: Generate final compliance report
603
+ system_prompt = """You are a regulatory compliance expert. Provide detailed, objective compliance reports."""
604
+
605
+ # Create minimal summary for LLM (save tokens)
606
+ compliance_summary = []
607
+ for verification in verifications:
608
+ compliance_summary.append({
609
+ 'requirement_id': verification.get('requirement_id', 'Unknown'),
610
+ 'compliance_status': verification.get('compliance_status', 'UNKNOWN'),
611
+ 'confidence': verification.get('confidence', 0),
612
+ 'evidence_count': len(verification.get('evidence_found', []))
613
+ })
614
+
615
+ summary_prompt = f"""
616
+ Based on the verification of {len(requirements)} requirements,
617
+ please provide a final compliance summary report.
618
+
619
+ Requirements Summary:
620
+ {json.dumps([{'id': req['id'], 'description': req['description'], 'category': req['category']} for req in requirements], indent=2)}
621
+
622
+ Compliance Results Summary:
623
+ {json.dumps(compliance_summary, indent=2)}
624
+
625
+ Format your response in the following template:
626
+
627
+ ## 🎯 **Analysis Requirements**
628
+
629
+ Summarize the overall compliance status with focus on:
630
+
631
+ 1. **Quantitative Metrics**: Count of fully compliant, partially compliant, and non-compliant requirements
632
+ 2. **Critical Issues**: Most urgent compliance gaps requiring immediate attention
633
+ 3. **Strategic Recommendations**: Actionable steps for the artwork designer to fix the compliance issues
634
+
635
+ ---
636
+
637
+ ## 📋 **Response Template**
638
+
639
+ ### 🔍 **Executive Summary**
640
+ Provide a single, clear statement of overall compliance status
641
+ *Example: "Organization achieved 70% compliance (14/20 requirements); moderate risk profile with 3 critical gaps identified."*
642
+
643
+ ---
644
+
645
+ ### 📈 **Compliance Statistics**
646
+
647
+ | **Metric** | **Count** | **Percentage** |
648
+ |------------|-----------|----------------|
649
+ | **Total Requirements** | `[total]` | `100%` |
650
+ | ✅ **Fully Compliant** | `[count]` | `[%]` |
651
+ | ⚠️ **Partially Compliant** | `[count]` | `[%]` |
652
+ | ❌ **Non-Compliant** | `[count]` | `[%]` |
653
+
654
+ ---
655
+
656
+ ### 🚨 **Priority Findings**
657
+
658
+ List 3-5 highest-severity issues in order of criticality:
659
+
660
+ 1. **[REQ-ID]** - [Brief description of critical issue]
661
+ 2. **[REQ-ID]** - [Brief description of high-priority gap]
662
+ 3. **[REQ-ID]** - [Brief description of moderate-priority concern]
663
+
664
+ ---
665
+
666
+ ### 💡 **Targeted Recommendations**
667
+
668
+ For each Priority Finding, provide specific corrective actions:
669
+
670
+ | **Finding** | **Recommended Action** | **Priority** |
671
+ |-------------|------------------------|--------------|
672
+ | **[REQ-ID]** | [Specific artwork designer action] | 🔴 **Critical** |
673
+ | **[REQ-ID]** | [Specific artwork designer action] | 🟡 **High** |
674
+ | **[REQ-ID]** | [Specific artwork designer action] | 🟢 **Medium** |
675
+
676
+ ---
677
+
678
+ ### 📝 **Detailed Assessment Results**
679
+
680
+ *[Provide comprehensive breakdown of each requirement with status and supporting details]*
681
+
682
+ ---
683
+
684
+ ### 📊 **Supporting Evidence**
685
+
686
+ *[Include relevant data, metrics, or documentation that supports the compliance assessment]*
687
+
688
+
689
+ """
690
+
691
+ # Get the final compliance report
692
+ compliance_report = self.llm.call_claude_api(summary_prompt, system_prompt, model='claude-3-5-haiku-20241022')
693
+
694
+ # Compile all results
695
+ result = {
696
+ "requirements": requirements,
697
+ "verifications": verifications,
698
+ "compliance_report": compliance_report,
699
+ "packaging_data": packaging_data,
700
+ "barcode_data": barcode_data,
701
+ "metadata": metadata
702
+ }
703
+
704
+ return result
src/extract_text/__pycache__/extract_meta_data.cpython-313.pyc ADDED
Binary file (16.2 kB). View file
 
src/extract_text/__pycache__/google_document_api.cpython-313.pyc ADDED
Binary file (13.1 kB). View file
 
src/extract_text/__pycache__/google_document_api.cpython-313.pyc.1480615374128 ADDED
Binary file (7.92 kB). View file
 
src/extract_text/__pycache__/ingest.cpython-313.pyc ADDED
Binary file (3.51 kB). View file
 
src/extract_text/extract_meta_data.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import pytesseract
3
+ from PIL import Image
4
+ import numpy as np
5
+ import cv2
6
+ from collections import defaultdict, Counter
7
+ import io
8
+ import re
9
+ from typing import Dict, List, Tuple, Optional, Union
10
+
11
+
12
+ class PDFArtworkMetadataExtractor:
13
+ """
14
+ A class for extracting metadata (font, font size, text color) from artwork PDFs.
15
+ Handles both selectable text and non-selectable text using OCR.
16
+ """
17
+
18
+ def __init__(self, tesseract_path: Optional[str] = None):
19
+ """
20
+ Initialize the metadata extractor.
21
+
22
+ Args:
23
+ tesseract_path: Path to tesseract executable (if not in PATH)
24
+ """
25
+ if tesseract_path:
26
+ pytesseract.pytesseract.tesseract_cmd = tesseract_path
27
+
28
+ self.pdf_doc = None
29
+ self.metadata = {
30
+ 'fonts': {},
31
+ 'font_sizes': {},
32
+ 'text_colors': {},
33
+ 'has_selectable_text': False,
34
+ 'pages_processed': 0,
35
+ 'extraction_method': None
36
+ }
37
+
38
+ def load_pdf(self, pdf_path: str) -> bool:
39
+ """
40
+ Load PDF document.
41
+
42
+ Args:
43
+ pdf_path: Path to PDF file
44
+
45
+ Returns:
46
+ bool: True if successful, False otherwise
47
+ """
48
+ try:
49
+ self.pdf_doc = fitz.open(pdf_path)
50
+ return True
51
+ except Exception as e:
52
+ print(f"Error loading PDF: {e}")
53
+ return False
54
+
55
+ def _extract_selectable_text_metadata(self) -> Dict:
56
+ """
57
+ Extract metadata from selectable text using PyMuPDF.
58
+
59
+ Returns:
60
+ Dict: Metadata dictionary with fonts, sizes, and colors
61
+ """
62
+ fonts = defaultdict(int)
63
+ font_sizes = defaultdict(int)
64
+ colors = defaultdict(int)
65
+
66
+ for page_num in range(len(self.pdf_doc)):
67
+ page = self.pdf_doc[page_num]
68
+
69
+ # Get text with formatting information
70
+ text_dict = page.get_text("dict")
71
+
72
+ for block in text_dict["blocks"]:
73
+ if "lines" in block:
74
+ for line in block["lines"]:
75
+ for span in line["spans"]:
76
+ # Extract font information
77
+ font_name = span.get("font", "Unknown")
78
+ font_size = span.get("size", 0)
79
+
80
+ # Extract color (RGB)
81
+ color = span.get("color", 0)
82
+ if isinstance(color, int):
83
+ # Convert integer color to RGB
84
+ r = (color >> 16) & 255
85
+ g = (color >> 8) & 255
86
+ b = color & 255
87
+ color_rgb = (r, g, b)
88
+ else:
89
+ color_rgb = (0, 0, 0) # Default to black
90
+
91
+ # Count occurrences
92
+ text_content = span.get("text", "").strip()
93
+ if text_content:
94
+ fonts[font_name] += len(text_content)
95
+ # Round font size to one decimal place
96
+ rounded_size = round(font_size, 1)
97
+ font_sizes[rounded_size] += len(text_content)
98
+ colors[color_rgb] += len(text_content)
99
+
100
+ return {
101
+ 'fonts': dict(fonts),
102
+ 'font_sizes': dict(font_sizes),
103
+ 'text_colors': dict(colors)
104
+ }
105
+
106
+ def _preprocess_image_for_ocr(self, image: np.ndarray) -> np.ndarray:
107
+ """
108
+ Preprocess image for better OCR results.
109
+
110
+ Args:
111
+ image: Input image as numpy array
112
+
113
+ Returns:
114
+ np.ndarray: Preprocessed image
115
+ """
116
+ # Convert to grayscale
117
+ if len(image.shape) == 3:
118
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
119
+ else:
120
+ gray = image
121
+
122
+ # Apply denoising
123
+ denoised = cv2.fastNlMeansDenoising(gray)
124
+
125
+ # Apply adaptive thresholding
126
+ thresh = cv2.adaptiveThreshold(
127
+ denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
128
+ cv2.THRESH_BINARY, 11, 2
129
+ )
130
+
131
+ return thresh
132
+
133
+ def _estimate_font_size_from_ocr(self, image: np.ndarray, text_data: Dict) -> Dict[float, int]:
134
+ """
135
+ Estimate font sizes from OCR bounding boxes.
136
+
137
+ Args:
138
+ image: Input image
139
+ text_data: OCR data from pytesseract
140
+
141
+ Returns:
142
+ Dict: Font sizes and their frequencies
143
+ """
144
+ font_sizes = defaultdict(int)
145
+
146
+ for i, text in enumerate(text_data['text']):
147
+ if text.strip():
148
+ height = text_data['height'][i]
149
+ # Estimate font size from bounding box height
150
+ estimated_size = max(8, min(72, height * 0.75)) # Rough conversion
151
+ # Round to one decimal place
152
+ rounded_size = round(estimated_size, 1)
153
+ font_sizes[rounded_size] += len(text.strip())
154
+
155
+ return dict(font_sizes)
156
+
157
+ def _extract_colors_from_image(self, image: np.ndarray, text_data: Dict) -> Dict[Tuple[int, int, int], int]:
158
+ """
159
+ Extract dominant colors from text regions.
160
+
161
+ Args:
162
+ image: Input image
163
+ text_data: OCR data from pytesseract
164
+
165
+ Returns:
166
+ Dict: Colors and their frequencies
167
+ """
168
+ colors = defaultdict(int)
169
+
170
+ for i, text in enumerate(text_data['text']):
171
+ if text.strip():
172
+ x, y, w, h = (text_data['left'][i], text_data['top'][i],
173
+ text_data['width'][i], text_data['height'][i])
174
+
175
+ # Extract text region
176
+ if 0 <= y < image.shape[0] and 0 <= x < image.shape[1]:
177
+ text_region = image[y:y+h, x:x+w]
178
+
179
+ if text_region.size > 0:
180
+ if len(text_region.shape) == 3:
181
+ # For color images, find dominant color
182
+ pixels = text_region.reshape(-1, 3)
183
+ # Find the most common color that's not white/background
184
+ unique_colors, counts = np.unique(pixels, axis=0, return_counts=True)
185
+
186
+ # Filter out likely background colors (very light colors)
187
+ for color, count in zip(unique_colors, counts):
188
+ if np.mean(color) < 200: # Not too light
189
+ colors[tuple(color)] += len(text.strip())
190
+ else:
191
+ # For grayscale, assume black text
192
+ avg_intensity = np.mean(text_region)
193
+ if avg_intensity < 128: # Dark text
194
+ colors[(0, 0, 0)] += len(text.strip())
195
+
196
+ return dict(colors)
197
+
198
+ def _extract_ocr_metadata(self) -> Dict:
199
+ """
200
+ Extract metadata using OCR for non-selectable text.
201
+
202
+ Returns:
203
+ Dict: Metadata dictionary with estimated fonts, sizes, and colors
204
+ """
205
+ all_font_sizes = defaultdict(int)
206
+ all_colors = defaultdict(int)
207
+
208
+ for page_num in range(len(self.pdf_doc)):
209
+ page = self.pdf_doc[page_num]
210
+
211
+ # Convert page to image
212
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better quality
213
+ img_data = pix.tobytes("ppm")
214
+ image = Image.open(io.BytesIO(img_data))
215
+ image_np = np.array(image)
216
+
217
+ # Preprocess image
218
+ processed_img = self._preprocess_image_for_ocr(image_np)
219
+
220
+ # Perform OCR with detailed data
221
+ ocr_data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT)
222
+
223
+ # Extract font sizes
224
+ page_font_sizes = self._estimate_font_size_from_ocr(processed_img, ocr_data)
225
+ for size, count in page_font_sizes.items():
226
+ all_font_sizes[size] += count
227
+
228
+ # Extract colors
229
+ page_colors = self._extract_colors_from_image(image_np, ocr_data)
230
+ for color, count in page_colors.items():
231
+ all_colors[color] += count
232
+
233
+ # For OCR, we can't determine exact fonts, so provide common estimates
234
+ estimated_fonts = {
235
+ 'Arial-like': sum(all_font_sizes.values()) * 0.4,
236
+ 'Times-like': sum(all_font_sizes.values()) * 0.3,
237
+ 'Helvetica-like': sum(all_font_sizes.values()) * 0.3
238
+ }
239
+
240
+ return {
241
+ 'fonts': estimated_fonts,
242
+ 'font_sizes': dict(all_font_sizes),
243
+ 'text_colors': dict(all_colors)
244
+ }
245
+
246
+ def _has_selectable_text(self) -> bool:
247
+ """
248
+ Check if PDF has selectable text.
249
+
250
+ Returns:
251
+ bool: True if PDF has selectable text
252
+ """
253
+ for page_num in range(min(3, len(self.pdf_doc))): # Check first 3 pages
254
+ page = self.pdf_doc[page_num]
255
+ text = page.get_text().strip()
256
+ if text:
257
+ return True
258
+ return False
259
+
260
+ def extract_metadata(self, pdf_path: str) -> Dict:
261
+ """
262
+ Extract metadata from PDF artwork.
263
+
264
+ Args:
265
+ pdf_path: Path to PDF file
266
+
267
+ Returns:
268
+ Dict: Complete metadata dictionary
269
+ """
270
+ if not self.load_pdf(pdf_path):
271
+ return {'error': 'Failed to load PDF'}
272
+
273
+ try:
274
+ self.metadata['pages_processed'] = len(self.pdf_doc)
275
+ has_selectable = self._has_selectable_text()
276
+ self.metadata['has_selectable_text'] = has_selectable
277
+
278
+ if has_selectable:
279
+ self.metadata['extraction_method'] = 'selectable_text'
280
+ extracted_data = self._extract_selectable_text_metadata()
281
+ else:
282
+ self.metadata['extraction_method'] = 'ocr'
283
+ extracted_data = self._extract_ocr_metadata()
284
+
285
+ # Update metadata
286
+ self.metadata.update(extracted_data)
287
+
288
+ # Sort by frequency (most common first)
289
+ self.metadata['fonts'] = dict(sorted(
290
+ self.metadata['fonts'].items(),
291
+ key=lambda x: x[1],
292
+ reverse=True
293
+ ))
294
+
295
+ self.metadata['font_sizes'] = dict(sorted(
296
+ self.metadata['font_sizes'].items(),
297
+ key=lambda x: x[1],
298
+ reverse=True
299
+ ))
300
+
301
+ self.metadata['text_colors'] = dict(sorted(
302
+ self.metadata['text_colors'].items(),
303
+ key=lambda x: x[1],
304
+ reverse=True
305
+ ))
306
+
307
+ return self.metadata
308
+
309
+ except Exception as e:
310
+ return {'error': f'Failed to extract metadata: {e}'}
311
+
312
+ finally:
313
+ if self.pdf_doc:
314
+ self.pdf_doc.close()
315
+
316
+ def get_dominant_font(self) -> Optional[str]:
317
+ """Get the most frequently used font."""
318
+ if self.metadata['fonts']:
319
+ return max(self.metadata['fonts'], key=self.metadata['fonts'].get)
320
+ return None
321
+
322
+ def get_dominant_font_size(self) -> Optional[float]:
323
+ """Get the most frequently used font size."""
324
+ if self.metadata['font_sizes']:
325
+ return max(self.metadata['font_sizes'], key=self.metadata['font_sizes'].get)
326
+ return None
327
+
328
+ def get_dominant_color(self) -> Optional[Tuple[int, int, int]]:
329
+ """Get the most frequently used text color."""
330
+ if self.metadata['text_colors']:
331
+ return max(self.metadata['text_colors'], key=self.metadata['text_colors'].get)
332
+ return None
333
+
334
+ def print_summary(self):
335
+ """Print a summary of extracted metadata."""
336
+ print("PDF Artwork Metadata Summary")
337
+ print("=" * 40)
338
+ print(f"Pages processed: {self.metadata['pages_processed']}")
339
+ print(f"Has selectable text: {self.metadata['has_selectable_text']}")
340
+ print(f"Extraction method: {self.metadata['extraction_method']}")
341
+ print()
342
+
343
+ print("Top 5 Fonts:")
344
+ for i, (font, count) in enumerate(list(self.metadata['fonts'].items())[:5]):
345
+ print(f" {i+1}. {font}: {count} characters")
346
+ print()
347
+
348
+ print("Top 5 Font Sizes:")
349
+ for i, (size, count) in enumerate(list(self.metadata['font_sizes'].items())[:5]):
350
+ print(f" {i+1}. {size}pt: {count} characters")
351
+ print()
352
+
353
+ print("Top 5 Text Colors (RGB):")
354
+ for i, (color, count) in enumerate(list(self.metadata['text_colors'].items())[:5]):
355
+ print(f" {i+1}. {color}: {count} characters")
src/extract_text/google_document_api.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, List, Dict, Any
3
+ from google.api_core.client_options import ClientOptions
4
+ from google.cloud import documentai # type: ignore
5
+ from PIL import Image, ImageChops
6
+ from io import BytesIO
7
+ import fitz # PyMuPDF
8
+ import base64
9
+
10
+ class GoogleDocumentAPI:
11
+ def __init__(self, credentials_path: str):
12
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
13
+
14
+ self.project_id = "649829115993"
15
+ self.location = "us" # Format is "us" or "eu"
16
+ self.processor_id = "7f9fd758484d83fe" # Only use this
17
+ self.mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
18
+
19
+ def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document:
20
+ opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
21
+ client = documentai.DocumentProcessorServiceClient(client_options=opts)
22
+
23
+ if processor_version_id:
24
+ name = client.processor_version_path(
25
+ self.project_id, self.location, self.processor_id, processor_version_id
26
+ )
27
+ else:
28
+ name = client.processor_path(self.project_id, self.location, self.processor_id)
29
+
30
+ with open(file_path, "rb") as image:
31
+ image_content = image.read()
32
+
33
+ raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type)
34
+
35
+ process_options = documentai.ProcessOptions(
36
+ individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
37
+ pages=[1]
38
+ )
39
+ )
40
+
41
+ request = documentai.ProcessRequest(
42
+ name=name,
43
+ raw_document=raw_document,
44
+ field_mask=field_mask,
45
+ process_options=process_options,
46
+ )
47
+
48
+ result = client.process_document(request=request)
49
+ return result.document
50
+
51
+ def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str:
52
+ # Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0]
53
+ return document.pages[page_number].text
54
+
55
+ @staticmethod
56
+ def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str:
57
+ """Helper function to extract style information for a text anchor."""
58
+ if not hasattr(document, 'text_styles') or not document.text_styles:
59
+ return "N/A"
60
+
61
+ styles = []
62
+ # A text anchor can have multiple non-contiguous segments.
63
+ for para_segment in text_anchor.text_segments:
64
+ para_start = int(para_segment.start_index)
65
+ para_end = int(para_segment.end_index)
66
+
67
+ for style in document.text_styles:
68
+ for style_segment in style.text_anchor.text_segments:
69
+ style_start = int(style_segment.start_index)
70
+ style_end = int(style_segment.end_index)
71
+
72
+ # Check for overlap between the paragraph segment and the style segment
73
+ if max(para_start, style_start) < min(para_end, style_end):
74
+ style_str_parts = []
75
+ if style.font_size and style.font_size.size > 0:
76
+ unit = style.font_size.unit if style.font_size.unit else 'pt'
77
+ style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}")
78
+ if style.font_weight and style.font_weight.lower() != 'normal':
79
+ style_str_parts.append(f"font weight: {style.font_weight}")
80
+ if style.text_style and style.text_style.lower() != 'normal':
81
+ style_str_parts.append(f"text style: {style.text_style}")
82
+ if style.font_family:
83
+ style_str_parts.append(f'font family: {style.font_family}')
84
+
85
+ if style_str_parts:
86
+ styles.append(" ".join(style_str_parts))
87
+
88
+ if styles:
89
+ # Using dict.fromkeys to preserve order and get unique styles
90
+ unique_styles = list(dict.fromkeys(styles))
91
+ return ", ".join(unique_styles)
92
+
93
+ return "default"
94
+
95
+ @staticmethod
96
+ def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
97
+ """Helper function to extract text from text_anchor."""
98
+ if not text_anchor.text_segments:
99
+ return ""
100
+ return "".join(
101
+ text[int(segment.start_index) : int(segment.end_index)]
102
+ for segment in text_anchor.text_segments
103
+ )
104
+
105
+ def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]:
106
+ """
107
+ Extracts text and bounding box for each paragraph in the document.
108
+
109
+ Args:
110
+ document: The processed documentai.Document object.
111
+
112
+ Returns:
113
+ A list of dictionaries, where each dictionary contains:
114
+ - 'page_number': The page number (1-based).
115
+ - 'text': The text of the paragraph.
116
+ - 'bounding_box': A list of normalized vertices for the bounding box.
117
+ - 'style': Style information for the text.
118
+ - 'height': The height of the text block in millimeters (mm).
119
+ """
120
+ all_paragraphs = []
121
+ full_text = document.text
122
+ pt_to_mm = 0.3528 # Conversion factor from points to millimeters
123
+
124
+ for page in document.pages:
125
+ # Get page height in points for height calculation
126
+ page_pts = page.dimension.height
127
+
128
+ for paragraph in page.paragraphs:
129
+ p_text = self._get_text(paragraph.layout.text_anchor, full_text)
130
+ style_info = self._get_style_info(paragraph.layout.text_anchor, document)
131
+
132
+ # Get the normalized vertices for the bounding box
133
+ vertices = [
134
+ {"x": vertex.x, "y": vertex.y}
135
+ for vertex in paragraph.layout.bounding_poly.normalized_vertices
136
+ ]
137
+
138
+ # Calculate height in millimeters
139
+ y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices]
140
+ height_ratio = max(y_coords) - min(y_coords)
141
+ height_pt = height_ratio * page_pts
142
+ height_mm = height_pt * pt_to_mm
143
+
144
+ all_paragraphs.append({
145
+ "page_number": page.page_number,
146
+ "text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace
147
+ "bounding_box": vertices,
148
+ "style": style_info,
149
+ "height": round(height_mm, 2)
150
+ })
151
+ return all_paragraphs
152
+
153
+
154
+
155
+
156
+ def extract_text_with_markdown_table(self, document: documentai.Document) -> str:
157
+ data = self.extract_text_with_bounding_boxes(document)
158
+ return self._create_markdown_table(data)
159
+
160
+ def _quantize_coord(self, val, grid_size=1000) -> int:
161
+ """Converts a float (0-1) to an integer on a grid."""
162
+ return int(val * grid_size)
163
+
164
+ def _create_markdown_table(self, data) -> str:
165
+ table = "| Text ID | X | Y | Text Height (mm) | Style | Text |\\n"
166
+ table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n"
167
+ for i, item in enumerate(data):
168
+ top_left = item['bounding_box'][0]
169
+ x = self._quantize_coord(top_left['x'])
170
+ y = self._quantize_coord(top_left['y'])
171
+ height = round(item.get('height', 0), 2)
172
+ style = item.get('style', 'N/A')
173
+ text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip()
174
+ table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n"
175
+ return table
176
+
177
+ def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]:
178
+ """
179
+ Extracts bounding boxes for tokens on a specific page.
180
+ """
181
+ page = document.pages[page_number]
182
+ return [token.layout.bounding_poly for token in page.tokens]
183
+
184
+ def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]:
185
+ """
186
+ Extracts the height of each line of text from a Google Document AI parsed document
187
+ and returns a list of heights in millimeters (mm).
188
+
189
+ Parameters:
190
+ document (google.cloud.documentai.Document): Parsed Document AI response object
191
+
192
+ Returns:
193
+ List of tuples: [(page_num, line_text, height_mm), ...]
194
+ """
195
+ heights = []
196
+ pt_to_mm = 0.3528
197
+
198
+ for page_num, page in enumerate(document.pages, start=1):
199
+ page_height_pt = page.dimension.height # e.g., 792 for US Letter
200
+
201
+ for line in page.lines:
202
+ layout = line.layout
203
+ vertices = layout.bounding_poly.normalized_vertices
204
+
205
+ y_coords = [v.y for v in vertices]
206
+ if not y_coords:
207
+ continue
208
+
209
+ height_ratio = max(y_coords) - min(y_coords)
210
+ height_pt = height_ratio * page_height_pt
211
+ height_mm = height_pt * pt_to_mm
212
+
213
+ # Extract visible text (optional — may require mapping segments)
214
+ text_segment = layout.text_anchor.text_segments[0]
215
+ start = int(text_segment.start_index)
216
+ end = int(text_segment.end_index)
217
+ line_text = document.text[start:end].strip()
218
+
219
+ heights.append((page_num, line_text, round(height_mm, 2)))
220
+
221
+ return heights
222
+
223
+
224
+
src/extract_text/ingest.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from io import BytesIO
3
+
4
+ class RequirementsIngest:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def ingest_requirements_document(self, file_obj) -> dict:
9
+ """
10
+ Ingest a requirements document from a file-like object.
11
+ Supports both TXT and PDF files.
12
+
13
+ Returns:
14
+ dict: {
15
+ 'type': 'text' or 'pdf',
16
+ 'content': str (for text) or base64 string (for PDF),
17
+ 'filename': str,
18
+ 'text_content': str (extracted text for PDFs, same as content for TXT)
19
+ }
20
+ """
21
+ try:
22
+ filename = getattr(file_obj, 'name', 'unknown')
23
+ file_extension = filename.lower().split('.')[-1] if '.' in filename else ''
24
+
25
+ if file_extension == 'pdf':
26
+ # Handle PDF file
27
+ file_obj.seek(0)
28
+ pdf_content = file_obj.read()
29
+
30
+ # Convert PDF to base64 for Claude
31
+ pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
32
+
33
+ # For PDFs, we'll extract text content for backward compatibility
34
+ # but the main content will be the PDF itself
35
+ try:
36
+ # Try to extract text using PyPDF2 if available
37
+ try:
38
+ from PyPDF2 import PdfReader
39
+ import io
40
+
41
+ # Reset file pointer and read PDF
42
+ file_obj.seek(0)
43
+ pdf_content = file_obj.read()
44
+ pdf_stream = io.BytesIO(pdf_content)
45
+
46
+ # Extract text from PDF
47
+ reader = PdfReader(pdf_stream)
48
+ text_content = ""
49
+ for page in reader.pages:
50
+ text_content += page.extract_text() + "\n"
51
+
52
+ if not text_content.strip():
53
+ text_content = f"PDF Requirements Document: {filename} (no text content found)"
54
+ else:
55
+ # Limit text content for display
56
+ text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content
57
+
58
+ except ImportError:
59
+ # PyPDF2 not available, use basic description
60
+ text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)"
61
+ except Exception as e:
62
+ text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
63
+
64
+ except Exception as e:
65
+ text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
66
+
67
+ return {
68
+ 'type': 'pdf',
69
+ 'content': pdf_base64,
70
+ 'filename': filename,
71
+ 'text_content': text_content,
72
+ 'file_size': len(pdf_content)
73
+ }
74
+ else:
75
+ # Handle text file (default behavior)
76
+ file_obj.seek(0)
77
+ text = file_obj.read()
78
+ if isinstance(text, bytes):
79
+ text = text.decode("utf-8", errors="replace")
80
+
81
+ return {
82
+ 'type': 'text',
83
+ 'content': text,
84
+ 'filename': filename,
85
+ 'text_content': text,
86
+ 'file_size': len(text.encode('utf-8'))
87
+ }
88
+
89
+ except Exception as e:
90
+ raise ValueError(f"Error reading requirements document: {e}")
91
+
92
+
src/extract_text/photon-services-f0d3ec1417d0.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "photon-services",
4
+ "private_key_id": "f0d3ec1417d0afe1a21079a88350de615829fb38",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDGUlwi7owC2jS0\n9miy5mDi9Q84/8arKMkG8n2Zok7lfFz9cFf76G/ai1eIAvQ9u6OV2ddt05lZMX8S\n+q5PSFlmeOCXSHcnufoTsWY5FKTXWzWd4dZ6lMsCOq7kWB+tHEhlftxMR1egI7sn\nA3z32cbydPewInvw6QMMLaFdtACS8p09QnRZSdYnGX5FNJr9Hq+NBa5qRHqA0y8g\n6x5lo/Ybku3bKCNAu4NWOErsKZ4Z0yEZzggad7nojx1oA9wmIVaTbrJ6OY2kPOMN\n0mBQJBOdRaw5fIHiDYH18tnR0UzVVEnv2s1LADcSpe144nDbIlLdD3DsZ0H9j91J\n7b+EnbaJAgMBAAECggEAHXRe/csrHUNWP6g3LZbcveiCnccTNRmGHdOHBvnduOSr\nFPMKBj5j2nQGiItTxhVnutpTThr2tBIPWvzDRcArkvYR+TYIiGxtMV6QHZsszlVc\nFbpUdflCW27mycAy2C2SrQxV4LhZ0c1svuMcPN1p2Fm57b15ZfLdgoIGbNnOmgRO\nmOjJxXnjbPq4pFnZYVB2GxV7t3O8kzTG8msWFeIuOfrs6UJpXAS91BQXfLmnaxv5\nP56EaNGyamQgHVnOrtoLoTTUFrfNUFCl2Ggrs80FfS0ZJaIWqrItDLI9ah9MgfeL\nTwrcgjWFodX0BRu7Er2RX5Bo/vhhIVVZeOIHxzKWFwKBgQDk0+QCqChmMAOvchlX\nWb6XADW8qyYYbEPSVO+/IJi0teqIDGW/d1F0QrDdZc8dYlmaUqCt5z1NT8PdXSXd\nTifDRXLbHaKlFS3DQF+ComgC+ey9cUjZ0nMiCqzYKUftkmM2xWWJcLfEXPuWSZiy\n//Yqctd1ilQjk5pMyJFaT5k0MwKBgQDd3x8DwqEyWHk/nT4RQSVGp4S9+ZLegu+K\nefLPpCQevc0klvQVDospob181jZqBnWPDBd7fPyBc3+HmD/zzmU2YHlyWg3n9scb\nq/5WOssxjGkjhb8OftwsUesYLPFm6HcVfb+kiHJm+FKk2Yb935L90S3oOd0ljIuk\ng6LJF40OUwKBgE53XmOO2DOaWVkrLgdnDdTnzIWCxtBvJ56TY5bNja/CBcdbQPSz\n7KmKSO3SgIAZ/pHNra2Ucs/0/zwEOfy2VSo/wU/jzKcBKS0gAOBh4nrKyuR3WTzg\nTnyo3nZNSY3subrJW7USguGB5P+3Ava2kOojcUCsC4gbkDiuOjGWw/lDAoGBAIiG\nTihbMCOxq1JIqLOnWY+jbxwTIZvICCw2pAG/J/a+pif4t1Lpsxo4C0hw6+TL+rS+\nJQj4vMvPTU8bkWatvzv5m2GRJnNxN83ARO28meHwW5XfK9R4nXSsJ7SlmxnOu9A+\no5lT2MmhzgDgVZ+MXn/Ooqf+SyVa2WavFZEV69c/AoGACpBkRiXMscE1FISCy+lr\nDTIvGtqsMMadN7N+2ceQB+Yr/slE7FaCHblPWo2VnPosazis2340XW5LUhRYcATn\nuhwwFLGvC2IXSAq4uAyHSSiHVtwDjKWcJakkMnKlFuK1a5AI/2vMLkb3wKqyxKxC\nvQ0KZDSe4YO4nJk983CUL4g=\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "jake-document-ai-test@photon-services.iam.gserviceaccount.com",
7
+ "client_id": "105944418590442697805",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/jake-document-ai-test%40photon-services.iam.gserviceaccount.com",
12
+ "universe_domain": "googleapis.com"
13
+ }
src/utils/__pycache__/barcode.cpython-313.pyc ADDED
Binary file (4.48 kB). View file
 
src/utils/__pycache__/image_utils.cpython-313.pyc ADDED
Binary file (9.62 kB). View file
 
src/utils/barcode.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from PIL import Image
4
+ import zxingcpp
5
+ import barcodenumber
6
+
7
+ class Barcode:
8
+ def __init__(self):
9
+ self._SYM_ALIAS = {
10
+ 'EAN13': 'ean13',
11
+ 'EAN8': 'ean8',
12
+ 'UPCA': 'upc',
13
+ 'UPC-A': 'upc',
14
+ }
15
+
16
+ def validate_barcode(self, data: str, sym: str) -> bool:
17
+ # Empty strings are always invalid
18
+ if not data:
19
+ return False
20
+
21
+ # For unknown symbology, try all known formats first
22
+ if sym.upper() not in self._SYM_ALIAS:
23
+ if data.isdigit():
24
+ for known_format in ['ean13', 'ean8', 'upc']:
25
+ try:
26
+ if barcodenumber.check_code(known_format, data):
27
+ return True
28
+ except (ValueError, KeyError):
29
+ continue
30
+ # If no known format matches, validate basic structure
31
+ return False
32
+
33
+ # For known formats, validate normally
34
+ code = self._SYM_ALIAS.get(sym, sym.lower())
35
+ try:
36
+ return barcodenumber.check_code(code, data)
37
+ except (ValueError, KeyError):
38
+ return False
39
+
40
+ def scan_and_validate(self, image, show_image: bool = False):
41
+ # 1) normalize to OpenCV BGR numpy array
42
+ if isinstance(image, np.ndarray):
43
+ cv_img = image.copy()
44
+ else:
45
+ # assume PIL
46
+ cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
47
+
48
+ # 2) for zxing we need a PIL, so make one from cv_img
49
+ pil_for_scan = Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
50
+ barcodes = zxingcpp.read_barcodes(pil_for_scan)
51
+
52
+ results = []
53
+ for i, barcode in enumerate(barcodes):
54
+ pos = barcode.position
55
+ if pos:
56
+ pts = [pos.top_left, pos.top_right, pos.bottom_right, pos.bottom_left]
57
+ xs = [p.x for p in pts]
58
+ ys = [p.y for p in pts]
59
+ x, y = int(min(xs)), int(min(ys))
60
+ w, h = int(max(xs) - x), int(max(ys) - y)
61
+ else:
62
+ x, y, w, h = 0, 0, 100, 50
63
+
64
+ raw = barcode.text
65
+ sym = str(barcode.format)
66
+ ok = self.validate_barcode(raw, sym)
67
+
68
+ # Create barcode result with position data
69
+ barcode_result = {
70
+ 'id': f'BARCODE_{i+1:03d}',
71
+ 'type': sym,
72
+ 'data': raw,
73
+ 'valid': ok,
74
+ 'position': {
75
+ 'x': x,
76
+ 'y': y,
77
+ 'width': w,
78
+ 'height': h,
79
+ 'top_left': {'x': x, 'y': y},
80
+ 'top_right': {'x': x + w, 'y': y},
81
+ 'bottom_right': {'x': x + w, 'y': y + h},
82
+ 'bottom_left': {'x': x, 'y': y + h}
83
+ }
84
+ }
85
+
86
+ results.append(barcode_result)
87
+
88
+ return results
89
+
90
+ def draw_box(self, img, x, y, w, h, sym, raw, ok):
91
+ color = (0,255,0) if ok else (0,0,255)
92
+ cv2.rectangle(img, (x,y), (x+w, y+h), color, 2)
93
+ cv2.putText(img, f"{sym}:{raw}", (x, y-10),
94
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
95
+ return img
src/utils/image_utils.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from io import BytesIO
3
+ from PIL import Image, ImageChops
4
+ from PIL import ImageDraw
5
+ import math
6
+
7
+ class ImageUtils:
8
+ def __init__(self):
9
+ pass
10
+
11
+ @staticmethod
12
+ def crop_base64(base64_string, output_format='PNG') -> str:
13
+ """
14
+ Takes a base64 encoded image, crops it by removing uniform background,
15
+ and returns the cropped image as base64.
16
+
17
+ Args:
18
+ base64_string (str or bytes): Base64 encoded image string or raw bytes
19
+ output_format (str): Output image format ('PNG', 'JPEG', etc.)
20
+
21
+ Returns:
22
+ str: Base64 encoded cropped image, or empty string if cropping fails
23
+ """
24
+ try:
25
+ # Handle both base64 strings and raw bytes
26
+ if isinstance(base64_string, bytes):
27
+ # If it's raw bytes, treat it as image data directly
28
+ image_data = base64_string
29
+ else:
30
+ # If it's a string, decode base64 to image
31
+ image_data = base64.b64decode(base64_string)
32
+
33
+ im = Image.open(BytesIO(image_data))
34
+
35
+ # Apply the original trim logic
36
+ bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
37
+ diff = ImageChops.difference(im, bg)
38
+ diff = ImageChops.add(diff, diff, 2.0, -100)
39
+ bbox = diff.getbbox()
40
+
41
+ if bbox:
42
+ cropped_im = im.crop(bbox)
43
+ else:
44
+ cropped_im = im # Return original if no cropping needed
45
+
46
+ # Convert back to base64
47
+ buffer = BytesIO()
48
+ cropped_im.save(buffer, format=output_format)
49
+ cropped_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
50
+
51
+ return cropped_base64
52
+
53
+ except Exception as e:
54
+ print(f"Error processing image: {e}")
55
+ return ""
56
+
57
+ @staticmethod
58
+ def crop_image(im: Image.Image) -> Image.Image:
59
+ """
60
+ Original trim function for PIL Image objects
61
+ """
62
+ try:
63
+ bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
64
+ diff = ImageChops.difference(im, bg)
65
+ diff = ImageChops.add(diff, diff, 2.0, -100)
66
+ bbox = diff.getbbox()
67
+ if bbox:
68
+ return im.crop(bbox)
69
+ return im
70
+ except Exception as e:
71
+ print(f"Error cropping image: {e}")
72
+ return im
73
+
74
+ @staticmethod
75
+ def draw_bounding_boxes(pil_image: Image.Image, boxes: list[tuple[int, int, int, int]], color: str = "red", width: int = 2) -> Image.Image:
76
+ """
77
+ Draw bounding boxes on a PIL image.
78
+
79
+ Args:
80
+ pil_image: A PIL.Image instance.
81
+ boxes: A list of boxes, each specified as (x1, y1, x2, y2).
82
+ color: The color for the bounding box outline.
83
+ width: The width of the bounding box line.
84
+
85
+ Returns:
86
+ The PIL.Image with drawn bounding boxes.
87
+ """
88
+ try:
89
+ draw = ImageDraw.Draw(pil_image)
90
+ for box in boxes:
91
+ draw.rectangle(box, outline=color, width=width)
92
+ return pil_image
93
+ except Exception as e:
94
+ print(f"Error drawing bounding boxes: {e}")
95
+ return pil_image
96
+
97
+ @staticmethod
98
+ def standardize_image_size(image: Image.Image, target_size: tuple = (1200, 1600), maintain_aspect_ratio: bool = True) -> Image.Image:
99
+ """
100
+ Resize image to target size while optionally maintaining aspect ratio.
101
+
102
+ Args:
103
+ image: PIL Image to resize
104
+ target_size: Target (width, height) in pixels
105
+ maintain_aspect_ratio: If True, fit within target size while maintaining aspect ratio
106
+
107
+ Returns:
108
+ Resized PIL Image
109
+ """
110
+ if maintain_aspect_ratio:
111
+ # Calculate aspect ratios
112
+ img_ratio = image.width / image.height
113
+ target_ratio = target_size[0] / target_size[1]
114
+
115
+ if img_ratio > target_ratio:
116
+ # Image is wider than target, fit to width
117
+ new_width = target_size[0]
118
+ new_height = int(target_size[0] / img_ratio)
119
+ else:
120
+ # Image is taller than target, fit to height
121
+ new_height = target_size[1]
122
+ new_width = int(target_size[1] * img_ratio)
123
+
124
+ # Resize image
125
+ resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
126
+
127
+ # Create new image with target size and white background
128
+ final_image = Image.new('RGB', target_size, 'white')
129
+
130
+ # Calculate position to center the resized image
131
+ x_offset = (target_size[0] - new_width) // 2
132
+ y_offset = (target_size[1] - new_height) // 2
133
+
134
+ # Paste the resized image onto the white background
135
+ final_image.paste(resized_image, (x_offset, y_offset))
136
+
137
+ return final_image
138
+ else:
139
+ # Direct resize to target size
140
+ return image.resize(target_size, Image.Resampling.LANCZOS)
141
+
142
+ @staticmethod
143
+ def optimize_image_quality(image: Image.Image, max_size_bytes: int = 1024 * 1024, initial_quality: int = 95) -> tuple[Image.Image, int]:
144
+ """
145
+ Optimize image quality to fit within specified file size limit.
146
+
147
+ Args:
148
+ image: PIL Image to optimize
149
+ max_size_bytes: Maximum file size in bytes (default 1MB)
150
+ initial_quality: Starting quality (1-100) - not used for PNG but kept for compatibility
151
+
152
+ Returns:
153
+ Tuple of (optimized_image, final_quality)
154
+ """
155
+ # For PNG, we'll use compression levels instead of quality
156
+ # PNG compression levels range from 0 (no compression) to 9 (maximum compression)
157
+ compression_levels = [0, 1, 3, 5, 7, 9] # Try different compression levels
158
+
159
+ for compression in compression_levels:
160
+ # Save image to buffer with current compression
161
+ buffer = BytesIO()
162
+ image.save(buffer, format='PNG', optimize=True, compress_level=compression)
163
+ current_size = buffer.tell()
164
+
165
+ # If size is within limit, return the image
166
+ if current_size <= max_size_bytes:
167
+ # Reset buffer position and load the optimized image
168
+ buffer.seek(0)
169
+ optimized_image = Image.open(buffer)
170
+ return optimized_image, 95 # Return a default quality value for compatibility
171
+
172
+ # If we can't get under the size limit, return the most compressed version
173
+ buffer = BytesIO()
174
+ image.save(buffer, format='PNG', optimize=True, compress_level=9)
175
+ buffer.seek(0)
176
+ optimized_image = Image.open(buffer)
177
+ return optimized_image, 50 # Return a lower quality value for compatibility
178
+
179
+ @staticmethod
180
+ def process_image_for_comparison(image: Image.Image, target_size: tuple = (1200, 1600), max_size_bytes: int = 1024 * 1024) -> tuple[Image.Image, int, int]:
181
+ """
182
+ Process image for comparison: standardize size and optimize quality.
183
+
184
+ Args:
185
+ image: PIL Image to process
186
+ target_size: Target size in pixels (width, height)
187
+ max_size_bytes: Maximum file size in bytes (default 1MB)
188
+
189
+ Returns:
190
+ Tuple of (processed_image, final_quality, file_size_bytes)
191
+ """
192
+ # First, standardize the size
193
+ sized_image = ImageUtils.standardize_image_size(image, target_size, maintain_aspect_ratio=True)
194
+
195
+ # Then optimize quality to fit within size limit
196
+ optimized_image, quality = ImageUtils.optimize_image_quality(sized_image, max_size_bytes)
197
+
198
+ # Get final file size (using PNG format for consistency)
199
+ buffer = BytesIO()
200
+ optimized_image.save(buffer, format='PNG', optimize=True)
201
+ file_size = buffer.tell()
202
+
203
+ return optimized_image, quality, file_size
204
+
205
+ @staticmethod
206
+ def image_to_base64_optimized(image: Image.Image, target_size: tuple = (1200, 1600), max_size_bytes: int = 1024 * 1024) -> str:
207
+ """
208
+ Convert image to base64 with size and quality optimization.
209
+
210
+ Args:
211
+ image: PIL Image to convert
212
+ target_size: Target size in pixels (width, height)
213
+ max_size_bytes: Maximum file size in bytes (default 1MB)
214
+
215
+ Returns:
216
+ Base64 encoded string of the optimized image
217
+ """
218
+ processed_image, quality, file_size = ImageUtils.process_image_for_comparison(
219
+ image, target_size, max_size_bytes
220
+ )
221
+
222
+ # Convert to base64 as PNG format
223
+ buffer = BytesIO()
224
+ processed_image.save(buffer, format='PNG', optimize=True)
225
+ image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
226
+
227
+ return image_base64