Spaces:

Propelis
/

QC_Rules

Sleeping

App Files Files Community

Jakecole1 commited on Jul 21, 2025

Commit

863cb78

verified ·

1 Parent(s): a5cebee

Upload 18 files

Browse files

Files changed (18) hide show

src/core/__pycache__/analysis.cpython-313.pyc +0 -0
src/core/__pycache__/analysis.cpython-313.pyc.1424781933232 +0 -0
src/core/__pycache__/analysis.cpython-313.pyc.3054062041392 +0 -0
src/core/__pycache__/analysis.cpython-313.pyc.3054062628656 +0 -0
src/core/__pycache__/analysis.cpython-313.pyc.3054062929328 +0 -0
src/core/analysis.py +704 -0
src/extract_text/__pycache__/extract_meta_data.cpython-313.pyc +0 -0
src/extract_text/__pycache__/google_document_api.cpython-313.pyc +0 -0
src/extract_text/__pycache__/google_document_api.cpython-313.pyc.1480615374128 +0 -0
src/extract_text/__pycache__/ingest.cpython-313.pyc +0 -0
src/extract_text/extract_meta_data.py +355 -0
src/extract_text/google_document_api.py +224 -0
src/extract_text/ingest.py +92 -0
src/extract_text/photon-services-f0d3ec1417d0.json +13 -0
src/utils/__pycache__/barcode.cpython-313.pyc +0 -0
src/utils/__pycache__/image_utils.cpython-313.pyc +0 -0
src/utils/barcode.py +95 -0
src/utils/image_utils.py +227 -0

src/core/__pycache__/analysis.cpython-313.pyc ADDED Viewed

Binary file (28.5 kB). View file

src/core/__pycache__/analysis.cpython-313.pyc.1424781933232 ADDED Viewed

Binary file (25.4 kB). View file

src/core/__pycache__/analysis.cpython-313.pyc.3054062041392 ADDED Viewed

Binary file (24.3 kB). View file

src/core/__pycache__/analysis.cpython-313.pyc.3054062628656 ADDED Viewed

Binary file (22.2 kB). View file

src/core/__pycache__/analysis.cpython-313.pyc.3054062929328 ADDED Viewed

Binary file (21.1 kB). View file

src/core/analysis.py ADDED Viewed

	@@ -0,0 +1,704 @@

+import os
+import anthropic
+import requests
+import streamlit as st
+import numpy as np
+import json
+import re
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from src.extract_text.google_document_api import GoogleDocumentAPI
+CLAUDE_API_URL = "https://api.anthropic.com/v1/messages"
+class LLM:
+    def __init__(self):
+        self.claude_api_key = os.getenv('CLAUDE_API_KEY')
+        if not self.claude_api_key:
+            raise ValueError("Please set the CLAUDE_API_KEY environment variable.")
+        # Configure retry strategy with more comprehensive error handling
+        retry_strategy = Retry(
+            total=5,  # Increased total retries
+            backoff_factor=2,  # Increased backoff factor for exponential backoff
+            status_forcelist=[429, 500, 502, 503, 504, 529],  # Added 529 for server overload
+            allowed_methods=["POST"],  # Only retry POST requests
+            respect_retry_after_header=True,  # Respect Retry-After headers
+        )
+        # Create session with retry strategy
+        self.session = requests.Session()
+        self.session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
+    def call_claude_api(self, prompt, system_prompt, model="claude-sonnet-4-20250514", max_tokens=2000) -> str:
+        """
+        Helper function to call Claude API with consistent parameters and enhanced error handling.
+        """
+        headers = {
+            "x-api-key": self.claude_api_key,
+            "anthropic-version": "2023-06-01",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": model,
+            "max_tokens": max_tokens,
+            "temperature": 0.1,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            "system": system_prompt
+        }
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                response = self.session.post(
+                    CLAUDE_API_URL,
+                    headers=headers,
+                    json=payload,
+                    verify=True,  # Explicitly enable SSL verification
+                    timeout=60  # Increased timeout for better reliability
+                )
+                # Handle specific error codes
+                if response.status_code == 529:
+                    st.warning(f"Server overload (529) on attempt {attempt + 1}/{max_retries}. Retrying with exponential backoff...")
+                    if attempt < max_retries - 1:
+                        import time
+                        time.sleep(2 ** attempt)  # Exponential backoff: 1s, 2s, 4s
+                        continue
+                    else:
+                        st.error("Server overload after all retries. Please try again later.")
+                        return ""
+                response.raise_for_status()  # Raise exception for other bad status codes
+                # Parse response
+                response_data = response.json()
+                if "content" in response_data and len(response_data["content"]) > 0:
+                    return response_data["content"][0]["text"]
+                else:
+                    st.error("Unexpected response format from Claude API")
+                    return ""
+            except requests.exceptions.SSLError as ssl_err:
+                st.error(f"SSL Error when calling Claude API. Please check your SSL certificates and network connection. Error: {ssl_err}")
+                return ""
+            except requests.exceptions.Timeout as timeout_err:
+                st.warning(f"Timeout on attempt {attempt + 1}/{max_retries}. Retrying...")
+                if attempt == max_retries - 1:
+                    st.error("Request timed out after all retries")
+                    return ""
+            except requests.exceptions.RequestException as e:
+                st.error(f"Error calling Claude API: {str(e)}")
+                return ""
+            except json.JSONDecodeError as json_err:
+                st.error(f"Invalid JSON response from Claude API: {json_err}")
+                return ""
+        return ""
+    def call_claude_vision_api(self, prompt, system_prompt, image_base64, model="claude-sonnet-4-20250514", max_tokens=2000) -> str:
+        """
+        Helper function to call Claude Vision API with image support and enhanced error handling.
+        """
+        headers = {
+            "x-api-key": self.claude_api_key,
+            "anthropic-version": "2023-06-01",
+            "Content-Type": "application/json"
+        }
+        content = [
+            {
+                "type": "text",
+                "text": prompt
+            },
+            {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": "image/png",
+                    "data": image_base64
+                }
+            }
+        ]
+        payload = {
+            "model": model,
+            "max_tokens": max_tokens,
+            "temperature": 0,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": content
+                }
+            ],
+            "system": system_prompt
+        }
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                response = self.session.post(
+                    CLAUDE_API_URL,
+                    headers=headers,
+                    json=payload,
+                    verify=True,  # Explicitly enable SSL verification
+                    timeout=90  # Increased timeout for vision API calls
+                )
+                # Handle specific error codes
+                if response.status_code == 529:
+                    st.warning(f"Server overload (529) on attempt {attempt + 1}/{max_retries}. Retrying with exponential backoff...")
+                    if attempt < max_retries - 1:
+                        import time
+                        time.sleep(2 ** attempt)  # Exponential backoff: 1s, 2s, 4s
+                        continue
+                    else:
+                        st.error("Server overload after all retries. Please try again later.")
+                        return ""
+                response.raise_for_status()  # Raise exception for other bad status codes
+                # Parse response
+                response_data = response.json()
+                if "content" in response_data and len(response_data["content"]) > 0:
+                    return response_data["content"][0]["text"]
+                else:
+                    st.error("Unexpected response format from Claude Vision API")
+                    return ""
+            except requests.exceptions.SSLError as ssl_err:
+                st.error(f"SSL Error when calling Claude Vision API. Please check your SSL certificates and network connection. Error: {ssl_err}")
+                return ""
+            except requests.exceptions.Timeout as timeout_err:
+                st.warning(f"Timeout on attempt {attempt + 1}/{max_retries}. Retrying...")
+                if attempt == max_retries - 1:
+                    st.error("Request timed out after all retries")
+                    return ""
+            except requests.exceptions.RequestException as e:
+                st.error(f"Error calling Claude Vision API: {str(e)}")
+                return ""
+            except json.JSONDecodeError as json_err:
+                st.error(f"Invalid JSON response from Claude Vision API: {json_err}")
+                return ""
+        return ""
+    def call_claude_pdf_api(self, prompt, system_prompt, pdf_base64, model="claude-sonnet-4-20250514", max_tokens=4000) -> str:
+        """
+        Helper function to call Claude API with PDF support for requirements documents.
+        For now, we'll fall back to text-based processing since PDF API requires specific setup.
+        """
+        # For now, we'll use the regular API with text extraction
+        # In the future, this can be enhanced to use the Converse API with citations
+        st.info("📄 PDF requirements detected. Using text-based processing for now.")
+        st.info("💡 For full visual PDF analysis, consider using the Converse API with citations enabled.")
+        # Extract text from PDF using a simple approach
+        # In a production environment, you might want to use a more robust PDF text extraction library
+        try:
+            import base64
+            import io
+            # Try to import PyPDF2
+            try:
+                from PyPDF2 import PdfReader
+                pdf_reader_available = True
+            except ImportError:
+                pdf_reader_available = False
+                st.warning("PyPDF2 not available. Using basic text processing for PDF.")
+            if pdf_reader_available:
+                # Decode base64 PDF
+                pdf_bytes = base64.b64decode(pdf_base64)
+                pdf_stream = io.BytesIO(pdf_bytes)
+                # Extract text from PDF
+                reader = PdfReader(pdf_stream)
+                text_content = ""
+                for page in reader.pages:
+                    text_content += page.extract_text() + "\n"
+                if not text_content.strip():
+                    text_content = "PDF Requirements Document (text extraction limited)"
+                # Use regular API with extracted text
+                return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
+            else:
+                # Fallback when PyPDF2 is not available
+                return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
+        except Exception as e:
+            st.warning(f"PDF text extraction failed: {e}")
+            st.warning("Falling back to basic text processing")
+            # Fallback to basic text processing
+            return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens)
+class ComplianceAnalysis:
+    def __init__(self):
+        self.llm = LLM()
+    def extract_structured_requirements(self, requirements_data) -> list[dict]:
+        """
+        Use Claude to extract structured requirements from the requirements document.
+        Args:
+            requirements_data: Either a string (for text files) or a dict (for PDF files) containing requirements.
+        Returns:
+            A list of dictionaries, each containing a requirement ID, description, and category.
+        """
+        # Handle both text and PDF requirements
+        if isinstance(requirements_data, str):
+            # Text-based requirements
+            requirements_text = requirements_data
+            requirements_type = "text"
+        elif isinstance(requirements_data, dict):
+            # PDF-based requirements
+            requirements_text = requirements_data.get('text_content', '')
+            requirements_type = requirements_data.get('type', 'text')
+            pdf_base64 = requirements_data.get('content', '') if requirements_type == 'pdf' else None
+        else:
+            st.error("Invalid requirements data format. Please upload a valid requirements document.")
+            return []
+        # Check if requirements text is empty or None
+        if not requirements_text or not requirements_text.strip():
+            st.error("Requirements text is empty. Please upload a valid requirements document.")
+            return []
+        system_prompt = """You are an expert requirements analyst. Extract clear, structured requirements from documents. You must always return valid JSON, even if no specific requirements are found."""
+        extraction_prompt = f"""
+        Extract all requirements from this document (not just allergen requirements):
+        {requirements_text}
+        For each requirement found, provide:
+        1. Unique ID (REQ001, REQ002, etc.)
+        2. Description (verbatim from the document)
+        3. Category (Font Size, Allergen List, Formatting, Placement, Barcode, Organic, Promotional, etc.)
+        4. Source reference (section/paragraph or line number)
+        If no requirements are found, return an empty array: []
+        Return as JSON array with fields: id, description, category, source_reference.
+        Example:
+        ```json
+        [
+        {{
+            "id": "REQ001",
+            "description": "IF the product is labeled as organic, THEN a certified organic seal must be visible",
+            "category": "Organic",
+            "source_reference": "Line 1"
+        }},
+        {{
+            "id": "REQ002",
+            "description": "IF there is a promotional offer mentioned, THEN include the offer expiry date",
+            "category": "Promotional",
+            "source_reference": "Line 2"
+        }}
+        ]
+        ```
+        IMPORTANT: Always return valid JSON. If you cannot extract any requirements, return an empty array: []
+        """
+        # Use appropriate API based on requirements type
+        if requirements_type == 'pdf' and pdf_base64:
+            # Use PDF API for native PDF processing
+            response = self.llm.call_claude_pdf_api(extraction_prompt, system_prompt, pdf_base64, model='claude-sonnet-4-20250514')
+        else:
+            # Use regular API for text processing
+            response = self.llm.call_claude_api(extraction_prompt, system_prompt, model='claude-3-5-haiku-20241022')
+        # Extract JSON from the response
+        try:
+            # Find JSON content between triple backticks if present
+            if "```json" in response and "```" in response.split("```json")[1]:
+                json_content = response.split("```json")[1].split("```")[0].strip()
+            elif "```" in response:
+                # Try to find any code block
+                json_content = response.split("```")[1].split("```")[0].strip()
+            else:
+                # Assume the entire response is JSON
+                json_content = response
+            # Clean the JSON content to handle control characters
+            # Remove or replace invalid control characters except newlines and tabs
+            json_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', json_content)
+            # Replace newlines within strings with escaped newlines
+            json_content = re.sub(r'(?<!\\)"(?:[^"\\]|\\.)*?(?<!\\)"', lambda m: m.group(0).replace('\n', '\\n'), json_content)
+            requirements = json.loads(json_content)
+            return requirements
+        except Exception as e:
+            st.error(f"Error parsing extracted requirements: {e}")
+            st.error(f"Raw response: {response}")
+            # Return empty array as fallback
+            return []
+    def verify_individual_requirement(self, requirement, markdown_table, image=None, barcode_data=None, metadata=None, requirements_data=None):
+        """
+        Use structured reasoning to verify if a specific requirement is met in the packaging text.
+        Args:
+            requirement: A dictionary containing requirement details
+            markdown_table: The markdown table extracted from the packaging PDF
+            image: The image of the packaging document (optional)
+            barcode_data: List of barcode objects with position data (optional)
+            metadata: Dictionary containing font, font size, and color metadata (optional)
+            requirements_data: Original requirements data (text or PDF) for context (optional)
+        Returns:
+            A dictionary with verification results including reasoning and compliance status
+        """
+        system_prompt = """You are a regulatory compliance expert. Provide detailed, objective compliance reports."""
+        # Build the prompt for verification
+        verification_prompt = f"""
+        You are a regulatory compliance expert. Provide detailed, objective compliance reports.
+        I need to verify if the following specific requirement is met in the packaging text:
+        Requirement ID: {requirement['id']}
+        Requirement Description: {requirement['description']}
+        Requirement Category: {requirement['category']}
+        Here is the packaging text to analyze:
+        {markdown_table}
+        """
+        # Add barcode information if available
+        if barcode_data:
+            # Create minimal barcode summary for LLM (save tokens)
+            barcode_summary = []
+            for barcode in barcode_data:
+                barcode_summary.append({
+                    'id': barcode['id'],
+                    'type': barcode['type'],
+                    'data': barcode['data'],
+                    'valid': barcode['valid']
+                })
+            verification_prompt += f"""
+        Barcode Information Found:
+        {json.dumps(barcode_summary, indent=2)}
+        When analyzing barcode-related requirements, consider:
+        - Barcode ID for evidence reference
+        - Barcode type and validation status
+        """
+        # Add metadata information if available
+        if metadata and not metadata.get('error'):
+            # Create metadata summary for LLM (save tokens)
+            metadata_summary = {
+                'extraction_method': metadata.get('extraction_method', 'unknown'),
+                'has_selectable_text': metadata.get('has_selectable_text', False),
+                'pages_processed': metadata.get('pages_processed', 0),
+                'dominant_font': metadata.get('fonts', {}),
+                'dominant_font_size': metadata.get('font_sizes', {}),
+                'dominant_text_color': metadata.get('text_colors', {})
+            }
+            verification_prompt += f"""
+        Typography and Design Metadata:
+        {json.dumps(metadata_summary, indent=2)}
+        When analyzing typography and design requirements, consider:
+        - Font types and their usage frequency
+        - Font sizes and their distribution
+        - Text colors and their application
+        - Whether text is selectable or requires OCR
+        """
+        verification_prompt += f"""
+        Verify this requirement using these steps:
+        1. Break down into checkable criteria
+        2. Search for evidence in packaging text (provide Text ID)
+        3. For visual elements not in text, describe clearly (text_id = null)
+        4. For barcode evidence, use Barcode ID (text_id = null)
+        5. Provide specific examples/quotes
+        6. Determine: COMPLIANT/NON-COMPLIANT/PARTIALLY COMPLIANT
+        - Compliant: All applicable rules are fully met without any deviation.
+        - Partially Compliant: Some rules are met, but minor issues/omissions that don't constitute a full failure but need attention.
+        - Non-Compliant: One or more critical rules are violated or omitted, posing a regulatory, safety, or logistical risk.
+        7. Explain reasoning
+        For visual evidence, describe:
+        - Location (e.g., "top right corner", "bottom section")
+        - Visual characteristics (e.g., "large bold text", "red warning box")
+        - Content description (e.g., "allergen warning in red box")
+        If there is barcode evidence, include:
+        - Barcode ID
+        - Barcode type and validation status
+        Return JSON with structure:
+        ```json
+        {{
+        "requirement_id": "{requirement['id']}",
+        "criteria": ["criterion 1", "criterion 2"],
+        "evidence_found": [
+            {{"text_id": <Text ID or null>, "evidence_text": "<description>", "barcode_id": "<Barcode ID ONLY if applicable>"}}
+        ],
+        "compliance_status": "COMPLIANT/NON-COMPLIANT/PARTIALLY COMPLIANT",
+        "reasoning": "Detailed explanation",
+        "confidence": 0.95
+        }}
+        ```
+        """
+        # Use vision API if image is provided, otherwise use regular API
+        if image:
+            response = self.llm.call_claude_vision_api(verification_prompt, system_prompt, image)
+        else:
+            response = self.llm.call_claude_api(verification_prompt, system_prompt)
+        # Extract JSON from the response with enhanced error handling
+        try:
+            # Check if response is empty or None
+            if not response or not response.strip():
+                st.error("Empty response received from Claude API")
+                return {
+                    "requirement_id": requirement['id'],
+                    "evidence_found": [],
+                    "compliance_status": "ERROR",
+                    "reasoning": "Empty response received from Claude API",
+                    "confidence": 0
+                }
+            # Find JSON content between triple backticks if present
+            if "```json" in response and "```" in response.split("```json")[1]:
+                json_content = response.split("```json")[1].split("```")[0].strip()
+            elif "```" in response:
+                # Try to find any code block
+                json_content = response.split("```")[1].split("```")[0].strip()
+            else:
+                # Assume the entire response is JSON
+                json_content = response
+            # Clean the JSON content to handle control characters
+            # Remove or replace invalid control characters except newlines and tabs
+            json_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', json_content)
+            # Replace newlines within strings with escaped newlines
+            json_content = re.sub(r'(?<!\\)"(?:[^"\\]|\\.)*?(?<!\\)"', lambda m: m.group(0).replace('\n', '\\n'), json_content)
+            # Try to parse JSON with multiple fallback strategies
+            verification_result = None
+            # Strategy 1: Direct parsing
+            try:
+                verification_result = json.loads(json_content)
+            except json.JSONDecodeError as e1:
+                st.warning(f"Initial JSON parsing failed: {e1}")
+                # Strategy 2: Try to extract JSON from malformed response
+                try:
+                    # Look for JSON-like structure
+                    json_match = re.search(r'\{.*\}', json_content, re.DOTALL)
+                    if json_match:
+                        potential_json = json_match.group(0)
+                        verification_result = json.loads(potential_json)
+                        st.info("Successfully extracted JSON from malformed response")
+                except json.JSONDecodeError as e2:
+                    st.warning(f"JSON extraction failed: {e2}")
+                    # Strategy 3: Create a minimal valid JSON structure
+                    try:
+                        # Try to extract key information from the response
+                        compliance_status = "UNKNOWN"
+                        if "COMPLIANT" in response.upper():
+                            compliance_status = "COMPLIANT"
+                        elif "NON-COMPLIANT" in response.upper():
+                            compliance_status = "NON-COMPLIANT"
+                        elif "PARTIALLY" in response.upper():
+                            compliance_status = "PARTIALLY COMPLIANT"
+                        verification_result = {
+                            "requirement_id": requirement['id'],
+                            "criteria": ["Unable to parse criteria"],
+                            "evidence_found": [],
+                            "compliance_status": compliance_status,
+                            "reasoning": f"Response parsing failed. Raw response: {response[:200]}...",
+                            "confidence": 0.1
+                        }
+                        st.warning("Created fallback JSON structure due to parsing errors")
+                    except Exception as e3:
+                        st.error(f"Fallback JSON creation failed: {e3}")
+                        raise e3
+            if verification_result:
+                return verification_result
+            else:
+                raise Exception("All JSON parsing strategies failed")
+        except Exception as e:
+            st.error(f"Error parsing verification result: {e}")
+            st.error(f"Raw response: {response}")
+            # Return a failure result
+            return {
+                "requirement_id": requirement['id'],
+                "evidence_found": [],
+                "compliance_status": "ERROR",
+                "reasoning": f"Failed to verify requirement due to parsing error: {str(e)}",
+                "confidence": 0
+            }
+    def analyze_compliance(self, requirements_data, packaging_text, packaging_data, image=None, barcode_data=None, metadata=None, model="claude-sonnet-4-20250514"):
+        """
+        Analyze packaging compliance through multi-step process:
+        1. Extract structured requirements
+        2. Verify each requirement with structured reasoning
+        Args:
+            requirements_data: The requirements data (text string or PDF dict)
+            packaging_text: Markdown table extracted from the packaging PDF
+            packaging_data: Structured text with bounding boxes
+            image: The image of the packaging document
+            barcode_data: List of barcode objects with position data
+            metadata: Dictionary containing font, font size, and color metadata
+            model: The Claude model to use
+        Returns:
+            A dictionary containing compliance analysis results
+        """
+        # Step 1: Extract structured requirements
+        st.info("Extracting structured requirements...")
+        requirements = self.extract_structured_requirements(requirements_data)
+        if not requirements:
+            st.warning("No requirements found in the document. Please check that your requirements file contains valid requirement statements.")
+            return {"error": "No requirements found", "requirements": [], "verifications": []}
+        st.success(f"Extracted {len(requirements)} requirements")
+        # Step 2: Verify each requirement with structured reasoning
+        st.info("Verifying requirements...")
+        verifications = []
+        for i, req in enumerate(requirements):
+            st.text(f"Verifying requirement {i+1}/{len(requirements)}: {req['id']}")
+            # Get verification result
+            verification = self.verify_individual_requirement(req, packaging_text, image, barcode_data, metadata, requirements_data)
+            verifications.append(verification)
+        # Step 4: Generate final compliance report
+        system_prompt = """You are a regulatory compliance expert. Provide detailed, objective compliance reports."""
+        # Create minimal summary for LLM (save tokens)
+        compliance_summary = []
+        for verification in verifications:
+            compliance_summary.append({
+                'requirement_id': verification.get('requirement_id', 'Unknown'),
+                'compliance_status': verification.get('compliance_status', 'UNKNOWN'),
+                'confidence': verification.get('confidence', 0),
+                'evidence_count': len(verification.get('evidence_found', []))
+            })
+        summary_prompt = f"""
+        Based on the verification of {len(requirements)} requirements,
+        please provide a final compliance summary report.
+        Requirements Summary:
+        {json.dumps([{'id': req['id'], 'description': req['description'], 'category': req['category']} for req in requirements], indent=2)}
+        Compliance Results Summary:
+        {json.dumps(compliance_summary, indent=2)}
+        Format your response in the following template:
+        ## 🎯 **Analysis Requirements**
+        Summarize the overall compliance status with focus on:
+        1. **Quantitative Metrics**: Count of fully compliant, partially compliant, and non-compliant requirements
+        2. **Critical Issues**: Most urgent compliance gaps requiring immediate attention
+        3. **Strategic Recommendations**: Actionable steps for the artwork designer to fix the compliance issues
+        ---
+        ## 📋 **Response Template**
+        ### 🔍 **Executive Summary**
+        Provide a single, clear statement of overall compliance status
+        *Example: "Organization achieved 70% compliance (14/20 requirements); moderate risk profile with 3 critical gaps identified."*
+        ---
+        ### 📈 **Compliance Statistics**
+        | **Metric** | **Count** | **Percentage** |
+        |------------|-----------|----------------|
+        | **Total Requirements** | `[total]` | `100%` |
+        | ✅ **Fully Compliant** | `[count]` | `[%]` |
+        | ⚠️ **Partially Compliant** | `[count]` | `[%]` |
+        | ❌ **Non-Compliant** | `[count]` | `[%]` |
+        ---
+        ### 🚨 **Priority Findings**
+        List 3-5 highest-severity issues in order of criticality:
+        1. **[REQ-ID]** - [Brief description of critical issue]
+        2. **[REQ-ID]** - [Brief description of high-priority gap]
+        3. **[REQ-ID]** - [Brief description of moderate-priority concern]
+        ---
+        ### 💡 **Targeted Recommendations**
+        For each Priority Finding, provide specific corrective actions:
+        | **Finding** | **Recommended Action** | **Priority** |
+        |-------------|------------------------|--------------|
+        | **[REQ-ID]** | [Specific artwork designer action] | 🔴 **Critical** |
+        | **[REQ-ID]** | [Specific artwork designer action] | 🟡 **High** |
+        | **[REQ-ID]** | [Specific artwork designer action] | 🟢 **Medium** |
+        ---
+        ### 📝 **Detailed Assessment Results**
+        *[Provide comprehensive breakdown of each requirement with status and supporting details]*
+        ---
+        ### 📊 **Supporting Evidence**
+        *[Include relevant data, metrics, or documentation that supports the compliance assessment]*
+        """
+        # Get the final compliance report
+        compliance_report = self.llm.call_claude_api(summary_prompt, system_prompt, model='claude-3-5-haiku-20241022')
+        # Compile all results
+        result = {
+            "requirements": requirements,
+            "verifications": verifications,
+            "compliance_report": compliance_report,
+            "packaging_data": packaging_data,
+            "barcode_data": barcode_data,
+            "metadata": metadata
+        }
+        return result

src/extract_text/__pycache__/extract_meta_data.cpython-313.pyc ADDED Viewed

Binary file (16.2 kB). View file

src/extract_text/__pycache__/google_document_api.cpython-313.pyc ADDED Viewed

Binary file (13.1 kB). View file

src/extract_text/__pycache__/google_document_api.cpython-313.pyc.1480615374128 ADDED Viewed

Binary file (7.92 kB). View file

src/extract_text/__pycache__/ingest.cpython-313.pyc ADDED Viewed

Binary file (3.51 kB). View file

src/extract_text/extract_meta_data.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import fitz  # PyMuPDF
+import pytesseract
+from PIL import Image
+import numpy as np
+import cv2
+from collections import defaultdict, Counter
+import io
+import re
+from typing import Dict, List, Tuple, Optional, Union
+class PDFArtworkMetadataExtractor:
+    """
+    A class for extracting metadata (font, font size, text color) from artwork PDFs.
+    Handles both selectable text and non-selectable text using OCR.
+    """
+    def __init__(self, tesseract_path: Optional[str] = None):
+        """
+        Initialize the metadata extractor.
+        Args:
+            tesseract_path: Path to tesseract executable (if not in PATH)
+        """
+        if tesseract_path:
+            pytesseract.pytesseract.tesseract_cmd = tesseract_path
+        self.pdf_doc = None
+        self.metadata = {
+            'fonts': {},
+            'font_sizes': {},
+            'text_colors': {},
+            'has_selectable_text': False,
+            'pages_processed': 0,
+            'extraction_method': None
+        }
+    def load_pdf(self, pdf_path: str) -> bool:
+        """
+        Load PDF document.
+        Args:
+            pdf_path: Path to PDF file
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            self.pdf_doc = fitz.open(pdf_path)
+            return True
+        except Exception as e:
+            print(f"Error loading PDF: {e}")
+            return False
+    def _extract_selectable_text_metadata(self) -> Dict:
+        """
+        Extract metadata from selectable text using PyMuPDF.
+        Returns:
+            Dict: Metadata dictionary with fonts, sizes, and colors
+        """
+        fonts = defaultdict(int)
+        font_sizes = defaultdict(int)
+        colors = defaultdict(int)
+        for page_num in range(len(self.pdf_doc)):
+            page = self.pdf_doc[page_num]
+            # Get text with formatting information
+            text_dict = page.get_text("dict")
+            for block in text_dict["blocks"]:
+                if "lines" in block:
+                    for line in block["lines"]:
+                        for span in line["spans"]:
+                            # Extract font information
+                            font_name = span.get("font", "Unknown")
+                            font_size = span.get("size", 0)
+                            # Extract color (RGB)
+                            color = span.get("color", 0)
+                            if isinstance(color, int):
+                                # Convert integer color to RGB
+                                r = (color >> 16) & 255
+                                g = (color >> 8) & 255
+                                b = color & 255
+                                color_rgb = (r, g, b)
+                            else:
+                                color_rgb = (0, 0, 0)  # Default to black
+                            # Count occurrences
+                            text_content = span.get("text", "").strip()
+                            if text_content:
+                                fonts[font_name] += len(text_content)
+                                # Round font size to one decimal place
+                                rounded_size = round(font_size, 1)
+                                font_sizes[rounded_size] += len(text_content)
+                                colors[color_rgb] += len(text_content)
+        return {
+            'fonts': dict(fonts),
+            'font_sizes': dict(font_sizes),
+            'text_colors': dict(colors)
+        }
+    def _preprocess_image_for_ocr(self, image: np.ndarray) -> np.ndarray:
+        """
+        Preprocess image for better OCR results.
+        Args:
+            image: Input image as numpy array
+        Returns:
+            np.ndarray: Preprocessed image
+        """
+        # Convert to grayscale
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+        else:
+            gray = image
+        # Apply denoising
+        denoised = cv2.fastNlMeansDenoising(gray)
+        # Apply adaptive thresholding
+        thresh = cv2.adaptiveThreshold(
+            denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 11, 2
+        )
+        return thresh
+    def _estimate_font_size_from_ocr(self, image: np.ndarray, text_data: Dict) -> Dict[float, int]:
+        """
+        Estimate font sizes from OCR bounding boxes.
+        Args:
+            image: Input image
+            text_data: OCR data from pytesseract
+        Returns:
+            Dict: Font sizes and their frequencies
+        """
+        font_sizes = defaultdict(int)
+        for i, text in enumerate(text_data['text']):
+            if text.strip():
+                height = text_data['height'][i]
+                # Estimate font size from bounding box height
+                estimated_size = max(8, min(72, height * 0.75))  # Rough conversion
+                # Round to one decimal place
+                rounded_size = round(estimated_size, 1)
+                font_sizes[rounded_size] += len(text.strip())
+        return dict(font_sizes)
+    def _extract_colors_from_image(self, image: np.ndarray, text_data: Dict) -> Dict[Tuple[int, int, int], int]:
+        """
+        Extract dominant colors from text regions.
+        Args:
+            image: Input image
+            text_data: OCR data from pytesseract
+        Returns:
+            Dict: Colors and their frequencies
+        """
+        colors = defaultdict(int)
+        for i, text in enumerate(text_data['text']):
+            if text.strip():
+                x, y, w, h = (text_data['left'][i], text_data['top'][i],
+                             text_data['width'][i], text_data['height'][i])
+                # Extract text region
+                if 0 <= y < image.shape[0] and 0 <= x < image.shape[1]:
+                    text_region = image[y:y+h, x:x+w]
+                    if text_region.size > 0:
+                        if len(text_region.shape) == 3:
+                            # For color images, find dominant color
+                            pixels = text_region.reshape(-1, 3)
+                            # Find the most common color that's not white/background
+                            unique_colors, counts = np.unique(pixels, axis=0, return_counts=True)
+                            # Filter out likely background colors (very light colors)
+                            for color, count in zip(unique_colors, counts):
+                                if np.mean(color) < 200:  # Not too light
+                                    colors[tuple(color)] += len(text.strip())
+                        else:
+                            # For grayscale, assume black text
+                            avg_intensity = np.mean(text_region)
+                            if avg_intensity < 128:  # Dark text
+                                colors[(0, 0, 0)] += len(text.strip())
+        return dict(colors)
+    def _extract_ocr_metadata(self) -> Dict:
+        """
+        Extract metadata using OCR for non-selectable text.
+        Returns:
+            Dict: Metadata dictionary with estimated fonts, sizes, and colors
+        """
+        all_font_sizes = defaultdict(int)
+        all_colors = defaultdict(int)
+        for page_num in range(len(self.pdf_doc)):
+            page = self.pdf_doc[page_num]
+            # Convert page to image
+            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better quality
+            img_data = pix.tobytes("ppm")
+            image = Image.open(io.BytesIO(img_data))
+            image_np = np.array(image)
+            # Preprocess image
+            processed_img = self._preprocess_image_for_ocr(image_np)
+            # Perform OCR with detailed data
+            ocr_data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT)
+            # Extract font sizes
+            page_font_sizes = self._estimate_font_size_from_ocr(processed_img, ocr_data)
+            for size, count in page_font_sizes.items():
+                all_font_sizes[size] += count
+            # Extract colors
+            page_colors = self._extract_colors_from_image(image_np, ocr_data)
+            for color, count in page_colors.items():
+                all_colors[color] += count
+        # For OCR, we can't determine exact fonts, so provide common estimates
+        estimated_fonts = {
+            'Arial-like': sum(all_font_sizes.values()) * 0.4,
+            'Times-like': sum(all_font_sizes.values()) * 0.3,
+            'Helvetica-like': sum(all_font_sizes.values()) * 0.3
+        }
+        return {
+            'fonts': estimated_fonts,
+            'font_sizes': dict(all_font_sizes),
+            'text_colors': dict(all_colors)
+        }
+    def _has_selectable_text(self) -> bool:
+        """
+        Check if PDF has selectable text.
+        Returns:
+            bool: True if PDF has selectable text
+        """
+        for page_num in range(min(3, len(self.pdf_doc))):  # Check first 3 pages
+            page = self.pdf_doc[page_num]
+            text = page.get_text().strip()
+            if text:
+                return True
+        return False
+    def extract_metadata(self, pdf_path: str) -> Dict:
+        """
+        Extract metadata from PDF artwork.
+        Args:
+            pdf_path: Path to PDF file
+        Returns:
+            Dict: Complete metadata dictionary
+        """
+        if not self.load_pdf(pdf_path):
+            return {'error': 'Failed to load PDF'}
+        try:
+            self.metadata['pages_processed'] = len(self.pdf_doc)
+            has_selectable = self._has_selectable_text()
+            self.metadata['has_selectable_text'] = has_selectable
+            if has_selectable:
+                self.metadata['extraction_method'] = 'selectable_text'
+                extracted_data = self._extract_selectable_text_metadata()
+            else:
+                self.metadata['extraction_method'] = 'ocr'
+                extracted_data = self._extract_ocr_metadata()
+            # Update metadata
+            self.metadata.update(extracted_data)
+            # Sort by frequency (most common first)
+            self.metadata['fonts'] = dict(sorted(
+                self.metadata['fonts'].items(),
+                key=lambda x: x[1],
+                reverse=True
+            ))
+            self.metadata['font_sizes'] = dict(sorted(
+                self.metadata['font_sizes'].items(),
+                key=lambda x: x[1],
+                reverse=True
+            ))
+            self.metadata['text_colors'] = dict(sorted(
+                self.metadata['text_colors'].items(),
+                key=lambda x: x[1],
+                reverse=True
+            ))
+            return self.metadata
+        except Exception as e:
+            return {'error': f'Failed to extract metadata: {e}'}
+        finally:
+            if self.pdf_doc:
+                self.pdf_doc.close()
+    def get_dominant_font(self) -> Optional[str]:
+        """Get the most frequently used font."""
+        if self.metadata['fonts']:
+            return max(self.metadata['fonts'], key=self.metadata['fonts'].get)
+        return None
+    def get_dominant_font_size(self) -> Optional[float]:
+        """Get the most frequently used font size."""
+        if self.metadata['font_sizes']:
+            return max(self.metadata['font_sizes'], key=self.metadata['font_sizes'].get)
+        return None
+    def get_dominant_color(self) -> Optional[Tuple[int, int, int]]:
+        """Get the most frequently used text color."""
+        if self.metadata['text_colors']:
+            return max(self.metadata['text_colors'], key=self.metadata['text_colors'].get)
+        return None
+    def print_summary(self):
+        """Print a summary of extracted metadata."""
+        print("PDF Artwork Metadata Summary")
+        print("=" * 40)
+        print(f"Pages processed: {self.metadata['pages_processed']}")
+        print(f"Has selectable text: {self.metadata['has_selectable_text']}")
+        print(f"Extraction method: {self.metadata['extraction_method']}")
+        print()
+        print("Top 5 Fonts:")
+        for i, (font, count) in enumerate(list(self.metadata['fonts'].items())[:5]):
+            print(f"  {i+1}. {font}: {count} characters")
+        print()
+        print("Top 5 Font Sizes:")
+        for i, (size, count) in enumerate(list(self.metadata['font_sizes'].items())[:5]):
+            print(f"  {i+1}. {size}pt: {count} characters")
+        print()
+        print("Top 5 Text Colors (RGB):")
+        for i, (color, count) in enumerate(list(self.metadata['text_colors'].items())[:5]):
+            print(f"  {i+1}. {color}: {count} characters")

src/extract_text/google_document_api.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+from typing import Optional, List, Dict, Any
+from google.api_core.client_options import ClientOptions
+from google.cloud import documentai  # type: ignore
+from PIL import Image, ImageChops
+from io import BytesIO
+import fitz  # PyMuPDF
+import base64
+class GoogleDocumentAPI:
+    def __init__(self, credentials_path: str):
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
+        self.project_id = "649829115993"
+        self.location = "us"  # Format is "us" or "eu"
+        self.processor_id = "7f9fd758484d83fe"  # Only use this
+        self.mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
+    def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document:
+        opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
+        client = documentai.DocumentProcessorServiceClient(client_options=opts)
+        if processor_version_id:
+            name = client.processor_version_path(
+                self.project_id, self.location, self.processor_id, processor_version_id
+            )
+        else:
+            name = client.processor_path(self.project_id, self.location, self.processor_id)
+        with open(file_path, "rb") as image:
+            image_content = image.read()
+        raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type)
+        process_options = documentai.ProcessOptions(
+            individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
+                pages=[1]
+            )
+        )
+        request = documentai.ProcessRequest(
+            name=name,
+            raw_document=raw_document,
+            field_mask=field_mask,
+            process_options=process_options,
+        )
+        result = client.process_document(request=request)
+        return result.document
+    def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str:
+        # Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0]
+        return document.pages[page_number].text
+    @staticmethod
+    def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str:
+        """Helper function to extract style information for a text anchor."""
+        if not hasattr(document, 'text_styles') or not document.text_styles:
+            return "N/A"
+        styles = []
+        # A text anchor can have multiple non-contiguous segments.
+        for para_segment in text_anchor.text_segments:
+            para_start = int(para_segment.start_index)
+            para_end = int(para_segment.end_index)
+            for style in document.text_styles:
+                for style_segment in style.text_anchor.text_segments:
+                    style_start = int(style_segment.start_index)
+                    style_end = int(style_segment.end_index)
+                    # Check for overlap between the paragraph segment and the style segment
+                    if max(para_start, style_start) < min(para_end, style_end):
+                        style_str_parts = []
+                        if style.font_size and style.font_size.size > 0:
+                            unit = style.font_size.unit if style.font_size.unit else 'pt'
+                            style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}")
+                        if style.font_weight and style.font_weight.lower() != 'normal':
+                            style_str_parts.append(f"font weight: {style.font_weight}")
+                        if style.text_style and style.text_style.lower() != 'normal':
+                            style_str_parts.append(f"text style: {style.text_style}")
+                        if style.font_family:
+                            style_str_parts.append(f'font family: {style.font_family}')
+                        if style_str_parts:
+                            styles.append(" ".join(style_str_parts))
+        if styles:
+            # Using dict.fromkeys to preserve order and get unique styles
+            unique_styles = list(dict.fromkeys(styles))
+            return ", ".join(unique_styles)
+        return "default"
+    @staticmethod
+    def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
+        """Helper function to extract text from text_anchor."""
+        if not text_anchor.text_segments:
+            return ""
+        return "".join(
+            text[int(segment.start_index) : int(segment.end_index)]
+            for segment in text_anchor.text_segments
+        )
+    def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]:
+        """
+        Extracts text and bounding box for each paragraph in the document.
+        Args:
+            document: The processed documentai.Document object.
+        Returns:
+            A list of dictionaries, where each dictionary contains:
+            - 'page_number': The page number (1-based).
+            - 'text': The text of the paragraph.
+            - 'bounding_box': A list of normalized vertices for the bounding box.
+            - 'style': Style information for the text.
+            - 'height': The height of the text block in millimeters (mm).
+        """
+        all_paragraphs = []
+        full_text = document.text
+        pt_to_mm = 0.3528  # Conversion factor from points to millimeters
+        for page in document.pages:
+            # Get page height in points for height calculation
+            page_pts = page.dimension.height
+            for paragraph in page.paragraphs:
+                p_text = self._get_text(paragraph.layout.text_anchor, full_text)
+                style_info = self._get_style_info(paragraph.layout.text_anchor, document)
+                # Get the normalized vertices for the bounding box
+                vertices = [
+                    {"x": vertex.x, "y": vertex.y}
+                    for vertex in paragraph.layout.bounding_poly.normalized_vertices
+                ]
+                # Calculate height in millimeters
+                y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices]
+                height_ratio = max(y_coords) - min(y_coords)
+                height_pt = height_ratio * page_pts
+                height_mm = height_pt * pt_to_mm
+                all_paragraphs.append({
+                    "page_number": page.page_number,
+                    "text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace
+                    "bounding_box": vertices,
+                    "style": style_info,
+                    "height": round(height_mm, 2)
+                })
+        return all_paragraphs
+    def extract_text_with_markdown_table(self, document: documentai.Document) -> str:
+        data = self.extract_text_with_bounding_boxes(document)
+        return self._create_markdown_table(data)
+    def _quantize_coord(self, val, grid_size=1000) -> int:
+        """Converts a float (0-1) to an integer on a grid."""
+        return int(val * grid_size)
+    def _create_markdown_table(self, data) -> str:
+        table = "| Text ID | X   | Y   | Text Height (mm) | Style | Text                                                           |\\n"
+        table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n"
+        for i, item in enumerate(data):
+            top_left = item['bounding_box'][0]
+            x = self._quantize_coord(top_left['x'])
+            y = self._quantize_coord(top_left['y'])
+            height = round(item.get('height', 0), 2)
+            style = item.get('style', 'N/A')
+            text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip()
+            table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n"
+        return table
+    def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]:
+        """
+        Extracts bounding boxes for tokens on a specific page.
+        """
+        page = document.pages[page_number]
+        return [token.layout.bounding_poly for token in page.tokens]
+    def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]:
+        """
+        Extracts the height of each line of text from a Google Document AI parsed document
+        and returns a list of heights in millimeters (mm).
+        Parameters:
+            document (google.cloud.documentai.Document): Parsed Document AI response object
+        Returns:
+            List of tuples: [(page_num, line_text, height_mm), ...]
+        """
+        heights = []
+        pt_to_mm = 0.3528
+        for page_num, page in enumerate(document.pages, start=1):
+            page_height_pt = page.dimension.height  # e.g., 792 for US Letter
+            for line in page.lines:
+                layout = line.layout
+                vertices = layout.bounding_poly.normalized_vertices
+                y_coords = [v.y for v in vertices]
+                if not y_coords:
+                    continue
+                height_ratio = max(y_coords) - min(y_coords)
+                height_pt = height_ratio * page_height_pt
+                height_mm = height_pt * pt_to_mm
+                # Extract visible text (optional — may require mapping segments)
+                text_segment = layout.text_anchor.text_segments[0]
+                start = int(text_segment.start_index)
+                end = int(text_segment.end_index)
+                line_text = document.text[start:end].strip()
+                heights.append((page_num, line_text, round(height_mm, 2)))
+        return heights

src/extract_text/ingest.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import base64
+from io import BytesIO
+class RequirementsIngest:
+    def __init__(self):
+        pass
+    def ingest_requirements_document(self, file_obj) -> dict:
+        """
+        Ingest a requirements document from a file-like object.
+        Supports both TXT and PDF files.
+        Returns:
+            dict: {
+                'type': 'text' or 'pdf',
+                'content': str (for text) or base64 string (for PDF),
+                'filename': str,
+                'text_content': str (extracted text for PDFs, same as content for TXT)
+            }
+        """
+        try:
+            filename = getattr(file_obj, 'name', 'unknown')
+            file_extension = filename.lower().split('.')[-1] if '.' in filename else ''
+            if file_extension == 'pdf':
+                # Handle PDF file
+                file_obj.seek(0)
+                pdf_content = file_obj.read()
+                # Convert PDF to base64 for Claude
+                pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
+                # For PDFs, we'll extract text content for backward compatibility
+                # but the main content will be the PDF itself
+                try:
+                    # Try to extract text using PyPDF2 if available
+                    try:
+                        from PyPDF2 import PdfReader
+                        import io
+                        # Reset file pointer and read PDF
+                        file_obj.seek(0)
+                        pdf_content = file_obj.read()
+                        pdf_stream = io.BytesIO(pdf_content)
+                        # Extract text from PDF
+                        reader = PdfReader(pdf_stream)
+                        text_content = ""
+                        for page in reader.pages:
+                            text_content += page.extract_text() + "\n"
+                        if not text_content.strip():
+                            text_content = f"PDF Requirements Document: {filename} (no text content found)"
+                        else:
+                            # Limit text content for display
+                            text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content
+                    except ImportError:
+                        # PyPDF2 not available, use basic description
+                        text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)"
+                    except Exception as e:
+                        text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
+                except Exception as e:
+                    text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})"
+                return {
+                    'type': 'pdf',
+                    'content': pdf_base64,
+                    'filename': filename,
+                    'text_content': text_content,
+                    'file_size': len(pdf_content)
+                }
+            else:
+                # Handle text file (default behavior)
+                file_obj.seek(0)
+                text = file_obj.read()
+                if isinstance(text, bytes):
+                    text = text.decode("utf-8", errors="replace")
+                return {
+                    'type': 'text',
+                    'content': text,
+                    'filename': filename,
+                    'text_content': text,
+                    'file_size': len(text.encode('utf-8'))
+                }
+        except Exception as e:
+            raise ValueError(f"Error reading requirements document: {e}")

src/extract_text/photon-services-f0d3ec1417d0.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "type": "service_account",
+  "project_id": "photon-services",
+  "private_key_id": "f0d3ec1417d0afe1a21079a88350de615829fb38",
+  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDGUlwi7owC2jS0\n9miy5mDi9Q84/8arKMkG8n2Zok7lfFz9cFf76G/ai1eIAvQ9u6OV2ddt05lZMX8S\n+q5PSFlmeOCXSHcnufoTsWY5FKTXWzWd4dZ6lMsCOq7kWB+tHEhlftxMR1egI7sn\nA3z32cbydPewInvw6QMMLaFdtACS8p09QnRZSdYnGX5FNJr9Hq+NBa5qRHqA0y8g\n6x5lo/Ybku3bKCNAu4NWOErsKZ4Z0yEZzggad7nojx1oA9wmIVaTbrJ6OY2kPOMN\n0mBQJBOdRaw5fIHiDYH18tnR0UzVVEnv2s1LADcSpe144nDbIlLdD3DsZ0H9j91J\n7b+EnbaJAgMBAAECggEAHXRe/csrHUNWP6g3LZbcveiCnccTNRmGHdOHBvnduOSr\nFPMKBj5j2nQGiItTxhVnutpTThr2tBIPWvzDRcArkvYR+TYIiGxtMV6QHZsszlVc\nFbpUdflCW27mycAy2C2SrQxV4LhZ0c1svuMcPN1p2Fm57b15ZfLdgoIGbNnOmgRO\nmOjJxXnjbPq4pFnZYVB2GxV7t3O8kzTG8msWFeIuOfrs6UJpXAS91BQXfLmnaxv5\nP56EaNGyamQgHVnOrtoLoTTUFrfNUFCl2Ggrs80FfS0ZJaIWqrItDLI9ah9MgfeL\nTwrcgjWFodX0BRu7Er2RX5Bo/vhhIVVZeOIHxzKWFwKBgQDk0+QCqChmMAOvchlX\nWb6XADW8qyYYbEPSVO+/IJi0teqIDGW/d1F0QrDdZc8dYlmaUqCt5z1NT8PdXSXd\nTifDRXLbHaKlFS3DQF+ComgC+ey9cUjZ0nMiCqzYKUftkmM2xWWJcLfEXPuWSZiy\n//Yqctd1ilQjk5pMyJFaT5k0MwKBgQDd3x8DwqEyWHk/nT4RQSVGp4S9+ZLegu+K\nefLPpCQevc0klvQVDospob181jZqBnWPDBd7fPyBc3+HmD/zzmU2YHlyWg3n9scb\nq/5WOssxjGkjhb8OftwsUesYLPFm6HcVfb+kiHJm+FKk2Yb935L90S3oOd0ljIuk\ng6LJF40OUwKBgE53XmOO2DOaWVkrLgdnDdTnzIWCxtBvJ56TY5bNja/CBcdbQPSz\n7KmKSO3SgIAZ/pHNra2Ucs/0/zwEOfy2VSo/wU/jzKcBKS0gAOBh4nrKyuR3WTzg\nTnyo3nZNSY3subrJW7USguGB5P+3Ava2kOojcUCsC4gbkDiuOjGWw/lDAoGBAIiG\nTihbMCOxq1JIqLOnWY+jbxwTIZvICCw2pAG/J/a+pif4t1Lpsxo4C0hw6+TL+rS+\nJQj4vMvPTU8bkWatvzv5m2GRJnNxN83ARO28meHwW5XfK9R4nXSsJ7SlmxnOu9A+\no5lT2MmhzgDgVZ+MXn/Ooqf+SyVa2WavFZEV69c/AoGACpBkRiXMscE1FISCy+lr\nDTIvGtqsMMadN7N+2ceQB+Yr/slE7FaCHblPWo2VnPosazis2340XW5LUhRYcATn\nuhwwFLGvC2IXSAq4uAyHSSiHVtwDjKWcJakkMnKlFuK1a5AI/2vMLkb3wKqyxKxC\nvQ0KZDSe4YO4nJk983CUL4g=\n-----END PRIVATE KEY-----\n",
+  "client_email": "jake-document-ai-test@photon-services.iam.gserviceaccount.com",
+  "client_id": "105944418590442697805",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/jake-document-ai-test%40photon-services.iam.gserviceaccount.com",
+  "universe_domain": "googleapis.com"
+}

src/utils/__pycache__/barcode.cpython-313.pyc ADDED Viewed

Binary file (4.48 kB). View file

src/utils/__pycache__/image_utils.cpython-313.pyc ADDED Viewed

Binary file (9.62 kB). View file

src/utils/barcode.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import cv2
+import numpy as np
+from PIL import Image
+import zxingcpp
+import barcodenumber
+class Barcode:
+    def __init__(self):
+        self._SYM_ALIAS = {
+            'EAN13': 'ean13',
+            'EAN8':  'ean8',
+            'UPCA':  'upc',
+            'UPC-A': 'upc',
+        }
+    def validate_barcode(self, data: str, sym: str) -> bool:
+        # Empty strings are always invalid
+        if not data:
+            return False
+        # For unknown symbology, try all known formats first
+        if sym.upper() not in self._SYM_ALIAS:
+            if data.isdigit():
+                for known_format in ['ean13', 'ean8', 'upc']:
+                    try:
+                        if barcodenumber.check_code(known_format, data):
+                            return True
+                    except (ValueError, KeyError):
+                        continue
+            # If no known format matches, validate basic structure
+            return False
+        # For known formats, validate normally
+        code = self._SYM_ALIAS.get(sym, sym.lower())
+        try:
+            return barcodenumber.check_code(code, data)
+        except (ValueError, KeyError):
+            return False
+    def scan_and_validate(self, image, show_image: bool = False):
+        # 1) normalize to OpenCV BGR numpy array
+        if isinstance(image, np.ndarray):
+            cv_img = image.copy()
+        else:
+            # assume PIL
+            cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        # 2) for zxing we need a PIL, so make one from cv_img
+        pil_for_scan = Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
+        barcodes = zxingcpp.read_barcodes(pil_for_scan)
+        results = []
+        for i, barcode in enumerate(barcodes):
+            pos = barcode.position
+            if pos:
+                pts = [pos.top_left, pos.top_right, pos.bottom_right, pos.bottom_left]
+                xs = [p.x for p in pts]
+                ys = [p.y for p in pts]
+                x, y = int(min(xs)), int(min(ys))
+                w, h = int(max(xs) - x), int(max(ys) - y)
+            else:
+                x, y, w, h = 0, 0, 100, 50
+            raw = barcode.text
+            sym = str(barcode.format)
+            ok  = self.validate_barcode(raw, sym)
+            # Create barcode result with position data
+            barcode_result = {
+                'id': f'BARCODE_{i+1:03d}',
+                'type': sym,
+                'data': raw,
+                'valid': ok,
+                'position': {
+                    'x': x,
+                    'y': y,
+                    'width': w,
+                    'height': h,
+                    'top_left': {'x': x, 'y': y},
+                    'top_right': {'x': x + w, 'y': y},
+                    'bottom_right': {'x': x + w, 'y': y + h},
+                    'bottom_left': {'x': x, 'y': y + h}
+                }
+            }
+            results.append(barcode_result)
+        return results
+    def draw_box(self, img, x, y, w, h, sym, raw, ok):
+        color = (0,255,0) if ok else (0,0,255)
+        cv2.rectangle(img, (x,y), (x+w, y+h), color, 2)
+        cv2.putText(img, f"{sym}:{raw}", (x, y-10),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+        return img

src/utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import base64
+from io import BytesIO
+from PIL import Image, ImageChops
+from PIL import ImageDraw
+import math
+class ImageUtils:
+    def __init__(self):
+        pass
+    @staticmethod
+    def crop_base64(base64_string, output_format='PNG') -> str:
+        """
+        Takes a base64 encoded image, crops it by removing uniform background,
+        and returns the cropped image as base64.
+        Args:
+            base64_string (str or bytes): Base64 encoded image string or raw bytes
+            output_format (str): Output image format ('PNG', 'JPEG', etc.)
+        Returns:
+            str: Base64 encoded cropped image, or empty string if cropping fails
+        """
+        try:
+            # Handle both base64 strings and raw bytes
+            if isinstance(base64_string, bytes):
+                # If it's raw bytes, treat it as image data directly
+                image_data = base64_string
+            else:
+                # If it's a string, decode base64 to image
+                image_data = base64.b64decode(base64_string)
+            im = Image.open(BytesIO(image_data))
+            # Apply the original trim logic
+            bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
+            diff = ImageChops.difference(im, bg)
+            diff = ImageChops.add(diff, diff, 2.0, -100)
+            bbox = diff.getbbox()
+            if bbox:
+                cropped_im = im.crop(bbox)
+            else:
+                cropped_im = im  # Return original if no cropping needed
+            # Convert back to base64
+            buffer = BytesIO()
+            cropped_im.save(buffer, format=output_format)
+            cropped_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+            return cropped_base64
+        except Exception as e:
+            print(f"Error processing image: {e}")
+            return ""
+    @staticmethod
+    def crop_image(im: Image.Image) -> Image.Image:
+        """
+        Original trim function for PIL Image objects
+        """
+        try:
+            bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
+            diff = ImageChops.difference(im, bg)
+            diff = ImageChops.add(diff, diff, 2.0, -100)
+            bbox = diff.getbbox()
+            if bbox:
+                return im.crop(bbox)
+            return im
+        except Exception as e:
+            print(f"Error cropping image: {e}")
+            return im
+    @staticmethod
+    def draw_bounding_boxes(pil_image: Image.Image, boxes: list[tuple[int, int, int, int]], color: str = "red", width: int = 2) -> Image.Image:
+        """
+        Draw bounding boxes on a PIL image.
+        Args:
+            pil_image: A PIL.Image instance.
+            boxes: A list of boxes, each specified as (x1, y1, x2, y2).
+            color: The color for the bounding box outline.
+            width: The width of the bounding box line.
+        Returns:
+            The PIL.Image with drawn bounding boxes.
+        """
+        try:
+            draw = ImageDraw.Draw(pil_image)
+            for box in boxes:
+                draw.rectangle(box, outline=color, width=width)
+            return pil_image
+        except Exception as e:
+            print(f"Error drawing bounding boxes: {e}")
+            return pil_image
+    @staticmethod
+    def standardize_image_size(image: Image.Image, target_size: tuple = (1200, 1600), maintain_aspect_ratio: bool = True) -> Image.Image:
+        """
+        Resize image to target size while optionally maintaining aspect ratio.
+        Args:
+            image: PIL Image to resize
+            target_size: Target (width, height) in pixels
+            maintain_aspect_ratio: If True, fit within target size while maintaining aspect ratio
+        Returns:
+            Resized PIL Image
+        """
+        if maintain_aspect_ratio:
+            # Calculate aspect ratios
+            img_ratio = image.width / image.height
+            target_ratio = target_size[0] / target_size[1]
+            if img_ratio > target_ratio:
+                # Image is wider than target, fit to width
+                new_width = target_size[0]
+                new_height = int(target_size[0] / img_ratio)
+            else:
+                # Image is taller than target, fit to height
+                new_height = target_size[1]
+                new_width = int(target_size[1] * img_ratio)
+            # Resize image
+            resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            # Create new image with target size and white background
+            final_image = Image.new('RGB', target_size, 'white')
+            # Calculate position to center the resized image
+            x_offset = (target_size[0] - new_width) // 2
+            y_offset = (target_size[1] - new_height) // 2
+            # Paste the resized image onto the white background
+            final_image.paste(resized_image, (x_offset, y_offset))
+            return final_image
+        else:
+            # Direct resize to target size
+            return image.resize(target_size, Image.Resampling.LANCZOS)
+    @staticmethod
+    def optimize_image_quality(image: Image.Image, max_size_bytes: int = 1024 * 1024, initial_quality: int = 95) -> tuple[Image.Image, int]:
+        """
+        Optimize image quality to fit within specified file size limit.
+        Args:
+            image: PIL Image to optimize
+            max_size_bytes: Maximum file size in bytes (default 1MB)
+            initial_quality: Starting quality (1-100) - not used for PNG but kept for compatibility
+        Returns:
+            Tuple of (optimized_image, final_quality)
+        """
+        # For PNG, we'll use compression levels instead of quality
+        # PNG compression levels range from 0 (no compression) to 9 (maximum compression)
+        compression_levels = [0, 1, 3, 5, 7, 9]  # Try different compression levels
+        for compression in compression_levels:
+            # Save image to buffer with current compression
+            buffer = BytesIO()
+            image.save(buffer, format='PNG', optimize=True, compress_level=compression)
+            current_size = buffer.tell()
+            # If size is within limit, return the image
+            if current_size <= max_size_bytes:
+                # Reset buffer position and load the optimized image
+                buffer.seek(0)
+                optimized_image = Image.open(buffer)
+                return optimized_image, 95  # Return a default quality value for compatibility
+        # If we can't get under the size limit, return the most compressed version
+        buffer = BytesIO()
+        image.save(buffer, format='PNG', optimize=True, compress_level=9)
+        buffer.seek(0)
+        optimized_image = Image.open(buffer)
+        return optimized_image, 50  # Return a lower quality value for compatibility
+    @staticmethod
+    def process_image_for_comparison(image: Image.Image, target_size: tuple = (1200, 1600), max_size_bytes: int = 1024 * 1024) -> tuple[Image.Image, int, int]:
+        """
+        Process image for comparison: standardize size and optimize quality.
+        Args:
+            image: PIL Image to process
+            target_size: Target size in pixels (width, height)
+            max_size_bytes: Maximum file size in bytes (default 1MB)
+        Returns:
+            Tuple of (processed_image, final_quality, file_size_bytes)
+        """
+        # First, standardize the size
+        sized_image = ImageUtils.standardize_image_size(image, target_size, maintain_aspect_ratio=True)
+        # Then optimize quality to fit within size limit
+        optimized_image, quality = ImageUtils.optimize_image_quality(sized_image, max_size_bytes)
+        # Get final file size (using PNG format for consistency)
+        buffer = BytesIO()
+        optimized_image.save(buffer, format='PNG', optimize=True)
+        file_size = buffer.tell()
+        return optimized_image, quality, file_size
+    @staticmethod
+    def image_to_base64_optimized(image: Image.Image, target_size: tuple = (1200, 1600), max_size_bytes: int = 1024 * 1024) -> str:
+        """
+        Convert image to base64 with size and quality optimization.
+        Args:
+            image: PIL Image to convert
+            target_size: Target size in pixels (width, height)
+            max_size_bytes: Maximum file size in bytes (default 1MB)
+        Returns:
+            Base64 encoded string of the optimized image
+        """
+        processed_image, quality, file_size = ImageUtils.process_image_for_comparison(
+            image, target_size, max_size_bytes
+        )
+        # Convert to base64 as PNG format
+        buffer = BytesIO()
+        processed_image.save(buffer, format='PNG', optimize=True)
+        image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+        return image_base64