import os import anthropic import requests import streamlit as st import numpy as np import json import re from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from src.extract_text.google_document_api import GoogleDocumentAPI CLAUDE_API_URL = "https://api.anthropic.com/v1/messages" class LLM: def __init__(self): self.claude_api_key = os.getenv('CLAUDE_API_KEY') if not self.claude_api_key: raise ValueError("Please set the CLAUDE_API_KEY environment variable.") # Configure retry strategy with more comprehensive error handling retry_strategy = Retry( total=5, # Increased total retries backoff_factor=2, # Increased backoff factor for exponential backoff status_forcelist=[429, 500, 502, 503, 504, 529], # Added 529 for server overload allowed_methods=["POST"], # Only retry POST requests respect_retry_after_header=True, # Respect Retry-After headers ) # Create session with retry strategy self.session = requests.Session() self.session.mount("https://", HTTPAdapter(max_retries=retry_strategy)) def call_claude_api(self, prompt, system_prompt, model="claude-sonnet-4-20250514", max_tokens=2000) -> str: """ Helper function to call Claude API with consistent parameters and enhanced error handling. """ headers = { "x-api-key": self.claude_api_key, "anthropic-version": "2023-06-01", "Content-Type": "application/json" } payload = { "model": model, "max_tokens": max_tokens, "temperature": 0.1, "messages": [ { "role": "user", "content": prompt } ], "system": system_prompt } max_retries = 3 for attempt in range(max_retries): try: response = self.session.post( CLAUDE_API_URL, headers=headers, json=payload, verify=True, # Explicitly enable SSL verification timeout=60 # Increased timeout for better reliability ) # Handle specific error codes if response.status_code == 529: st.warning(f"Server overload (529) on attempt {attempt + 1}/{max_retries}. Retrying with exponential backoff...") if attempt < max_retries - 1: import time time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s continue else: st.error("Server overload after all retries. Please try again later.") return "" response.raise_for_status() # Raise exception for other bad status codes # Parse response response_data = response.json() if "content" in response_data and len(response_data["content"]) > 0: return response_data["content"][0]["text"] else: st.error("Unexpected response format from Claude API") return "" except requests.exceptions.SSLError as ssl_err: st.error(f"SSL Error when calling Claude API. Please check your SSL certificates and network connection. Error: {ssl_err}") return "" except requests.exceptions.Timeout as timeout_err: st.warning(f"Timeout on attempt {attempt + 1}/{max_retries}. Retrying...") if attempt == max_retries - 1: st.error("Request timed out after all retries") return "" except requests.exceptions.RequestException as e: st.error(f"Error calling Claude API: {str(e)}") return "" except json.JSONDecodeError as json_err: st.error(f"Invalid JSON response from Claude API: {json_err}") return "" return "" def call_claude_vision_api(self, prompt, system_prompt, image_base64, model="claude-sonnet-4-20250514", max_tokens=2000) -> str: """ Helper function to call Claude Vision API with image support and enhanced error handling. """ headers = { "x-api-key": self.claude_api_key, "anthropic-version": "2023-06-01", "Content-Type": "application/json" } content = [ { "type": "text", "text": prompt }, { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": image_base64 } } ] payload = { "model": model, "max_tokens": max_tokens, "temperature": 0, "messages": [ { "role": "user", "content": content } ], "system": system_prompt } max_retries = 3 for attempt in range(max_retries): try: response = self.session.post( CLAUDE_API_URL, headers=headers, json=payload, verify=True, # Explicitly enable SSL verification timeout=90 # Increased timeout for vision API calls ) # Handle specific error codes if response.status_code == 529: st.warning(f"Server overload (529) on attempt {attempt + 1}/{max_retries}. Retrying with exponential backoff...") if attempt < max_retries - 1: import time time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s continue else: st.error("Server overload after all retries. Please try again later.") return "" response.raise_for_status() # Raise exception for other bad status codes # Parse response response_data = response.json() if "content" in response_data and len(response_data["content"]) > 0: return response_data["content"][0]["text"] else: st.error("Unexpected response format from Claude Vision API") return "" except requests.exceptions.SSLError as ssl_err: st.error(f"SSL Error when calling Claude Vision API. Please check your SSL certificates and network connection. Error: {ssl_err}") return "" except requests.exceptions.Timeout as timeout_err: st.warning(f"Timeout on attempt {attempt + 1}/{max_retries}. Retrying...") if attempt == max_retries - 1: st.error("Request timed out after all retries") return "" except requests.exceptions.RequestException as e: st.error(f"Error calling Claude Vision API: {str(e)}") return "" except json.JSONDecodeError as json_err: st.error(f"Invalid JSON response from Claude Vision API: {json_err}") return "" return "" def call_claude_pdf_api(self, prompt, system_prompt, pdf_base64, model="claude-sonnet-4-20250514", max_tokens=4000) -> str: """ Helper function to call Claude API with PDF support for requirements documents. For now, we'll fall back to text-based processing since PDF API requires specific setup. """ # For now, we'll use the regular API with text extraction # In the future, this can be enhanced to use the Converse API with citations st.info("📄 PDF requirements detected. Using text-based processing for now.") st.info("💡 For full visual PDF analysis, consider using the Converse API with citations enabled.") # Extract text from PDF using a simple approach # In a production environment, you might want to use a more robust PDF text extraction library try: import base64 import io # Try to import PyPDF2 try: from PyPDF2 import PdfReader pdf_reader_available = True except ImportError: pdf_reader_available = False st.warning("PyPDF2 not available. Using basic text processing for PDF.") if pdf_reader_available: # Decode base64 PDF pdf_bytes = base64.b64decode(pdf_base64) pdf_stream = io.BytesIO(pdf_bytes) # Extract text from PDF reader = PdfReader(pdf_stream) text_content = "" for page in reader.pages: text_content += page.extract_text() + "\n" if not text_content.strip(): text_content = "PDF Requirements Document (text extraction limited)" # Use regular API with extracted text return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens) else: # Fallback when PyPDF2 is not available return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens) except Exception as e: st.warning(f"PDF text extraction failed: {e}") st.warning("Falling back to basic text processing") # Fallback to basic text processing return self.call_claude_api(prompt, system_prompt, model=model, max_tokens=max_tokens) class ComplianceAnalysis: def __init__(self): self.llm = LLM() def extract_structured_requirements(self, requirements_data) -> list[dict]: """ Use Claude to extract structured requirements from the requirements document. Args: requirements_data: Either a string (for text files) or a dict (for PDF files) containing requirements. Returns: A list of dictionaries, each containing a requirement ID, description, and category. """ # Handle both text and PDF requirements if isinstance(requirements_data, str): # Text-based requirements requirements_text = requirements_data requirements_type = "text" elif isinstance(requirements_data, dict): # PDF-based requirements requirements_text = requirements_data.get('text_content', '') requirements_type = requirements_data.get('type', 'text') pdf_base64 = requirements_data.get('content', '') if requirements_type == 'pdf' else None else: st.error("Invalid requirements data format. Please upload a valid requirements document.") return [] # Check if requirements text is empty or None if not requirements_text or not requirements_text.strip(): st.error("Requirements text is empty. Please upload a valid requirements document.") return [] system_prompt = """You are an expert requirements analyst. Extract clear, structured requirements from documents. You must always return valid JSON, even if no specific requirements are found.""" extraction_prompt = f""" Extract all requirements from this document (not just allergen requirements): {requirements_text} For each requirement found, provide: 1. Unique ID (REQ001, REQ002, etc.) 2. Description (verbatim from the document) 3. Category (Font Size, Allergen List, Formatting, Placement, Barcode, Organic, Promotional, etc.) 4. Source reference (section/paragraph or line number) If no requirements are found, return an empty array: [] Return as JSON array with fields: id, description, category, source_reference. Example: ```json [ {{ "id": "REQ001", "description": "IF the product is labeled as organic, THEN a certified organic seal must be visible", "category": "Organic", "source_reference": "Line 1" }}, {{ "id": "REQ002", "description": "IF there is a promotional offer mentioned, THEN include the offer expiry date", "category": "Promotional", "source_reference": "Line 2" }} ] ``` IMPORTANT: Always return valid JSON. If you cannot extract any requirements, return an empty array: [] """ # Use appropriate API based on requirements type if requirements_type == 'pdf' and pdf_base64: # Use PDF API for native PDF processing response = self.llm.call_claude_pdf_api(extraction_prompt, system_prompt, pdf_base64, model='claude-sonnet-4-20250514') else: # Use regular API for text processing response = self.llm.call_claude_api(extraction_prompt, system_prompt, model='claude-3-5-haiku-20241022') # Extract JSON from the response try: # Find JSON content between triple backticks if present if "```json" in response and "```" in response.split("```json")[1]: json_content = response.split("```json")[1].split("```")[0].strip() elif "```" in response: # Try to find any code block json_content = response.split("```")[1].split("```")[0].strip() else: # Assume the entire response is JSON json_content = response # Clean the JSON content to handle control characters # Remove or replace invalid control characters except newlines and tabs json_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', json_content) # Replace newlines within strings with escaped newlines json_content = re.sub(r'(?, "evidence_text": "", "barcode_id": ""}} ], "compliance_status": "COMPLIANT/NON-COMPLIANT/PARTIALLY COMPLIANT", "reasoning": "Detailed explanation", "confidence": 0.95 }} ``` """ # Use vision API if image is provided, otherwise use regular API if image: response = self.llm.call_claude_vision_api(verification_prompt, system_prompt, image) else: response = self.llm.call_claude_api(verification_prompt, system_prompt) # Extract JSON from the response with enhanced error handling try: # Check if response is empty or None if not response or not response.strip(): st.error("Empty response received from Claude API") return { "requirement_id": requirement['id'], "evidence_found": [], "compliance_status": "ERROR", "reasoning": "Empty response received from Claude API", "confidence": 0 } # Find JSON content between triple backticks if present if "```json" in response and "```" in response.split("```json")[1]: json_content = response.split("```json")[1].split("```")[0].strip() elif "```" in response: # Try to find any code block json_content = response.split("```")[1].split("```")[0].strip() else: # Assume the entire response is JSON json_content = response # Clean the JSON content to handle control characters # Remove or replace invalid control characters except newlines and tabs json_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', json_content) # Replace newlines within strings with escaped newlines json_content = re.sub(r'(?