Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

File size: 7,314 Bytes

0b3ce9a

"""
Claude Opus 4.5 Adapter for ARF
Drop-in replacement for Hugging Face inference
"""

import os
import logging
from typing import Optional
from dataclasses import dataclass

logger = logging.getLogger(__name__)

# Try to import anthropic, with graceful fallback
try:
    import anthropic
    ANTHROPIC_AVAILABLE = True
except ImportError:
    ANTHROPIC_AVAILABLE = False
    logger.warning("anthropic package not installed - using mock mode only")


@dataclass
class ClaudeConfig:
    """Claude API configuration"""
    api_key: str
    model: str = "claude-opus-4"
    max_tokens: int = 512
    temperature: float = 0.3


class ClaudeAdapter:
    """
    Drop-in replacement for HF inference in ARF agents
    
    Features:
    - Automatic fallback to mock mode if no API key
    - Intelligent pre-written responses for demo
    - Same interface as HF inference
    - Built-in error handling
    """
    
    def __init__(self, config: Optional[ClaudeConfig] = None):
        self.config = config or ClaudeConfig(
            api_key=os.environ.get("ANTHROPIC_API_KEY", "")
        )
        
        if not ANTHROPIC_AVAILABLE:
            logger.warning("Anthropic package not available - mock mode only")
            self.mock_mode = True
        elif not self.config.api_key:
            logger.warning("No ANTHROPIC_API_KEY found - using mock mode")
            self.mock_mode = True
        else:
            try:
                self.client = anthropic.Anthropic(api_key=self.config.api_key)
                self.mock_mode = False
                logger.info(f"✅ Claude adapter initialized with model: {self.config.model}")
            except Exception as e:
                logger.error(f"Failed to initialize Claude client: {e}")
                self.mock_mode = True
    
    def generate_completion(
        self,
        prompt: str,
        system_prompt: Optional[str] = None
    ) -> str:
        """
        Generate completion using Claude or fallback to mock
        
        Args:
            prompt: User prompt
            system_prompt: Optional system context
            
        Returns:
            Generated text response
        """
        if self.mock_mode:
            logger.debug("Using mock mode (no API key or package not available)")
            return self._mock_response(prompt)
        
        try:
            messages = [{"role": "user", "content": prompt}]
            
            kwargs = {
                "model": self.config.model,
                "max_tokens": self.config.max_tokens,
                "temperature": self.config.temperature,
                "messages": messages
            }
            
            if system_prompt:
                kwargs["system"] = system_prompt
            
            response = self.client.messages.create(**kwargs)
            
            # Extract text from response
            if response.content and len(response.content) > 0:
                return response.content[0].text
            
            logger.warning("Empty response from Claude - using mock")
            return self._mock_response(prompt)
            
        except Exception as e:
            logger.error(f"Claude API error: {e} - falling back to mock")
            return self._mock_response(prompt)
    
    def _mock_response(self, prompt: str) -> str:
        """
        Intelligent fallback mock response for demo
        Pre-crafted to show system capabilities
        """
        prompt_lower = prompt.lower()
        
        # Detective Agent Response
        if "detective" in prompt_lower or "anomaly" in prompt_lower:
            return """🔍 ANOMALY DETECTED: Payment gateway timeout pattern identified.

PATTERN ANALYSIS:
• Current error rate: 87% (baseline: <5%)
• Latency spike: 8500ms P99 (baseline: ~100ms)  
• Pattern match: 94% similarity to incident 2024-11-15 (database connection pool exhaustion)

CONFIDENCE: HIGH (0.87)

CLASSIFICATION: Infrastructure failure - upstream dependency timeout

AFFECTED METRICS:
Primary: Error rate (+1740% vs baseline)
Secondary: Latency (+8400% vs baseline)
Tertiary: Throughput degradation

RECOMMENDATION: Immediate investigation of upstream payment provider status + connection pool health check required."""
        
        # Diagnostician Agent Response
        elif "diagnostician" in prompt_lower or "root cause" in prompt_lower:
            return """🔬 ROOT CAUSE ANALYSIS:

PRIMARY CAUSE:
Upstream payment provider latency spike (avg response: 8.5s, normal: <500ms)

SECONDARY FACTORS:
• Connection pool exhaustion (95% utilized)
• Retry storm amplifying load (exponential backoff not engaged)
• Circuit breaker threshold not reached (87% < 90% threshold)

EVIDENCE CHAIN:
1. Error rate spike correlates with provider status page incident (timestamp alignment)
2. Connection pool saturation occurred 45 seconds before error spike
3. Upstream API latency increased 17x baseline
4. Historical pattern match: 94% similarity to Nov 15 incident

RECOMMENDED ACTION: REROUTE
• Target: gateway-2 (backup payment processor)
• Expected recovery: 45±5 seconds
• Success probability: 92% (based on historical data)

RATIONALE: Rerouting bypasses degraded provider, allows time for upstream recovery."""
        
        # Predictive Agent Response
        elif "predictive" in prompt_lower or "forecast" in prompt_lower:
            return """📈 PREDICTIVE FORECAST ANALYSIS:

CURRENT TRAJECTORY:
• Error rate: Increasing at 12%/minute (exponential trend)
• Latency: Accelerating degradation (quadratic curve)
• Resource utilization: CPU 75%, Memory 82% (stable)

TIME-TO-FAILURE ESTIMATES:
• Critical threshold (>95% error rate): ~8 minutes
• Complete service failure: ~12 minutes  
• Current impact: 1,240 active users affected

RISK ASSESSMENT:
Risk Score: 0.85 (HIGH)
Confidence: 0.79
Trend: DETERIORATING

BUSINESS IMPACT FORECAST:
• Current revenue loss: \$12,000/minute
• Projected 15-min loss (no action): \$180,000
• Customer churn risk: MEDIUM (historical correlation: 0.67)
• SLA violation: IMMINENT (99.9% target, current: 13% availability)

RECOMMENDATIONS:
Primary: Execute REROUTE action immediately (Diagnostician recommendation)
Secondary: Scale connection pool +50% capacity
Tertiary: Enable aggressive circuit breaking (lower threshold to 75%)

PREVENTIVE MEASURES:
Monitor upstream provider health proactively, implement predictive circuit breaking."""
        
        # Generic/Synthesis Response
        else:
            return """✅ MULTI-AGENT ANALYSIS COMPLETE

SYSTEM STATUS: Incident detected and analyzed
CONFIDENCE: HIGH (0.85)

SYNTHESIS:
All agents have completed analysis. The system has identified a critical upstream dependency failure requiring immediate intervention. Recovery action has been selected based on historical success patterns and current system state.

Recommended action: REROUTE to backup systems
Expected outcome: Service restoration within 45 seconds

Continuing autonomous monitoring..."""


# Singleton instance
_claude_adapter: Optional[ClaudeAdapter] = None


def get_claude_adapter() -> ClaudeAdapter:
    """Get or create Claude adapter singleton"""
    global _claude_adapter
    if _claude_adapter is None:
        _claude_adapter = ClaudeAdapter()
    return _claude_adapter