Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

Agentic-Reliability-Framework-API / claude_adapter.py

petter2025

Update claude_adapter.py

0b3ce9a verified 4 months ago

raw

history blame

7.31 kB

	"""
	Claude Opus 4.5 Adapter for ARF
	Drop-in replacement for Hugging Face inference
	"""

	import os
	import logging
	from typing import Optional
	from dataclasses import dataclass

	logger = logging.getLogger(__name__)

	# Try to import anthropic, with graceful fallback
	try:
	import anthropic
	ANTHROPIC_AVAILABLE = True
	except ImportError:
	ANTHROPIC_AVAILABLE = False
	logger.warning("anthropic package not installed - using mock mode only")


	@dataclass
	class ClaudeConfig:
	"""Claude API configuration"""
	api_key: str
	model: str = "claude-opus-4"
	max_tokens: int = 512
	temperature: float = 0.3


	class ClaudeAdapter:
	"""
	Drop-in replacement for HF inference in ARF agents

	Features:
	- Automatic fallback to mock mode if no API key
	- Intelligent pre-written responses for demo
	- Same interface as HF inference
	- Built-in error handling
	"""

	def __init__(self, config: Optional[ClaudeConfig] = None):
	self.config = config or ClaudeConfig(
	api_key=os.environ.get("ANTHROPIC_API_KEY", "")
	)

	if not ANTHROPIC_AVAILABLE:
	logger.warning("Anthropic package not available - mock mode only")
	self.mock_mode = True
	elif not self.config.api_key:
	logger.warning("No ANTHROPIC_API_KEY found - using mock mode")
	self.mock_mode = True
	else:
	try:
	self.client = anthropic.Anthropic(api_key=self.config.api_key)
	self.mock_mode = False
	logger.info(f"✅ Claude adapter initialized with model: {self.config.model}")
	except Exception as e:
	logger.error(f"Failed to initialize Claude client: {e}")
	self.mock_mode = True

	def generate_completion(
	self,
	prompt: str,
	system_prompt: Optional[str] = None
	) -> str:
	"""
	Generate completion using Claude or fallback to mock

	Args:
	prompt: User prompt
	system_prompt: Optional system context

	Returns:
	Generated text response
	"""
	if self.mock_mode:
	logger.debug("Using mock mode (no API key or package not available)")
	return self._mock_response(prompt)

	try:
	messages = [{"role": "user", "content": prompt}]

	kwargs = {
	"model": self.config.model,
	"max_tokens": self.config.max_tokens,
	"temperature": self.config.temperature,
	"messages": messages
	}

	if system_prompt:
	kwargs["system"] = system_prompt

	response = self.client.messages.create(**kwargs)

	# Extract text from response
	if response.content and len(response.content) > 0:
	return response.content[0].text

	logger.warning("Empty response from Claude - using mock")
	return self._mock_response(prompt)

	except Exception as e:
	logger.error(f"Claude API error: {e} - falling back to mock")
	return self._mock_response(prompt)

	def _mock_response(self, prompt: str) -> str:
	"""
	Intelligent fallback mock response for demo
	Pre-crafted to show system capabilities
	"""
	prompt_lower = prompt.lower()

	# Detective Agent Response
	if "detective" in prompt_lower or "anomaly" in prompt_lower:
	return """🔍 ANOMALY DETECTED: Payment gateway timeout pattern identified.

	PATTERN ANALYSIS:
	• Current error rate: 87% (baseline: <5%)
	• Latency spike: 8500ms P99 (baseline: ~100ms)
	• Pattern match: 94% similarity to incident 2024-11-15 (database connection pool exhaustion)

	CONFIDENCE: HIGH (0.87)

	CLASSIFICATION: Infrastructure failure - upstream dependency timeout

	AFFECTED METRICS:
	Primary: Error rate (+1740% vs baseline)
	Secondary: Latency (+8400% vs baseline)
	Tertiary: Throughput degradation

	RECOMMENDATION: Immediate investigation of upstream payment provider status + connection pool health check required."""

	# Diagnostician Agent Response
	elif "diagnostician" in prompt_lower or "root cause" in prompt_lower:
	return """🔬 ROOT CAUSE ANALYSIS:

	PRIMARY CAUSE:
	Upstream payment provider latency spike (avg response: 8.5s, normal: <500ms)

	SECONDARY FACTORS:
	• Connection pool exhaustion (95% utilized)
	• Retry storm amplifying load (exponential backoff not engaged)
	• Circuit breaker threshold not reached (87% < 90% threshold)

	EVIDENCE CHAIN:
	1. Error rate spike correlates with provider status page incident (timestamp alignment)
	2. Connection pool saturation occurred 45 seconds before error spike
	3. Upstream API latency increased 17x baseline
	4. Historical pattern match: 94% similarity to Nov 15 incident

	RECOMMENDED ACTION: REROUTE
	• Target: gateway-2 (backup payment processor)
	• Expected recovery: 45±5 seconds
	• Success probability: 92% (based on historical data)

	RATIONALE: Rerouting bypasses degraded provider, allows time for upstream recovery."""

	# Predictive Agent Response
	elif "predictive" in prompt_lower or "forecast" in prompt_lower:
	return """📈 PREDICTIVE FORECAST ANALYSIS:

	CURRENT TRAJECTORY:
	• Error rate: Increasing at 12%/minute (exponential trend)
	• Latency: Accelerating degradation (quadratic curve)
	• Resource utilization: CPU 75%, Memory 82% (stable)

	TIME-TO-FAILURE ESTIMATES:
	• Critical threshold (>95% error rate): ~8 minutes
	• Complete service failure: ~12 minutes
	• Current impact: 1,240 active users affected

	RISK ASSESSMENT:
	Risk Score: 0.85 (HIGH)
	Confidence: 0.79
	Trend: DETERIORATING

	BUSINESS IMPACT FORECAST:
	• Current revenue loss: \$12,000/minute
	• Projected 15-min loss (no action): \$180,000
	• Customer churn risk: MEDIUM (historical correlation: 0.67)
	• SLA violation: IMMINENT (99.9% target, current: 13% availability)

	RECOMMENDATIONS:
	Primary: Execute REROUTE action immediately (Diagnostician recommendation)
	Secondary: Scale connection pool +50% capacity
	Tertiary: Enable aggressive circuit breaking (lower threshold to 75%)

	PREVENTIVE MEASURES:
	Monitor upstream provider health proactively, implement predictive circuit breaking."""

	# Generic/Synthesis Response
	else:
	return """✅ MULTI-AGENT ANALYSIS COMPLETE

	SYSTEM STATUS: Incident detected and analyzed
	CONFIDENCE: HIGH (0.85)

	SYNTHESIS:
	All agents have completed analysis. The system has identified a critical upstream dependency failure requiring immediate intervention. Recovery action has been selected based on historical success patterns and current system state.

	Recommended action: REROUTE to backup systems
	Expected outcome: Service restoration within 45 seconds

	Continuing autonomous monitoring..."""


	# Singleton instance
	_claude_adapter: Optional[ClaudeAdapter] = None


	def get_claude_adapter() -> ClaudeAdapter:
	"""Get or create Claude adapter singleton"""
	global _claude_adapter
	if _claude_adapter is None:
	_claude_adapter = ClaudeAdapter()
	return _claude_adapter