| """ |
| Pythonic data models for ARF Demo - COMPLETE VERSION |
| """ |
|
|
| from dataclasses import dataclass, asdict |
| from enum import Enum |
| from typing import Dict, List, Optional, Any |
| import datetime |
|
|
| |
| try: |
| from agentic_reliability_framework.arf_core.models.healing_intent import ( |
| HealingIntent, |
| create_scale_out_intent, |
| create_rollback_intent, |
| create_restart_intent |
| ) |
| from agentic_reliability_framework.arf_core.engine.simple_mcp_client import OSSMCPClient |
| ARF_OSS_AVAILABLE = True |
| except ImportError: |
| ARF_OSS_AVAILABLE = False |
| |
| class HealingIntent: |
| def __init__(self, **kwargs): |
| self.intent_type = kwargs.get("intent_type", "scale_out") |
| self.parameters = kwargs.get("parameters", {}) |
| |
| def to_dict(self): |
| return { |
| "intent_type": self.intent_type, |
| "parameters": self.parameters, |
| "created_at": datetime.datetime.now().isoformat() |
| } |
| |
| def create_scale_out_intent(resource_type: str, scale_factor: float = 2.0): |
| return HealingIntent( |
| intent_type="scale_out", |
| parameters={ |
| "resource_type": resource_type, |
| "scale_factor": scale_factor, |
| "action": "Increase capacity" |
| } |
| ) |
| |
| class OSSMCPClient: |
| def __init__(self): |
| self.mode = "advisory" |
| |
| def analyze_incident(self, metrics: Dict, pattern: str = "") -> Dict: |
| return { |
| "status": "analysis_complete", |
| "recommendations": [ |
| "Increase resource allocation", |
| "Implement monitoring", |
| "Add circuit breakers", |
| "Optimize configuration" |
| ], |
| "confidence": 0.92, |
| "pattern_matched": pattern, |
| "healing_intent": { |
| "type": "scale_out", |
| "requires_execution": True |
| } |
| } |
|
|
| class IncidentSeverity(Enum): |
| """Enum for incident severity levels""" |
| LOW = "LOW" |
| MEDIUM = "MEDIUM" |
| HIGH = "HIGH" |
| CRITICAL = "CRITICAL" |
|
|
| class DemoMode(Enum): |
| """Enum for demo modes""" |
| QUICK = "quick" |
| COMPREHENSIVE = "comprehensive" |
| INVESTOR = "investor" |
|
|
| @dataclass |
| class OSSAnalysis: |
| """Structured OSS analysis results - using actual ARF""" |
| status: str |
| recommendations: List[str] |
| estimated_time: str |
| engineers_needed: str |
| manual_effort: str |
| confidence_score: float = 0.95 |
| healing_intent: Optional[Dict] = None |
| |
| def to_dict(self) -> Dict: |
| """Convert to dictionary, including healing intent if available""" |
| data = asdict(self) |
| if self.healing_intent: |
| data["healing_intent"] = { |
| "type": "HealingIntent", |
| "recommendations": self.recommendations, |
| "requires_execution": True |
| } |
| return data |
| |
| @classmethod |
| def from_arf_analysis(cls, arf_result: Dict, scenario_name: str) -> 'OSSAnalysis': |
| """Create from actual ARF analysis result""" |
| recommendations = arf_result.get("recommendations", [ |
| "Increase resource allocation", |
| "Implement monitoring", |
| "Add circuit breakers", |
| "Optimize configuration" |
| ]) |
| |
| return cls( |
| status="β
ARF OSS Analysis Complete", |
| recommendations=recommendations, |
| estimated_time="45-90 minutes", |
| engineers_needed="2-3 engineers", |
| manual_effort="High", |
| confidence_score=0.92, |
| healing_intent={ |
| "scenario": scenario_name, |
| "actions": recommendations, |
| "execution_required": True, |
| "auto_execution": False |
| } |
| ) |
|
|
| @dataclass |
| class EnterpriseResults: |
| """Structured enterprise execution results""" |
| actions_completed: List[str] |
| metrics_improvement: Dict[str, str] |
| business_impact: Dict[str, Any] |
| approval_required: bool = True |
| execution_time: str = "" |
| healing_intent_executed: bool = True |
| |
| def to_dict(self) -> Dict: |
| data = asdict(self) |
| data["arf_enterprise"] = { |
| "execution_complete": True, |
| "learning_applied": True, |
| "audit_trail_created": True |
| } |
| return data |
|
|
| @dataclass |
| class IncidentScenario: |
| """Pythonic incident scenario model with ARF integration""" |
| name: str |
| severity: IncidentSeverity |
| metrics: Dict[str, str] |
| impact: Dict[str, str] |
| arf_pattern: str = "" |
| oss_analysis: Optional[OSSAnalysis] = None |
| enterprise_results: Optional[EnterpriseResults] = None |
| |
| def to_dict(self) -> Dict: |
| """Convert to dictionary for JSON serialization""" |
| data = { |
| "name": self.name, |
| "severity": self.severity.value, |
| "metrics": self.metrics, |
| "impact": self.impact, |
| "arf_oss_available": ARF_OSS_AVAILABLE |
| } |
| if self.oss_analysis: |
| data["oss_analysis"] = self.oss_analysis.to_dict() |
| if self.enterprise_results: |
| data["enterprise_results"] = self.enterprise_results.to_dict() |
| return data |
|
|
| @dataclass |
| class DemoStep: |
| """Demo step for presenter guidance""" |
| title: str |
| scenario: Optional[str] |
| action: str |
| message: str |
| icon: str = "π―" |
| arf_integration: bool = False |
|
|
| |
| |
| |
|
|
| class IncidentDatabase: |
| """Database of incident scenarios for the demo""" |
| |
| @staticmethod |
| def get_scenarios() -> Dict[str, IncidentScenario]: |
| """Get all incident scenarios""" |
| cache_miss = IncidentScenario( |
| name="Cache Miss Storm", |
| severity=IncidentSeverity.CRITICAL, |
| metrics={ |
| "Cache Hit Rate": "18.5% (Critical)", |
| "Database Load": "92% (Overloaded)", |
| "Response Time": "1850ms (Slow)", |
| "Affected Users": "45,000", |
| "Eviction Rate": "125/sec" |
| }, |
| impact={ |
| "Revenue Loss": "$8,500/hour", |
| "Page Load Time": "+300%", |
| "Users Impacted": "45,000", |
| "SLA Violation": "Yes", |
| "Customer Satisfaction": "-40%" |
| }, |
| arf_pattern="cache_miss_storm", |
| oss_analysis=OSSAnalysis( |
| status="β
Analysis Complete", |
| recommendations=[ |
| "Increase Redis cache memory allocation by 2x", |
| "Implement cache warming strategy with predictive loading", |
| "Optimize key patterns and implement TTL adjustments", |
| "Add circuit breaker for graceful database fallback", |
| "Deploy monitoring for cache hit rate trends" |
| ], |
| estimated_time="60-90 minutes", |
| engineers_needed="2-3 SREs + 1 DBA", |
| manual_effort="High", |
| confidence_score=0.92, |
| healing_intent={ |
| "type": "scale_out", |
| "resource": "cache", |
| "scale_factor": 2.0 |
| } |
| ), |
| enterprise_results=EnterpriseResults( |
| actions_completed=[ |
| "β
Auto-scaled Redis cluster: 4GB β 8GB", |
| "β
Deployed intelligent cache warming service", |
| "β
Optimized 12 key patterns with ML recommendations", |
| "β
Implemented circuit breaker with 95% success rate", |
| "β
Validated recovery with automated testing" |
| ], |
| metrics_improvement={ |
| "Cache Hit Rate": "18.5% β 72%", |
| "Response Time": "1850ms β 450ms", |
| "Database Load": "92% β 45%", |
| "Throughput": "1250 β 2450 req/sec" |
| }, |
| business_impact={ |
| "Recovery Time": "60 min β 12 min", |
| "Cost Saved": "$7,200", |
| "Users Impacted": "45,000 β 0", |
| "Revenue Protected": "$1,700", |
| "MTTR Improvement": "80% reduction" |
| }, |
| approval_required=True, |
| execution_time="8 minutes" |
| ) |
| ) |
| |
| db_exhaustion = IncidentScenario( |
| name="Database Connection Pool Exhaustion", |
| severity=IncidentSeverity.HIGH, |
| metrics={ |
| "Active Connections": "98/100 (Critical)", |
| "API Latency": "2450ms", |
| "Error Rate": "15.2%", |
| "Queue Depth": "1250", |
| "Connection Wait Time": "45s" |
| }, |
| impact={ |
| "Revenue Loss": "$4,200/hour", |
| "Affected Services": "API Gateway, User Service, Payment Service", |
| "SLA Violation": "Yes", |
| "Partner Impact": "3 external APIs" |
| }, |
| arf_pattern="db_connection_exhaustion", |
| oss_analysis=OSSAnalysis( |
| status="β
Analysis Complete", |
| recommendations=[ |
| "Increase connection pool size from 100 to 200", |
| "Add connection timeout (30s)", |
| "Implement leak detection", |
| "Add connection health checks", |
| "Optimize query patterns" |
| ], |
| estimated_time="45-60 minutes", |
| engineers_needed="1-2 DBAs", |
| manual_effort="Medium-High", |
| confidence_score=0.88 |
| ) |
| ) |
| |
| memory_leak = IncidentScenario( |
| name="Memory Leak in Production", |
| severity=IncidentSeverity.HIGH, |
| metrics={ |
| "Memory Usage": "96% (Critical)", |
| "GC Pause Time": "4500ms", |
| "Error Rate": "28.5%", |
| "Restart Frequency": "12/hour", |
| "Heap Fragmentation": "42%" |
| }, |
| impact={ |
| "Revenue Loss": "$5,500/hour", |
| "Session Loss": "8,500 users", |
| "Customer Impact": "High", |
| "Support Tickets": "+300%" |
| }, |
| arf_pattern="memory_leak_java", |
| oss_analysis=OSSAnalysis( |
| status="β
Analysis Complete", |
| recommendations=[ |
| "Increase JVM heap size from 4GB to 8GB", |
| "Implement memory leak detection with profiling", |
| "Add proactive health checks", |
| "Schedule rolling restart with zero downtime", |
| "Deploy memory monitoring dashboard" |
| ], |
| estimated_time="75-90 minutes", |
| engineers_needed="2 Java SREs", |
| manual_effort="High", |
| confidence_score=0.85 |
| ) |
| ) |
| |
| api_rate_limit = IncidentScenario( |
| name="API Rate Limit Exceeded", |
| severity=IncidentSeverity.MEDIUM, |
| metrics={ |
| "429 Error Rate": "42.5%", |
| "Successful Requests": "58.3%", |
| "API Latency": "120ms", |
| "Queue Depth": "1250", |
| "Client Satisfaction": "65/100" |
| }, |
| impact={ |
| "Revenue Loss": "$1,800/hour", |
| "Affected Partners": "8", |
| "Partner SLA Violations": "3", |
| "Business Impact": "Medium" |
| }, |
| arf_pattern="api_rate_limit" |
| ) |
| |
| return { |
| "Cache Miss Storm": cache_miss, |
| "Database Connection Pool Exhaustion": db_exhaustion, |
| "Memory Leak in Production": memory_leak, |
| "API Rate Limit Exceeded": api_rate_limit |
| } |