Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Jan 17

Commit

beb9e94

verified ·

1 Parent(s): 6c5d8cf

Update demo/mock_arf.py

Browse files

Files changed (1) hide show

demo/mock_arf.py +370 -72

demo/mock_arf.py CHANGED Viewed

@@ -2,9 +2,12 @@
 """
 Enhanced Mock ARF with scenario-aware metrics
 Generates different values based on scenario characteristics
 """
 import random
 import time
 from typing import Dict, Any, List
 import json
@@ -127,7 +130,7 @@ def simulate_arf_analysis(scenario_data: Dict[str, Any]) -> Dict[str, Any]:
     return {
         "analysis_complete": True,
         "anomaly_detected": True,
-        "severity": scenario_data.get("severity", "HIGH"),
         "confidence": round(detection_confidence, 3),  # Round to 3 decimals
         "detection_time_ms": detection_time * 1000,  # Convert to ms for display
         "detection_time_seconds": detection_time,
@@ -229,84 +232,289 @@ def calculate_pattern_confidence(scenario_data: Dict[str, Any], similar_incident
     return round(confidence, 3)
-def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]], confidence: float) -> Dict[str, Any]:
     """
-    Create mock healing intent based on scenario and similar incidents
-    Args:
-        scenario_data: Dictionary containing scenario information
-        similar_incidents: List of similar incidents from RAG search
-        confidence: Pattern confidence score
     Returns:
-        Healing intent dictionary
     """
     scenario_name = scenario_data.get("name", "Unknown Scenario")
     config = get_scenario_config(scenario_name)
     component = scenario_data.get("component", "unknown")
-    # Determine action based on component and scenario
-    if "cache" in component.lower() or "redis" in component.lower():
-        action = "scale_out"
-        parameters = {
-            "nodes": f"{random.randint(2, 4)}→{random.randint(5, 8)}",
-            "memory": f"{random.randint(8, 16)}GB→{random.randint(24, 64)}GB",
-            "strategy": "gradual_scale",
-            "region": "auto-select"
-        }
-    elif "database" in component.lower():
-        action = "restart"
-        parameters = {
-            "connections": f"{random.randint(50, 100)}→{random.randint(150, 300)}",
-            "timeout": f"{random.randint(30, 60)}s",
-            "strategy": "rolling_restart",
-            "maintenance_window": "immediate"
-        }
-    elif "kubernetes" in component.lower():
-        action = "memory_limit_increase"
-        parameters = {
-            "memory": f"{random.randint(512, 1024)}Mi→{random.randint(2048, 4096)}Mi",
-            "strategy": "pod_restart",
-            "drain_timeout": f"{random.randint(5, 15)}m"
-        }
-    elif "api" in component.lower():
-        action = "circuit_breaker"
-        parameters = {
-            "threshold": f"{random.randint(70, 85)}%",
-            "window": f"{random.randint(3, 10)}m",
-            "fallback": "cached_response",
-            "retry_after": f"{random.randint(30, 120)}s"
-        }
-    elif "network" in component.lower():
-        action = "failover"
-        parameters = {
-            "primary": "us-east-1",
-            "secondary": "us-west-2",
-            "timeout": f"{random.randint(10, 30)}s",
-            "health_check": "enhanced"
-        }
-    elif "storage" in component.lower():
-        action = "io_optimization"
-        parameters = {
-            "iops": f"{random.randint(1000, 3000)}→{random.randint(5000, 10000)}",
-            "throughput": f"{random.randint(100, 250)}MB/s→{random.randint(500, 1000)}MB/s",
-            "cache_size": f"{random.randint(8, 16)}GB→{random.randint(32, 64)}GB"
         }
-    else:
-        action = "investigate"
-        parameters = {
-            "priority": "high",
-            "escalation": "tier2",
-            "timeout": "30m"
         }
-    # Calculate success rate from similar incidents
-    if similar_incidents:
-        success_count = sum(1 for inc in similar_incidents if inc["success"])
-        success_rate = success_count / len(similar_incidents)
-    else:
-        success_rate = random.uniform(*config["success_rate_range"])
     # Calculate estimated impact
     if similar_incidents:
@@ -316,15 +524,24 @@ def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents:
         avg_cost_savings = sum(config["cost_savings_range"]) / 2
         avg_resolution_time = sum(config["resolution_time_range"]) / 2
     return {
-        "action": action,
         "component": component,
         "confidence": round(confidence, 3),
-        "parameters": parameters,
-        "source": "mock_analysis",
         "requires_enterprise": True,
         "advisory_only": True,
-        "success_rate": round(success_rate, 3),
         "estimated_impact": {
             "cost_savings": int(avg_cost_savings),
             "resolution_time_minutes": int(avg_resolution_time),
@@ -338,6 +555,18 @@ def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents:
             "approval_required": True,
             "risk_level": "medium" if confidence < 0.9 else "low"
         },
         "scenario_specific": True,
         "scenario_name": scenario_name
     }
@@ -367,4 +596,73 @@ def get_scenario_metrics(scenario_name: str) -> Dict[str, Any]:
         "resolution_time_range": config["resolution_time_range"],
         "affected_users_range": config["affected_users_range"],
         "tags": config["tags"]
     }

 """
 Enhanced Mock ARF with scenario-aware metrics
 Generates different values based on scenario characteristics
+DOCTRINAL COMPLIANCE VERSION 3.3.9+restraint
+Key Addition: Explicit Observation Gate for psychological advantage
 """
 import random
 import time
+import datetime
 from typing import Dict, Any, List
 import json
     return {
         "analysis_complete": True,
         "anomaly_detected": True,
+        "severity": scenario_data.get("severity", "HIGH_VARIANCE"),  # Changed from "HIGH" to "HIGH_VARIANCE"
         "confidence": round(detection_confidence, 3),  # Round to 3 decimals
         "detection_time_ms": detection_time * 1000,  # Convert to ms for display
         "detection_time_seconds": detection_time,
     return round(confidence, 3)
+def calculate_internal_success_rate(similar_incidents: List[Dict[str, Any]]) -> float:
     """
+    Calculate success rate for internal logic only.
+    Not for UI display in Decision View.
+    Doctrinal: Percentages invite debate, narratives shut it down.
+    Keep this internal for logic, surface only in Outcome View.
+    """
+    if not similar_incidents:
+        return 0.0
+    success_count = sum(1 for inc in similar_incidents if inc.get("success", False))
+    return round(success_count / len(similar_incidents), 3)
+def check_contraindications(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Check for contraindications based on retry amplification signatures and historical evidence
     Returns:
+        Dictionary with contraindication analysis
+    """
+    component = scenario_data.get("component", "").lower()
+    scenario_name = scenario_data.get("name", "").lower()
+    # Detect retry amplification signatures
+    retry_amplification = False
+    evidence = []
+    # Check telemetry for retry storm indicators
+    telemetry = scenario_data.get("telemetry", {})
+    if telemetry.get("retry_storm", False):
+        retry_amplification = True
+        evidence.append("Telemetry shows retry_storm: True")
+    # Check for amplification factor in metrics
+    metrics = scenario_data.get("metrics", {})
+    amplification_factor = metrics.get("amplification_factor", 1.0)
+    if amplification_factor > 2.0:
+        retry_amplification = True
+        evidence.append(f"Amplification factor {amplification_factor} > 2.0")
+    # Check database load
+    db_load = metrics.get("database_load_percent", 0)
+    if db_load > 85:
+        retry_amplification = True
+        evidence.append(f"Database load {db_load}% > 85%")
+    # Check historical incidents for scaling-first failures
+    historical_scaling_failures = False
+    scaling_failure_evidence = []
+    for incident in similar_incidents:
+        resolution = incident.get("resolution", "").lower()
+        success = incident.get("success", True)
+        # Check for scaling-first resolutions that failed
+        if any(scale_term in resolution for scale_term in ["scale", "increase", "add_replicas"]):
+            if not success:
+                historical_scaling_failures = True
+                scaling_failure_evidence.append(
+                    f"{incident.get('timestamp', 'Unknown date')}: {resolution} failed"
+                )
+    contraindicated_actions = []
+    if retry_amplification or historical_scaling_failures:
+        contraindicated_actions.append("scale_during_retry_amplification")
+    return {
+        "retry_amplification": retry_amplification,
+        "historical_scaling_failures": historical_scaling_failures,
+        "evidence": evidence + scaling_failure_evidence,
+        "contraindicated_actions": contraindicated_actions,
+        "confidence": 0.92 if evidence else 0.0
+    }
+def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]], confidence: float) -> Dict[str, Any]:
+    """
+    Create doctrinally compliant healing intent with sequencing thesis enforcement
+    Doctrinal Addition: Explicit Observation Gate when contraindications exist OR confidence < threshold
+    Psychological Goal: Make inaction an explicit, powerful decision
     """
+    # Check for contraindications FIRST (doctrinal constraint)
+    contraindications = check_contraindications(scenario_data, similar_incidents)
     scenario_name = scenario_data.get("name", "Unknown Scenario")
     config = get_scenario_config(scenario_name)
     component = scenario_data.get("component", "unknown")
+    # ============ OBSERVATION GATE LOGIC ============
+    # Key psychological addition: Explicit deferral when uncertainty is high
+    observation_gate_threshold = 0.70  # Below this, we observe first
+    should_observe_first = (
+        contraindications["retry_amplification"] or
+        contraindications["historical_scaling_failures"] or
+        confidence < observation_gate_threshold or
+        len(similar_incidents) < 2  # Insufficient historical evidence
+    )
+    if should_observe_first:
+        # Return OBSERVATION GATE state - intentional inaction
+        current_time = datetime.datetime.now()
+        next_evaluation = current_time + datetime.timedelta(minutes=5)
+        return {
+            "action": "defer_decision_for_trend_confirmation",
+            "component": component,
+            "confidence": round(confidence, 3),
+            "parameters": {
+                "observation_window": "5m",
+                "metrics_to_watch": ["retry_count", "database_load_percent", "error_rate"],
+                "trend_threshold": "stabilizing_or_declining"
+            },
+            "source": "observation_gate_logic",
+            "requires_enterprise": False,
+            "advisory_only": True,
+            # CRITICAL PSYCHOLOGICAL FIELDS
+            "execution_state": "observe_only",
+            "next_evaluation_window": "5m",
+            "decision_frozen_until": next_evaluation.isoformat(),
+            "deferral_reason": "uncertainty_too_high_for_action" if confidence < observation_gate_threshold else
+                              "contraindications_present" if contraindications["retry_amplification"] else
+                              "historical_failures_detected" if contraindications["historical_scaling_failures"] else
+                              "insufficient_historical_evidence",
+            # FORMAL HEALINGINTENT FIELDS
+            "preconditions": [
+                f"Confidence threshold not met ({confidence:.2f} < {observation_gate_threshold})" if confidence < observation_gate_threshold else
+                "Retry amplification detected" if contraindications["retry_amplification"] else
+                "Historical scaling failures present" if contraindications["historical_scaling_failures"] else
+                "Insufficient similar incidents for pattern matching"
+            ],
+            "contraindicated_actions": ["any_healing_action_during_high_uncertainty"],
+            "reversibility_statement": "Evaluation resumes automatically after 5-minute observation window",
+            "sequencing_rule": "observe_before_any_action_when_uncertain",
+            "historical_evidence": [
+                f"{len(similar_incidents)} similar incidents analyzed (minimum 2 required)",
+                "Observation-first reduces incorrect actions by 67% (historical analysis)"
+            ],
+            # SUCCESS RATE HANDLING (kept internal, not surfaced early)
+            "_internal_success_rate": calculate_internal_success_rate(similar_incidents) if similar_incidents else 0.0,
+            "_internal_notes": "Success rate kept internal; percentages invite debate, narratives shut it down",
+            "scenario_specific": True,
+            "scenario_name": scenario_name
         }
+    # If retry amplification detected (but passed observation gate threshold), enforce dampening-first logic
+    if contraindications["retry_amplification"]:
+        return {
+            "action": "implement_request_coalescing_with_exponential_backoff",
+            "component": component,
+            "confidence": max(confidence, 0.85),  # High confidence for dampening-first
+            "parameters": {
+                "coalescing_window_ms": "100-500ms",
+                "backoff_factor": "exponential",
+                "max_retries": 3,
+                "timeout": "10m"
+            },
+            "source": "contraindication_detection",
+            "requires_enterprise": False,
+            "advisory_only": False,
+            # CRITICAL: Add observation window even for dampening actions
+            "post_action_observation": {
+                "required": True,
+                "duration": "5m",
+                "metrics": ["retry_count", "database_load_percent", "latency_p99"]
+            },
+            "success_rate": 0.88,
+            "estimated_impact": {
+                "cost_savings": 4500,
+                "resolution_time_minutes": 12,
+                "users_protected": random.randint(*config["affected_users_range"]),
+                "mttr_reduction": "73%"
+            },
+            "safety_checks": {
+                "blast_radius": "single_service",
+                "business_hours": "compliant",
+                "rollback_plan": "coalescing_disable",
+                "approval_required": False,
+                "risk_level": "low"
+            },
+            # FORMAL HEALINGINTENT FIELDS (doctrinal constraint)
+            "preconditions": [
+                "Retry amplification signature detected",
+                f"Amplification factor > {scenario_data.get('metrics', {}).get('amplification_factor', 2.0)}",
+                "Database load > 85%"
+            ],
+            "contraindicated_actions": ["scale_during_retry_storm", "add_capacity_during_amplification"],
+            "reversibility_statement": "Remove coalescing window after 10 minutes of stable operation",
+            "sequencing_rule": "dampening_first_then_observe_then_optional_scale",
+            "historical_evidence": contraindications["evidence"][:3],  # Top 3 evidence items
+            "scenario_specific": True,
+            "scenario_name": scenario_name
         }
+    # Only proceed with normal logic if no contraindications AND passed observation gate
+    # Determine action based on component and scenario WITH sequencing logic
+    ranked_actions = []
+    # DAMPENING actions (always first in sequence)
+    dampening_actions = []
+    if "api" in component.lower() or "rate" in scenario_name.lower():
+        dampening_actions.append({
+            "action": "circuit_breaker",
+            "confidence": confidence * 0.95,  # Slightly lower confidence for dampening
+            "parameters": {
+                "threshold": f"{random.randint(70, 85)}%",
+                "window": f"{random.randint(3, 10)}m",
+                "fallback": "cached_response",
+                "retry_after": f"{random.randint(30, 120)}s"
+            }
+        })
+    # Add general dampening for retry-prone scenarios
+    if any(term in component.lower() for term in ["redis", "cache", "database"]):
+        dampening_actions.append({
+            "action": "request_batching_with_timeout",
+            "confidence": confidence * 0.92,
+            "parameters": {
+                "batch_size": "10-50 requests",
+                "timeout_ms": "100ms",
+                "strategy": "adaptive"
+            }
+        })
+    # Add dampening actions to ranked list
+    for i, act in enumerate(dampening_actions):
+        ranked_actions.append({
+            "rank": len(ranked_actions) + 1,
+            "action": act["action"],
+            "confidence": round(act["confidence"], 3),
+            "parameters": act["parameters"],
+            "category": "dampening"
+        })
+    # CONCURRENCY CAP actions (second in sequence)
+    if "database" in component.lower():
+        ranked_actions.append({
+            "rank": len(ranked_actions) + 1,
+            "action": "connection_pool_limit_adjustment",
+            "confidence": confidence * 0.88,
+            "parameters": {
+                "max_connections": f"{random.randint(100, 200)}",
+                "timeout": f"{random.randint(30, 60)}s"
+            },
+            "category": "concurrency_control"
+        })
+    # OBSERVE actions (third in sequence)
+    ranked_actions.append({
+        "rank": len(ranked_actions) + 1,
+        "action": "enhanced_monitoring_with_telemetry",
+        "confidence": confidence * 0.85,
+        "parameters": {
+            "duration": "5m",
+            "metrics": ["latency_p99", "error_rate", "throughput"],
+            "alert_threshold": "2x_baseline"
+        },
+        "category": "observation"
+    })
+    # SCALING actions (ONLY if no contraindications AND last in sequence)
+    # AND only if confidence justifies scaling over dampening
+    scaling_confidence_threshold = 0.75  # Scaling requires higher confidence
+    if confidence > scaling_confidence_threshold and not contraindications["historical_scaling_failures"]:
+        if "cache" in component.lower() or "redis" in component.lower():
+            scaling_action = {
+                "rank": len(ranked_actions) + 1,
+                "action": "gradual_scale_out",
+                "confidence": confidence * 0.80,  # Lower confidence than dampening
+                "parameters": {
+                    "nodes": f"{random.randint(2, 4)}→{random.randint(4, 6)}",
+                    "strategy": "one_by_one",
+                    "health_check_interval": "30s"
+                },
+                "category": "scaling",
+                "constraints": ["Only if dampening insufficient after 5 minutes"]
+            }
+            ranked_actions.append(scaling_action)
+    # Calculate success rate internally only
+    _internal_success_rate = calculate_internal_success_rate(similar_incidents) if similar_incidents else random.uniform(*config["success_rate_range"])
     # Calculate estimated impact
     if similar_incidents:
         avg_cost_savings = sum(config["cost_savings_range"]) / 2
         avg_resolution_time = sum(config["resolution_time_range"]) / 2
+    # Primary action is first in ranked_actions (dampening-first)
+    primary_action = ranked_actions[0] if ranked_actions else {
+        "action": "investigate",
+        "confidence": confidence,
+        "parameters": {"priority": "high"}
+    }
     return {
+        "action": primary_action["action"],
         "component": component,
         "confidence": round(confidence, 3),
+        "parameters": primary_action.get("parameters", {}),
+        "source": "sequencing_analysis",
         "requires_enterprise": True,
         "advisory_only": True,
+        # SUCCESS RATE: Internal only, not for UI display in Decision View
+        "_internal_success_rate": _internal_success_rate,
+        "_internal_notes": "Success rate for internal logic; surface narrative outcomes, not percentages",
         "estimated_impact": {
             "cost_savings": int(avg_cost_savings),
             "resolution_time_minutes": int(avg_resolution_time),
             "approval_required": True,
             "risk_level": "medium" if confidence < 0.9 else "low"
         },
+        # FORMAL HEALINGINTENT FIELDS (doctrinal constraint)
+        "preconditions": [
+            f"Component: {component}",
+            f"Confidence threshold > {scaling_confidence_threshold}",
+            "No retry amplification detected",
+            "Historical scaling success rate > 70%"
+        ],
+        "contraindicated_actions": contraindications["contraindicated_actions"],
+        "reversibility_statement": f"Rollback to previous configuration available within {random.randint(5, 15)} minutes",
+        "sequencing_rule": "dampening_before_concurrency_before_observation_before_scaling",
+        "ranked_actions": ranked_actions,
+        "historical_evidence": [f"{len(similar_incidents)} similar incidents analyzed"],
         "scenario_specific": True,
         "scenario_name": scenario_name
     }
         "resolution_time_range": config["resolution_time_range"],
         "affected_users_range": config["affected_users_range"],
         "tags": config["tags"]
+    }
+def detect_retry_amplification(telemetry_data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Detect retry amplification signatures from telemetry data
+    Doctrinal constraint: Must be REAL detection, not hardcoded in scenarios
+    Args:
+        telemetry_data: Dictionary containing telemetry metrics
+    Returns:
+        Dictionary with detection results
+    """
+    # Extract metrics with defaults
+    retry_storm = telemetry_data.get("retry_storm", False)
+    retry_count = telemetry_data.get("retry_count", 0)
+    success_count = telemetry_data.get("success_count", 1)  # Avoid division by zero
+    database_load = telemetry_data.get("database_load_percent", 0)
+    retry_cascade_depth = telemetry_data.get("retry_cascade_depth", 0)
+    # Calculate amplification factor
+    amplification_factor = 1.0
+    if success_count > 0:
+        amplification_factor = retry_count / success_count
+    # Detect signatures
+    detected = (
+        retry_storm or
+        amplification_factor > 2.0 or
+        retry_cascade_depth > 2 or
+        database_load > 85
+    )
+    signature = None
+    if detected:
+        if retry_storm and amplification_factor > 3.0:
+            signature = "exponential_retry_cascade"
+        elif database_load > 85 and amplification_factor > 1.5:
+            signature = "database_amplified_retry"
+        else:
+            signature = "retry_amplification_detected"
+    # Calculate confidence based on evidence strength
+    confidence_factors = []
+    if retry_storm:
+        confidence_factors.append(0.3)
+    if amplification_factor > 2.0:
+        confidence_factors.append(0.25 * min(amplification_factor / 5.0, 1.0))
+    if retry_cascade_depth > 2:
+        confidence_factors.append(0.2 * min(retry_cascade_depth / 5.0, 1.0))
+    if database_load > 85:
+        confidence_factors.append(0.25 * min(database_load / 100.0, 1.0))
+    confidence = min(0.98, 0.1 + sum(confidence_factors)) if confidence_factors else 0.0
+    return {
+        "detected": detected,
+        "amplification_factor": round(amplification_factor, 2),
+        "signature": signature,
+        "confidence": round(confidence, 3),
+        "metrics": {
+            "retry_storm": retry_storm,
+            "retry_count": retry_count,
+            "success_count": success_count,
+            "database_load_percent": database_load,
+            "retry_cascade_depth": retry_cascade_depth
+        },
+        "recommendation": "implement_dampening_first" if detected else "proceed_with_caution"
     }