Update demo/mock_arf.py
Browse files- demo/mock_arf.py +370 -72
demo/mock_arf.py
CHANGED
|
@@ -2,9 +2,12 @@
|
|
| 2 |
"""
|
| 3 |
Enhanced Mock ARF with scenario-aware metrics
|
| 4 |
Generates different values based on scenario characteristics
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
import random
|
| 7 |
import time
|
|
|
|
| 8 |
from typing import Dict, Any, List
|
| 9 |
import json
|
| 10 |
|
|
@@ -127,7 +130,7 @@ def simulate_arf_analysis(scenario_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 127 |
return {
|
| 128 |
"analysis_complete": True,
|
| 129 |
"anomaly_detected": True,
|
| 130 |
-
"severity": scenario_data.get("severity", "
|
| 131 |
"confidence": round(detection_confidence, 3), # Round to 3 decimals
|
| 132 |
"detection_time_ms": detection_time * 1000, # Convert to ms for display
|
| 133 |
"detection_time_seconds": detection_time,
|
|
@@ -229,84 +232,289 @@ def calculate_pattern_confidence(scenario_data: Dict[str, Any], similar_incident
|
|
| 229 |
|
| 230 |
return round(confidence, 3)
|
| 231 |
|
| 232 |
-
def
|
| 233 |
"""
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
-
Args:
|
| 237 |
-
scenario_data: Dictionary containing scenario information
|
| 238 |
-
similar_incidents: List of similar incidents from RAG search
|
| 239 |
-
confidence: Pattern confidence score
|
| 240 |
-
|
| 241 |
Returns:
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
"""
|
|
|
|
|
|
|
|
|
|
| 244 |
scenario_name = scenario_data.get("name", "Unknown Scenario")
|
| 245 |
config = get_scenario_config(scenario_name)
|
| 246 |
-
|
| 247 |
component = scenario_data.get("component", "unknown")
|
| 248 |
|
| 249 |
-
#
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
"
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
"
|
| 277 |
-
|
| 278 |
-
"
|
| 279 |
-
"
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
"
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
"
|
| 293 |
-
"
|
| 294 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
}
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
"
|
| 301 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
}
|
| 303 |
|
| 304 |
-
#
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
# Calculate estimated impact
|
| 312 |
if similar_incidents:
|
|
@@ -316,15 +524,24 @@ def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents:
|
|
| 316 |
avg_cost_savings = sum(config["cost_savings_range"]) / 2
|
| 317 |
avg_resolution_time = sum(config["resolution_time_range"]) / 2
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
return {
|
| 320 |
-
"action": action,
|
| 321 |
"component": component,
|
| 322 |
"confidence": round(confidence, 3),
|
| 323 |
-
"parameters": parameters,
|
| 324 |
-
"source": "
|
| 325 |
"requires_enterprise": True,
|
| 326 |
"advisory_only": True,
|
| 327 |
-
|
|
|
|
|
|
|
| 328 |
"estimated_impact": {
|
| 329 |
"cost_savings": int(avg_cost_savings),
|
| 330 |
"resolution_time_minutes": int(avg_resolution_time),
|
|
@@ -338,6 +555,18 @@ def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents:
|
|
| 338 |
"approval_required": True,
|
| 339 |
"risk_level": "medium" if confidence < 0.9 else "low"
|
| 340 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
"scenario_specific": True,
|
| 342 |
"scenario_name": scenario_name
|
| 343 |
}
|
|
@@ -367,4 +596,73 @@ def get_scenario_metrics(scenario_name: str) -> Dict[str, Any]:
|
|
| 367 |
"resolution_time_range": config["resolution_time_range"],
|
| 368 |
"affected_users_range": config["affected_users_range"],
|
| 369 |
"tags": config["tags"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
}
|
|
|
|
| 2 |
"""
|
| 3 |
Enhanced Mock ARF with scenario-aware metrics
|
| 4 |
Generates different values based on scenario characteristics
|
| 5 |
+
DOCTRINAL COMPLIANCE VERSION 3.3.9+restraint
|
| 6 |
+
Key Addition: Explicit Observation Gate for psychological advantage
|
| 7 |
"""
|
| 8 |
import random
|
| 9 |
import time
|
| 10 |
+
import datetime
|
| 11 |
from typing import Dict, Any, List
|
| 12 |
import json
|
| 13 |
|
|
|
|
| 130 |
return {
|
| 131 |
"analysis_complete": True,
|
| 132 |
"anomaly_detected": True,
|
| 133 |
+
"severity": scenario_data.get("severity", "HIGH_VARIANCE"), # Changed from "HIGH" to "HIGH_VARIANCE"
|
| 134 |
"confidence": round(detection_confidence, 3), # Round to 3 decimals
|
| 135 |
"detection_time_ms": detection_time * 1000, # Convert to ms for display
|
| 136 |
"detection_time_seconds": detection_time,
|
|
|
|
| 232 |
|
| 233 |
return round(confidence, 3)
|
| 234 |
|
| 235 |
+
def calculate_internal_success_rate(similar_incidents: List[Dict[str, Any]]) -> float:
|
| 236 |
"""
|
| 237 |
+
Calculate success rate for internal logic only.
|
| 238 |
+
Not for UI display in Decision View.
|
| 239 |
+
|
| 240 |
+
Doctrinal: Percentages invite debate, narratives shut it down.
|
| 241 |
+
Keep this internal for logic, surface only in Outcome View.
|
| 242 |
+
"""
|
| 243 |
+
if not similar_incidents:
|
| 244 |
+
return 0.0
|
| 245 |
+
|
| 246 |
+
success_count = sum(1 for inc in similar_incidents if inc.get("success", False))
|
| 247 |
+
return round(success_count / len(similar_incidents), 3)
|
| 248 |
+
|
| 249 |
+
def check_contraindications(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 250 |
+
"""
|
| 251 |
+
Check for contraindications based on retry amplification signatures and historical evidence
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
Returns:
|
| 254 |
+
Dictionary with contraindication analysis
|
| 255 |
+
"""
|
| 256 |
+
component = scenario_data.get("component", "").lower()
|
| 257 |
+
scenario_name = scenario_data.get("name", "").lower()
|
| 258 |
+
|
| 259 |
+
# Detect retry amplification signatures
|
| 260 |
+
retry_amplification = False
|
| 261 |
+
evidence = []
|
| 262 |
+
|
| 263 |
+
# Check telemetry for retry storm indicators
|
| 264 |
+
telemetry = scenario_data.get("telemetry", {})
|
| 265 |
+
if telemetry.get("retry_storm", False):
|
| 266 |
+
retry_amplification = True
|
| 267 |
+
evidence.append("Telemetry shows retry_storm: True")
|
| 268 |
+
|
| 269 |
+
# Check for amplification factor in metrics
|
| 270 |
+
metrics = scenario_data.get("metrics", {})
|
| 271 |
+
amplification_factor = metrics.get("amplification_factor", 1.0)
|
| 272 |
+
if amplification_factor > 2.0:
|
| 273 |
+
retry_amplification = True
|
| 274 |
+
evidence.append(f"Amplification factor {amplification_factor} > 2.0")
|
| 275 |
+
|
| 276 |
+
# Check database load
|
| 277 |
+
db_load = metrics.get("database_load_percent", 0)
|
| 278 |
+
if db_load > 85:
|
| 279 |
+
retry_amplification = True
|
| 280 |
+
evidence.append(f"Database load {db_load}% > 85%")
|
| 281 |
+
|
| 282 |
+
# Check historical incidents for scaling-first failures
|
| 283 |
+
historical_scaling_failures = False
|
| 284 |
+
scaling_failure_evidence = []
|
| 285 |
+
|
| 286 |
+
for incident in similar_incidents:
|
| 287 |
+
resolution = incident.get("resolution", "").lower()
|
| 288 |
+
success = incident.get("success", True)
|
| 289 |
+
|
| 290 |
+
# Check for scaling-first resolutions that failed
|
| 291 |
+
if any(scale_term in resolution for scale_term in ["scale", "increase", "add_replicas"]):
|
| 292 |
+
if not success:
|
| 293 |
+
historical_scaling_failures = True
|
| 294 |
+
scaling_failure_evidence.append(
|
| 295 |
+
f"{incident.get('timestamp', 'Unknown date')}: {resolution} failed"
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
contraindicated_actions = []
|
| 299 |
+
if retry_amplification or historical_scaling_failures:
|
| 300 |
+
contraindicated_actions.append("scale_during_retry_amplification")
|
| 301 |
+
|
| 302 |
+
return {
|
| 303 |
+
"retry_amplification": retry_amplification,
|
| 304 |
+
"historical_scaling_failures": historical_scaling_failures,
|
| 305 |
+
"evidence": evidence + scaling_failure_evidence,
|
| 306 |
+
"contraindicated_actions": contraindicated_actions,
|
| 307 |
+
"confidence": 0.92 if evidence else 0.0
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]], confidence: float) -> Dict[str, Any]:
|
| 311 |
+
"""
|
| 312 |
+
Create doctrinally compliant healing intent with sequencing thesis enforcement
|
| 313 |
+
|
| 314 |
+
Doctrinal Addition: Explicit Observation Gate when contraindications exist OR confidence < threshold
|
| 315 |
+
Psychological Goal: Make inaction an explicit, powerful decision
|
| 316 |
"""
|
| 317 |
+
# Check for contraindications FIRST (doctrinal constraint)
|
| 318 |
+
contraindications = check_contraindications(scenario_data, similar_incidents)
|
| 319 |
+
|
| 320 |
scenario_name = scenario_data.get("name", "Unknown Scenario")
|
| 321 |
config = get_scenario_config(scenario_name)
|
|
|
|
| 322 |
component = scenario_data.get("component", "unknown")
|
| 323 |
|
| 324 |
+
# ============ OBSERVATION GATE LOGIC ============
|
| 325 |
+
# Key psychological addition: Explicit deferral when uncertainty is high
|
| 326 |
+
observation_gate_threshold = 0.70 # Below this, we observe first
|
| 327 |
+
|
| 328 |
+
should_observe_first = (
|
| 329 |
+
contraindications["retry_amplification"] or
|
| 330 |
+
contraindications["historical_scaling_failures"] or
|
| 331 |
+
confidence < observation_gate_threshold or
|
| 332 |
+
len(similar_incidents) < 2 # Insufficient historical evidence
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
if should_observe_first:
|
| 336 |
+
# Return OBSERVATION GATE state - intentional inaction
|
| 337 |
+
current_time = datetime.datetime.now()
|
| 338 |
+
next_evaluation = current_time + datetime.timedelta(minutes=5)
|
| 339 |
+
|
| 340 |
+
return {
|
| 341 |
+
"action": "defer_decision_for_trend_confirmation",
|
| 342 |
+
"component": component,
|
| 343 |
+
"confidence": round(confidence, 3),
|
| 344 |
+
"parameters": {
|
| 345 |
+
"observation_window": "5m",
|
| 346 |
+
"metrics_to_watch": ["retry_count", "database_load_percent", "error_rate"],
|
| 347 |
+
"trend_threshold": "stabilizing_or_declining"
|
| 348 |
+
},
|
| 349 |
+
"source": "observation_gate_logic",
|
| 350 |
+
"requires_enterprise": False,
|
| 351 |
+
"advisory_only": True,
|
| 352 |
+
# CRITICAL PSYCHOLOGICAL FIELDS
|
| 353 |
+
"execution_state": "observe_only",
|
| 354 |
+
"next_evaluation_window": "5m",
|
| 355 |
+
"decision_frozen_until": next_evaluation.isoformat(),
|
| 356 |
+
"deferral_reason": "uncertainty_too_high_for_action" if confidence < observation_gate_threshold else
|
| 357 |
+
"contraindications_present" if contraindications["retry_amplification"] else
|
| 358 |
+
"historical_failures_detected" if contraindications["historical_scaling_failures"] else
|
| 359 |
+
"insufficient_historical_evidence",
|
| 360 |
+
# FORMAL HEALINGINTENT FIELDS
|
| 361 |
+
"preconditions": [
|
| 362 |
+
f"Confidence threshold not met ({confidence:.2f} < {observation_gate_threshold})" if confidence < observation_gate_threshold else
|
| 363 |
+
"Retry amplification detected" if contraindications["retry_amplification"] else
|
| 364 |
+
"Historical scaling failures present" if contraindications["historical_scaling_failures"] else
|
| 365 |
+
"Insufficient similar incidents for pattern matching"
|
| 366 |
+
],
|
| 367 |
+
"contraindicated_actions": ["any_healing_action_during_high_uncertainty"],
|
| 368 |
+
"reversibility_statement": "Evaluation resumes automatically after 5-minute observation window",
|
| 369 |
+
"sequencing_rule": "observe_before_any_action_when_uncertain",
|
| 370 |
+
"historical_evidence": [
|
| 371 |
+
f"{len(similar_incidents)} similar incidents analyzed (minimum 2 required)",
|
| 372 |
+
"Observation-first reduces incorrect actions by 67% (historical analysis)"
|
| 373 |
+
],
|
| 374 |
+
# SUCCESS RATE HANDLING (kept internal, not surfaced early)
|
| 375 |
+
"_internal_success_rate": calculate_internal_success_rate(similar_incidents) if similar_incidents else 0.0,
|
| 376 |
+
"_internal_notes": "Success rate kept internal; percentages invite debate, narratives shut it down",
|
| 377 |
+
"scenario_specific": True,
|
| 378 |
+
"scenario_name": scenario_name
|
| 379 |
}
|
| 380 |
+
|
| 381 |
+
# If retry amplification detected (but passed observation gate threshold), enforce dampening-first logic
|
| 382 |
+
if contraindications["retry_amplification"]:
|
| 383 |
+
return {
|
| 384 |
+
"action": "implement_request_coalescing_with_exponential_backoff",
|
| 385 |
+
"component": component,
|
| 386 |
+
"confidence": max(confidence, 0.85), # High confidence for dampening-first
|
| 387 |
+
"parameters": {
|
| 388 |
+
"coalescing_window_ms": "100-500ms",
|
| 389 |
+
"backoff_factor": "exponential",
|
| 390 |
+
"max_retries": 3,
|
| 391 |
+
"timeout": "10m"
|
| 392 |
+
},
|
| 393 |
+
"source": "contraindication_detection",
|
| 394 |
+
"requires_enterprise": False,
|
| 395 |
+
"advisory_only": False,
|
| 396 |
+
# CRITICAL: Add observation window even for dampening actions
|
| 397 |
+
"post_action_observation": {
|
| 398 |
+
"required": True,
|
| 399 |
+
"duration": "5m",
|
| 400 |
+
"metrics": ["retry_count", "database_load_percent", "latency_p99"]
|
| 401 |
+
},
|
| 402 |
+
"success_rate": 0.88,
|
| 403 |
+
"estimated_impact": {
|
| 404 |
+
"cost_savings": 4500,
|
| 405 |
+
"resolution_time_minutes": 12,
|
| 406 |
+
"users_protected": random.randint(*config["affected_users_range"]),
|
| 407 |
+
"mttr_reduction": "73%"
|
| 408 |
+
},
|
| 409 |
+
"safety_checks": {
|
| 410 |
+
"blast_radius": "single_service",
|
| 411 |
+
"business_hours": "compliant",
|
| 412 |
+
"rollback_plan": "coalescing_disable",
|
| 413 |
+
"approval_required": False,
|
| 414 |
+
"risk_level": "low"
|
| 415 |
+
},
|
| 416 |
+
# FORMAL HEALINGINTENT FIELDS (doctrinal constraint)
|
| 417 |
+
"preconditions": [
|
| 418 |
+
"Retry amplification signature detected",
|
| 419 |
+
f"Amplification factor > {scenario_data.get('metrics', {}).get('amplification_factor', 2.0)}",
|
| 420 |
+
"Database load > 85%"
|
| 421 |
+
],
|
| 422 |
+
"contraindicated_actions": ["scale_during_retry_storm", "add_capacity_during_amplification"],
|
| 423 |
+
"reversibility_statement": "Remove coalescing window after 10 minutes of stable operation",
|
| 424 |
+
"sequencing_rule": "dampening_first_then_observe_then_optional_scale",
|
| 425 |
+
"historical_evidence": contraindications["evidence"][:3], # Top 3 evidence items
|
| 426 |
+
"scenario_specific": True,
|
| 427 |
+
"scenario_name": scenario_name
|
| 428 |
}
|
| 429 |
|
| 430 |
+
# Only proceed with normal logic if no contraindications AND passed observation gate
|
| 431 |
+
# Determine action based on component and scenario WITH sequencing logic
|
| 432 |
+
ranked_actions = []
|
| 433 |
+
|
| 434 |
+
# DAMPENING actions (always first in sequence)
|
| 435 |
+
dampening_actions = []
|
| 436 |
+
if "api" in component.lower() or "rate" in scenario_name.lower():
|
| 437 |
+
dampening_actions.append({
|
| 438 |
+
"action": "circuit_breaker",
|
| 439 |
+
"confidence": confidence * 0.95, # Slightly lower confidence for dampening
|
| 440 |
+
"parameters": {
|
| 441 |
+
"threshold": f"{random.randint(70, 85)}%",
|
| 442 |
+
"window": f"{random.randint(3, 10)}m",
|
| 443 |
+
"fallback": "cached_response",
|
| 444 |
+
"retry_after": f"{random.randint(30, 120)}s"
|
| 445 |
+
}
|
| 446 |
+
})
|
| 447 |
+
|
| 448 |
+
# Add general dampening for retry-prone scenarios
|
| 449 |
+
if any(term in component.lower() for term in ["redis", "cache", "database"]):
|
| 450 |
+
dampening_actions.append({
|
| 451 |
+
"action": "request_batching_with_timeout",
|
| 452 |
+
"confidence": confidence * 0.92,
|
| 453 |
+
"parameters": {
|
| 454 |
+
"batch_size": "10-50 requests",
|
| 455 |
+
"timeout_ms": "100ms",
|
| 456 |
+
"strategy": "adaptive"
|
| 457 |
+
}
|
| 458 |
+
})
|
| 459 |
+
|
| 460 |
+
# Add dampening actions to ranked list
|
| 461 |
+
for i, act in enumerate(dampening_actions):
|
| 462 |
+
ranked_actions.append({
|
| 463 |
+
"rank": len(ranked_actions) + 1,
|
| 464 |
+
"action": act["action"],
|
| 465 |
+
"confidence": round(act["confidence"], 3),
|
| 466 |
+
"parameters": act["parameters"],
|
| 467 |
+
"category": "dampening"
|
| 468 |
+
})
|
| 469 |
+
|
| 470 |
+
# CONCURRENCY CAP actions (second in sequence)
|
| 471 |
+
if "database" in component.lower():
|
| 472 |
+
ranked_actions.append({
|
| 473 |
+
"rank": len(ranked_actions) + 1,
|
| 474 |
+
"action": "connection_pool_limit_adjustment",
|
| 475 |
+
"confidence": confidence * 0.88,
|
| 476 |
+
"parameters": {
|
| 477 |
+
"max_connections": f"{random.randint(100, 200)}",
|
| 478 |
+
"timeout": f"{random.randint(30, 60)}s"
|
| 479 |
+
},
|
| 480 |
+
"category": "concurrency_control"
|
| 481 |
+
})
|
| 482 |
+
|
| 483 |
+
# OBSERVE actions (third in sequence)
|
| 484 |
+
ranked_actions.append({
|
| 485 |
+
"rank": len(ranked_actions) + 1,
|
| 486 |
+
"action": "enhanced_monitoring_with_telemetry",
|
| 487 |
+
"confidence": confidence * 0.85,
|
| 488 |
+
"parameters": {
|
| 489 |
+
"duration": "5m",
|
| 490 |
+
"metrics": ["latency_p99", "error_rate", "throughput"],
|
| 491 |
+
"alert_threshold": "2x_baseline"
|
| 492 |
+
},
|
| 493 |
+
"category": "observation"
|
| 494 |
+
})
|
| 495 |
+
|
| 496 |
+
# SCALING actions (ONLY if no contraindications AND last in sequence)
|
| 497 |
+
# AND only if confidence justifies scaling over dampening
|
| 498 |
+
scaling_confidence_threshold = 0.75 # Scaling requires higher confidence
|
| 499 |
+
|
| 500 |
+
if confidence > scaling_confidence_threshold and not contraindications["historical_scaling_failures"]:
|
| 501 |
+
if "cache" in component.lower() or "redis" in component.lower():
|
| 502 |
+
scaling_action = {
|
| 503 |
+
"rank": len(ranked_actions) + 1,
|
| 504 |
+
"action": "gradual_scale_out",
|
| 505 |
+
"confidence": confidence * 0.80, # Lower confidence than dampening
|
| 506 |
+
"parameters": {
|
| 507 |
+
"nodes": f"{random.randint(2, 4)}→{random.randint(4, 6)}",
|
| 508 |
+
"strategy": "one_by_one",
|
| 509 |
+
"health_check_interval": "30s"
|
| 510 |
+
},
|
| 511 |
+
"category": "scaling",
|
| 512 |
+
"constraints": ["Only if dampening insufficient after 5 minutes"]
|
| 513 |
+
}
|
| 514 |
+
ranked_actions.append(scaling_action)
|
| 515 |
+
|
| 516 |
+
# Calculate success rate internally only
|
| 517 |
+
_internal_success_rate = calculate_internal_success_rate(similar_incidents) if similar_incidents else random.uniform(*config["success_rate_range"])
|
| 518 |
|
| 519 |
# Calculate estimated impact
|
| 520 |
if similar_incidents:
|
|
|
|
| 524 |
avg_cost_savings = sum(config["cost_savings_range"]) / 2
|
| 525 |
avg_resolution_time = sum(config["resolution_time_range"]) / 2
|
| 526 |
|
| 527 |
+
# Primary action is first in ranked_actions (dampening-first)
|
| 528 |
+
primary_action = ranked_actions[0] if ranked_actions else {
|
| 529 |
+
"action": "investigate",
|
| 530 |
+
"confidence": confidence,
|
| 531 |
+
"parameters": {"priority": "high"}
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
return {
|
| 535 |
+
"action": primary_action["action"],
|
| 536 |
"component": component,
|
| 537 |
"confidence": round(confidence, 3),
|
| 538 |
+
"parameters": primary_action.get("parameters", {}),
|
| 539 |
+
"source": "sequencing_analysis",
|
| 540 |
"requires_enterprise": True,
|
| 541 |
"advisory_only": True,
|
| 542 |
+
# SUCCESS RATE: Internal only, not for UI display in Decision View
|
| 543 |
+
"_internal_success_rate": _internal_success_rate,
|
| 544 |
+
"_internal_notes": "Success rate for internal logic; surface narrative outcomes, not percentages",
|
| 545 |
"estimated_impact": {
|
| 546 |
"cost_savings": int(avg_cost_savings),
|
| 547 |
"resolution_time_minutes": int(avg_resolution_time),
|
|
|
|
| 555 |
"approval_required": True,
|
| 556 |
"risk_level": "medium" if confidence < 0.9 else "low"
|
| 557 |
},
|
| 558 |
+
# FORMAL HEALINGINTENT FIELDS (doctrinal constraint)
|
| 559 |
+
"preconditions": [
|
| 560 |
+
f"Component: {component}",
|
| 561 |
+
f"Confidence threshold > {scaling_confidence_threshold}",
|
| 562 |
+
"No retry amplification detected",
|
| 563 |
+
"Historical scaling success rate > 70%"
|
| 564 |
+
],
|
| 565 |
+
"contraindicated_actions": contraindications["contraindicated_actions"],
|
| 566 |
+
"reversibility_statement": f"Rollback to previous configuration available within {random.randint(5, 15)} minutes",
|
| 567 |
+
"sequencing_rule": "dampening_before_concurrency_before_observation_before_scaling",
|
| 568 |
+
"ranked_actions": ranked_actions,
|
| 569 |
+
"historical_evidence": [f"{len(similar_incidents)} similar incidents analyzed"],
|
| 570 |
"scenario_specific": True,
|
| 571 |
"scenario_name": scenario_name
|
| 572 |
}
|
|
|
|
| 596 |
"resolution_time_range": config["resolution_time_range"],
|
| 597 |
"affected_users_range": config["affected_users_range"],
|
| 598 |
"tags": config["tags"]
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
def detect_retry_amplification(telemetry_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 602 |
+
"""
|
| 603 |
+
Detect retry amplification signatures from telemetry data
|
| 604 |
+
|
| 605 |
+
Doctrinal constraint: Must be REAL detection, not hardcoded in scenarios
|
| 606 |
+
|
| 607 |
+
Args:
|
| 608 |
+
telemetry_data: Dictionary containing telemetry metrics
|
| 609 |
+
|
| 610 |
+
Returns:
|
| 611 |
+
Dictionary with detection results
|
| 612 |
+
"""
|
| 613 |
+
# Extract metrics with defaults
|
| 614 |
+
retry_storm = telemetry_data.get("retry_storm", False)
|
| 615 |
+
retry_count = telemetry_data.get("retry_count", 0)
|
| 616 |
+
success_count = telemetry_data.get("success_count", 1) # Avoid division by zero
|
| 617 |
+
database_load = telemetry_data.get("database_load_percent", 0)
|
| 618 |
+
retry_cascade_depth = telemetry_data.get("retry_cascade_depth", 0)
|
| 619 |
+
|
| 620 |
+
# Calculate amplification factor
|
| 621 |
+
amplification_factor = 1.0
|
| 622 |
+
if success_count > 0:
|
| 623 |
+
amplification_factor = retry_count / success_count
|
| 624 |
+
|
| 625 |
+
# Detect signatures
|
| 626 |
+
detected = (
|
| 627 |
+
retry_storm or
|
| 628 |
+
amplification_factor > 2.0 or
|
| 629 |
+
retry_cascade_depth > 2 or
|
| 630 |
+
database_load > 85
|
| 631 |
+
)
|
| 632 |
+
|
| 633 |
+
signature = None
|
| 634 |
+
if detected:
|
| 635 |
+
if retry_storm and amplification_factor > 3.0:
|
| 636 |
+
signature = "exponential_retry_cascade"
|
| 637 |
+
elif database_load > 85 and amplification_factor > 1.5:
|
| 638 |
+
signature = "database_amplified_retry"
|
| 639 |
+
else:
|
| 640 |
+
signature = "retry_amplification_detected"
|
| 641 |
+
|
| 642 |
+
# Calculate confidence based on evidence strength
|
| 643 |
+
confidence_factors = []
|
| 644 |
+
if retry_storm:
|
| 645 |
+
confidence_factors.append(0.3)
|
| 646 |
+
if amplification_factor > 2.0:
|
| 647 |
+
confidence_factors.append(0.25 * min(amplification_factor / 5.0, 1.0))
|
| 648 |
+
if retry_cascade_depth > 2:
|
| 649 |
+
confidence_factors.append(0.2 * min(retry_cascade_depth / 5.0, 1.0))
|
| 650 |
+
if database_load > 85:
|
| 651 |
+
confidence_factors.append(0.25 * min(database_load / 100.0, 1.0))
|
| 652 |
+
|
| 653 |
+
confidence = min(0.98, 0.1 + sum(confidence_factors)) if confidence_factors else 0.0
|
| 654 |
+
|
| 655 |
+
return {
|
| 656 |
+
"detected": detected,
|
| 657 |
+
"amplification_factor": round(amplification_factor, 2),
|
| 658 |
+
"signature": signature,
|
| 659 |
+
"confidence": round(confidence, 3),
|
| 660 |
+
"metrics": {
|
| 661 |
+
"retry_storm": retry_storm,
|
| 662 |
+
"retry_count": retry_count,
|
| 663 |
+
"success_count": success_count,
|
| 664 |
+
"database_load_percent": database_load,
|
| 665 |
+
"retry_cascade_depth": retry_cascade_depth
|
| 666 |
+
},
|
| 667 |
+
"recommendation": "implement_dampening_first" if detected else "proceed_with_caution"
|
| 668 |
}
|