shaikhsalman commited on
Commit
dd25ceb
·
verified ·
1 Parent(s): fa7a46b

Upload incident-response/auto-remediation/auto-remediate.sh with huggingface_hub

Browse files
incident-response/auto-remediation/auto-remediate.sh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # Autonomous Incident Remediation Engine
4
+ # =============================================================================
5
+ # Triggered by Alertmanager webhook. Auto-remediates known patterns.
6
+ # =============================================================================
7
+
8
+ set -euo pipefail
9
+
10
+ ALERT_NAME="${1:-unknown}"
11
+ NAMESPACE="${2:-default}"
12
+ POD_NAME="${3:-}"
13
+
14
+ log() { echo "[$(date +%H:%M:%S)] [REMEDIATE] $*"; }
15
+
16
+ case "${ALERT_NAME}" in
17
+ PodCrashLooping)
18
+ log "Remediating crash-looping pod: ${NAMESPACE}/${POD_NAME}"
19
+ # Check if OOM killed
20
+ OOM_COUNT=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o json | jq -r '.status.containerStatuses[0].lastState.terminated.reason // empty' | grep -c OOMKilled || true)
21
+ if [[ "${OOM_COUNT}" -gt 0 ]]; then
22
+ log "OOM detected - increasing memory limit"
23
+ kubectl patch deployment "${POD_NAME%-*}" -n "${NAMESPACE}" -p '{"spec":{"template":{"spec":{"containers":[{"name":"app","resources":{"limits":{"memory":"2Gi"}}}]}}}}'
24
+ log "Memory limit increased to 2Gi"
25
+ else
26
+ log "Non-OOM crash - restarting pod"
27
+ kubectl delete pod "${POD_NAME}" -n "${NAMESPACE}" --grace-period=30
28
+ fi
29
+ ;;
30
+
31
+ HighMemoryUsage)
32
+ log "Node memory pressure detected"
33
+ # Evict lowest-priority pods
34
+ kubectl get pods -A --sort-by=.spec.priority --field-selector=status.phase=Running | tail -5 | while read ns pod rest; do
35
+ log "Considering eviction: ${ns}/${pod}"
36
+ done
37
+ ;;
38
+
39
+ FalcoRuntimeAlert)
40
+ log "Runtime security alert - do NOT auto-remediate"
41
+ log "Escalate to security team: #security-alerts"
42
+ # Only notify - never auto-remediate security alerts
43
+ ;;
44
+
45
+ *)
46
+ log "Unknown alert pattern: ${ALERT_NAME}"
47
+ log "Manual investigation required"
48
+ exit 1
49
+ ;;
50
+ esac