Upload incident-response/auto-remediation/auto-remediate.sh with huggingface_hub
Browse files
incident-response/auto-remediation/auto-remediate.sh
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# Autonomous Incident Remediation Engine
|
| 4 |
+
# =============================================================================
|
| 5 |
+
# Triggered by Alertmanager webhook. Auto-remediates known patterns.
|
| 6 |
+
# =============================================================================
|
| 7 |
+
|
| 8 |
+
set -euo pipefail
|
| 9 |
+
|
| 10 |
+
ALERT_NAME="${1:-unknown}"
|
| 11 |
+
NAMESPACE="${2:-default}"
|
| 12 |
+
POD_NAME="${3:-}"
|
| 13 |
+
|
| 14 |
+
log() { echo "[$(date +%H:%M:%S)] [REMEDIATE] $*"; }
|
| 15 |
+
|
| 16 |
+
case "${ALERT_NAME}" in
|
| 17 |
+
PodCrashLooping)
|
| 18 |
+
log "Remediating crash-looping pod: ${NAMESPACE}/${POD_NAME}"
|
| 19 |
+
# Check if OOM killed
|
| 20 |
+
OOM_COUNT=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o json | jq -r '.status.containerStatuses[0].lastState.terminated.reason // empty' | grep -c OOMKilled || true)
|
| 21 |
+
if [[ "${OOM_COUNT}" -gt 0 ]]; then
|
| 22 |
+
log "OOM detected - increasing memory limit"
|
| 23 |
+
kubectl patch deployment "${POD_NAME%-*}" -n "${NAMESPACE}" -p '{"spec":{"template":{"spec":{"containers":[{"name":"app","resources":{"limits":{"memory":"2Gi"}}}]}}}}'
|
| 24 |
+
log "Memory limit increased to 2Gi"
|
| 25 |
+
else
|
| 26 |
+
log "Non-OOM crash - restarting pod"
|
| 27 |
+
kubectl delete pod "${POD_NAME}" -n "${NAMESPACE}" --grace-period=30
|
| 28 |
+
fi
|
| 29 |
+
;;
|
| 30 |
+
|
| 31 |
+
HighMemoryUsage)
|
| 32 |
+
log "Node memory pressure detected"
|
| 33 |
+
# Evict lowest-priority pods
|
| 34 |
+
kubectl get pods -A --sort-by=.spec.priority --field-selector=status.phase=Running | tail -5 | while read ns pod rest; do
|
| 35 |
+
log "Considering eviction: ${ns}/${pod}"
|
| 36 |
+
done
|
| 37 |
+
;;
|
| 38 |
+
|
| 39 |
+
FalcoRuntimeAlert)
|
| 40 |
+
log "Runtime security alert - do NOT auto-remediate"
|
| 41 |
+
log "Escalate to security team: #security-alerts"
|
| 42 |
+
# Only notify - never auto-remediate security alerts
|
| 43 |
+
;;
|
| 44 |
+
|
| 45 |
+
*)
|
| 46 |
+
log "Unknown alert pattern: ${ALERT_NAME}"
|
| 47 |
+
log "Manual investigation required"
|
| 48 |
+
exit 1
|
| 49 |
+
;;
|
| 50 |
+
esac
|