#!/usr/bin/env bash # ============================================================================= # Incident Response Runbook — Automated Response # ============================================================================= set -euo pipefail INCIDENT_TYPE="${1:?Usage: $0 }" NAMESPACE="${2:-default}" RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' NC='\033[0m' log() { echo -e "${YELLOW}[$(date +%H:%M:%S)]${NC} $*"; } ok() { echo -e "${GREEN}[OK]${NC} $*"; } fail(){ echo -e "${RED}[FAIL]${NC} $*"; } case "${INCIDENT_TYPE}" in pod-crash) log "Investigating crash-looping pods in ${NAMESPACE}..." kubectl get pods -n "${NAMESPACE}" --field-selector=status.phase!=Running echo "" kubectl get pods -n "${NAMESPACE}" -o json | \ jq -r '.items[] | select(.status.containerStatuses[]?.restartCount > 3) | {name: .metadata.name, restarts: .status.containerStatuses[0].restartCount, reason: .status.containerStatuses[0].lastState.terminated.reason}' echo "" log "Recent logs from failing pods:" for pod in $(kubectl get pods -n "${NAMESPACE}" --field-selector=status.phase!=Running -o name); do echo "--- ${pod} ---" kubectl logs -n "${NAMESPACE}" "${pod}" --tail=50 2>/dev/null || echo "(no logs available)" done ;; oom) log "Investigating OOM kills..." kubectl get events -A --field-selector=reason=OOMKilling --sort-by='.lastTimestamp' echo "" log "Pods with high memory usage:" kubectl top pods -A --sort-by=memory | head -20 echo "" log "Nodes under memory pressure:" kubectl get nodes -o json | \ jq -r '.items[] | select(.status.conditions[] | select(.type=="MemoryPressure" and .status=="True")) | .metadata.name' ;; security) log "Checking security events..." kubectl get events -A --field-selector=reason=FailedSandbox,reason=OOMKilling --sort-by='.lastTimestamp' | head -20 echo "" log "Kyverno policy violations:" kubectl get policyreports -A -o json | \ jq -r '.items[].results[] | select(.result=="fail") | {policy: .policy, resource: .resource}' echo "" log "Trivy vulnerability reports:" kubectl get vulnerabilityreports -A -o json | \ jq -r '[.items[].report.vulnerabilities[] | select(.severity=="CRITICAL")] | length' 2>/dev/null || echo "0" echo "" log "Falco alerts (last hour):" kubectl logs -n security -l app=falco --tail=100 2>/dev/null | grep -c "CRITICAL" || echo "0" ;; node-down) log "Checking node health..." kubectl get nodes -o wide echo "" log "NotReady nodes:" kubectl get nodes --field-selector=status.phase!=Normal 2>/dev/null || \ kubectl get nodes -o json | jq -r '.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="False")) | .metadata.name' echo "" log "Node conditions:" kubectl get nodes -o json | \ jq -r '.items[] | {name: .metadata.name, conditions: [.status.conditions[] | {type, status}]}' ;; dns) log "Testing DNS resolution..." kubectl run dns-test --image=busybox:1.36 --rm -it --restart=Never -- \ nslookup kubernetes.default.svc.cluster.local 2>/dev/null || echo "DNS FAILED" log "CoreDNS logs:" kubectl logs -n kube-system -l k8s-app=kube-dns --tail=30 ;; *) fail "Unknown incident type: ${INCIDENT_TYPE}" echo "Available: pod-crash, oom, security, node-down, dns" exit 1 ;; esac echo "" log "Incident investigation complete. Check dashboards at https://grafana.platform.internal"