devsecops-platform / scripts /bash /incident-response.sh
shaikhsalman's picture
feat: DevSecOps Platform - Full production reference architecture
7c19d46 verified
raw
history blame
3.6 kB
#!/usr/bin/env bash
# =============================================================================
# Incident Response Runbook — Automated Response
# =============================================================================
set -euo pipefail
INCIDENT_TYPE="${1:?Usage: $0 <pod-crash|oom|security|node-down|dns>}"
NAMESPACE="${2:-default}"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
NC='\033[0m'
log() { echo -e "${YELLOW}[$(date +%H:%M:%S)]${NC} $*"; }
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
fail(){ echo -e "${RED}[FAIL]${NC} $*"; }
case "${INCIDENT_TYPE}" in
pod-crash)
log "Investigating crash-looping pods in ${NAMESPACE}..."
kubectl get pods -n "${NAMESPACE}" --field-selector=status.phase!=Running
echo ""
kubectl get pods -n "${NAMESPACE}" -o json | \
jq -r '.items[] | select(.status.containerStatuses[]?.restartCount > 3) |
{name: .metadata.name, restarts: .status.containerStatuses[0].restartCount,
reason: .status.containerStatuses[0].lastState.terminated.reason}'
echo ""
log "Recent logs from failing pods:"
for pod in $(kubectl get pods -n "${NAMESPACE}" --field-selector=status.phase!=Running -o name); do
echo "--- ${pod} ---"
kubectl logs -n "${NAMESPACE}" "${pod}" --tail=50 2>/dev/null || echo "(no logs available)"
done
;;
oom)
log "Investigating OOM kills..."
kubectl get events -A --field-selector=reason=OOMKilling --sort-by='.lastTimestamp'
echo ""
log "Pods with high memory usage:"
kubectl top pods -A --sort-by=memory | head -20
echo ""
log "Nodes under memory pressure:"
kubectl get nodes -o json | \
jq -r '.items[] | select(.status.conditions[] | select(.type=="MemoryPressure" and .status=="True")) |
.metadata.name'
;;
security)
log "Checking security events..."
kubectl get events -A --field-selector=reason=FailedSandbox,reason=OOMKilling --sort-by='.lastTimestamp' | head -20
echo ""
log "Kyverno policy violations:"
kubectl get policyreports -A -o json | \
jq -r '.items[].results[] | select(.result=="fail") | {policy: .policy, resource: .resource}'
echo ""
log "Trivy vulnerability reports:"
kubectl get vulnerabilityreports -A -o json | \
jq -r '[.items[].report.vulnerabilities[] | select(.severity=="CRITICAL")] | length' 2>/dev/null || echo "0"
echo ""
log "Falco alerts (last hour):"
kubectl logs -n security -l app=falco --tail=100 2>/dev/null | grep -c "CRITICAL" || echo "0"
;;
node-down)
log "Checking node health..."
kubectl get nodes -o wide
echo ""
log "NotReady nodes:"
kubectl get nodes --field-selector=status.phase!=Normal 2>/dev/null || \
kubectl get nodes -o json | jq -r '.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="False")) | .metadata.name'
echo ""
log "Node conditions:"
kubectl get nodes -o json | \
jq -r '.items[] | {name: .metadata.name, conditions: [.status.conditions[] | {type, status}]}'
;;
dns)
log "Testing DNS resolution..."
kubectl run dns-test --image=busybox:1.36 --rm -it --restart=Never -- \
nslookup kubernetes.default.svc.cluster.local 2>/dev/null || echo "DNS FAILED"
log "CoreDNS logs:"
kubectl logs -n kube-system -l k8s-app=kube-dns --tail=30
;;
*)
fail "Unknown incident type: ${INCIDENT_TYPE}"
echo "Available: pod-crash, oom, security, node-down, dns"
exit 1
;;
esac
echo ""
log "Incident investigation complete. Check dashboards at https://grafana.platform.internal"