| #!/usr/bin/env bash |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| INCIDENT_TYPE="${1:?Usage: $0 <pod-crash|oom|security|node-down|dns>}" |
| NAMESPACE="${2:-default}" |
|
|
| RED='\033[0;31m' |
| GREEN='\033[0;32m' |
| YELLOW='\033[0;33m' |
| NC='\033[0m' |
|
|
| log() { echo -e "${YELLOW}[$(date +%H:%M:%S)]${NC} $*"; } |
| ok() { echo -e "${GREEN}[OK]${NC} $*"; } |
| fail(){ echo -e "${RED}[FAIL]${NC} $*"; } |
|
|
| case "${INCIDENT_TYPE}" in |
| pod-crash) |
| log "Investigating crash-looping pods in ${NAMESPACE}..." |
| kubectl get pods -n "${NAMESPACE}" --field-selector=status.phase!=Running |
| echo "" |
| kubectl get pods -n "${NAMESPACE}" -o json | \ |
| jq -r '.items[] | select(.status.containerStatuses[]?.restartCount > 3) | |
| {name: .metadata.name, restarts: .status.containerStatuses[0].restartCount, |
| reason: .status.containerStatuses[0].lastState.terminated.reason}' |
| echo "" |
| log "Recent logs from failing pods:" |
| for pod in $(kubectl get pods -n "${NAMESPACE}" --field-selector=status.phase!=Running -o name); do |
| echo "--- ${pod} ---" |
| kubectl logs -n "${NAMESPACE}" "${pod}" --tail=50 2>/dev/null || echo "(no logs available)" |
| done |
| ;; |
|
|
| oom) |
| log "Investigating OOM kills..." |
| kubectl get events -A --field-selector=reason=OOMKilling --sort-by='.lastTimestamp' |
| echo "" |
| log "Pods with high memory usage:" |
| kubectl top pods -A --sort-by=memory | head -20 |
| echo "" |
| log "Nodes under memory pressure:" |
| kubectl get nodes -o json | \ |
| jq -r '.items[] | select(.status.conditions[] | select(.type=="MemoryPressure" and .status=="True")) | |
| .metadata.name' |
| ;; |
|
|
| security) |
| log "Checking security events..." |
| kubectl get events -A --field-selector=reason=FailedSandbox,reason=OOMKilling --sort-by='.lastTimestamp' | head -20 |
| echo "" |
| log "Kyverno policy violations:" |
| kubectl get policyreports -A -o json | \ |
| jq -r '.items[].results[] | select(.result=="fail") | {policy: .policy, resource: .resource}' |
| echo "" |
| log "Trivy vulnerability reports:" |
| kubectl get vulnerabilityreports -A -o json | \ |
| jq -r '[.items[].report.vulnerabilities[] | select(.severity=="CRITICAL")] | length' 2>/dev/null || echo "0" |
| echo "" |
| log "Falco alerts (last hour):" |
| kubectl logs -n security -l app=falco --tail=100 2>/dev/null | grep -c "CRITICAL" || echo "0" |
| ;; |
|
|
| node-down) |
| log "Checking node health..." |
| kubectl get nodes -o wide |
| echo "" |
| log "NotReady nodes:" |
| kubectl get nodes --field-selector=status.phase!=Normal 2>/dev/null || \ |
| kubectl get nodes -o json | jq -r '.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="False")) | .metadata.name' |
| echo "" |
| log "Node conditions:" |
| kubectl get nodes -o json | \ |
| jq -r '.items[] | {name: .metadata.name, conditions: [.status.conditions[] | {type, status}]}' |
| ;; |
|
|
| dns) |
| log "Testing DNS resolution..." |
| kubectl run dns-test --image=busybox:1.36 --rm -it --restart=Never -- \ |
| nslookup kubernetes.default.svc.cluster.local 2>/dev/null || echo "DNS FAILED" |
| log "CoreDNS logs:" |
| kubectl logs -n kube-system -l k8s-app=kube-dns --tail=30 |
| ;; |
|
|
| *) |
| fail "Unknown incident type: ${INCIDENT_TYPE}" |
| echo "Available: pod-crash, oom, security, node-down, dns" |
| exit 1 |
| ;; |
| esac |
|
|
| echo "" |
| log "Incident investigation complete. Check dashboards at https://grafana.platform.internal" |
|
|