#!/usr/bin/env bash
# =============================================================================
# Incident Response Runbook — Automated Response
# =============================================================================

set -euo pipefail

INCIDENT_TYPE="${1:?Usage: $0 <pod-crash|oom|security|node-down|dns>}"
NAMESPACE="${2:-default}"

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
NC='\033[0m'

log() { echo -e "${YELLOW}[$(date +%H:%M:%S)]${NC} $*"; }
ok()  { echo -e "${GREEN}[OK]${NC} $*"; }
fail(){ echo -e "${RED}[FAIL]${NC} $*"; }

case "${INCIDENT_TYPE}" in
  pod-crash)
    log "Investigating crash-looping pods in ${NAMESPACE}..."
    kubectl get pods -n "${NAMESPACE}" --field-selector=status.phase!=Running
    echo ""
    kubectl get pods -n "${NAMESPACE}" -o json | \
      jq -r '.items[] | select(.status.containerStatuses[]?.restartCount > 3) |
        {name: .metadata.name, restarts: .status.containerStatuses[0].restartCount,
         reason: .status.containerStatuses[0].lastState.terminated.reason}'
    echo ""
    log "Recent logs from failing pods:"
    for pod in $(kubectl get pods -n "${NAMESPACE}" --field-selector=status.phase!=Running -o name); do
      echo "--- ${pod} ---"
      kubectl logs -n "${NAMESPACE}" "${pod}" --tail=50 2>/dev/null || echo "(no logs available)"
    done
    ;;

  oom)
    log "Investigating OOM kills..."
    kubectl get events -A --field-selector=reason=OOMKilling --sort-by='.lastTimestamp'
    echo ""
    log "Pods with high memory usage:"
    kubectl top pods -A --sort-by=memory | head -20
    echo ""
    log "Nodes under memory pressure:"
    kubectl get nodes -o json | \
      jq -r '.items[] | select(.status.conditions[] | select(.type=="MemoryPressure" and .status=="True")) |
        .metadata.name'
    ;;

  security)
    log "Checking security events..."
    kubectl get events -A --field-selector=reason=FailedSandbox,reason=OOMKilling --sort-by='.lastTimestamp' | head -20
    echo ""
    log "Kyverno policy violations:"
    kubectl get policyreports -A -o json | \
      jq -r '.items[].results[] | select(.result=="fail") | {policy: .policy, resource: .resource}'
    echo ""
    log "Trivy vulnerability reports:"
    kubectl get vulnerabilityreports -A -o json | \
      jq -r '[.items[].report.vulnerabilities[] | select(.severity=="CRITICAL")] | length' 2>/dev/null || echo "0"
    echo ""
    log "Falco alerts (last hour):"
    kubectl logs -n security -l app=falco --tail=100 2>/dev/null | grep -c "CRITICAL" || echo "0"
    ;;

  node-down)
    log "Checking node health..."
    kubectl get nodes -o wide
    echo ""
    log "NotReady nodes:"
    kubectl get nodes --field-selector=status.phase!=Normal 2>/dev/null || \
      kubectl get nodes -o json | jq -r '.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="False")) | .metadata.name'
    echo ""
    log "Node conditions:"
    kubectl get nodes -o json | \
      jq -r '.items[] | {name: .metadata.name, conditions: [.status.conditions[] | {type, status}]}' 
    ;;

  dns)
    log "Testing DNS resolution..."
    kubectl run dns-test --image=busybox:1.36 --rm -it --restart=Never -- \
      nslookup kubernetes.default.svc.cluster.local 2>/dev/null || echo "DNS FAILED"
    log "CoreDNS logs:"
    kubectl logs -n kube-system -l k8s-app=kube-dns --tail=30
    ;;

  *)
    fail "Unknown incident type: ${INCIDENT_TYPE}"
    echo "Available: pod-crash, oom, security, node-down, dns"
    exit 1
    ;;
esac

echo ""
log "Incident investigation complete. Check dashboards at https://grafana.platform.internal"